diff options
author | Philipp Reisner <philipp.reisner@linbit.com> | 2012-11-09 08:18:43 -0500 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2012-11-09 08:20:23 -0500 |
commit | 986836503e49ccf7e84b813715d344964ec93566 (patch) | |
tree | b3bea7428efde5b77096cef80e5b6bfee494cc12 | |
parent | ccae7868b0c5697508a541c531cf96b361d62c1c (diff) | |
parent | 328e0f125bf41f4f33f684db22015f92cb44fe56 (diff) |
Merge branch 'drbd-8.4_ed6' into for-3.8-drivers-drbd-8.4_ed6
30 files changed, 12020 insertions, 8533 deletions
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index 0d3f337ff5ff..8b450338075e 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile | |||
@@ -1,5 +1,7 @@ | |||
1 | drbd-y := drbd_bitmap.o drbd_proc.o | 1 | drbd-y := drbd_bitmap.o drbd_proc.o |
2 | drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o | 2 | drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o |
3 | drbd-y += drbd_main.o drbd_strings.o drbd_nl.o | 3 | drbd-y += drbd_main.o drbd_strings.o drbd_nl.o |
4 | drbd-y += drbd_interval.o drbd_state.o | ||
5 | drbd-y += drbd_nla.o | ||
4 | 6 | ||
5 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o | 7 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o |
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index d4dd563d0d54..92510f8ad013 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -24,21 +24,73 @@ | |||
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/crc32c.h> | ||
27 | #include <linux/drbd.h> | 28 | #include <linux/drbd.h> |
29 | #include <linux/drbd_limits.h> | ||
30 | #include <linux/dynamic_debug.h> | ||
28 | #include "drbd_int.h" | 31 | #include "drbd_int.h" |
29 | #include "drbd_wrappers.h" | 32 | #include "drbd_wrappers.h" |
30 | 33 | ||
31 | /* We maintain a trivial checksum in our on disk activity log. | 34 | |
32 | * With that we can ensure correct operation even when the storage | 35 | enum al_transaction_types { |
33 | * device might do a partial (last) sector write while losing power. | 36 | AL_TR_UPDATE = 0, |
34 | */ | 37 | AL_TR_INITIALIZED = 0xffff |
35 | struct __packed al_transaction { | 38 | }; |
36 | u32 magic; | 39 | /* all fields on disc in big endian */ |
37 | u32 tr_number; | 40 | struct __packed al_transaction_on_disk { |
38 | struct __packed { | 41 | /* don't we all like magic */ |
39 | u32 pos; | 42 | __be32 magic; |
40 | u32 extent; } updates[1 + AL_EXTENTS_PT]; | 43 | |
41 | u32 xor_sum; | 44 | /* to identify the most recent transaction block |
45 | * in the on disk ring buffer */ | ||
46 | __be32 tr_number; | ||
47 | |||
48 | /* checksum on the full 4k block, with this field set to 0. */ | ||
49 | __be32 crc32c; | ||
50 | |||
51 | /* type of transaction, special transaction types like: | ||
52 | * purge-all, set-all-idle, set-all-active, ... to-be-defined | ||
53 | * see also enum al_transaction_types */ | ||
54 | __be16 transaction_type; | ||
55 | |||
56 | /* we currently allow only a few thousand extents, | ||
57 | * so 16bit will be enough for the slot number. */ | ||
58 | |||
59 | /* how many updates in this transaction */ | ||
60 | __be16 n_updates; | ||
61 | |||
62 | /* maximum slot number, "al-extents" in drbd.conf speak. | ||
63 | * Having this in each transaction should make reconfiguration | ||
64 | * of that parameter easier. */ | ||
65 | __be16 context_size; | ||
66 | |||
67 | /* slot number the context starts with */ | ||
68 | __be16 context_start_slot_nr; | ||
69 | |||
70 | /* Some reserved bytes. Expected usage is a 64bit counter of | ||
71 | * sectors-written since device creation, and other data generation tag | ||
72 | * supporting usage */ | ||
73 | __be32 __reserved[4]; | ||
74 | |||
75 | /* --- 36 byte used --- */ | ||
76 | |||
77 | /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes | ||
78 | * in one transaction, then use the remaining byte in the 4k block for | ||
79 | * context information. "Flexible" number of updates per transaction | ||
80 | * does not help, as we have to account for the case when all update | ||
81 | * slots are used anyways, so it would only complicate code without | ||
82 | * additional benefit. | ||
83 | */ | ||
84 | __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; | ||
85 | |||
86 | /* but the extent number is 32bit, which at an extent size of 4 MiB | ||
87 | * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ | ||
88 | __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; | ||
89 | |||
90 | /* --- 420 bytes used (36 + 64*6) --- */ | ||
91 | |||
92 | /* 4096 - 420 = 3676 = 919 * 4 */ | ||
93 | __be32 context[AL_CONTEXT_PER_TRANSACTION]; | ||
42 | }; | 94 | }; |
43 | 95 | ||
44 | struct update_odbm_work { | 96 | struct update_odbm_work { |
@@ -48,22 +100,11 @@ struct update_odbm_work { | |||
48 | 100 | ||
49 | struct update_al_work { | 101 | struct update_al_work { |
50 | struct drbd_work w; | 102 | struct drbd_work w; |
51 | struct lc_element *al_ext; | ||
52 | struct completion event; | 103 | struct completion event; |
53 | unsigned int enr; | 104 | int err; |
54 | /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ | ||
55 | unsigned int old_enr; | ||
56 | }; | ||
57 | |||
58 | struct drbd_atodb_wait { | ||
59 | atomic_t count; | ||
60 | struct completion io_done; | ||
61 | struct drbd_conf *mdev; | ||
62 | int error; | ||
63 | }; | 105 | }; |
64 | 106 | ||
65 | 107 | static int al_write_transaction(struct drbd_conf *mdev); | |
66 | int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); | ||
67 | 108 | ||
68 | void *drbd_md_get_buffer(struct drbd_conf *mdev) | 109 | void *drbd_md_get_buffer(struct drbd_conf *mdev) |
69 | { | 110 | { |
@@ -85,12 +126,17 @@ void drbd_md_put_buffer(struct drbd_conf *mdev) | |||
85 | void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | 126 | void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, |
86 | unsigned int *done) | 127 | unsigned int *done) |
87 | { | 128 | { |
88 | long dt = bdev->dc.disk_timeout * HZ / 10; | 129 | long dt; |
130 | |||
131 | rcu_read_lock(); | ||
132 | dt = rcu_dereference(bdev->disk_conf)->disk_timeout; | ||
133 | rcu_read_unlock(); | ||
134 | dt = dt * HZ / 10; | ||
89 | if (dt == 0) | 135 | if (dt == 0) |
90 | dt = MAX_SCHEDULE_TIMEOUT; | 136 | dt = MAX_SCHEDULE_TIMEOUT; |
91 | 137 | ||
92 | dt = wait_event_timeout(mdev->misc_wait, | 138 | dt = wait_event_timeout(mdev->misc_wait, |
93 | *done || drbd_test_flag(mdev, FORCE_DETACH), dt); | 139 | *done || test_bit(FORCE_DETACH, &mdev->flags), dt); |
94 | if (dt == 0) { | 140 | if (dt == 0) { |
95 | dev_err(DEV, "meta-data IO operation timed out\n"); | 141 | dev_err(DEV, "meta-data IO operation timed out\n"); |
96 | drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH); | 142 | drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH); |
@@ -103,20 +149,20 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
103 | int rw, int size) | 149 | int rw, int size) |
104 | { | 150 | { |
105 | struct bio *bio; | 151 | struct bio *bio; |
106 | int ok; | 152 | int err; |
107 | 153 | ||
108 | mdev->md_io.done = 0; | 154 | mdev->md_io.done = 0; |
109 | mdev->md_io.error = -ENODEV; | 155 | mdev->md_io.error = -ENODEV; |
110 | 156 | ||
111 | if ((rw & WRITE) && !drbd_test_flag(mdev, MD_NO_FUA)) | 157 | if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) |
112 | rw |= REQ_FUA | REQ_FLUSH; | 158 | rw |= REQ_FUA | REQ_FLUSH; |
113 | rw |= REQ_SYNC; | 159 | rw |= REQ_SYNC; |
114 | 160 | ||
115 | bio = bio_alloc_drbd(GFP_NOIO); | 161 | bio = bio_alloc_drbd(GFP_NOIO); |
116 | bio->bi_bdev = bdev->md_bdev; | 162 | bio->bi_bdev = bdev->md_bdev; |
117 | bio->bi_sector = sector; | 163 | bio->bi_sector = sector; |
118 | ok = (bio_add_page(bio, page, size, 0) == size); | 164 | err = -EIO; |
119 | if (!ok) | 165 | if (bio_add_page(bio, page, size, 0) != size) |
120 | goto out; | 166 | goto out; |
121 | bio->bi_private = &mdev->md_io; | 167 | bio->bi_private = &mdev->md_io; |
122 | bio->bi_end_io = drbd_md_io_complete; | 168 | bio->bi_end_io = drbd_md_io_complete; |
@@ -124,7 +170,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
124 | 170 | ||
125 | if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ | 171 | if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ |
126 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); | 172 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); |
127 | ok = 0; | 173 | err = -ENODEV; |
128 | goto out; | 174 | goto out; |
129 | } | 175 | } |
130 | 176 | ||
@@ -135,85 +181,46 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
135 | else | 181 | else |
136 | submit_bio(rw, bio); | 182 | submit_bio(rw, bio); |
137 | wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done); | 183 | wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done); |
138 | ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; | 184 | if (bio_flagged(bio, BIO_UPTODATE)) |
185 | err = mdev->md_io.error; | ||
139 | 186 | ||
140 | out: | 187 | out: |
141 | bio_put(bio); | 188 | bio_put(bio); |
142 | return ok; | 189 | return err; |
143 | } | 190 | } |
144 | 191 | ||
145 | int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | 192 | int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, |
146 | sector_t sector, int rw) | 193 | sector_t sector, int rw) |
147 | { | 194 | { |
148 | int logical_block_size, mask, ok; | 195 | int err; |
149 | int offset = 0; | ||
150 | struct page *iop = mdev->md_io_page; | 196 | struct page *iop = mdev->md_io_page; |
151 | 197 | ||
152 | D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); | 198 | D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); |
153 | 199 | ||
154 | BUG_ON(!bdev->md_bdev); | 200 | BUG_ON(!bdev->md_bdev); |
155 | 201 | ||
156 | logical_block_size = bdev_logical_block_size(bdev->md_bdev); | 202 | dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", |
157 | if (logical_block_size == 0) | 203 | current->comm, current->pid, __func__, |
158 | logical_block_size = MD_SECTOR_SIZE; | 204 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); |
159 | |||
160 | /* in case logical_block_size != 512 [ s390 only? ] */ | ||
161 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
162 | mask = (logical_block_size / MD_SECTOR_SIZE) - 1; | ||
163 | D_ASSERT(mask == 1 || mask == 3 || mask == 7); | ||
164 | D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); | ||
165 | offset = sector & mask; | ||
166 | sector = sector & ~mask; | ||
167 | iop = mdev->md_io_tmpp; | ||
168 | |||
169 | if (rw & WRITE) { | ||
170 | /* these are GFP_KERNEL pages, pre-allocated | ||
171 | * on device initialization */ | ||
172 | void *p = page_address(mdev->md_io_page); | ||
173 | void *hp = page_address(mdev->md_io_tmpp); | ||
174 | |||
175 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, | ||
176 | READ, logical_block_size); | ||
177 | |||
178 | if (unlikely(!ok)) { | ||
179 | dev_err(DEV, "drbd_md_sync_page_io(,%llus," | ||
180 | "READ [logical_block_size!=512]) failed!\n", | ||
181 | (unsigned long long)sector); | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); | ||
186 | } | ||
187 | } | ||
188 | 205 | ||
189 | if (sector < drbd_md_first_sector(bdev) || | 206 | if (sector < drbd_md_first_sector(bdev) || |
190 | sector > drbd_md_last_sector(bdev)) | 207 | sector + 7 > drbd_md_last_sector(bdev)) |
191 | dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", | 208 | dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", |
192 | current->comm, current->pid, __func__, | 209 | current->comm, current->pid, __func__, |
193 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 210 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); |
194 | 211 | ||
195 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); | 212 | err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); |
196 | if (unlikely(!ok)) { | 213 | if (err) { |
197 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", | 214 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", |
198 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 215 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); |
199 | return 0; | ||
200 | } | 216 | } |
201 | 217 | return err; | |
202 | if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { | ||
203 | void *p = page_address(mdev->md_io_page); | ||
204 | void *hp = page_address(mdev->md_io_tmpp); | ||
205 | |||
206 | memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); | ||
207 | } | ||
208 | |||
209 | return ok; | ||
210 | } | 218 | } |
211 | 219 | ||
212 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | 220 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) |
213 | { | 221 | { |
214 | struct lc_element *al_ext; | 222 | struct lc_element *al_ext; |
215 | struct lc_element *tmp; | 223 | struct lc_element *tmp; |
216 | unsigned long al_flags = 0; | ||
217 | int wake; | 224 | int wake; |
218 | 225 | ||
219 | spin_lock_irq(&mdev->al_lock); | 226 | spin_lock_irq(&mdev->al_lock); |
@@ -228,76 +235,92 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | |||
228 | return NULL; | 235 | return NULL; |
229 | } | 236 | } |
230 | } | 237 | } |
231 | al_ext = lc_get(mdev->act_log, enr); | 238 | al_ext = lc_get(mdev->act_log, enr); |
232 | al_flags = mdev->act_log->flags; | ||
233 | spin_unlock_irq(&mdev->al_lock); | 239 | spin_unlock_irq(&mdev->al_lock); |
234 | |||
235 | /* | ||
236 | if (!al_ext) { | ||
237 | if (al_flags & LC_STARVING) | ||
238 | dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); | ||
239 | if (al_flags & LC_DIRTY) | ||
240 | dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); | ||
241 | } | ||
242 | */ | ||
243 | |||
244 | return al_ext; | 240 | return al_ext; |
245 | } | 241 | } |
246 | 242 | ||
247 | void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) | 243 | void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) |
248 | { | 244 | { |
249 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | 245 | /* for bios crossing activity log extent boundaries, |
250 | struct lc_element *al_ext; | 246 | * we may need to activate two extents in one go */ |
251 | struct update_al_work al_work; | 247 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); |
248 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
249 | unsigned enr; | ||
250 | bool locked = false; | ||
252 | 251 | ||
252 | |||
253 | D_ASSERT(first <= last); | ||
253 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | 254 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); |
254 | 255 | ||
255 | wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); | 256 | for (enr = first; enr <= last; enr++) |
257 | wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); | ||
256 | 258 | ||
257 | if (al_ext->lc_number != enr) { | 259 | /* Serialize multiple transactions. |
260 | * This uses test_and_set_bit, memory barrier is implicit. | ||
261 | */ | ||
262 | wait_event(mdev->al_wait, | ||
263 | mdev->act_log->pending_changes == 0 || | ||
264 | (locked = lc_try_lock_for_transaction(mdev->act_log))); | ||
265 | |||
266 | if (locked) { | ||
258 | /* drbd_al_write_transaction(mdev,al_ext,enr); | 267 | /* drbd_al_write_transaction(mdev,al_ext,enr); |
259 | * recurses into generic_make_request(), which | 268 | * recurses into generic_make_request(), which |
260 | * disallows recursion, bios being serialized on the | 269 | * disallows recursion, bios being serialized on the |
261 | * current->bio_tail list now. | 270 | * current->bio_tail list now. |
262 | * we have to delegate updates to the activity log | 271 | * we have to delegate updates to the activity log |
263 | * to the worker thread. */ | 272 | * to the worker thread. */ |
264 | init_completion(&al_work.event); | 273 | |
265 | al_work.al_ext = al_ext; | 274 | /* Double check: it may have been committed by someone else, |
266 | al_work.enr = enr; | 275 | * while we have been waiting for the lock. */ |
267 | al_work.old_enr = al_ext->lc_number; | 276 | if (mdev->act_log->pending_changes) { |
268 | al_work.w.cb = w_al_write_transaction; | 277 | bool write_al_updates; |
269 | drbd_queue_work_front(&mdev->data.work, &al_work.w); | 278 | |
270 | wait_for_completion(&al_work.event); | 279 | rcu_read_lock(); |
271 | 280 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; | |
272 | mdev->al_writ_cnt++; | 281 | rcu_read_unlock(); |
273 | 282 | ||
274 | spin_lock_irq(&mdev->al_lock); | 283 | if (write_al_updates) { |
275 | lc_changed(mdev->act_log, al_ext); | 284 | al_write_transaction(mdev); |
276 | spin_unlock_irq(&mdev->al_lock); | 285 | mdev->al_writ_cnt++; |
286 | } | ||
287 | |||
288 | spin_lock_irq(&mdev->al_lock); | ||
289 | /* FIXME | ||
290 | if (err) | ||
291 | we need an "lc_cancel" here; | ||
292 | */ | ||
293 | lc_committed(mdev->act_log); | ||
294 | spin_unlock_irq(&mdev->al_lock); | ||
295 | } | ||
296 | lc_unlock(mdev->act_log); | ||
277 | wake_up(&mdev->al_wait); | 297 | wake_up(&mdev->al_wait); |
278 | } | 298 | } |
279 | } | 299 | } |
280 | 300 | ||
281 | void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) | 301 | void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) |
282 | { | 302 | { |
283 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | 303 | /* for bios crossing activity log extent boundaries, |
304 | * we may need to activate two extents in one go */ | ||
305 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | ||
306 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
307 | unsigned enr; | ||
284 | struct lc_element *extent; | 308 | struct lc_element *extent; |
285 | unsigned long flags; | 309 | unsigned long flags; |
286 | 310 | ||
311 | D_ASSERT(first <= last); | ||
287 | spin_lock_irqsave(&mdev->al_lock, flags); | 312 | spin_lock_irqsave(&mdev->al_lock, flags); |
288 | 313 | ||
289 | extent = lc_find(mdev->act_log, enr); | 314 | for (enr = first; enr <= last; enr++) { |
290 | 315 | extent = lc_find(mdev->act_log, enr); | |
291 | if (!extent) { | 316 | if (!extent) { |
292 | spin_unlock_irqrestore(&mdev->al_lock, flags); | 317 | dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); |
293 | dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); | 318 | continue; |
294 | return; | 319 | } |
320 | lc_put(mdev->act_log, extent); | ||
295 | } | 321 | } |
296 | |||
297 | if (lc_put(mdev->act_log, extent) == 0) | ||
298 | wake_up(&mdev->al_wait); | ||
299 | |||
300 | spin_unlock_irqrestore(&mdev->al_lock, flags); | 322 | spin_unlock_irqrestore(&mdev->al_lock, flags); |
323 | wake_up(&mdev->al_wait); | ||
301 | } | 324 | } |
302 | 325 | ||
303 | #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) | 326 | #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) |
@@ -323,296 +346,148 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) | |||
323 | return rs_enr >> | 346 | return rs_enr >> |
324 | /* bit to page */ | 347 | /* bit to page */ |
325 | ((PAGE_SHIFT + 3) - | 348 | ((PAGE_SHIFT + 3) - |
326 | /* al extent number to bit */ | 349 | /* resync extent number to bit */ |
327 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); | 350 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); |
328 | } | 351 | } |
329 | 352 | ||
330 | int | 353 | static int |
331 | w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 354 | _al_write_transaction(struct drbd_conf *mdev) |
332 | { | 355 | { |
333 | struct update_al_work *aw = container_of(w, struct update_al_work, w); | 356 | struct al_transaction_on_disk *buffer; |
334 | struct lc_element *updated = aw->al_ext; | 357 | struct lc_element *e; |
335 | const unsigned int new_enr = aw->enr; | ||
336 | const unsigned int evicted = aw->old_enr; | ||
337 | struct al_transaction *buffer; | ||
338 | sector_t sector; | 358 | sector_t sector; |
339 | int i, n, mx; | 359 | int i, mx; |
340 | unsigned int extent_nr; | 360 | unsigned extent_nr; |
341 | u32 xor_sum = 0; | 361 | unsigned crc = 0; |
362 | int err = 0; | ||
342 | 363 | ||
343 | if (!get_ldev(mdev)) { | 364 | if (!get_ldev(mdev)) { |
344 | dev_err(DEV, | 365 | dev_err(DEV, "disk is %s, cannot start al transaction\n", |
345 | "disk is %s, cannot start al transaction (-%d +%d)\n", | 366 | drbd_disk_str(mdev->state.disk)); |
346 | drbd_disk_str(mdev->state.disk), evicted, new_enr); | 367 | return -EIO; |
347 | complete(&((struct update_al_work *)w)->event); | ||
348 | return 1; | ||
349 | } | 368 | } |
350 | /* do we have to do a bitmap write, first? | ||
351 | * TODO reduce maximum latency: | ||
352 | * submit both bios, then wait for both, | ||
353 | * instead of doing two synchronous sector writes. | ||
354 | * For now, we must not write the transaction, | ||
355 | * if we cannot write out the bitmap of the evicted extent. */ | ||
356 | if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) | ||
357 | drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted)); | ||
358 | 369 | ||
359 | /* The bitmap write may have failed, causing a state change. */ | 370 | /* The bitmap write may have failed, causing a state change. */ |
360 | if (mdev->state.disk < D_INCONSISTENT) { | 371 | if (mdev->state.disk < D_INCONSISTENT) { |
361 | dev_err(DEV, | 372 | dev_err(DEV, |
362 | "disk is %s, cannot write al transaction (-%d +%d)\n", | 373 | "disk is %s, cannot write al transaction\n", |
363 | drbd_disk_str(mdev->state.disk), evicted, new_enr); | 374 | drbd_disk_str(mdev->state.disk)); |
364 | complete(&((struct update_al_work *)w)->event); | ||
365 | put_ldev(mdev); | 375 | put_ldev(mdev); |
366 | return 1; | 376 | return -EIO; |
367 | } | 377 | } |
368 | 378 | ||
369 | buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ | 379 | buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ |
370 | if (!buffer) { | 380 | if (!buffer) { |
371 | dev_err(DEV, "disk failed while waiting for md_io buffer\n"); | 381 | dev_err(DEV, "disk failed while waiting for md_io buffer\n"); |
372 | complete(&((struct update_al_work *)w)->event); | ||
373 | put_ldev(mdev); | 382 | put_ldev(mdev); |
374 | return 1; | 383 | return -ENODEV; |
375 | } | 384 | } |
376 | 385 | ||
377 | buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); | 386 | memset(buffer, 0, sizeof(*buffer)); |
387 | buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); | ||
378 | buffer->tr_number = cpu_to_be32(mdev->al_tr_number); | 388 | buffer->tr_number = cpu_to_be32(mdev->al_tr_number); |
379 | 389 | ||
380 | n = lc_index_of(mdev->act_log, updated); | 390 | i = 0; |
381 | 391 | ||
382 | buffer->updates[0].pos = cpu_to_be32(n); | 392 | /* Even though no one can start to change this list |
383 | buffer->updates[0].extent = cpu_to_be32(new_enr); | 393 | * once we set the LC_LOCKED -- from drbd_al_begin_io(), |
394 | * lc_try_lock_for_transaction() --, someone may still | ||
395 | * be in the process of changing it. */ | ||
396 | spin_lock_irq(&mdev->al_lock); | ||
397 | list_for_each_entry(e, &mdev->act_log->to_be_changed, list) { | ||
398 | if (i == AL_UPDATES_PER_TRANSACTION) { | ||
399 | i++; | ||
400 | break; | ||
401 | } | ||
402 | buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); | ||
403 | buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); | ||
404 | if (e->lc_number != LC_FREE) | ||
405 | drbd_bm_mark_for_writeout(mdev, | ||
406 | al_extent_to_bm_page(e->lc_number)); | ||
407 | i++; | ||
408 | } | ||
409 | spin_unlock_irq(&mdev->al_lock); | ||
410 | BUG_ON(i > AL_UPDATES_PER_TRANSACTION); | ||
384 | 411 | ||
385 | xor_sum ^= new_enr; | 412 | buffer->n_updates = cpu_to_be16(i); |
413 | for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { | ||
414 | buffer->update_slot_nr[i] = cpu_to_be16(-1); | ||
415 | buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); | ||
416 | } | ||
417 | |||
418 | buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements); | ||
419 | buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle); | ||
386 | 420 | ||
387 | mx = min_t(int, AL_EXTENTS_PT, | 421 | mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, |
388 | mdev->act_log->nr_elements - mdev->al_tr_cycle); | 422 | mdev->act_log->nr_elements - mdev->al_tr_cycle); |
389 | for (i = 0; i < mx; i++) { | 423 | for (i = 0; i < mx; i++) { |
390 | unsigned idx = mdev->al_tr_cycle + i; | 424 | unsigned idx = mdev->al_tr_cycle + i; |
391 | extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; | 425 | extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; |
392 | buffer->updates[i+1].pos = cpu_to_be32(idx); | 426 | buffer->context[i] = cpu_to_be32(extent_nr); |
393 | buffer->updates[i+1].extent = cpu_to_be32(extent_nr); | ||
394 | xor_sum ^= extent_nr; | ||
395 | } | ||
396 | for (; i < AL_EXTENTS_PT; i++) { | ||
397 | buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); | ||
398 | buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); | ||
399 | xor_sum ^= LC_FREE; | ||
400 | } | 427 | } |
401 | mdev->al_tr_cycle += AL_EXTENTS_PT; | 428 | for (; i < AL_CONTEXT_PER_TRANSACTION; i++) |
429 | buffer->context[i] = cpu_to_be32(LC_FREE); | ||
430 | |||
431 | mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; | ||
402 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) | 432 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) |
403 | mdev->al_tr_cycle = 0; | 433 | mdev->al_tr_cycle = 0; |
404 | 434 | ||
405 | buffer->xor_sum = cpu_to_be32(xor_sum); | ||
406 | |||
407 | sector = mdev->ldev->md.md_offset | 435 | sector = mdev->ldev->md.md_offset |
408 | + mdev->ldev->md.al_offset + mdev->al_tr_pos; | 436 | + mdev->ldev->md.al_offset |
437 | + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); | ||
409 | 438 | ||
410 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) | 439 | crc = crc32c(0, buffer, 4096); |
411 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 440 | buffer->crc32c = cpu_to_be32(crc); |
412 | |||
413 | if (++mdev->al_tr_pos > | ||
414 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
415 | mdev->al_tr_pos = 0; | ||
416 | 441 | ||
417 | D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); | 442 | if (drbd_bm_write_hinted(mdev)) |
418 | mdev->al_tr_number++; | 443 | err = -EIO; |
444 | /* drbd_chk_io_error done already */ | ||
445 | else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | ||
446 | err = -EIO; | ||
447 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | ||
448 | } else { | ||
449 | /* advance ringbuffer position and transaction counter */ | ||
450 | mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); | ||
451 | mdev->al_tr_number++; | ||
452 | } | ||
419 | 453 | ||
420 | drbd_md_put_buffer(mdev); | 454 | drbd_md_put_buffer(mdev); |
421 | |||
422 | complete(&((struct update_al_work *)w)->event); | ||
423 | put_ldev(mdev); | 455 | put_ldev(mdev); |
424 | 456 | ||
425 | return 1; | 457 | return err; |
426 | } | 458 | } |
427 | 459 | ||
428 | /** | ||
429 | * drbd_al_read_tr() - Read a single transaction from the on disk activity log | ||
430 | * @mdev: DRBD device. | ||
431 | * @bdev: Block device to read form. | ||
432 | * @b: pointer to an al_transaction. | ||
433 | * @index: On disk slot of the transaction to read. | ||
434 | * | ||
435 | * Returns -1 on IO error, 0 on checksum error and 1 upon success. | ||
436 | */ | ||
437 | static int drbd_al_read_tr(struct drbd_conf *mdev, | ||
438 | struct drbd_backing_dev *bdev, | ||
439 | struct al_transaction *b, | ||
440 | int index) | ||
441 | { | ||
442 | sector_t sector; | ||
443 | int rv, i; | ||
444 | u32 xor_sum = 0; | ||
445 | |||
446 | sector = bdev->md.md_offset + bdev->md.al_offset + index; | ||
447 | |||
448 | /* Dont process error normally, | ||
449 | * as this is done before disk is attached! */ | ||
450 | if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) | ||
451 | return -1; | ||
452 | 460 | ||
453 | rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); | 461 | static int w_al_write_transaction(struct drbd_work *w, int unused) |
454 | |||
455 | for (i = 0; i < AL_EXTENTS_PT + 1; i++) | ||
456 | xor_sum ^= be32_to_cpu(b->updates[i].extent); | ||
457 | rv &= (xor_sum == be32_to_cpu(b->xor_sum)); | ||
458 | |||
459 | return rv; | ||
460 | } | ||
461 | |||
462 | /** | ||
463 | * drbd_al_read_log() - Restores the activity log from its on disk representation. | ||
464 | * @mdev: DRBD device. | ||
465 | * @bdev: Block device to read form. | ||
466 | * | ||
467 | * Returns 1 on success, returns 0 when reading the log failed due to IO errors. | ||
468 | */ | ||
469 | int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
470 | { | 462 | { |
471 | struct al_transaction *buffer; | 463 | struct update_al_work *aw = container_of(w, struct update_al_work, w); |
472 | int i; | 464 | struct drbd_conf *mdev = w->mdev; |
473 | int rv; | 465 | int err; |
474 | int mx; | ||
475 | int active_extents = 0; | ||
476 | int transactions = 0; | ||
477 | int found_valid = 0; | ||
478 | int from = 0; | ||
479 | int to = 0; | ||
480 | u32 from_tnr = 0; | ||
481 | u32 to_tnr = 0; | ||
482 | u32 cnr; | ||
483 | |||
484 | mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); | ||
485 | |||
486 | /* lock out all other meta data io for now, | ||
487 | * and make sure the page is mapped. | ||
488 | */ | ||
489 | buffer = drbd_md_get_buffer(mdev); | ||
490 | if (!buffer) | ||
491 | return 0; | ||
492 | |||
493 | /* Find the valid transaction in the log */ | ||
494 | for (i = 0; i <= mx; i++) { | ||
495 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
496 | if (rv == 0) | ||
497 | continue; | ||
498 | if (rv == -1) { | ||
499 | drbd_md_put_buffer(mdev); | ||
500 | return 0; | ||
501 | } | ||
502 | cnr = be32_to_cpu(buffer->tr_number); | ||
503 | |||
504 | if (++found_valid == 1) { | ||
505 | from = i; | ||
506 | to = i; | ||
507 | from_tnr = cnr; | ||
508 | to_tnr = cnr; | ||
509 | continue; | ||
510 | } | ||
511 | if ((int)cnr - (int)from_tnr < 0) { | ||
512 | D_ASSERT(from_tnr - cnr + i - from == mx+1); | ||
513 | from = i; | ||
514 | from_tnr = cnr; | ||
515 | } | ||
516 | if ((int)cnr - (int)to_tnr > 0) { | ||
517 | D_ASSERT(cnr - to_tnr == i - to); | ||
518 | to = i; | ||
519 | to_tnr = cnr; | ||
520 | } | ||
521 | } | ||
522 | |||
523 | if (!found_valid) { | ||
524 | dev_warn(DEV, "No usable activity log found.\n"); | ||
525 | drbd_md_put_buffer(mdev); | ||
526 | return 1; | ||
527 | } | ||
528 | |||
529 | /* Read the valid transactions. | ||
530 | * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ | ||
531 | i = from; | ||
532 | while (1) { | ||
533 | int j, pos; | ||
534 | unsigned int extent_nr; | ||
535 | unsigned int trn; | ||
536 | |||
537 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
538 | ERR_IF(rv == 0) goto cancel; | ||
539 | if (rv == -1) { | ||
540 | drbd_md_put_buffer(mdev); | ||
541 | return 0; | ||
542 | } | ||
543 | |||
544 | trn = be32_to_cpu(buffer->tr_number); | ||
545 | |||
546 | spin_lock_irq(&mdev->al_lock); | ||
547 | |||
548 | /* This loop runs backwards because in the cyclic | ||
549 | elements there might be an old version of the | ||
550 | updated element (in slot 0). So the element in slot 0 | ||
551 | can overwrite old versions. */ | ||
552 | for (j = AL_EXTENTS_PT; j >= 0; j--) { | ||
553 | pos = be32_to_cpu(buffer->updates[j].pos); | ||
554 | extent_nr = be32_to_cpu(buffer->updates[j].extent); | ||
555 | |||
556 | if (extent_nr == LC_FREE) | ||
557 | continue; | ||
558 | |||
559 | lc_set(mdev->act_log, extent_nr, pos); | ||
560 | active_extents++; | ||
561 | } | ||
562 | spin_unlock_irq(&mdev->al_lock); | ||
563 | |||
564 | transactions++; | ||
565 | |||
566 | cancel: | ||
567 | if (i == to) | ||
568 | break; | ||
569 | i++; | ||
570 | if (i > mx) | ||
571 | i = 0; | ||
572 | } | ||
573 | |||
574 | mdev->al_tr_number = to_tnr+1; | ||
575 | mdev->al_tr_pos = to; | ||
576 | if (++mdev->al_tr_pos > | ||
577 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
578 | mdev->al_tr_pos = 0; | ||
579 | |||
580 | /* ok, we are done with it */ | ||
581 | drbd_md_put_buffer(mdev); | ||
582 | 466 | ||
583 | dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", | 467 | err = _al_write_transaction(mdev); |
584 | transactions, active_extents); | 468 | aw->err = err; |
469 | complete(&aw->event); | ||
585 | 470 | ||
586 | return 1; | 471 | return err != -EIO ? err : 0; |
587 | } | 472 | } |
588 | 473 | ||
589 | /** | 474 | /* Calls from worker context (see w_restart_disk_io()) need to write the |
590 | * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents | 475 | transaction directly. Others came through generic_make_request(), |
591 | * @mdev: DRBD device. | 476 | those need to delegate it to the worker. */ |
592 | */ | 477 | static int al_write_transaction(struct drbd_conf *mdev) |
593 | void drbd_al_apply_to_bm(struct drbd_conf *mdev) | ||
594 | { | 478 | { |
595 | unsigned int enr; | 479 | struct update_al_work al_work; |
596 | unsigned long add = 0; | ||
597 | char ppb[10]; | ||
598 | int i, tmp; | ||
599 | 480 | ||
600 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | 481 | if (current == mdev->tconn->worker.task) |
482 | return _al_write_transaction(mdev); | ||
601 | 483 | ||
602 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | 484 | init_completion(&al_work.event); |
603 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | 485 | al_work.w.cb = w_al_write_transaction; |
604 | if (enr == LC_FREE) | 486 | al_work.w.mdev = mdev; |
605 | continue; | 487 | drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); |
606 | tmp = drbd_bm_ALe_set_all(mdev, enr); | 488 | wait_for_completion(&al_work.event); |
607 | dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr); | ||
608 | add += tmp; | ||
609 | } | ||
610 | 489 | ||
611 | lc_unlock(mdev->act_log); | 490 | return al_work.err; |
612 | wake_up(&mdev->al_wait); | ||
613 | |||
614 | dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", | ||
615 | ppsize(ppb, Bit2KB(add))); | ||
616 | } | 491 | } |
617 | 492 | ||
618 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) | 493 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) |
@@ -642,7 +517,7 @@ void drbd_al_shrink(struct drbd_conf *mdev) | |||
642 | struct lc_element *al_ext; | 517 | struct lc_element *al_ext; |
643 | int i; | 518 | int i; |
644 | 519 | ||
645 | D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); | 520 | D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags)); |
646 | 521 | ||
647 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | 522 | for (i = 0; i < mdev->act_log->nr_elements; i++) { |
648 | al_ext = lc_element_by_index(mdev->act_log, i); | 523 | al_ext = lc_element_by_index(mdev->act_log, i); |
@@ -654,15 +529,17 @@ void drbd_al_shrink(struct drbd_conf *mdev) | |||
654 | wake_up(&mdev->al_wait); | 529 | wake_up(&mdev->al_wait); |
655 | } | 530 | } |
656 | 531 | ||
657 | static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 532 | static int w_update_odbm(struct drbd_work *w, int unused) |
658 | { | 533 | { |
659 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); | 534 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); |
535 | struct drbd_conf *mdev = w->mdev; | ||
536 | struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; | ||
660 | 537 | ||
661 | if (!get_ldev(mdev)) { | 538 | if (!get_ldev(mdev)) { |
662 | if (__ratelimit(&drbd_ratelimit_state)) | 539 | if (__ratelimit(&drbd_ratelimit_state)) |
663 | dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); | 540 | dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); |
664 | kfree(udw); | 541 | kfree(udw); |
665 | return 1; | 542 | return 0; |
666 | } | 543 | } |
667 | 544 | ||
668 | drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); | 545 | drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); |
@@ -680,9 +557,9 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused | |||
680 | break; | 557 | break; |
681 | } | 558 | } |
682 | } | 559 | } |
683 | drbd_bcast_sync_progress(mdev); | 560 | drbd_bcast_event(mdev, &sib); |
684 | 561 | ||
685 | return 1; | 562 | return 0; |
686 | } | 563 | } |
687 | 564 | ||
688 | 565 | ||
@@ -752,7 +629,9 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | |||
752 | } | 629 | } |
753 | ext->rs_left = rs_left; | 630 | ext->rs_left = rs_left; |
754 | ext->rs_failed = success ? 0 : count; | 631 | ext->rs_failed = success ? 0 : count; |
755 | lc_changed(mdev->resync, &ext->lce); | 632 | /* we don't keep a persistent log of the resync lru, |
633 | * we can commit any change right away. */ | ||
634 | lc_committed(mdev->resync); | ||
756 | } | 635 | } |
757 | lc_put(mdev->resync, &ext->lce); | 636 | lc_put(mdev->resync, &ext->lce); |
758 | /* no race, we are within the al_lock! */ | 637 | /* no race, we are within the al_lock! */ |
@@ -764,7 +643,8 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | |||
764 | if (udw) { | 643 | if (udw) { |
765 | udw->enr = ext->lce.lc_number; | 644 | udw->enr = ext->lce.lc_number; |
766 | udw->w.cb = w_update_odbm; | 645 | udw->w.cb = w_update_odbm; |
767 | drbd_queue_work_front(&mdev->data.work, &udw->w); | 646 | udw->w.mdev = mdev; |
647 | drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w); | ||
768 | } else { | 648 | } else { |
769 | dev_warn(DEV, "Could not kmalloc an udw\n"); | 649 | dev_warn(DEV, "Could not kmalloc an udw\n"); |
770 | } | 650 | } |
@@ -810,16 +690,22 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
810 | int wake_up = 0; | 690 | int wake_up = 0; |
811 | unsigned long flags; | 691 | unsigned long flags; |
812 | 692 | ||
813 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 693 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
814 | dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", | 694 | dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", |
815 | (unsigned long long)sector, size); | 695 | (unsigned long long)sector, size); |
816 | return; | 696 | return; |
817 | } | 697 | } |
698 | |||
699 | if (!get_ldev(mdev)) | ||
700 | return; /* no disk, no metadata, no bitmap to clear bits in */ | ||
701 | |||
818 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | 702 | nr_sectors = drbd_get_capacity(mdev->this_bdev); |
819 | esector = sector + (size >> 9) - 1; | 703 | esector = sector + (size >> 9) - 1; |
820 | 704 | ||
821 | ERR_IF(sector >= nr_sectors) return; | 705 | if (!expect(sector < nr_sectors)) |
822 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | 706 | goto out; |
707 | if (!expect(esector < nr_sectors)) | ||
708 | esector = nr_sectors - 1; | ||
823 | 709 | ||
824 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | 710 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); |
825 | 711 | ||
@@ -827,7 +713,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
827 | * round up start sector, round down end sector. we make sure we only | 713 | * round up start sector, round down end sector. we make sure we only |
828 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ | 714 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ |
829 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | 715 | if (unlikely(esector < BM_SECT_PER_BIT-1)) |
830 | return; | 716 | goto out; |
831 | if (unlikely(esector == (nr_sectors-1))) | 717 | if (unlikely(esector == (nr_sectors-1))) |
832 | ebnr = lbnr; | 718 | ebnr = lbnr; |
833 | else | 719 | else |
@@ -835,14 +721,14 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
835 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | 721 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); |
836 | 722 | ||
837 | if (sbnr > ebnr) | 723 | if (sbnr > ebnr) |
838 | return; | 724 | goto out; |
839 | 725 | ||
840 | /* | 726 | /* |
841 | * ok, (capacity & 7) != 0 sometimes, but who cares... | 727 | * ok, (capacity & 7) != 0 sometimes, but who cares... |
842 | * we count rs_{total,left} in bits, not sectors. | 728 | * we count rs_{total,left} in bits, not sectors. |
843 | */ | 729 | */ |
844 | count = drbd_bm_clear_bits(mdev, sbnr, ebnr); | 730 | count = drbd_bm_clear_bits(mdev, sbnr, ebnr); |
845 | if (count && get_ldev(mdev)) { | 731 | if (count) { |
846 | drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); | 732 | drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); |
847 | spin_lock_irqsave(&mdev->al_lock, flags); | 733 | spin_lock_irqsave(&mdev->al_lock, flags); |
848 | drbd_try_clear_on_disk_bm(mdev, sector, count, true); | 734 | drbd_try_clear_on_disk_bm(mdev, sector, count, true); |
@@ -851,8 +737,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
851 | /* just wake_up unconditional now, various lc_chaged(), | 737 | /* just wake_up unconditional now, various lc_chaged(), |
852 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | 738 | * lc_put() in drbd_try_clear_on_disk_bm(). */ |
853 | wake_up = 1; | 739 | wake_up = 1; |
854 | put_ldev(mdev); | ||
855 | } | 740 | } |
741 | out: | ||
742 | put_ldev(mdev); | ||
856 | if (wake_up) | 743 | if (wake_up) |
857 | wake_up(&mdev->al_wait); | 744 | wake_up(&mdev->al_wait); |
858 | } | 745 | } |
@@ -868,7 +755,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
868 | int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | 755 | int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, |
869 | const char *file, const unsigned int line) | 756 | const char *file, const unsigned int line) |
870 | { | 757 | { |
871 | unsigned long sbnr, ebnr, lbnr, flags; | 758 | unsigned long sbnr, ebnr, flags; |
872 | sector_t esector, nr_sectors; | 759 | sector_t esector, nr_sectors; |
873 | unsigned int enr, count = 0; | 760 | unsigned int enr, count = 0; |
874 | struct lc_element *e; | 761 | struct lc_element *e; |
@@ -877,7 +764,7 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
877 | if (size == 0) | 764 | if (size == 0) |
878 | return 0; | 765 | return 0; |
879 | 766 | ||
880 | if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 767 | if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
881 | dev_err(DEV, "sector: %llus, size: %d\n", | 768 | dev_err(DEV, "sector: %llus, size: %d\n", |
882 | (unsigned long long)sector, size); | 769 | (unsigned long long)sector, size); |
883 | return 0; | 770 | return 0; |
@@ -889,12 +776,10 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
889 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | 776 | nr_sectors = drbd_get_capacity(mdev->this_bdev); |
890 | esector = sector + (size >> 9) - 1; | 777 | esector = sector + (size >> 9) - 1; |
891 | 778 | ||
892 | ERR_IF(sector >= nr_sectors) | 779 | if (!expect(sector < nr_sectors)) |
893 | goto out; | 780 | goto out; |
894 | ERR_IF(esector >= nr_sectors) | 781 | if (!expect(esector < nr_sectors)) |
895 | esector = (nr_sectors-1); | 782 | esector = nr_sectors - 1; |
896 | |||
897 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
898 | 783 | ||
899 | /* we set it out of sync, | 784 | /* we set it out of sync, |
900 | * we do not need to round anything here */ | 785 | * we do not need to round anything here */ |
@@ -937,7 +822,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | |||
937 | if (bm_ext->lce.lc_number != enr) { | 822 | if (bm_ext->lce.lc_number != enr) { |
938 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | 823 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); |
939 | bm_ext->rs_failed = 0; | 824 | bm_ext->rs_failed = 0; |
940 | lc_changed(mdev->resync, &bm_ext->lce); | 825 | lc_committed(mdev->resync); |
941 | wakeup = 1; | 826 | wakeup = 1; |
942 | } | 827 | } |
943 | if (bm_ext->lce.refcnt == 1) | 828 | if (bm_ext->lce.refcnt == 1) |
@@ -953,7 +838,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | |||
953 | if (rs_flags & LC_STARVING) | 838 | if (rs_flags & LC_STARVING) |
954 | dev_warn(DEV, "Have to wait for element" | 839 | dev_warn(DEV, "Have to wait for element" |
955 | " (resync LRU too small?)\n"); | 840 | " (resync LRU too small?)\n"); |
956 | BUG_ON(rs_flags & LC_DIRTY); | 841 | BUG_ON(rs_flags & LC_LOCKED); |
957 | } | 842 | } |
958 | 843 | ||
959 | return bm_ext; | 844 | return bm_ext; |
@@ -961,26 +846,12 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | |||
961 | 846 | ||
962 | static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) | 847 | static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) |
963 | { | 848 | { |
964 | struct lc_element *al_ext; | 849 | int rv; |
965 | int rv = 0; | ||
966 | 850 | ||
967 | spin_lock_irq(&mdev->al_lock); | 851 | spin_lock_irq(&mdev->al_lock); |
968 | if (unlikely(enr == mdev->act_log->new_number)) | 852 | rv = lc_is_used(mdev->act_log, enr); |
969 | rv = 1; | ||
970 | else { | ||
971 | al_ext = lc_find(mdev->act_log, enr); | ||
972 | if (al_ext) { | ||
973 | if (al_ext->refcnt) | ||
974 | rv = 1; | ||
975 | } | ||
976 | } | ||
977 | spin_unlock_irq(&mdev->al_lock); | 853 | spin_unlock_irq(&mdev->al_lock); |
978 | 854 | ||
979 | /* | ||
980 | if (unlikely(rv)) { | ||
981 | dev_info(DEV, "Delaying sync read until app's write is done\n"); | ||
982 | } | ||
983 | */ | ||
984 | return rv; | 855 | return rv; |
985 | } | 856 | } |
986 | 857 | ||
@@ -1110,13 +981,13 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | |||
1110 | if (rs_flags & LC_STARVING) | 981 | if (rs_flags & LC_STARVING) |
1111 | dev_warn(DEV, "Have to wait for element" | 982 | dev_warn(DEV, "Have to wait for element" |
1112 | " (resync LRU too small?)\n"); | 983 | " (resync LRU too small?)\n"); |
1113 | BUG_ON(rs_flags & LC_DIRTY); | 984 | BUG_ON(rs_flags & LC_LOCKED); |
1114 | goto try_again; | 985 | goto try_again; |
1115 | } | 986 | } |
1116 | if (bm_ext->lce.lc_number != enr) { | 987 | if (bm_ext->lce.lc_number != enr) { |
1117 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | 988 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); |
1118 | bm_ext->rs_failed = 0; | 989 | bm_ext->rs_failed = 0; |
1119 | lc_changed(mdev->resync, &bm_ext->lce); | 990 | lc_committed(mdev->resync); |
1120 | wake_up(&mdev->al_wait); | 991 | wake_up(&mdev->al_wait); |
1121 | D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); | 992 | D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); |
1122 | } | 993 | } |
@@ -1127,8 +998,6 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | |||
1127 | } | 998 | } |
1128 | check_al: | 999 | check_al: |
1129 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { | 1000 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { |
1130 | if (unlikely(al_enr+i == mdev->act_log->new_number)) | ||
1131 | goto try_again; | ||
1132 | if (lc_is_used(mdev->act_log, al_enr+i)) | 1001 | if (lc_is_used(mdev->act_log, al_enr+i)) |
1133 | goto try_again; | 1002 | goto try_again; |
1134 | } | 1003 | } |
@@ -1263,7 +1132,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) | |||
1263 | sector_t esector, nr_sectors; | 1132 | sector_t esector, nr_sectors; |
1264 | int wake_up = 0; | 1133 | int wake_up = 0; |
1265 | 1134 | ||
1266 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 1135 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
1267 | dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", | 1136 | dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", |
1268 | (unsigned long long)sector, size); | 1137 | (unsigned long long)sector, size); |
1269 | return; | 1138 | return; |
@@ -1271,8 +1140,10 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) | |||
1271 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | 1140 | nr_sectors = drbd_get_capacity(mdev->this_bdev); |
1272 | esector = sector + (size >> 9) - 1; | 1141 | esector = sector + (size >> 9) - 1; |
1273 | 1142 | ||
1274 | ERR_IF(sector >= nr_sectors) return; | 1143 | if (!expect(sector < nr_sectors)) |
1275 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | 1144 | return; |
1145 | if (!expect(esector < nr_sectors)) | ||
1146 | esector = nr_sectors - 1; | ||
1276 | 1147 | ||
1277 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | 1148 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); |
1278 | 1149 | ||
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 8d8069758042..1ab205a4bf69 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -119,13 +119,9 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) | |||
119 | if (!__ratelimit(&drbd_ratelimit_state)) | 119 | if (!__ratelimit(&drbd_ratelimit_state)) |
120 | return; | 120 | return; |
121 | dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", | 121 | dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", |
122 | current == mdev->receiver.task ? "receiver" : | 122 | drbd_task_to_thread_name(mdev->tconn, current), |
123 | current == mdev->asender.task ? "asender" : | 123 | func, b->bm_why ?: "?", |
124 | current == mdev->worker.task ? "worker" : current->comm, | 124 | drbd_task_to_thread_name(mdev->tconn, b->bm_task)); |
125 | func, b->bm_why ?: "?", | ||
126 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
127 | b->bm_task == mdev->asender.task ? "asender" : | ||
128 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
129 | } | 125 | } |
130 | 126 | ||
131 | void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) | 127 | void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) |
@@ -142,13 +138,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) | |||
142 | 138 | ||
143 | if (trylock_failed) { | 139 | if (trylock_failed) { |
144 | dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", | 140 | dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", |
145 | current == mdev->receiver.task ? "receiver" : | 141 | drbd_task_to_thread_name(mdev->tconn, current), |
146 | current == mdev->asender.task ? "asender" : | 142 | why, b->bm_why ?: "?", |
147 | current == mdev->worker.task ? "worker" : current->comm, | 143 | drbd_task_to_thread_name(mdev->tconn, b->bm_task)); |
148 | why, b->bm_why ?: "?", | ||
149 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
150 | b->bm_task == mdev->asender.task ? "asender" : | ||
151 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
152 | mutex_lock(&b->bm_change); | 144 | mutex_lock(&b->bm_change); |
153 | } | 145 | } |
154 | if (BM_LOCKED_MASK & b->bm_flags) | 146 | if (BM_LOCKED_MASK & b->bm_flags) |
@@ -196,6 +188,9 @@ void drbd_bm_unlock(struct drbd_conf *mdev) | |||
196 | /* to mark for lazy writeout once syncer cleared all clearable bits, | 188 | /* to mark for lazy writeout once syncer cleared all clearable bits, |
197 | * we if bits have been cleared since last IO. */ | 189 | * we if bits have been cleared since last IO. */ |
198 | #define BM_PAGE_LAZY_WRITEOUT 28 | 190 | #define BM_PAGE_LAZY_WRITEOUT 28 |
191 | /* pages marked with this "HINT" will be considered for writeout | ||
192 | * on activity log transactions */ | ||
193 | #define BM_PAGE_HINT_WRITEOUT 27 | ||
199 | 194 | ||
200 | /* store_page_idx uses non-atomic assignment. It is only used directly after | 195 | /* store_page_idx uses non-atomic assignment. It is only used directly after |
201 | * allocating the page. All other bm_set_page_* and bm_clear_page_* need to | 196 | * allocating the page. All other bm_set_page_* and bm_clear_page_* need to |
@@ -227,8 +222,7 @@ static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr) | |||
227 | { | 222 | { |
228 | struct drbd_bitmap *b = mdev->bitmap; | 223 | struct drbd_bitmap *b = mdev->bitmap; |
229 | void *addr = &page_private(b->bm_pages[page_nr]); | 224 | void *addr = &page_private(b->bm_pages[page_nr]); |
230 | clear_bit(BM_PAGE_IO_LOCK, addr); | 225 | clear_bit_unlock(BM_PAGE_IO_LOCK, addr); |
231 | smp_mb__after_clear_bit(); | ||
232 | wake_up(&mdev->bitmap->bm_io_wait); | 226 | wake_up(&mdev->bitmap->bm_io_wait); |
233 | } | 227 | } |
234 | 228 | ||
@@ -246,6 +240,27 @@ static void bm_set_page_need_writeout(struct page *page) | |||
246 | set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); | 240 | set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); |
247 | } | 241 | } |
248 | 242 | ||
243 | /** | ||
244 | * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout | ||
245 | * @mdev: DRBD device. | ||
246 | * @page_nr: the bitmap page to mark with the "hint" flag | ||
247 | * | ||
248 | * From within an activity log transaction, we mark a few pages with these | ||
249 | * hints, then call drbd_bm_write_hinted(), which will only write out changed | ||
250 | * pages which are flagged with this mark. | ||
251 | */ | ||
252 | void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr) | ||
253 | { | ||
254 | struct page *page; | ||
255 | if (page_nr >= mdev->bitmap->bm_number_of_pages) { | ||
256 | dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n", | ||
257 | page_nr, (int)mdev->bitmap->bm_number_of_pages); | ||
258 | return; | ||
259 | } | ||
260 | page = mdev->bitmap->bm_pages[page_nr]; | ||
261 | set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); | ||
262 | } | ||
263 | |||
249 | static int bm_test_page_unchanged(struct page *page) | 264 | static int bm_test_page_unchanged(struct page *page) |
250 | { | 265 | { |
251 | volatile const unsigned long *addr = &page_private(page); | 266 | volatile const unsigned long *addr = &page_private(page); |
@@ -376,7 +391,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
376 | * GFP_NOIO, as this is called while drbd IO is "suspended", | 391 | * GFP_NOIO, as this is called while drbd IO is "suspended", |
377 | * and during resize or attach on diskless Primary, | 392 | * and during resize or attach on diskless Primary, |
378 | * we must not block on IO to ourselves. | 393 | * we must not block on IO to ourselves. |
379 | * Context is receiver thread or cqueue thread/dmsetup. */ | 394 | * Context is receiver thread or dmsetup. */ |
380 | bytes = sizeof(struct page *)*want; | 395 | bytes = sizeof(struct page *)*want; |
381 | new_pages = kzalloc(bytes, GFP_NOIO); | 396 | new_pages = kzalloc(bytes, GFP_NOIO); |
382 | if (!new_pages) { | 397 | if (!new_pages) { |
@@ -441,7 +456,8 @@ int drbd_bm_init(struct drbd_conf *mdev) | |||
441 | 456 | ||
442 | sector_t drbd_bm_capacity(struct drbd_conf *mdev) | 457 | sector_t drbd_bm_capacity(struct drbd_conf *mdev) |
443 | { | 458 | { |
444 | ERR_IF(!mdev->bitmap) return 0; | 459 | if (!expect(mdev->bitmap)) |
460 | return 0; | ||
445 | return mdev->bitmap->bm_dev_capacity; | 461 | return mdev->bitmap->bm_dev_capacity; |
446 | } | 462 | } |
447 | 463 | ||
@@ -449,7 +465,8 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev) | |||
449 | */ | 465 | */ |
450 | void drbd_bm_cleanup(struct drbd_conf *mdev) | 466 | void drbd_bm_cleanup(struct drbd_conf *mdev) |
451 | { | 467 | { |
452 | ERR_IF (!mdev->bitmap) return; | 468 | if (!expect(mdev->bitmap)) |
469 | return; | ||
453 | bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); | 470 | bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); |
454 | bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags)); | 471 | bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags)); |
455 | kfree(mdev->bitmap); | 472 | kfree(mdev->bitmap); |
@@ -612,7 +629,8 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) | |||
612 | int err = 0, growing; | 629 | int err = 0, growing; |
613 | int opages_vmalloced; | 630 | int opages_vmalloced; |
614 | 631 | ||
615 | ERR_IF(!b) return -ENOMEM; | 632 | if (!expect(b)) |
633 | return -ENOMEM; | ||
616 | 634 | ||
617 | drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK); | 635 | drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK); |
618 | 636 | ||
@@ -734,8 +752,10 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) | |||
734 | unsigned long s; | 752 | unsigned long s; |
735 | unsigned long flags; | 753 | unsigned long flags; |
736 | 754 | ||
737 | ERR_IF(!b) return 0; | 755 | if (!expect(b)) |
738 | ERR_IF(!b->bm_pages) return 0; | 756 | return 0; |
757 | if (!expect(b->bm_pages)) | ||
758 | return 0; | ||
739 | 759 | ||
740 | spin_lock_irqsave(&b->bm_lock, flags); | 760 | spin_lock_irqsave(&b->bm_lock, flags); |
741 | s = b->bm_set; | 761 | s = b->bm_set; |
@@ -758,8 +778,10 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) | |||
758 | size_t drbd_bm_words(struct drbd_conf *mdev) | 778 | size_t drbd_bm_words(struct drbd_conf *mdev) |
759 | { | 779 | { |
760 | struct drbd_bitmap *b = mdev->bitmap; | 780 | struct drbd_bitmap *b = mdev->bitmap; |
761 | ERR_IF(!b) return 0; | 781 | if (!expect(b)) |
762 | ERR_IF(!b->bm_pages) return 0; | 782 | return 0; |
783 | if (!expect(b->bm_pages)) | ||
784 | return 0; | ||
763 | 785 | ||
764 | return b->bm_words; | 786 | return b->bm_words; |
765 | } | 787 | } |
@@ -767,7 +789,8 @@ size_t drbd_bm_words(struct drbd_conf *mdev) | |||
767 | unsigned long drbd_bm_bits(struct drbd_conf *mdev) | 789 | unsigned long drbd_bm_bits(struct drbd_conf *mdev) |
768 | { | 790 | { |
769 | struct drbd_bitmap *b = mdev->bitmap; | 791 | struct drbd_bitmap *b = mdev->bitmap; |
770 | ERR_IF(!b) return 0; | 792 | if (!expect(b)) |
793 | return 0; | ||
771 | 794 | ||
772 | return b->bm_bits; | 795 | return b->bm_bits; |
773 | } | 796 | } |
@@ -788,8 +811,10 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
788 | 811 | ||
789 | end = offset + number; | 812 | end = offset + number; |
790 | 813 | ||
791 | ERR_IF(!b) return; | 814 | if (!expect(b)) |
792 | ERR_IF(!b->bm_pages) return; | 815 | return; |
816 | if (!expect(b->bm_pages)) | ||
817 | return; | ||
793 | if (number == 0) | 818 | if (number == 0) |
794 | return; | 819 | return; |
795 | WARN_ON(offset >= b->bm_words); | 820 | WARN_ON(offset >= b->bm_words); |
@@ -833,8 +858,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
833 | 858 | ||
834 | end = offset + number; | 859 | end = offset + number; |
835 | 860 | ||
836 | ERR_IF(!b) return; | 861 | if (!expect(b)) |
837 | ERR_IF(!b->bm_pages) return; | 862 | return; |
863 | if (!expect(b->bm_pages)) | ||
864 | return; | ||
838 | 865 | ||
839 | spin_lock_irq(&b->bm_lock); | 866 | spin_lock_irq(&b->bm_lock); |
840 | if ((offset >= b->bm_words) || | 867 | if ((offset >= b->bm_words) || |
@@ -862,8 +889,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
862 | void drbd_bm_set_all(struct drbd_conf *mdev) | 889 | void drbd_bm_set_all(struct drbd_conf *mdev) |
863 | { | 890 | { |
864 | struct drbd_bitmap *b = mdev->bitmap; | 891 | struct drbd_bitmap *b = mdev->bitmap; |
865 | ERR_IF(!b) return; | 892 | if (!expect(b)) |
866 | ERR_IF(!b->bm_pages) return; | 893 | return; |
894 | if (!expect(b->bm_pages)) | ||
895 | return; | ||
867 | 896 | ||
868 | spin_lock_irq(&b->bm_lock); | 897 | spin_lock_irq(&b->bm_lock); |
869 | bm_memset(b, 0, 0xff, b->bm_words); | 898 | bm_memset(b, 0, 0xff, b->bm_words); |
@@ -876,8 +905,10 @@ void drbd_bm_set_all(struct drbd_conf *mdev) | |||
876 | void drbd_bm_clear_all(struct drbd_conf *mdev) | 905 | void drbd_bm_clear_all(struct drbd_conf *mdev) |
877 | { | 906 | { |
878 | struct drbd_bitmap *b = mdev->bitmap; | 907 | struct drbd_bitmap *b = mdev->bitmap; |
879 | ERR_IF(!b) return; | 908 | if (!expect(b)) |
880 | ERR_IF(!b->bm_pages) return; | 909 | return; |
910 | if (!expect(b->bm_pages)) | ||
911 | return; | ||
881 | 912 | ||
882 | spin_lock_irq(&b->bm_lock); | 913 | spin_lock_irq(&b->bm_lock); |
883 | bm_memset(b, 0, 0, b->bm_words); | 914 | bm_memset(b, 0, 0, b->bm_words); |
@@ -891,7 +922,8 @@ struct bm_aio_ctx { | |||
891 | unsigned int done; | 922 | unsigned int done; |
892 | unsigned flags; | 923 | unsigned flags; |
893 | #define BM_AIO_COPY_PAGES 1 | 924 | #define BM_AIO_COPY_PAGES 1 |
894 | #define BM_WRITE_ALL_PAGES 2 | 925 | #define BM_AIO_WRITE_HINTED 2 |
926 | #define BM_WRITE_ALL_PAGES 4 | ||
895 | int error; | 927 | int error; |
896 | struct kref kref; | 928 | struct kref kref; |
897 | }; | 929 | }; |
@@ -1062,6 +1094,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1062 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) | 1094 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) |
1063 | break; | 1095 | break; |
1064 | if (rw & WRITE) { | 1096 | if (rw & WRITE) { |
1097 | if ((flags & BM_AIO_WRITE_HINTED) && | ||
1098 | !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, | ||
1099 | &page_private(b->bm_pages[i]))) | ||
1100 | continue; | ||
1101 | |||
1065 | if (!(flags & BM_WRITE_ALL_PAGES) && | 1102 | if (!(flags & BM_WRITE_ALL_PAGES) && |
1066 | bm_test_page_unchanged(b->bm_pages[i])) { | 1103 | bm_test_page_unchanged(b->bm_pages[i])) { |
1067 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); | 1104 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); |
@@ -1094,9 +1131,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1094 | else | 1131 | else |
1095 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); | 1132 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); |
1096 | 1133 | ||
1097 | dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", | 1134 | /* summary for global bitmap IO */ |
1098 | rw == WRITE ? "WRITE" : "READ", | 1135 | if (flags == 0) |
1099 | count, jiffies - now); | 1136 | dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", |
1137 | rw == WRITE ? "WRITE" : "READ", | ||
1138 | count, jiffies - now); | ||
1100 | 1139 | ||
1101 | if (ctx->error) { | 1140 | if (ctx->error) { |
1102 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); | 1141 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); |
@@ -1117,8 +1156,9 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1117 | } | 1156 | } |
1118 | now = b->bm_set; | 1157 | now = b->bm_set; |
1119 | 1158 | ||
1120 | dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", | 1159 | if (flags == 0) |
1121 | ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); | 1160 | dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", |
1161 | ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); | ||
1122 | 1162 | ||
1123 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); | 1163 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); |
1124 | return err; | 1164 | return err; |
@@ -1181,9 +1221,17 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) | |||
1181 | return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); | 1221 | return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); |
1182 | } | 1222 | } |
1183 | 1223 | ||
1224 | /** | ||
1225 | * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. | ||
1226 | * @mdev: DRBD device. | ||
1227 | */ | ||
1228 | int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local) | ||
1229 | { | ||
1230 | return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); | ||
1231 | } | ||
1184 | 1232 | ||
1185 | /** | 1233 | /** |
1186 | * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap | 1234 | * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap |
1187 | * @mdev: DRBD device. | 1235 | * @mdev: DRBD device. |
1188 | * @idx: bitmap page index | 1236 | * @idx: bitmap page index |
1189 | * | 1237 | * |
@@ -1291,8 +1339,10 @@ static unsigned long bm_find_next(struct drbd_conf *mdev, | |||
1291 | struct drbd_bitmap *b = mdev->bitmap; | 1339 | struct drbd_bitmap *b = mdev->bitmap; |
1292 | unsigned long i = DRBD_END_OF_BITMAP; | 1340 | unsigned long i = DRBD_END_OF_BITMAP; |
1293 | 1341 | ||
1294 | ERR_IF(!b) return i; | 1342 | if (!expect(b)) |
1295 | ERR_IF(!b->bm_pages) return i; | 1343 | return i; |
1344 | if (!expect(b->bm_pages)) | ||
1345 | return i; | ||
1296 | 1346 | ||
1297 | spin_lock_irq(&b->bm_lock); | 1347 | spin_lock_irq(&b->bm_lock); |
1298 | if (BM_DONT_TEST & b->bm_flags) | 1348 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1393,8 +1443,10 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1393 | struct drbd_bitmap *b = mdev->bitmap; | 1443 | struct drbd_bitmap *b = mdev->bitmap; |
1394 | int c = 0; | 1444 | int c = 0; |
1395 | 1445 | ||
1396 | ERR_IF(!b) return 1; | 1446 | if (!expect(b)) |
1397 | ERR_IF(!b->bm_pages) return 0; | 1447 | return 1; |
1448 | if (!expect(b->bm_pages)) | ||
1449 | return 0; | ||
1398 | 1450 | ||
1399 | spin_lock_irqsave(&b->bm_lock, flags); | 1451 | spin_lock_irqsave(&b->bm_lock, flags); |
1400 | if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) | 1452 | if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) |
@@ -1425,13 +1477,21 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, | |||
1425 | { | 1477 | { |
1426 | int i; | 1478 | int i; |
1427 | int bits; | 1479 | int bits; |
1480 | int changed = 0; | ||
1428 | unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); | 1481 | unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); |
1429 | for (i = first_word; i < last_word; i++) { | 1482 | for (i = first_word; i < last_word; i++) { |
1430 | bits = hweight_long(paddr[i]); | 1483 | bits = hweight_long(paddr[i]); |
1431 | paddr[i] = ~0UL; | 1484 | paddr[i] = ~0UL; |
1432 | b->bm_set += BITS_PER_LONG - bits; | 1485 | changed += BITS_PER_LONG - bits; |
1433 | } | 1486 | } |
1434 | kunmap_atomic(paddr); | 1487 | kunmap_atomic(paddr); |
1488 | if (changed) { | ||
1489 | /* We only need lazy writeout, the information is still in the | ||
1490 | * remote bitmap as well, and is reconstructed during the next | ||
1491 | * bitmap exchange, if lost locally due to a crash. */ | ||
1492 | bm_set_page_lazy_writeout(b->bm_pages[page_nr]); | ||
1493 | b->bm_set += changed; | ||
1494 | } | ||
1435 | } | 1495 | } |
1436 | 1496 | ||
1437 | /* Same thing as drbd_bm_set_bits, | 1497 | /* Same thing as drbd_bm_set_bits, |
@@ -1526,8 +1586,10 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) | |||
1526 | unsigned long *p_addr; | 1586 | unsigned long *p_addr; |
1527 | int i; | 1587 | int i; |
1528 | 1588 | ||
1529 | ERR_IF(!b) return 0; | 1589 | if (!expect(b)) |
1530 | ERR_IF(!b->bm_pages) return 0; | 1590 | return 0; |
1591 | if (!expect(b->bm_pages)) | ||
1592 | return 0; | ||
1531 | 1593 | ||
1532 | spin_lock_irqsave(&b->bm_lock, flags); | 1594 | spin_lock_irqsave(&b->bm_lock, flags); |
1533 | if (BM_DONT_TEST & b->bm_flags) | 1595 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1561,8 +1623,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1561 | * robust in case we screwed up elsewhere, in that case pretend there | 1623 | * robust in case we screwed up elsewhere, in that case pretend there |
1562 | * was one dirty bit in the requested area, so we won't try to do a | 1624 | * was one dirty bit in the requested area, so we won't try to do a |
1563 | * local read there (no bitmap probably implies no disk) */ | 1625 | * local read there (no bitmap probably implies no disk) */ |
1564 | ERR_IF(!b) return 1; | 1626 | if (!expect(b)) |
1565 | ERR_IF(!b->bm_pages) return 1; | 1627 | return 1; |
1628 | if (!expect(b->bm_pages)) | ||
1629 | return 1; | ||
1566 | 1630 | ||
1567 | spin_lock_irqsave(&b->bm_lock, flags); | 1631 | spin_lock_irqsave(&b->bm_lock, flags); |
1568 | if (BM_DONT_TEST & b->bm_flags) | 1632 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1575,11 +1639,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1575 | bm_unmap(p_addr); | 1639 | bm_unmap(p_addr); |
1576 | p_addr = bm_map_pidx(b, idx); | 1640 | p_addr = bm_map_pidx(b, idx); |
1577 | } | 1641 | } |
1578 | ERR_IF (bitnr >= b->bm_bits) { | 1642 | if (expect(bitnr < b->bm_bits)) |
1579 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1580 | } else { | ||
1581 | c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); | 1643 | c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); |
1582 | } | 1644 | else |
1645 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1583 | } | 1646 | } |
1584 | if (p_addr) | 1647 | if (p_addr) |
1585 | bm_unmap(p_addr); | 1648 | bm_unmap(p_addr); |
@@ -1609,8 +1672,10 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | |||
1609 | unsigned long flags; | 1672 | unsigned long flags; |
1610 | unsigned long *p_addr, *bm; | 1673 | unsigned long *p_addr, *bm; |
1611 | 1674 | ||
1612 | ERR_IF(!b) return 0; | 1675 | if (!expect(b)) |
1613 | ERR_IF(!b->bm_pages) return 0; | 1676 | return 0; |
1677 | if (!expect(b->bm_pages)) | ||
1678 | return 0; | ||
1614 | 1679 | ||
1615 | spin_lock_irqsave(&b->bm_lock, flags); | 1680 | spin_lock_irqsave(&b->bm_lock, flags); |
1616 | if (BM_DONT_TEST & b->bm_flags) | 1681 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1632,47 +1697,3 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | |||
1632 | spin_unlock_irqrestore(&b->bm_lock, flags); | 1697 | spin_unlock_irqrestore(&b->bm_lock, flags); |
1633 | return count; | 1698 | return count; |
1634 | } | 1699 | } |
1635 | |||
1636 | /* Set all bits covered by the AL-extent al_enr. | ||
1637 | * Returns number of bits changed. */ | ||
1638 | unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) | ||
1639 | { | ||
1640 | struct drbd_bitmap *b = mdev->bitmap; | ||
1641 | unsigned long *p_addr, *bm; | ||
1642 | unsigned long weight; | ||
1643 | unsigned long s, e; | ||
1644 | int count, i, do_now; | ||
1645 | ERR_IF(!b) return 0; | ||
1646 | ERR_IF(!b->bm_pages) return 0; | ||
1647 | |||
1648 | spin_lock_irq(&b->bm_lock); | ||
1649 | if (BM_DONT_SET & b->bm_flags) | ||
1650 | bm_print_lock_info(mdev); | ||
1651 | weight = b->bm_set; | ||
1652 | |||
1653 | s = al_enr * BM_WORDS_PER_AL_EXT; | ||
1654 | e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); | ||
1655 | /* assert that s and e are on the same page */ | ||
1656 | D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) | ||
1657 | == s >> (PAGE_SHIFT - LN2_BPL + 3)); | ||
1658 | count = 0; | ||
1659 | if (s < b->bm_words) { | ||
1660 | i = do_now = e-s; | ||
1661 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); | ||
1662 | bm = p_addr + MLPP(s); | ||
1663 | while (i--) { | ||
1664 | count += hweight_long(*bm); | ||
1665 | *bm = -1UL; | ||
1666 | bm++; | ||
1667 | } | ||
1668 | bm_unmap(p_addr); | ||
1669 | b->bm_set += do_now*BITS_PER_LONG - count; | ||
1670 | if (e == b->bm_words) | ||
1671 | b->bm_set -= bm_clear_surplus(b); | ||
1672 | } else { | ||
1673 | dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s); | ||
1674 | } | ||
1675 | weight = b->bm_set - weight; | ||
1676 | spin_unlock_irq(&b->bm_lock); | ||
1677 | return weight; | ||
1678 | } | ||
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 277c69c9465b..ef72a72814c7 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -39,9 +39,13 @@ | |||
39 | #include <linux/major.h> | 39 | #include <linux/major.h> |
40 | #include <linux/blkdev.h> | 40 | #include <linux/blkdev.h> |
41 | #include <linux/genhd.h> | 41 | #include <linux/genhd.h> |
42 | #include <linux/idr.h> | ||
42 | #include <net/tcp.h> | 43 | #include <net/tcp.h> |
43 | #include <linux/lru_cache.h> | 44 | #include <linux/lru_cache.h> |
44 | #include <linux/prefetch.h> | 45 | #include <linux/prefetch.h> |
46 | #include <linux/drbd_genl_api.h> | ||
47 | #include <linux/drbd.h> | ||
48 | #include "drbd_state.h" | ||
45 | 49 | ||
46 | #ifdef __CHECKER__ | 50 | #ifdef __CHECKER__ |
47 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) | 51 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) |
@@ -61,7 +65,6 @@ | |||
61 | extern unsigned int minor_count; | 65 | extern unsigned int minor_count; |
62 | extern bool disable_sendpage; | 66 | extern bool disable_sendpage; |
63 | extern bool allow_oos; | 67 | extern bool allow_oos; |
64 | extern unsigned int cn_idx; | ||
65 | 68 | ||
66 | #ifdef CONFIG_DRBD_FAULT_INJECTION | 69 | #ifdef CONFIG_DRBD_FAULT_INJECTION |
67 | extern int enable_faults; | 70 | extern int enable_faults; |
@@ -86,34 +89,44 @@ extern char usermode_helper[]; | |||
86 | */ | 89 | */ |
87 | #define DRBD_SIGKILL SIGHUP | 90 | #define DRBD_SIGKILL SIGHUP |
88 | 91 | ||
89 | /* All EEs on the free list should have ID_VACANT (== 0) | ||
90 | * freshly allocated EEs get !ID_VACANT (== 1) | ||
91 | * so if it says "cannot dereference null pointer at address 0x00000001", | ||
92 | * it is most likely one of these :( */ | ||
93 | |||
94 | #define ID_IN_SYNC (4711ULL) | 92 | #define ID_IN_SYNC (4711ULL) |
95 | #define ID_OUT_OF_SYNC (4712ULL) | 93 | #define ID_OUT_OF_SYNC (4712ULL) |
96 | |||
97 | #define ID_SYNCER (-1ULL) | 94 | #define ID_SYNCER (-1ULL) |
98 | #define ID_VACANT 0 | 95 | |
99 | #define is_syncer_block_id(id) ((id) == ID_SYNCER) | ||
100 | #define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) | 96 | #define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) |
101 | 97 | ||
102 | struct drbd_conf; | 98 | struct drbd_conf; |
99 | struct drbd_tconn; | ||
103 | 100 | ||
104 | 101 | ||
105 | /* to shorten dev_warn(DEV, "msg"); and relatives statements */ | 102 | /* to shorten dev_warn(DEV, "msg"); and relatives statements */ |
106 | #define DEV (disk_to_dev(mdev->vdisk)) | 103 | #define DEV (disk_to_dev(mdev->vdisk)) |
107 | 104 | ||
105 | #define conn_printk(LEVEL, TCONN, FMT, ARGS...) \ | ||
106 | printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS) | ||
107 | #define conn_alert(TCONN, FMT, ARGS...) conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS) | ||
108 | #define conn_crit(TCONN, FMT, ARGS...) conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS) | ||
109 | #define conn_err(TCONN, FMT, ARGS...) conn_printk(KERN_ERR, TCONN, FMT, ## ARGS) | ||
110 | #define conn_warn(TCONN, FMT, ARGS...) conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS) | ||
111 | #define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS) | ||
112 | #define conn_info(TCONN, FMT, ARGS...) conn_printk(KERN_INFO, TCONN, FMT, ## ARGS) | ||
113 | #define conn_dbg(TCONN, FMT, ARGS...) conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS) | ||
114 | |||
108 | #define D_ASSERT(exp) if (!(exp)) \ | 115 | #define D_ASSERT(exp) if (!(exp)) \ |
109 | dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) | 116 | dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) |
110 | 117 | ||
111 | #define ERR_IF(exp) if (({ \ | 118 | /** |
112 | int _b = (exp) != 0; \ | 119 | * expect - Make an assertion |
113 | if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \ | 120 | * |
114 | __func__, #exp, __FILE__, __LINE__); \ | 121 | * Unlike the assert macro, this macro returns a boolean result. |
115 | _b; \ | 122 | */ |
116 | })) | 123 | #define expect(exp) ({ \ |
124 | bool _bool = (exp); \ | ||
125 | if (!_bool) \ | ||
126 | dev_err(DEV, "ASSERTION %s FAILED in %s\n", \ | ||
127 | #exp, __func__); \ | ||
128 | _bool; \ | ||
129 | }) | ||
117 | 130 | ||
118 | /* Defines to control fault insertion */ | 131 | /* Defines to control fault insertion */ |
119 | enum { | 132 | enum { |
@@ -150,15 +163,12 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { | |||
150 | /* usual integer division */ | 163 | /* usual integer division */ |
151 | #define div_floor(A, B) ((A)/(B)) | 164 | #define div_floor(A, B) ((A)/(B)) |
152 | 165 | ||
153 | /* drbd_meta-data.c (still in drbd_main.c) */ | ||
154 | /* 4th incarnation of the disk layout. */ | ||
155 | #define DRBD_MD_MAGIC (DRBD_MAGIC+4) | ||
156 | |||
157 | extern struct drbd_conf **minor_table; | ||
158 | extern struct ratelimit_state drbd_ratelimit_state; | 166 | extern struct ratelimit_state drbd_ratelimit_state; |
167 | extern struct idr minors; /* RCU, updates: genl_lock() */ | ||
168 | extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */ | ||
159 | 169 | ||
160 | /* on the wire */ | 170 | /* on the wire */ |
161 | enum drbd_packets { | 171 | enum drbd_packet { |
162 | /* receiver (data socket) */ | 172 | /* receiver (data socket) */ |
163 | P_DATA = 0x00, | 173 | P_DATA = 0x00, |
164 | P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ | 174 | P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ |
@@ -186,7 +196,7 @@ enum drbd_packets { | |||
186 | P_RECV_ACK = 0x15, /* Used in protocol B */ | 196 | P_RECV_ACK = 0x15, /* Used in protocol B */ |
187 | P_WRITE_ACK = 0x16, /* Used in protocol C */ | 197 | P_WRITE_ACK = 0x16, /* Used in protocol C */ |
188 | P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ | 198 | P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ |
189 | P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ | 199 | P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */ |
190 | P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ | 200 | P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ |
191 | P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ | 201 | P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ |
192 | P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ | 202 | P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ |
@@ -207,77 +217,23 @@ enum drbd_packets { | |||
207 | P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ | 217 | P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ |
208 | P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ | 218 | P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ |
209 | P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ | 219 | P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ |
220 | P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */ | ||
221 | P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ | ||
222 | P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ | ||
223 | P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ | ||
210 | 224 | ||
211 | P_MAX_CMD = 0x2A, | ||
212 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ | 225 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ |
213 | P_MAX_OPT_CMD = 0x101, | 226 | P_MAX_OPT_CMD = 0x101, |
214 | 227 | ||
215 | /* special command ids for handshake */ | 228 | /* special command ids for handshake */ |
216 | 229 | ||
217 | P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ | 230 | P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */ |
218 | P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ | 231 | P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */ |
219 | 232 | ||
220 | P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ | 233 | P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */ |
221 | }; | 234 | }; |
222 | 235 | ||
223 | static inline const char *cmdname(enum drbd_packets cmd) | 236 | extern const char *cmdname(enum drbd_packet cmd); |
224 | { | ||
225 | /* THINK may need to become several global tables | ||
226 | * when we want to support more than | ||
227 | * one PRO_VERSION */ | ||
228 | static const char *cmdnames[] = { | ||
229 | [P_DATA] = "Data", | ||
230 | [P_DATA_REPLY] = "DataReply", | ||
231 | [P_RS_DATA_REPLY] = "RSDataReply", | ||
232 | [P_BARRIER] = "Barrier", | ||
233 | [P_BITMAP] = "ReportBitMap", | ||
234 | [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", | ||
235 | [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", | ||
236 | [P_UNPLUG_REMOTE] = "UnplugRemote", | ||
237 | [P_DATA_REQUEST] = "DataRequest", | ||
238 | [P_RS_DATA_REQUEST] = "RSDataRequest", | ||
239 | [P_SYNC_PARAM] = "SyncParam", | ||
240 | [P_SYNC_PARAM89] = "SyncParam89", | ||
241 | [P_PROTOCOL] = "ReportProtocol", | ||
242 | [P_UUIDS] = "ReportUUIDs", | ||
243 | [P_SIZES] = "ReportSizes", | ||
244 | [P_STATE] = "ReportState", | ||
245 | [P_SYNC_UUID] = "ReportSyncUUID", | ||
246 | [P_AUTH_CHALLENGE] = "AuthChallenge", | ||
247 | [P_AUTH_RESPONSE] = "AuthResponse", | ||
248 | [P_PING] = "Ping", | ||
249 | [P_PING_ACK] = "PingAck", | ||
250 | [P_RECV_ACK] = "RecvAck", | ||
251 | [P_WRITE_ACK] = "WriteAck", | ||
252 | [P_RS_WRITE_ACK] = "RSWriteAck", | ||
253 | [P_DISCARD_ACK] = "DiscardAck", | ||
254 | [P_NEG_ACK] = "NegAck", | ||
255 | [P_NEG_DREPLY] = "NegDReply", | ||
256 | [P_NEG_RS_DREPLY] = "NegRSDReply", | ||
257 | [P_BARRIER_ACK] = "BarrierAck", | ||
258 | [P_STATE_CHG_REQ] = "StateChgRequest", | ||
259 | [P_STATE_CHG_REPLY] = "StateChgReply", | ||
260 | [P_OV_REQUEST] = "OVRequest", | ||
261 | [P_OV_REPLY] = "OVReply", | ||
262 | [P_OV_RESULT] = "OVResult", | ||
263 | [P_CSUM_RS_REQUEST] = "CsumRSRequest", | ||
264 | [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", | ||
265 | [P_COMPRESSED_BITMAP] = "CBitmap", | ||
266 | [P_DELAY_PROBE] = "DelayProbe", | ||
267 | [P_OUT_OF_SYNC] = "OutOfSync", | ||
268 | [P_MAX_CMD] = NULL, | ||
269 | }; | ||
270 | |||
271 | if (cmd == P_HAND_SHAKE_M) | ||
272 | return "HandShakeM"; | ||
273 | if (cmd == P_HAND_SHAKE_S) | ||
274 | return "HandShakeS"; | ||
275 | if (cmd == P_HAND_SHAKE) | ||
276 | return "HandShake"; | ||
277 | if (cmd >= P_MAX_CMD) | ||
278 | return "Unknown"; | ||
279 | return cmdnames[cmd]; | ||
280 | } | ||
281 | 237 | ||
282 | /* for sending/receiving the bitmap, | 238 | /* for sending/receiving the bitmap, |
283 | * possibly in some encoding scheme */ | 239 | * possibly in some encoding scheme */ |
@@ -337,37 +293,24 @@ struct p_header80 { | |||
337 | u32 magic; | 293 | u32 magic; |
338 | u16 command; | 294 | u16 command; |
339 | u16 length; /* bytes of data after this header */ | 295 | u16 length; /* bytes of data after this header */ |
340 | u8 payload[0]; | ||
341 | } __packed; | 296 | } __packed; |
342 | 297 | ||
343 | /* Header for big packets, Used for data packets exceeding 64kB */ | 298 | /* Header for big packets, Used for data packets exceeding 64kB */ |
344 | struct p_header95 { | 299 | struct p_header95 { |
345 | u16 magic; /* use DRBD_MAGIC_BIG here */ | 300 | u16 magic; /* use DRBD_MAGIC_BIG here */ |
346 | u16 command; | 301 | u16 command; |
347 | u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */ | 302 | u32 length; |
348 | u8 payload[0]; | ||
349 | } __packed; | 303 | } __packed; |
350 | 304 | ||
351 | union p_header { | 305 | struct p_header100 { |
352 | struct p_header80 h80; | 306 | u32 magic; |
353 | struct p_header95 h95; | 307 | u16 volume; |
354 | }; | 308 | u16 command; |
355 | 309 | u32 length; | |
356 | /* | 310 | u32 pad; |
357 | * short commands, packets without payload, plain p_header: | 311 | } __packed; |
358 | * P_PING | ||
359 | * P_PING_ACK | ||
360 | * P_BECOME_SYNC_TARGET | ||
361 | * P_BECOME_SYNC_SOURCE | ||
362 | * P_UNPLUG_REMOTE | ||
363 | */ | ||
364 | 312 | ||
365 | /* | 313 | extern unsigned int drbd_header_size(struct drbd_tconn *tconn); |
366 | * commands with out-of-struct payload: | ||
367 | * P_BITMAP (no additional fields) | ||
368 | * P_DATA, P_DATA_REPLY (see p_data) | ||
369 | * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) | ||
370 | */ | ||
371 | 314 | ||
372 | /* these defines must not be changed without changing the protocol version */ | 315 | /* these defines must not be changed without changing the protocol version */ |
373 | #define DP_HARDBARRIER 1 /* depricated */ | 316 | #define DP_HARDBARRIER 1 /* depricated */ |
@@ -377,9 +320,10 @@ union p_header { | |||
377 | #define DP_FUA 16 /* equals REQ_FUA */ | 320 | #define DP_FUA 16 /* equals REQ_FUA */ |
378 | #define DP_FLUSH 32 /* equals REQ_FLUSH */ | 321 | #define DP_FLUSH 32 /* equals REQ_FLUSH */ |
379 | #define DP_DISCARD 64 /* equals REQ_DISCARD */ | 322 | #define DP_DISCARD 64 /* equals REQ_DISCARD */ |
323 | #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ | ||
324 | #define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ | ||
380 | 325 | ||
381 | struct p_data { | 326 | struct p_data { |
382 | union p_header head; | ||
383 | u64 sector; /* 64 bits sector number */ | 327 | u64 sector; /* 64 bits sector number */ |
384 | u64 block_id; /* to identify the request in protocol B&C */ | 328 | u64 block_id; /* to identify the request in protocol B&C */ |
385 | u32 seq_num; | 329 | u32 seq_num; |
@@ -390,21 +334,18 @@ struct p_data { | |||
390 | * commands which share a struct: | 334 | * commands which share a struct: |
391 | * p_block_ack: | 335 | * p_block_ack: |
392 | * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), | 336 | * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), |
393 | * P_DISCARD_ACK (proto C, two-primaries conflict detection) | 337 | * P_SUPERSEDED (proto C, two-primaries conflict detection) |
394 | * p_block_req: | 338 | * p_block_req: |
395 | * P_DATA_REQUEST, P_RS_DATA_REQUEST | 339 | * P_DATA_REQUEST, P_RS_DATA_REQUEST |
396 | */ | 340 | */ |
397 | struct p_block_ack { | 341 | struct p_block_ack { |
398 | struct p_header80 head; | ||
399 | u64 sector; | 342 | u64 sector; |
400 | u64 block_id; | 343 | u64 block_id; |
401 | u32 blksize; | 344 | u32 blksize; |
402 | u32 seq_num; | 345 | u32 seq_num; |
403 | } __packed; | 346 | } __packed; |
404 | 347 | ||
405 | |||
406 | struct p_block_req { | 348 | struct p_block_req { |
407 | struct p_header80 head; | ||
408 | u64 sector; | 349 | u64 sector; |
409 | u64 block_id; | 350 | u64 block_id; |
410 | u32 blksize; | 351 | u32 blksize; |
@@ -413,59 +354,52 @@ struct p_block_req { | |||
413 | 354 | ||
414 | /* | 355 | /* |
415 | * commands with their own struct for additional fields: | 356 | * commands with their own struct for additional fields: |
416 | * P_HAND_SHAKE | 357 | * P_CONNECTION_FEATURES |
417 | * P_BARRIER | 358 | * P_BARRIER |
418 | * P_BARRIER_ACK | 359 | * P_BARRIER_ACK |
419 | * P_SYNC_PARAM | 360 | * P_SYNC_PARAM |
420 | * ReportParams | 361 | * ReportParams |
421 | */ | 362 | */ |
422 | 363 | ||
423 | struct p_handshake { | 364 | struct p_connection_features { |
424 | struct p_header80 head; /* 8 bytes */ | ||
425 | u32 protocol_min; | 365 | u32 protocol_min; |
426 | u32 feature_flags; | 366 | u32 feature_flags; |
427 | u32 protocol_max; | 367 | u32 protocol_max; |
428 | 368 | ||
429 | /* should be more than enough for future enhancements | 369 | /* should be more than enough for future enhancements |
430 | * for now, feature_flags and the reserverd array shall be zero. | 370 | * for now, feature_flags and the reserved array shall be zero. |
431 | */ | 371 | */ |
432 | 372 | ||
433 | u32 _pad; | 373 | u32 _pad; |
434 | u64 reserverd[7]; | 374 | u64 reserved[7]; |
435 | } __packed; | 375 | } __packed; |
436 | /* 80 bytes, FIXED for the next century */ | ||
437 | 376 | ||
438 | struct p_barrier { | 377 | struct p_barrier { |
439 | struct p_header80 head; | ||
440 | u32 barrier; /* barrier number _handle_ only */ | 378 | u32 barrier; /* barrier number _handle_ only */ |
441 | u32 pad; /* to multiple of 8 Byte */ | 379 | u32 pad; /* to multiple of 8 Byte */ |
442 | } __packed; | 380 | } __packed; |
443 | 381 | ||
444 | struct p_barrier_ack { | 382 | struct p_barrier_ack { |
445 | struct p_header80 head; | ||
446 | u32 barrier; | 383 | u32 barrier; |
447 | u32 set_size; | 384 | u32 set_size; |
448 | } __packed; | 385 | } __packed; |
449 | 386 | ||
450 | struct p_rs_param { | 387 | struct p_rs_param { |
451 | struct p_header80 head; | 388 | u32 resync_rate; |
452 | u32 rate; | ||
453 | 389 | ||
454 | /* Since protocol version 88 and higher. */ | 390 | /* Since protocol version 88 and higher. */ |
455 | char verify_alg[0]; | 391 | char verify_alg[0]; |
456 | } __packed; | 392 | } __packed; |
457 | 393 | ||
458 | struct p_rs_param_89 { | 394 | struct p_rs_param_89 { |
459 | struct p_header80 head; | 395 | u32 resync_rate; |
460 | u32 rate; | ||
461 | /* protocol version 89: */ | 396 | /* protocol version 89: */ |
462 | char verify_alg[SHARED_SECRET_MAX]; | 397 | char verify_alg[SHARED_SECRET_MAX]; |
463 | char csums_alg[SHARED_SECRET_MAX]; | 398 | char csums_alg[SHARED_SECRET_MAX]; |
464 | } __packed; | 399 | } __packed; |
465 | 400 | ||
466 | struct p_rs_param_95 { | 401 | struct p_rs_param_95 { |
467 | struct p_header80 head; | 402 | u32 resync_rate; |
468 | u32 rate; | ||
469 | char verify_alg[SHARED_SECRET_MAX]; | 403 | char verify_alg[SHARED_SECRET_MAX]; |
470 | char csums_alg[SHARED_SECRET_MAX]; | 404 | char csums_alg[SHARED_SECRET_MAX]; |
471 | u32 c_plan_ahead; | 405 | u32 c_plan_ahead; |
@@ -475,12 +409,11 @@ struct p_rs_param_95 { | |||
475 | } __packed; | 409 | } __packed; |
476 | 410 | ||
477 | enum drbd_conn_flags { | 411 | enum drbd_conn_flags { |
478 | CF_WANT_LOSE = 1, | 412 | CF_DISCARD_MY_DATA = 1, |
479 | CF_DRY_RUN = 2, | 413 | CF_DRY_RUN = 2, |
480 | }; | 414 | }; |
481 | 415 | ||
482 | struct p_protocol { | 416 | struct p_protocol { |
483 | struct p_header80 head; | ||
484 | u32 protocol; | 417 | u32 protocol; |
485 | u32 after_sb_0p; | 418 | u32 after_sb_0p; |
486 | u32 after_sb_1p; | 419 | u32 after_sb_1p; |
@@ -494,17 +427,14 @@ struct p_protocol { | |||
494 | } __packed; | 427 | } __packed; |
495 | 428 | ||
496 | struct p_uuids { | 429 | struct p_uuids { |
497 | struct p_header80 head; | ||
498 | u64 uuid[UI_EXTENDED_SIZE]; | 430 | u64 uuid[UI_EXTENDED_SIZE]; |
499 | } __packed; | 431 | } __packed; |
500 | 432 | ||
501 | struct p_rs_uuid { | 433 | struct p_rs_uuid { |
502 | struct p_header80 head; | ||
503 | u64 uuid; | 434 | u64 uuid; |
504 | } __packed; | 435 | } __packed; |
505 | 436 | ||
506 | struct p_sizes { | 437 | struct p_sizes { |
507 | struct p_header80 head; | ||
508 | u64 d_size; /* size of disk */ | 438 | u64 d_size; /* size of disk */ |
509 | u64 u_size; /* user requested size */ | 439 | u64 u_size; /* user requested size */ |
510 | u64 c_size; /* current exported size */ | 440 | u64 c_size; /* current exported size */ |
@@ -514,18 +444,15 @@ struct p_sizes { | |||
514 | } __packed; | 444 | } __packed; |
515 | 445 | ||
516 | struct p_state { | 446 | struct p_state { |
517 | struct p_header80 head; | ||
518 | u32 state; | 447 | u32 state; |
519 | } __packed; | 448 | } __packed; |
520 | 449 | ||
521 | struct p_req_state { | 450 | struct p_req_state { |
522 | struct p_header80 head; | ||
523 | u32 mask; | 451 | u32 mask; |
524 | u32 val; | 452 | u32 val; |
525 | } __packed; | 453 | } __packed; |
526 | 454 | ||
527 | struct p_req_state_reply { | 455 | struct p_req_state_reply { |
528 | struct p_header80 head; | ||
529 | u32 retcode; | 456 | u32 retcode; |
530 | } __packed; | 457 | } __packed; |
531 | 458 | ||
@@ -539,15 +466,7 @@ struct p_drbd06_param { | |||
539 | u32 bit_map_gen[5]; | 466 | u32 bit_map_gen[5]; |
540 | } __packed; | 467 | } __packed; |
541 | 468 | ||
542 | struct p_discard { | ||
543 | struct p_header80 head; | ||
544 | u64 block_id; | ||
545 | u32 seq_num; | ||
546 | u32 pad; | ||
547 | } __packed; | ||
548 | |||
549 | struct p_block_desc { | 469 | struct p_block_desc { |
550 | struct p_header80 head; | ||
551 | u64 sector; | 470 | u64 sector; |
552 | u32 blksize; | 471 | u32 blksize; |
553 | u32 pad; /* to multiple of 8 Byte */ | 472 | u32 pad; /* to multiple of 8 Byte */ |
@@ -563,7 +482,6 @@ enum drbd_bitmap_code { | |||
563 | }; | 482 | }; |
564 | 483 | ||
565 | struct p_compressed_bm { | 484 | struct p_compressed_bm { |
566 | struct p_header80 head; | ||
567 | /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code | 485 | /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code |
568 | * (encoding & 0x80): polarity (set/unset) of first runlength | 486 | * (encoding & 0x80): polarity (set/unset) of first runlength |
569 | * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits | 487 | * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits |
@@ -575,90 +493,22 @@ struct p_compressed_bm { | |||
575 | } __packed; | 493 | } __packed; |
576 | 494 | ||
577 | struct p_delay_probe93 { | 495 | struct p_delay_probe93 { |
578 | struct p_header80 head; | ||
579 | u32 seq_num; /* sequence number to match the two probe packets */ | 496 | u32 seq_num; /* sequence number to match the two probe packets */ |
580 | u32 offset; /* usecs the probe got sent after the reference time point */ | 497 | u32 offset; /* usecs the probe got sent after the reference time point */ |
581 | } __packed; | 498 | } __packed; |
582 | 499 | ||
583 | /* DCBP: Drbd Compressed Bitmap Packet ... */ | 500 | /* |
584 | static inline enum drbd_bitmap_code | 501 | * Bitmap packets need to fit within a single page on the sender and receiver, |
585 | DCBP_get_code(struct p_compressed_bm *p) | 502 | * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger). |
586 | { | ||
587 | return (enum drbd_bitmap_code)(p->encoding & 0x0f); | ||
588 | } | ||
589 | |||
590 | static inline void | ||
591 | DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) | ||
592 | { | ||
593 | BUG_ON(code & ~0xf); | ||
594 | p->encoding = (p->encoding & ~0xf) | code; | ||
595 | } | ||
596 | |||
597 | static inline int | ||
598 | DCBP_get_start(struct p_compressed_bm *p) | ||
599 | { | ||
600 | return (p->encoding & 0x80) != 0; | ||
601 | } | ||
602 | |||
603 | static inline void | ||
604 | DCBP_set_start(struct p_compressed_bm *p, int set) | ||
605 | { | ||
606 | p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); | ||
607 | } | ||
608 | |||
609 | static inline int | ||
610 | DCBP_get_pad_bits(struct p_compressed_bm *p) | ||
611 | { | ||
612 | return (p->encoding >> 4) & 0x7; | ||
613 | } | ||
614 | |||
615 | static inline void | ||
616 | DCBP_set_pad_bits(struct p_compressed_bm *p, int n) | ||
617 | { | ||
618 | BUG_ON(n & ~0x7); | ||
619 | p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); | ||
620 | } | ||
621 | |||
622 | /* one bitmap packet, including the p_header, | ||
623 | * should fit within one _architecture independend_ page. | ||
624 | * so we need to use the fixed size 4KiB page size | ||
625 | * most architectures have used for a long time. | ||
626 | */ | 503 | */ |
627 | #define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80)) | 504 | #define DRBD_SOCKET_BUFFER_SIZE 4096 |
628 | #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) | ||
629 | #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) | ||
630 | #if (PAGE_SIZE < 4096) | ||
631 | /* drbd_send_bitmap / receive_bitmap would break horribly */ | ||
632 | #error "PAGE_SIZE too small" | ||
633 | #endif | ||
634 | |||
635 | union p_polymorph { | ||
636 | union p_header header; | ||
637 | struct p_handshake handshake; | ||
638 | struct p_data data; | ||
639 | struct p_block_ack block_ack; | ||
640 | struct p_barrier barrier; | ||
641 | struct p_barrier_ack barrier_ack; | ||
642 | struct p_rs_param_89 rs_param_89; | ||
643 | struct p_rs_param_95 rs_param_95; | ||
644 | struct p_protocol protocol; | ||
645 | struct p_sizes sizes; | ||
646 | struct p_uuids uuids; | ||
647 | struct p_state state; | ||
648 | struct p_req_state req_state; | ||
649 | struct p_req_state_reply req_state_reply; | ||
650 | struct p_block_req block_req; | ||
651 | struct p_delay_probe93 delay_probe93; | ||
652 | struct p_rs_uuid rs_uuid; | ||
653 | struct p_block_desc block_desc; | ||
654 | } __packed; | ||
655 | 505 | ||
656 | /**********************************************************************/ | 506 | /**********************************************************************/ |
657 | enum drbd_thread_state { | 507 | enum drbd_thread_state { |
658 | None, | 508 | NONE, |
659 | Running, | 509 | RUNNING, |
660 | Exiting, | 510 | EXITING, |
661 | Restarting | 511 | RESTARTING |
662 | }; | 512 | }; |
663 | 513 | ||
664 | struct drbd_thread { | 514 | struct drbd_thread { |
@@ -667,8 +517,9 @@ struct drbd_thread { | |||
667 | struct completion stop; | 517 | struct completion stop; |
668 | enum drbd_thread_state t_state; | 518 | enum drbd_thread_state t_state; |
669 | int (*function) (struct drbd_thread *); | 519 | int (*function) (struct drbd_thread *); |
670 | struct drbd_conf *mdev; | 520 | struct drbd_tconn *tconn; |
671 | int reset_cpu_mask; | 521 | int reset_cpu_mask; |
522 | char name[9]; | ||
672 | }; | 523 | }; |
673 | 524 | ||
674 | static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) | 525 | static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) |
@@ -681,58 +532,54 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) | |||
681 | return thi->t_state; | 532 | return thi->t_state; |
682 | } | 533 | } |
683 | 534 | ||
684 | struct drbd_work; | ||
685 | typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); | ||
686 | struct drbd_work { | 535 | struct drbd_work { |
687 | struct list_head list; | 536 | struct list_head list; |
688 | drbd_work_cb cb; | 537 | int (*cb)(struct drbd_work *, int cancel); |
538 | union { | ||
539 | struct drbd_conf *mdev; | ||
540 | struct drbd_tconn *tconn; | ||
541 | }; | ||
689 | }; | 542 | }; |
690 | 543 | ||
691 | struct drbd_tl_epoch; | 544 | #include "drbd_interval.h" |
545 | |||
546 | extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *); | ||
547 | |||
692 | struct drbd_request { | 548 | struct drbd_request { |
693 | struct drbd_work w; | 549 | struct drbd_work w; |
694 | struct drbd_conf *mdev; | ||
695 | 550 | ||
696 | /* if local IO is not allowed, will be NULL. | 551 | /* if local IO is not allowed, will be NULL. |
697 | * if local IO _is_ allowed, holds the locally submitted bio clone, | 552 | * if local IO _is_ allowed, holds the locally submitted bio clone, |
698 | * or, after local IO completion, the ERR_PTR(error). | 553 | * or, after local IO completion, the ERR_PTR(error). |
699 | * see drbd_endio_pri(). */ | 554 | * see drbd_request_endio(). */ |
700 | struct bio *private_bio; | 555 | struct bio *private_bio; |
701 | 556 | ||
702 | struct hlist_node collision; | 557 | struct drbd_interval i; |
703 | sector_t sector; | ||
704 | unsigned int size; | ||
705 | unsigned int epoch; /* barrier_nr */ | ||
706 | 558 | ||
707 | /* barrier_nr: used to check on "completion" whether this req was in | 559 | /* epoch: used to check on "completion" whether this req was in |
708 | * the current epoch, and we therefore have to close it, | 560 | * the current epoch, and we therefore have to close it, |
709 | * starting a new epoch... | 561 | * causing a p_barrier packet to be send, starting a new epoch. |
562 | * | ||
563 | * This corresponds to "barrier" in struct p_barrier[_ack], | ||
564 | * and to "barrier_nr" in struct drbd_epoch (and various | ||
565 | * comments/function parameters/local variable names). | ||
710 | */ | 566 | */ |
567 | unsigned int epoch; | ||
711 | 568 | ||
712 | struct list_head tl_requests; /* ring list in the transfer log */ | 569 | struct list_head tl_requests; /* ring list in the transfer log */ |
713 | struct bio *master_bio; /* master bio pointer */ | 570 | struct bio *master_bio; /* master bio pointer */ |
714 | unsigned long rq_state; /* see comments above _req_mod() */ | ||
715 | unsigned long start_time; | 571 | unsigned long start_time; |
716 | }; | ||
717 | 572 | ||
718 | struct drbd_tl_epoch { | 573 | /* once it hits 0, we may complete the master_bio */ |
719 | struct drbd_work w; | 574 | atomic_t completion_ref; |
720 | struct list_head requests; /* requests before */ | 575 | /* once it hits 0, we may destroy this drbd_request object */ |
721 | struct drbd_tl_epoch *next; /* pointer to the next barrier */ | 576 | struct kref kref; |
722 | unsigned int br_number; /* the barriers identifier. */ | ||
723 | int n_writes; /* number of requests attached before this barrier */ | ||
724 | }; | ||
725 | 577 | ||
726 | struct drbd_request; | 578 | unsigned rq_state; /* see comments above _req_mod() */ |
727 | 579 | }; | |
728 | /* These Tl_epoch_entries may be in one of 6 lists: | ||
729 | active_ee .. data packet being written | ||
730 | sync_ee .. syncer block being written | ||
731 | done_ee .. block written, need to send P_WRITE_ACK | ||
732 | read_ee .. [RS]P_DATA_REQUEST being read | ||
733 | */ | ||
734 | 580 | ||
735 | struct drbd_epoch { | 581 | struct drbd_epoch { |
582 | struct drbd_tconn *tconn; | ||
736 | struct list_head list; | 583 | struct list_head list; |
737 | unsigned int barrier_nr; | 584 | unsigned int barrier_nr; |
738 | atomic_t epoch_size; /* increased on every request added. */ | 585 | atomic_t epoch_size; /* increased on every request added. */ |
@@ -762,17 +609,14 @@ struct digest_info { | |||
762 | void *digest; | 609 | void *digest; |
763 | }; | 610 | }; |
764 | 611 | ||
765 | struct drbd_epoch_entry { | 612 | struct drbd_peer_request { |
766 | struct drbd_work w; | 613 | struct drbd_work w; |
767 | struct hlist_node collision; | ||
768 | struct drbd_epoch *epoch; /* for writes */ | 614 | struct drbd_epoch *epoch; /* for writes */ |
769 | struct drbd_conf *mdev; | ||
770 | struct page *pages; | 615 | struct page *pages; |
771 | atomic_t pending_bios; | 616 | atomic_t pending_bios; |
772 | unsigned int size; | 617 | struct drbd_interval i; |
773 | /* see comments on ee flag bits below */ | 618 | /* see comments on ee flag bits below */ |
774 | unsigned long flags; | 619 | unsigned long flags; |
775 | sector_t sector; | ||
776 | union { | 620 | union { |
777 | u64 block_id; | 621 | u64 block_id; |
778 | struct digest_info *digest; | 622 | struct digest_info *digest; |
@@ -793,31 +637,37 @@ enum { | |||
793 | * we need to resubmit without the barrier flag. */ | 637 | * we need to resubmit without the barrier flag. */ |
794 | __EE_RESUBMITTED, | 638 | __EE_RESUBMITTED, |
795 | 639 | ||
796 | /* we may have several bios per epoch entry. | 640 | /* we may have several bios per peer request. |
797 | * if any of those fail, we set this flag atomically | 641 | * if any of those fail, we set this flag atomically |
798 | * from the endio callback */ | 642 | * from the endio callback */ |
799 | __EE_WAS_ERROR, | 643 | __EE_WAS_ERROR, |
800 | 644 | ||
801 | /* This ee has a pointer to a digest instead of a block id */ | 645 | /* This ee has a pointer to a digest instead of a block id */ |
802 | __EE_HAS_DIGEST, | 646 | __EE_HAS_DIGEST, |
647 | |||
648 | /* Conflicting local requests need to be restarted after this request */ | ||
649 | __EE_RESTART_REQUESTS, | ||
650 | |||
651 | /* The peer wants a write ACK for this (wire proto C) */ | ||
652 | __EE_SEND_WRITE_ACK, | ||
653 | |||
654 | /* Is set when net_conf had two_primaries set while creating this peer_req */ | ||
655 | __EE_IN_INTERVAL_TREE, | ||
803 | }; | 656 | }; |
804 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) | 657 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) |
805 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) | 658 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) |
806 | #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) | 659 | #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) |
807 | #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) | 660 | #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) |
808 | #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) | 661 | #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) |
662 | #define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) | ||
663 | #define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) | ||
664 | #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) | ||
809 | 665 | ||
810 | /* global flag bits */ | 666 | /* flag bits per mdev */ |
811 | enum drbd_flag { | 667 | enum { |
812 | CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ | ||
813 | SIGNAL_ASENDER, /* whether asender wants to be interrupted */ | ||
814 | SEND_PING, /* whether asender should send a ping asap */ | ||
815 | |||
816 | UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ | 668 | UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ |
817 | MD_DIRTY, /* current uuids and flags not yet on disk */ | 669 | MD_DIRTY, /* current uuids and flags not yet on disk */ |
818 | DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ | ||
819 | USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ | 670 | USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ |
820 | CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ | ||
821 | CL_ST_CHG_SUCCESS, | 671 | CL_ST_CHG_SUCCESS, |
822 | CL_ST_CHG_FAIL, | 672 | CL_ST_CHG_FAIL, |
823 | CRASHED_PRIMARY, /* This node was a crashed primary. | 673 | CRASHED_PRIMARY, /* This node was a crashed primary. |
@@ -835,33 +685,14 @@ enum drbd_flag { | |||
835 | WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ | 685 | WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ |
836 | FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ | 686 | FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ |
837 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ | 687 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ |
838 | NET_CONGESTED, /* The data socket is congested */ | ||
839 | |||
840 | CONFIG_PENDING, /* serialization of (re)configuration requests. | ||
841 | * if set, also prevents the device from dying */ | ||
842 | DEVICE_DYING, /* device became unconfigured, | ||
843 | * but worker thread is still handling the cleanup. | ||
844 | * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, | ||
845 | * while this is set. */ | ||
846 | RESIZE_PENDING, /* Size change detected locally, waiting for the response from | 688 | RESIZE_PENDING, /* Size change detected locally, waiting for the response from |
847 | * the peer, if it changed there as well. */ | 689 | * the peer, if it changed there as well. */ |
848 | CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ | ||
849 | GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ | ||
850 | NEW_CUR_UUID, /* Create new current UUID when thawing IO */ | 690 | NEW_CUR_UUID, /* Create new current UUID when thawing IO */ |
851 | AL_SUSPENDED, /* Activity logging is currently suspended. */ | 691 | AL_SUSPENDED, /* Activity logging is currently suspended. */ |
852 | AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ | 692 | AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ |
853 | STATE_SENT, /* Do not change state/UUIDs while this is set */ | 693 | B_RS_H_DONE, /* Before resync handler done (already executed) */ |
854 | 694 | DISCARD_MY_DATA, /* discard_my_data flag per volume */ | |
855 | CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) | 695 | READ_BALANCE_RR, |
856 | * pending, from drbd worker context. | ||
857 | * If set, bdi_write_congested() returns true, | ||
858 | * so shrink_page_list() would not recurse into, | ||
859 | * and potentially deadlock on, this drbd worker. | ||
860 | */ | ||
861 | DISCONNECT_SENT, /* Currently the last bit in this 32bit word */ | ||
862 | |||
863 | /* keep last */ | ||
864 | DRBD_N_FLAGS, | ||
865 | }; | 696 | }; |
866 | 697 | ||
867 | struct drbd_bitmap; /* opaque for drbd_conf */ | 698 | struct drbd_bitmap; /* opaque for drbd_conf */ |
@@ -899,18 +730,17 @@ enum bm_flag { | |||
899 | 730 | ||
900 | struct drbd_work_queue { | 731 | struct drbd_work_queue { |
901 | struct list_head q; | 732 | struct list_head q; |
902 | struct semaphore s; /* producers up it, worker down()s it */ | ||
903 | spinlock_t q_lock; /* to protect the list. */ | 733 | spinlock_t q_lock; /* to protect the list. */ |
734 | wait_queue_head_t q_wait; | ||
904 | }; | 735 | }; |
905 | 736 | ||
906 | struct drbd_socket { | 737 | struct drbd_socket { |
907 | struct drbd_work_queue work; | ||
908 | struct mutex mutex; | 738 | struct mutex mutex; |
909 | struct socket *socket; | 739 | struct socket *socket; |
910 | /* this way we get our | 740 | /* this way we get our |
911 | * send/receive buffers off the stack */ | 741 | * send/receive buffers off the stack */ |
912 | union p_polymorph sbuf; | 742 | void *sbuf; |
913 | union p_polymorph rbuf; | 743 | void *rbuf; |
914 | }; | 744 | }; |
915 | 745 | ||
916 | struct drbd_md { | 746 | struct drbd_md { |
@@ -927,24 +757,16 @@ struct drbd_md { | |||
927 | s32 bm_offset; /* signed relative sector offset to bitmap */ | 757 | s32 bm_offset; /* signed relative sector offset to bitmap */ |
928 | 758 | ||
929 | /* u32 al_nr_extents; important for restoring the AL | 759 | /* u32 al_nr_extents; important for restoring the AL |
930 | * is stored into sync_conf.al_extents, which in turn | 760 | * is stored into ldev->dc.al_extents, which in turn |
931 | * gets applied to act_log->nr_elements | 761 | * gets applied to act_log->nr_elements |
932 | */ | 762 | */ |
933 | }; | 763 | }; |
934 | 764 | ||
935 | /* for sync_conf and other types... */ | ||
936 | #define NL_PACKET(name, number, fields) struct name { fields }; | ||
937 | #define NL_INTEGER(pn,pr,member) int member; | ||
938 | #define NL_INT64(pn,pr,member) __u64 member; | ||
939 | #define NL_BIT(pn,pr,member) unsigned member:1; | ||
940 | #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; | ||
941 | #include <linux/drbd_nl.h> | ||
942 | |||
943 | struct drbd_backing_dev { | 765 | struct drbd_backing_dev { |
944 | struct block_device *backing_bdev; | 766 | struct block_device *backing_bdev; |
945 | struct block_device *md_bdev; | 767 | struct block_device *md_bdev; |
946 | struct drbd_md md; | 768 | struct drbd_md md; |
947 | struct disk_conf dc; /* The user provided config... */ | 769 | struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */ |
948 | sector_t known_size; /* last known size of that backing device */ | 770 | sector_t known_size; /* last known size of that backing device */ |
949 | }; | 771 | }; |
950 | 772 | ||
@@ -968,17 +790,116 @@ enum write_ordering_e { | |||
968 | }; | 790 | }; |
969 | 791 | ||
970 | struct fifo_buffer { | 792 | struct fifo_buffer { |
971 | int *values; | ||
972 | unsigned int head_index; | 793 | unsigned int head_index; |
973 | unsigned int size; | 794 | unsigned int size; |
795 | int total; /* sum of all values */ | ||
796 | int values[0]; | ||
797 | }; | ||
798 | extern struct fifo_buffer *fifo_alloc(int fifo_size); | ||
799 | |||
800 | /* flag bits per tconn */ | ||
801 | enum { | ||
802 | NET_CONGESTED, /* The data socket is congested */ | ||
803 | RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ | ||
804 | SEND_PING, /* whether asender should send a ping asap */ | ||
805 | SIGNAL_ASENDER, /* whether asender wants to be interrupted */ | ||
806 | GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ | ||
807 | CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ | ||
808 | CONN_WD_ST_CHG_OKAY, | ||
809 | CONN_WD_ST_CHG_FAIL, | ||
810 | CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ | ||
811 | CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ | ||
812 | STATE_SENT, /* Do not change state/UUIDs while this is set */ | ||
813 | CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) | ||
814 | * pending, from drbd worker context. | ||
815 | * If set, bdi_write_congested() returns true, | ||
816 | * so shrink_page_list() would not recurse into, | ||
817 | * and potentially deadlock on, this drbd worker. | ||
818 | */ | ||
819 | DISCONNECT_SENT, | ||
820 | }; | ||
821 | |||
822 | struct drbd_tconn { /* is a resource from the config file */ | ||
823 | char *name; /* Resource name */ | ||
824 | struct list_head all_tconn; /* linked on global drbd_tconns */ | ||
825 | struct kref kref; | ||
826 | struct idr volumes; /* <tconn, vnr> to mdev mapping */ | ||
827 | enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ | ||
828 | unsigned susp:1; /* IO suspended by user */ | ||
829 | unsigned susp_nod:1; /* IO suspended because no data */ | ||
830 | unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ | ||
831 | struct mutex cstate_mutex; /* Protects graceful disconnects */ | ||
832 | |||
833 | unsigned long flags; | ||
834 | struct net_conf *net_conf; /* content protected by rcu */ | ||
835 | struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ | ||
836 | wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */ | ||
837 | struct res_opts res_opts; | ||
838 | |||
839 | struct sockaddr_storage my_addr; | ||
840 | int my_addr_len; | ||
841 | struct sockaddr_storage peer_addr; | ||
842 | int peer_addr_len; | ||
843 | |||
844 | struct drbd_socket data; /* data/barrier/cstate/parameter packets */ | ||
845 | struct drbd_socket meta; /* ping/ack (metadata) packets */ | ||
846 | int agreed_pro_version; /* actually used protocol version */ | ||
847 | unsigned long last_received; /* in jiffies, either socket */ | ||
848 | unsigned int ko_count; | ||
849 | |||
850 | spinlock_t req_lock; | ||
851 | |||
852 | struct list_head transfer_log; /* all requests not yet fully processed */ | ||
853 | |||
854 | struct crypto_hash *cram_hmac_tfm; | ||
855 | struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by tconn->data->mutex */ | ||
856 | struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ | ||
857 | struct crypto_hash *csums_tfm; | ||
858 | struct crypto_hash *verify_tfm; | ||
859 | void *int_dig_in; | ||
860 | void *int_dig_vv; | ||
861 | |||
862 | /* receiver side */ | ||
863 | struct drbd_epoch *current_epoch; | ||
864 | spinlock_t epoch_lock; | ||
865 | unsigned int epochs; | ||
866 | enum write_ordering_e write_ordering; | ||
867 | atomic_t current_tle_nr; /* transfer log epoch number */ | ||
868 | unsigned current_tle_writes; /* writes seen within this tl epoch */ | ||
869 | |||
870 | unsigned long last_reconnect_jif; | ||
871 | struct drbd_thread receiver; | ||
872 | struct drbd_thread worker; | ||
873 | struct drbd_thread asender; | ||
874 | cpumask_var_t cpu_mask; | ||
875 | |||
876 | /* sender side */ | ||
877 | struct drbd_work_queue sender_work; | ||
878 | |||
879 | struct { | ||
880 | /* whether this sender thread | ||
881 | * has processed a single write yet. */ | ||
882 | bool seen_any_write_yet; | ||
883 | |||
884 | /* Which barrier number to send with the next P_BARRIER */ | ||
885 | int current_epoch_nr; | ||
886 | |||
887 | /* how many write requests have been sent | ||
888 | * with req->epoch == current_epoch_nr. | ||
889 | * If none, no P_BARRIER will be sent. */ | ||
890 | unsigned current_epoch_writes; | ||
891 | } send; | ||
974 | }; | 892 | }; |
975 | 893 | ||
976 | struct drbd_conf { | 894 | struct drbd_conf { |
977 | unsigned long drbd_flags[(DRBD_N_FLAGS + BITS_PER_LONG -1)/BITS_PER_LONG]; | 895 | struct drbd_tconn *tconn; |
896 | int vnr; /* volume number within the connection */ | ||
897 | struct kref kref; | ||
898 | |||
899 | /* things that are stored as / read from meta data on disk */ | ||
900 | unsigned long flags; | ||
978 | 901 | ||
979 | /* configured by drbdsetup */ | 902 | /* configured by drbdsetup */ |
980 | struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ | ||
981 | struct syncer_conf sync_conf; | ||
982 | struct drbd_backing_dev *ldev __protected_by(local); | 903 | struct drbd_backing_dev *ldev __protected_by(local); |
983 | 904 | ||
984 | sector_t p_size; /* partner's disk size */ | 905 | sector_t p_size; /* partner's disk size */ |
@@ -986,11 +907,7 @@ struct drbd_conf { | |||
986 | struct block_device *this_bdev; | 907 | struct block_device *this_bdev; |
987 | struct gendisk *vdisk; | 908 | struct gendisk *vdisk; |
988 | 909 | ||
989 | struct drbd_socket data; /* data/barrier/cstate/parameter packets */ | 910 | unsigned long last_reattach_jif; |
990 | struct drbd_socket meta; /* ping/ack (metadata) packets */ | ||
991 | int agreed_pro_version; /* actually used protocol version */ | ||
992 | unsigned long last_received; /* in jiffies, either socket */ | ||
993 | unsigned int ko_count; | ||
994 | struct drbd_work resync_work, | 911 | struct drbd_work resync_work, |
995 | unplug_work, | 912 | unplug_work, |
996 | go_diskless, | 913 | go_diskless, |
@@ -1010,10 +927,9 @@ struct drbd_conf { | |||
1010 | /* Used after attach while negotiating new disk state. */ | 927 | /* Used after attach while negotiating new disk state. */ |
1011 | union drbd_state new_state_tmp; | 928 | union drbd_state new_state_tmp; |
1012 | 929 | ||
1013 | union drbd_state state; | 930 | union drbd_dev_state state; |
1014 | wait_queue_head_t misc_wait; | 931 | wait_queue_head_t misc_wait; |
1015 | wait_queue_head_t state_wait; /* upon each state change. */ | 932 | wait_queue_head_t state_wait; /* upon each state change. */ |
1016 | wait_queue_head_t net_cnt_wait; | ||
1017 | unsigned int send_cnt; | 933 | unsigned int send_cnt; |
1018 | unsigned int recv_cnt; | 934 | unsigned int recv_cnt; |
1019 | unsigned int read_cnt; | 935 | unsigned int read_cnt; |
@@ -1023,17 +939,12 @@ struct drbd_conf { | |||
1023 | atomic_t ap_bio_cnt; /* Requests we need to complete */ | 939 | atomic_t ap_bio_cnt; /* Requests we need to complete */ |
1024 | atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ | 940 | atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ |
1025 | atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ | 941 | atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ |
1026 | atomic_t unacked_cnt; /* Need to send replys for */ | 942 | atomic_t unacked_cnt; /* Need to send replies for */ |
1027 | atomic_t local_cnt; /* Waiting for local completion */ | 943 | atomic_t local_cnt; /* Waiting for local completion */ |
1028 | atomic_t net_cnt; /* Users of net_conf */ | 944 | |
1029 | spinlock_t req_lock; | 945 | /* Interval tree of pending local requests */ |
1030 | struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ | 946 | struct rb_root read_requests; |
1031 | struct drbd_tl_epoch *newest_tle; | 947 | struct rb_root write_requests; |
1032 | struct drbd_tl_epoch *oldest_tle; | ||
1033 | struct list_head out_of_sequence_requests; | ||
1034 | struct list_head barrier_acked_requests; | ||
1035 | struct hlist_head *tl_hash; | ||
1036 | unsigned int tl_hash_s; | ||
1037 | 948 | ||
1038 | /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ | 949 | /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ |
1039 | unsigned long rs_total; | 950 | unsigned long rs_total; |
@@ -1053,6 +964,7 @@ struct drbd_conf { | |||
1053 | unsigned long rs_mark_time[DRBD_SYNC_MARKS]; | 964 | unsigned long rs_mark_time[DRBD_SYNC_MARKS]; |
1054 | /* current index into rs_mark_{left,time} */ | 965 | /* current index into rs_mark_{left,time} */ |
1055 | int rs_last_mark; | 966 | int rs_last_mark; |
967 | unsigned long rs_last_bcast; /* [unit jiffies] */ | ||
1056 | 968 | ||
1057 | /* where does the admin want us to start? (sector) */ | 969 | /* where does the admin want us to start? (sector) */ |
1058 | sector_t ov_start_sector; | 970 | sector_t ov_start_sector; |
@@ -1064,14 +976,7 @@ struct drbd_conf { | |||
1064 | /* size of out-of-sync range in sectors. */ | 976 | /* size of out-of-sync range in sectors. */ |
1065 | sector_t ov_last_oos_size; | 977 | sector_t ov_last_oos_size; |
1066 | unsigned long ov_left; /* in bits */ | 978 | unsigned long ov_left; /* in bits */ |
1067 | struct crypto_hash *csums_tfm; | ||
1068 | struct crypto_hash *verify_tfm; | ||
1069 | 979 | ||
1070 | unsigned long last_reattach_jif; | ||
1071 | unsigned long last_reconnect_jif; | ||
1072 | struct drbd_thread receiver; | ||
1073 | struct drbd_thread worker; | ||
1074 | struct drbd_thread asender; | ||
1075 | struct drbd_bitmap *bitmap; | 980 | struct drbd_bitmap *bitmap; |
1076 | unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ | 981 | unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ |
1077 | 982 | ||
@@ -1084,29 +989,19 @@ struct drbd_conf { | |||
1084 | 989 | ||
1085 | int open_cnt; | 990 | int open_cnt; |
1086 | u64 *p_uuid; | 991 | u64 *p_uuid; |
1087 | struct drbd_epoch *current_epoch; | 992 | |
1088 | spinlock_t epoch_lock; | ||
1089 | unsigned int epochs; | ||
1090 | enum write_ordering_e write_ordering; | ||
1091 | struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ | 993 | struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ |
1092 | struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ | 994 | struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ |
1093 | struct list_head done_ee; /* send ack */ | 995 | struct list_head done_ee; /* need to send P_WRITE_ACK */ |
1094 | struct list_head read_ee; /* IO in progress (any read) */ | 996 | struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ |
1095 | struct list_head net_ee; /* zero-copy network send in progress */ | 997 | struct list_head net_ee; /* zero-copy network send in progress */ |
1096 | struct hlist_head *ee_hash; /* is proteced by req_lock! */ | ||
1097 | unsigned int ee_hash_s; | ||
1098 | |||
1099 | /* this one is protected by ee_lock, single thread */ | ||
1100 | struct drbd_epoch_entry *last_write_w_barrier; | ||
1101 | 998 | ||
1102 | int next_barrier_nr; | 999 | int next_barrier_nr; |
1103 | struct hlist_head *app_reads_hash; /* is proteced by req_lock */ | ||
1104 | struct list_head resync_reads; | 1000 | struct list_head resync_reads; |
1105 | atomic_t pp_in_use; /* allocated from page pool */ | 1001 | atomic_t pp_in_use; /* allocated from page pool */ |
1106 | atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ | 1002 | atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ |
1107 | wait_queue_head_t ee_wait; | 1003 | wait_queue_head_t ee_wait; |
1108 | struct page *md_io_page; /* one page buffer for md_io */ | 1004 | struct page *md_io_page; /* one page buffer for md_io */ |
1109 | struct page *md_io_tmpp; /* for logical_block_size != 512 */ | ||
1110 | struct drbd_md_io md_io; | 1005 | struct drbd_md_io md_io; |
1111 | atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ | 1006 | atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ |
1112 | spinlock_t al_lock; | 1007 | spinlock_t al_lock; |
@@ -1115,22 +1010,16 @@ struct drbd_conf { | |||
1115 | unsigned int al_tr_number; | 1010 | unsigned int al_tr_number; |
1116 | int al_tr_cycle; | 1011 | int al_tr_cycle; |
1117 | int al_tr_pos; /* position of the next transaction in the journal */ | 1012 | int al_tr_pos; /* position of the next transaction in the journal */ |
1118 | struct crypto_hash *cram_hmac_tfm; | ||
1119 | struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ | ||
1120 | struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ | ||
1121 | void *int_dig_out; | ||
1122 | void *int_dig_in; | ||
1123 | void *int_dig_vv; | ||
1124 | wait_queue_head_t seq_wait; | 1013 | wait_queue_head_t seq_wait; |
1125 | atomic_t packet_seq; | 1014 | atomic_t packet_seq; |
1126 | unsigned int peer_seq; | 1015 | unsigned int peer_seq; |
1127 | spinlock_t peer_seq_lock; | 1016 | spinlock_t peer_seq_lock; |
1128 | unsigned int minor; | 1017 | unsigned int minor; |
1129 | unsigned long comm_bm_set; /* communicated number of set bits. */ | 1018 | unsigned long comm_bm_set; /* communicated number of set bits. */ |
1130 | cpumask_var_t cpu_mask; | ||
1131 | struct bm_io_work bm_io_work; | 1019 | struct bm_io_work bm_io_work; |
1132 | u64 ed_uuid; /* UUID of the exposed data */ | 1020 | u64 ed_uuid; /* UUID of the exposed data */ |
1133 | struct mutex state_mutex; | 1021 | struct mutex own_state_mutex; |
1022 | struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */ | ||
1134 | char congestion_reason; /* Why we where congested... */ | 1023 | char congestion_reason; /* Why we where congested... */ |
1135 | atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ | 1024 | atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ |
1136 | atomic_t rs_sect_ev; /* for submitted resync data rate, both */ | 1025 | atomic_t rs_sect_ev; /* for submitted resync data rate, both */ |
@@ -1138,46 +1027,16 @@ struct drbd_conf { | |||
1138 | int rs_last_events; /* counter of read or write "events" (unit sectors) | 1027 | int rs_last_events; /* counter of read or write "events" (unit sectors) |
1139 | * on the lower level device when we last looked. */ | 1028 | * on the lower level device when we last looked. */ |
1140 | int c_sync_rate; /* current resync rate after syncer throttle magic */ | 1029 | int c_sync_rate; /* current resync rate after syncer throttle magic */ |
1141 | struct fifo_buffer rs_plan_s; /* correction values of resync planer */ | 1030 | struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */ |
1142 | int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ | 1031 | int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ |
1143 | int rs_planed; /* resync sectors already planned */ | ||
1144 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ | 1032 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ |
1145 | unsigned int peer_max_bio_size; | 1033 | unsigned int peer_max_bio_size; |
1146 | unsigned int local_max_bio_size; | 1034 | unsigned int local_max_bio_size; |
1147 | }; | 1035 | }; |
1148 | 1036 | ||
1149 | static inline void drbd_set_flag(struct drbd_conf *mdev, enum drbd_flag f) | ||
1150 | { | ||
1151 | set_bit(f, &mdev->drbd_flags[0]); | ||
1152 | } | ||
1153 | |||
1154 | static inline void drbd_clear_flag(struct drbd_conf *mdev, enum drbd_flag f) | ||
1155 | { | ||
1156 | clear_bit(f, &mdev->drbd_flags[0]); | ||
1157 | } | ||
1158 | |||
1159 | static inline int drbd_test_flag(struct drbd_conf *mdev, enum drbd_flag f) | ||
1160 | { | ||
1161 | return test_bit(f, &mdev->drbd_flags[0]); | ||
1162 | } | ||
1163 | |||
1164 | static inline int drbd_test_and_set_flag(struct drbd_conf *mdev, enum drbd_flag f) | ||
1165 | { | ||
1166 | return test_and_set_bit(f, &mdev->drbd_flags[0]); | ||
1167 | } | ||
1168 | |||
1169 | static inline int drbd_test_and_clear_flag(struct drbd_conf *mdev, enum drbd_flag f) | ||
1170 | { | ||
1171 | return test_and_clear_bit(f, &mdev->drbd_flags[0]); | ||
1172 | } | ||
1173 | |||
1174 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) | 1037 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) |
1175 | { | 1038 | { |
1176 | struct drbd_conf *mdev; | 1039 | return (struct drbd_conf *)idr_find(&minors, minor); |
1177 | |||
1178 | mdev = minor < minor_count ? minor_table[minor] : NULL; | ||
1179 | |||
1180 | return mdev; | ||
1181 | } | 1040 | } |
1182 | 1041 | ||
1183 | static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) | 1042 | static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) |
@@ -1185,29 +1044,9 @@ static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) | |||
1185 | return mdev->minor; | 1044 | return mdev->minor; |
1186 | } | 1045 | } |
1187 | 1046 | ||
1188 | /* returns 1 if it was successful, | 1047 | static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr) |
1189 | * returns 0 if there was no data socket. | ||
1190 | * so wherever you are going to use the data.socket, e.g. do | ||
1191 | * if (!drbd_get_data_sock(mdev)) | ||
1192 | * return 0; | ||
1193 | * CODE(); | ||
1194 | * drbd_put_data_sock(mdev); | ||
1195 | */ | ||
1196 | static inline int drbd_get_data_sock(struct drbd_conf *mdev) | ||
1197 | { | ||
1198 | mutex_lock(&mdev->data.mutex); | ||
1199 | /* drbd_disconnect() could have called drbd_free_sock() | ||
1200 | * while we were waiting in down()... */ | ||
1201 | if (unlikely(mdev->data.socket == NULL)) { | ||
1202 | mutex_unlock(&mdev->data.mutex); | ||
1203 | return 0; | ||
1204 | } | ||
1205 | return 1; | ||
1206 | } | ||
1207 | |||
1208 | static inline void drbd_put_data_sock(struct drbd_conf *mdev) | ||
1209 | { | 1048 | { |
1210 | mutex_unlock(&mdev->data.mutex); | 1049 | return (struct drbd_conf *)idr_find(&tconn->volumes, vnr); |
1211 | } | 1050 | } |
1212 | 1051 | ||
1213 | /* | 1052 | /* |
@@ -1216,99 +1055,69 @@ static inline void drbd_put_data_sock(struct drbd_conf *mdev) | |||
1216 | 1055 | ||
1217 | /* drbd_main.c */ | 1056 | /* drbd_main.c */ |
1218 | 1057 | ||
1219 | enum chg_state_flags { | ||
1220 | CS_HARD = 1, | ||
1221 | CS_VERBOSE = 2, | ||
1222 | CS_WAIT_COMPLETE = 4, | ||
1223 | CS_SERIALIZE = 8, | ||
1224 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | ||
1225 | }; | ||
1226 | |||
1227 | enum dds_flags { | 1058 | enum dds_flags { |
1228 | DDSF_FORCED = 1, | 1059 | DDSF_FORCED = 1, |
1229 | DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ | 1060 | DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ |
1230 | }; | 1061 | }; |
1231 | 1062 | ||
1232 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); | 1063 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); |
1233 | extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev, | ||
1234 | enum chg_state_flags f, | ||
1235 | union drbd_state mask, | ||
1236 | union drbd_state val); | ||
1237 | extern void drbd_force_state(struct drbd_conf *, union drbd_state, | ||
1238 | union drbd_state); | ||
1239 | extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *, | ||
1240 | union drbd_state, | ||
1241 | union drbd_state, | ||
1242 | enum chg_state_flags); | ||
1243 | extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state, | ||
1244 | enum chg_state_flags, | ||
1245 | struct completion *done); | ||
1246 | extern void print_st_err(struct drbd_conf *, union drbd_state, | ||
1247 | union drbd_state, int); | ||
1248 | extern int drbd_thread_start(struct drbd_thread *thi); | 1064 | extern int drbd_thread_start(struct drbd_thread *thi); |
1249 | extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); | 1065 | extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); |
1066 | extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task); | ||
1250 | #ifdef CONFIG_SMP | 1067 | #ifdef CONFIG_SMP |
1251 | extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); | 1068 | extern void drbd_thread_current_set_cpu(struct drbd_thread *thi); |
1252 | extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); | 1069 | extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn); |
1253 | #else | 1070 | #else |
1254 | #define drbd_thread_current_set_cpu(A) ({}) | 1071 | #define drbd_thread_current_set_cpu(A) ({}) |
1255 | #define drbd_calc_cpu_mask(A) ({}) | 1072 | #define drbd_calc_cpu_mask(A) ({}) |
1256 | #endif | 1073 | #endif |
1257 | extern void drbd_free_resources(struct drbd_conf *mdev); | 1074 | extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr, |
1258 | extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | ||
1259 | unsigned int set_size); | 1075 | unsigned int set_size); |
1260 | extern void tl_clear(struct drbd_conf *mdev); | 1076 | extern void tl_clear(struct drbd_tconn *); |
1261 | extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); | 1077 | extern void drbd_free_sock(struct drbd_tconn *tconn); |
1262 | extern void drbd_free_sock(struct drbd_conf *mdev); | 1078 | extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock, |
1263 | extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, | 1079 | void *buf, size_t size, unsigned msg_flags); |
1264 | void *buf, size_t size, unsigned msg_flags); | 1080 | extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t, |
1265 | extern int drbd_send_protocol(struct drbd_conf *mdev); | 1081 | unsigned); |
1082 | |||
1083 | extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd); | ||
1084 | extern int drbd_send_protocol(struct drbd_tconn *tconn); | ||
1266 | extern int drbd_send_uuids(struct drbd_conf *mdev); | 1085 | extern int drbd_send_uuids(struct drbd_conf *mdev); |
1267 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); | 1086 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); |
1268 | extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); | 1087 | extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); |
1269 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); | 1088 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); |
1270 | extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); | 1089 | extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); |
1271 | extern int drbd_send_current_state(struct drbd_conf *mdev); | 1090 | extern int drbd_send_current_state(struct drbd_conf *mdev); |
1272 | extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | 1091 | extern int drbd_send_sync_param(struct drbd_conf *mdev); |
1273 | enum drbd_packets cmd, struct p_header80 *h, | 1092 | extern void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, |
1274 | size_t size, unsigned msg_flags); | 1093 | u32 set_size); |
1275 | #define USE_DATA_SOCKET 1 | 1094 | extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet, |
1276 | #define USE_META_SOCKET 0 | 1095 | struct drbd_peer_request *); |
1277 | extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | 1096 | extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, |
1278 | enum drbd_packets cmd, struct p_header80 *h, | 1097 | struct p_block_req *rp); |
1279 | size_t size); | 1098 | extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, |
1280 | extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, | 1099 | struct p_data *dp, int data_size); |
1281 | char *data, size_t size); | 1100 | extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, |
1282 | extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); | ||
1283 | extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, | ||
1284 | u32 set_size); | ||
1285 | extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1286 | struct drbd_epoch_entry *e); | ||
1287 | extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1288 | struct p_block_req *rp); | ||
1289 | extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1290 | struct p_data *dp, int data_size); | ||
1291 | extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1292 | sector_t sector, int blksize, u64 block_id); | 1101 | sector_t sector, int blksize, u64 block_id); |
1293 | extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req); | 1102 | extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *); |
1294 | extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | 1103 | extern int drbd_send_block(struct drbd_conf *, enum drbd_packet, |
1295 | struct drbd_epoch_entry *e); | 1104 | struct drbd_peer_request *); |
1296 | extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); | 1105 | extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); |
1297 | extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | 1106 | extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, |
1298 | sector_t sector, int size, u64 block_id); | 1107 | sector_t sector, int size, u64 block_id); |
1299 | extern int drbd_send_drequest_csum(struct drbd_conf *mdev, | 1108 | extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, |
1300 | sector_t sector,int size, | 1109 | int size, void *digest, int digest_size, |
1301 | void *digest, int digest_size, | 1110 | enum drbd_packet cmd); |
1302 | enum drbd_packets cmd); | ||
1303 | extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); | 1111 | extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); |
1304 | 1112 | ||
1305 | extern int drbd_send_bitmap(struct drbd_conf *mdev); | 1113 | extern int drbd_send_bitmap(struct drbd_conf *mdev); |
1306 | extern int _drbd_send_bitmap(struct drbd_conf *mdev); | 1114 | extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode); |
1307 | extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode); | 1115 | extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode); |
1308 | extern void drbd_free_bc(struct drbd_backing_dev *ldev); | 1116 | extern void drbd_free_bc(struct drbd_backing_dev *ldev); |
1309 | extern void drbd_mdev_cleanup(struct drbd_conf *mdev); | 1117 | extern void drbd_mdev_cleanup(struct drbd_conf *mdev); |
1310 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); | 1118 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); |
1311 | 1119 | ||
1120 | extern void conn_md_sync(struct drbd_tconn *tconn); | ||
1312 | extern void drbd_md_sync(struct drbd_conf *mdev); | 1121 | extern void drbd_md_sync(struct drbd_conf *mdev); |
1313 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); | 1122 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); |
1314 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | 1123 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); |
@@ -1334,33 +1143,52 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, | |||
1334 | extern int drbd_bitmap_io(struct drbd_conf *mdev, | 1143 | extern int drbd_bitmap_io(struct drbd_conf *mdev, |
1335 | int (*io_fn)(struct drbd_conf *), | 1144 | int (*io_fn)(struct drbd_conf *), |
1336 | char *why, enum bm_flag flags); | 1145 | char *why, enum bm_flag flags); |
1146 | extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | ||
1147 | int (*io_fn)(struct drbd_conf *), | ||
1148 | char *why, enum bm_flag flags); | ||
1337 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); | 1149 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); |
1338 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); | 1150 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); |
1339 | extern void drbd_go_diskless(struct drbd_conf *mdev); | 1151 | extern void drbd_go_diskless(struct drbd_conf *mdev); |
1340 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); | 1152 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); |
1341 | 1153 | ||
1342 | |||
1343 | /* Meta data layout | 1154 | /* Meta data layout |
1344 | We reserve a 128MB Block (4k aligned) | 1155 | We reserve a 128MB Block (4k aligned) |
1345 | * either at the end of the backing device | 1156 | * either at the end of the backing device |
1346 | * or on a separate meta data device. */ | 1157 | * or on a separate meta data device. */ |
1347 | 1158 | ||
1348 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ | ||
1349 | /* The following numbers are sectors */ | 1159 | /* The following numbers are sectors */ |
1350 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ | 1160 | /* Allows up to about 3.8TB, so if you want more, |
1351 | #define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ | 1161 | * you need to use the "flexible" meta data format. */ |
1352 | /* Allows up to about 3.8TB */ | 1162 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ |
1353 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) | 1163 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ |
1354 | 1164 | #define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ | |
1355 | /* Since the smalles IO unit is usually 512 byte */ | 1165 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) |
1356 | #define MD_SECTOR_SHIFT 9 | 1166 | |
1357 | #define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT) | 1167 | /* we do all meta data IO in 4k blocks */ |
1358 | 1168 | #define MD_BLOCK_SHIFT 12 | |
1359 | /* activity log */ | 1169 | #define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT) |
1360 | #define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */ | 1170 | |
1361 | #define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */ | 1171 | /* One activity log extent represents 4M of storage */ |
1172 | #define AL_EXTENT_SHIFT 22 | ||
1362 | #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) | 1173 | #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) |
1363 | 1174 | ||
1175 | /* We could make these currently hardcoded constants configurable | ||
1176 | * variables at create-md time (or even re-configurable at runtime?). | ||
1177 | * Which will require some more changes to the DRBD "super block" | ||
1178 | * and attach code. | ||
1179 | * | ||
1180 | * updates per transaction: | ||
1181 | * This many changes to the active set can be logged with one transaction. | ||
1182 | * This number is arbitrary. | ||
1183 | * context per transaction: | ||
1184 | * This many context extent numbers are logged with each transaction. | ||
1185 | * This number is resulting from the transaction block size (4k), the layout | ||
1186 | * of the transaction header, and the number of updates per transaction. | ||
1187 | * See drbd_actlog.c:struct al_transaction_on_disk | ||
1188 | * */ | ||
1189 | #define AL_UPDATES_PER_TRANSACTION 64 // arbitrary | ||
1190 | #define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4 | ||
1191 | |||
1364 | #if BITS_PER_LONG == 32 | 1192 | #if BITS_PER_LONG == 32 |
1365 | #define LN2_BPL 5 | 1193 | #define LN2_BPL 5 |
1366 | #define cpu_to_lel(A) cpu_to_le32(A) | 1194 | #define cpu_to_lel(A) cpu_to_le32(A) |
@@ -1396,11 +1224,14 @@ struct bm_extent { | |||
1396 | 1224 | ||
1397 | #define SLEEP_TIME (HZ/10) | 1225 | #define SLEEP_TIME (HZ/10) |
1398 | 1226 | ||
1399 | #define BM_BLOCK_SHIFT 12 /* 4k per bit */ | 1227 | /* We do bitmap IO in units of 4k blocks. |
1228 | * We also still have a hardcoded 4k per bit relation. */ | ||
1229 | #define BM_BLOCK_SHIFT 12 /* 4k per bit */ | ||
1400 | #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) | 1230 | #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) |
1401 | /* (9+3) : 512 bytes @ 8 bits; representing 16M storage | 1231 | /* mostly arbitrarily set the represented size of one bitmap extent, |
1402 | * per sector of on disk bitmap */ | 1232 | * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap |
1403 | #define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */ | 1233 | * at 4k per bit resolution) */ |
1234 | #define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */ | ||
1404 | #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) | 1235 | #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) |
1405 | 1236 | ||
1406 | #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) | 1237 | #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) |
@@ -1468,17 +1299,20 @@ struct bm_extent { | |||
1468 | #endif | 1299 | #endif |
1469 | #endif | 1300 | #endif |
1470 | 1301 | ||
1471 | /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. | 1302 | /* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE, |
1472 | * With a value of 8 all IO in one 128K block make it to the same slot of the | 1303 | * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte. |
1473 | * hash table. */ | 1304 | * Since we may live in a mixed-platform cluster, |
1474 | #define HT_SHIFT 8 | 1305 | * we limit us to a platform agnostic constant here for now. |
1475 | #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) | 1306 | * A followup commit may allow even bigger BIO sizes, |
1307 | * once we thought that through. */ | ||
1308 | #define DRBD_MAX_BIO_SIZE (1U << 20) | ||
1309 | #if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE | ||
1310 | #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE | ||
1311 | #endif | ||
1476 | #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ | 1312 | #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ |
1477 | 1313 | ||
1478 | #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */ | 1314 | #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ |
1479 | 1315 | #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ | |
1480 | /* Number of elements in the app_reads_hash */ | ||
1481 | #define APP_R_HSIZE 15 | ||
1482 | 1316 | ||
1483 | extern int drbd_bm_init(struct drbd_conf *mdev); | 1317 | extern int drbd_bm_init(struct drbd_conf *mdev); |
1484 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); | 1318 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); |
@@ -1500,11 +1334,11 @@ extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); | |||
1500 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); | 1334 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); |
1501 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); | 1335 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); |
1502 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); | 1336 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); |
1337 | extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr); | ||
1503 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); | 1338 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); |
1339 | extern int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local); | ||
1504 | extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); | 1340 | extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); |
1505 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); | 1341 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); |
1506 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, | ||
1507 | unsigned long al_enr); | ||
1508 | extern size_t drbd_bm_words(struct drbd_conf *mdev); | 1342 | extern size_t drbd_bm_words(struct drbd_conf *mdev); |
1509 | extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); | 1343 | extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); |
1510 | extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); | 1344 | extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); |
@@ -1529,7 +1363,7 @@ extern void drbd_bm_unlock(struct drbd_conf *mdev); | |||
1529 | /* drbd_main.c */ | 1363 | /* drbd_main.c */ |
1530 | 1364 | ||
1531 | extern struct kmem_cache *drbd_request_cache; | 1365 | extern struct kmem_cache *drbd_request_cache; |
1532 | extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ | 1366 | extern struct kmem_cache *drbd_ee_cache; /* peer requests */ |
1533 | extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | 1367 | extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ |
1534 | extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | 1368 | extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ |
1535 | extern mempool_t *drbd_request_mempool; | 1369 | extern mempool_t *drbd_request_mempool; |
@@ -1569,12 +1403,22 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); | |||
1569 | 1403 | ||
1570 | extern rwlock_t global_state_lock; | 1404 | extern rwlock_t global_state_lock; |
1571 | 1405 | ||
1572 | extern struct drbd_conf *drbd_new_device(unsigned int minor); | 1406 | extern int conn_lowest_minor(struct drbd_tconn *tconn); |
1573 | extern void drbd_free_mdev(struct drbd_conf *mdev); | 1407 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr); |
1408 | extern void drbd_minor_destroy(struct kref *kref); | ||
1409 | |||
1410 | extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts); | ||
1411 | extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts); | ||
1412 | extern void conn_destroy(struct kref *kref); | ||
1413 | struct drbd_tconn *conn_get_by_name(const char *name); | ||
1414 | extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, | ||
1415 | void *peer_addr, int peer_addr_len); | ||
1416 | extern void conn_free_crypto(struct drbd_tconn *tconn); | ||
1574 | 1417 | ||
1575 | extern int proc_details; | 1418 | extern int proc_details; |
1576 | 1419 | ||
1577 | /* drbd_req */ | 1420 | /* drbd_req */ |
1421 | extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); | ||
1578 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); | 1422 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); |
1579 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); | 1423 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); |
1580 | extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); | 1424 | extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); |
@@ -1582,10 +1426,11 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); | |||
1582 | 1426 | ||
1583 | 1427 | ||
1584 | /* drbd_nl.c */ | 1428 | /* drbd_nl.c */ |
1429 | extern int drbd_msg_put_info(const char *info); | ||
1585 | extern void drbd_suspend_io(struct drbd_conf *mdev); | 1430 | extern void drbd_suspend_io(struct drbd_conf *mdev); |
1586 | extern void drbd_resume_io(struct drbd_conf *mdev); | 1431 | extern void drbd_resume_io(struct drbd_conf *mdev); |
1587 | extern char *ppsize(char *buf, unsigned long long size); | 1432 | extern char *ppsize(char *buf, unsigned long long size); |
1588 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); | 1433 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); |
1589 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; | 1434 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; |
1590 | extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); | 1435 | extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); |
1591 | extern void resync_after_online_grow(struct drbd_conf *); | 1436 | extern void resync_after_online_grow(struct drbd_conf *); |
@@ -1593,13 +1438,14 @@ extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); | |||
1593 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, | 1438 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, |
1594 | enum drbd_role new_role, | 1439 | enum drbd_role new_role, |
1595 | int force); | 1440 | int force); |
1596 | extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); | 1441 | extern bool conn_try_outdate_peer(struct drbd_tconn *tconn); |
1597 | extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev); | 1442 | extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn); |
1598 | extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); | 1443 | extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); |
1599 | 1444 | ||
1600 | /* drbd_worker.c */ | 1445 | /* drbd_worker.c */ |
1601 | extern int drbd_worker(struct drbd_thread *thi); | 1446 | extern int drbd_worker(struct drbd_thread *thi); |
1602 | extern int drbd_alter_sa(struct drbd_conf *mdev, int na); | 1447 | enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor); |
1448 | void drbd_resync_after_changed(struct drbd_conf *mdev); | ||
1603 | extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); | 1449 | extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); |
1604 | extern void resume_next_sg(struct drbd_conf *mdev); | 1450 | extern void resume_next_sg(struct drbd_conf *mdev); |
1605 | extern void suspend_other_sg(struct drbd_conf *mdev); | 1451 | extern void suspend_other_sg(struct drbd_conf *mdev); |
@@ -1608,13 +1454,13 @@ extern int drbd_resync_finished(struct drbd_conf *mdev); | |||
1608 | extern void *drbd_md_get_buffer(struct drbd_conf *mdev); | 1454 | extern void *drbd_md_get_buffer(struct drbd_conf *mdev); |
1609 | extern void drbd_md_put_buffer(struct drbd_conf *mdev); | 1455 | extern void drbd_md_put_buffer(struct drbd_conf *mdev); |
1610 | extern int drbd_md_sync_page_io(struct drbd_conf *mdev, | 1456 | extern int drbd_md_sync_page_io(struct drbd_conf *mdev, |
1611 | struct drbd_backing_dev *bdev, sector_t sector, int rw); | 1457 | struct drbd_backing_dev *bdev, sector_t sector, int rw); |
1458 | extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int); | ||
1612 | extern void wait_until_done_or_force_detached(struct drbd_conf *mdev, | 1459 | extern void wait_until_done_or_force_detached(struct drbd_conf *mdev, |
1613 | struct drbd_backing_dev *bdev, unsigned int *done); | 1460 | struct drbd_backing_dev *bdev, unsigned int *done); |
1614 | extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); | ||
1615 | extern void drbd_rs_controller_reset(struct drbd_conf *mdev); | 1461 | extern void drbd_rs_controller_reset(struct drbd_conf *mdev); |
1616 | 1462 | ||
1617 | static inline void ov_oos_print(struct drbd_conf *mdev) | 1463 | static inline void ov_out_of_sync_print(struct drbd_conf *mdev) |
1618 | { | 1464 | { |
1619 | if (mdev->ov_last_oos_size) { | 1465 | if (mdev->ov_last_oos_size) { |
1620 | dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", | 1466 | dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", |
@@ -1626,97 +1472,102 @@ static inline void ov_oos_print(struct drbd_conf *mdev) | |||
1626 | 1472 | ||
1627 | 1473 | ||
1628 | extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); | 1474 | extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); |
1629 | extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *); | 1475 | extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, |
1476 | struct drbd_peer_request *, void *); | ||
1630 | /* worker callbacks */ | 1477 | /* worker callbacks */ |
1631 | extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); | 1478 | extern int w_e_end_data_req(struct drbd_work *, int); |
1632 | extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); | 1479 | extern int w_e_end_rsdata_req(struct drbd_work *, int); |
1633 | extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); | 1480 | extern int w_e_end_csum_rs_req(struct drbd_work *, int); |
1634 | extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); | 1481 | extern int w_e_end_ov_reply(struct drbd_work *, int); |
1635 | extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); | 1482 | extern int w_e_end_ov_req(struct drbd_work *, int); |
1636 | extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); | 1483 | extern int w_ov_finished(struct drbd_work *, int); |
1637 | extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); | 1484 | extern int w_resync_timer(struct drbd_work *, int); |
1638 | extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); | 1485 | extern int w_send_write_hint(struct drbd_work *, int); |
1639 | extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int); | 1486 | extern int w_make_resync_request(struct drbd_work *, int); |
1640 | extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); | 1487 | extern int w_send_dblock(struct drbd_work *, int); |
1641 | extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); | 1488 | extern int w_send_read_req(struct drbd_work *, int); |
1642 | extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); | 1489 | extern int w_prev_work_done(struct drbd_work *, int); |
1643 | extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); | 1490 | extern int w_e_reissue(struct drbd_work *, int); |
1644 | extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); | 1491 | extern int w_restart_disk_io(struct drbd_work *, int); |
1645 | extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); | 1492 | extern int w_send_out_of_sync(struct drbd_work *, int); |
1646 | extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); | 1493 | extern int w_start_resync(struct drbd_work *, int); |
1647 | extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); | ||
1648 | extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int); | ||
1649 | extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int); | ||
1650 | 1494 | ||
1651 | extern void resync_timer_fn(unsigned long data); | 1495 | extern void resync_timer_fn(unsigned long data); |
1652 | extern void start_resync_timer_fn(unsigned long data); | 1496 | extern void start_resync_timer_fn(unsigned long data); |
1653 | 1497 | ||
1654 | /* drbd_receiver.c */ | 1498 | /* drbd_receiver.c */ |
1655 | extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector); | 1499 | extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector); |
1656 | extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1500 | extern int drbd_submit_peer_request(struct drbd_conf *, |
1657 | const unsigned rw, const int fault_type); | 1501 | struct drbd_peer_request *, const unsigned, |
1658 | extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); | 1502 | const int); |
1659 | extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | 1503 | extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *); |
1660 | u64 id, | 1504 | extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64, |
1661 | sector_t sector, | 1505 | sector_t, unsigned int, |
1662 | unsigned int data_size, | 1506 | gfp_t) __must_hold(local); |
1663 | gfp_t gfp_mask) __must_hold(local); | 1507 | extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *, |
1664 | extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1508 | int); |
1665 | int is_net); | 1509 | #define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) |
1666 | #define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0) | 1510 | #define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) |
1667 | #define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1) | 1511 | extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool); |
1668 | extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1669 | struct list_head *head); | ||
1670 | extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1671 | struct list_head *head); | ||
1672 | extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); | 1512 | extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); |
1673 | extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); | 1513 | extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); |
1674 | extern void drbd_flush_workqueue(struct drbd_conf *mdev); | 1514 | extern void conn_flush_workqueue(struct drbd_tconn *tconn); |
1675 | extern void drbd_free_tl_hash(struct drbd_conf *mdev); | 1515 | extern int drbd_connected(struct drbd_conf *mdev); |
1516 | static inline void drbd_flush_workqueue(struct drbd_conf *mdev) | ||
1517 | { | ||
1518 | conn_flush_workqueue(mdev->tconn); | ||
1519 | } | ||
1676 | 1520 | ||
1677 | /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to | 1521 | /* Yes, there is kernel_setsockopt, but only since 2.6.18. |
1678 | * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ | 1522 | * So we have our own copy of it here. */ |
1679 | static inline int drbd_setsockopt(struct socket *sock, int level, int optname, | 1523 | static inline int drbd_setsockopt(struct socket *sock, int level, int optname, |
1680 | char __user *optval, int optlen) | 1524 | char *optval, int optlen) |
1681 | { | 1525 | { |
1526 | mm_segment_t oldfs = get_fs(); | ||
1527 | char __user *uoptval; | ||
1682 | int err; | 1528 | int err; |
1529 | |||
1530 | uoptval = (char __user __force *)optval; | ||
1531 | |||
1532 | set_fs(KERNEL_DS); | ||
1683 | if (level == SOL_SOCKET) | 1533 | if (level == SOL_SOCKET) |
1684 | err = sock_setsockopt(sock, level, optname, optval, optlen); | 1534 | err = sock_setsockopt(sock, level, optname, uoptval, optlen); |
1685 | else | 1535 | else |
1686 | err = sock->ops->setsockopt(sock, level, optname, optval, | 1536 | err = sock->ops->setsockopt(sock, level, optname, uoptval, |
1687 | optlen); | 1537 | optlen); |
1538 | set_fs(oldfs); | ||
1688 | return err; | 1539 | return err; |
1689 | } | 1540 | } |
1690 | 1541 | ||
1691 | static inline void drbd_tcp_cork(struct socket *sock) | 1542 | static inline void drbd_tcp_cork(struct socket *sock) |
1692 | { | 1543 | { |
1693 | int __user val = 1; | 1544 | int val = 1; |
1694 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | 1545 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, |
1695 | (char __user *)&val, sizeof(val)); | 1546 | (char*)&val, sizeof(val)); |
1696 | } | 1547 | } |
1697 | 1548 | ||
1698 | static inline void drbd_tcp_uncork(struct socket *sock) | 1549 | static inline void drbd_tcp_uncork(struct socket *sock) |
1699 | { | 1550 | { |
1700 | int __user val = 0; | 1551 | int val = 0; |
1701 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | 1552 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, |
1702 | (char __user *)&val, sizeof(val)); | 1553 | (char*)&val, sizeof(val)); |
1703 | } | 1554 | } |
1704 | 1555 | ||
1705 | static inline void drbd_tcp_nodelay(struct socket *sock) | 1556 | static inline void drbd_tcp_nodelay(struct socket *sock) |
1706 | { | 1557 | { |
1707 | int __user val = 1; | 1558 | int val = 1; |
1708 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, | 1559 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, |
1709 | (char __user *)&val, sizeof(val)); | 1560 | (char*)&val, sizeof(val)); |
1710 | } | 1561 | } |
1711 | 1562 | ||
1712 | static inline void drbd_tcp_quickack(struct socket *sock) | 1563 | static inline void drbd_tcp_quickack(struct socket *sock) |
1713 | { | 1564 | { |
1714 | int __user val = 2; | 1565 | int val = 2; |
1715 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, | 1566 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, |
1716 | (char __user *)&val, sizeof(val)); | 1567 | (char*)&val, sizeof(val)); |
1717 | } | 1568 | } |
1718 | 1569 | ||
1719 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); | 1570 | void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo); |
1720 | 1571 | ||
1721 | /* drbd_proc.c */ | 1572 | /* drbd_proc.c */ |
1722 | extern struct proc_dir_entry *drbd_proc; | 1573 | extern struct proc_dir_entry *drbd_proc; |
@@ -1725,8 +1576,8 @@ extern const char *drbd_conn_str(enum drbd_conns s); | |||
1725 | extern const char *drbd_role_str(enum drbd_role s); | 1576 | extern const char *drbd_role_str(enum drbd_role s); |
1726 | 1577 | ||
1727 | /* drbd_actlog.c */ | 1578 | /* drbd_actlog.c */ |
1728 | extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); | 1579 | extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); |
1729 | extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); | 1580 | extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); |
1730 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); | 1581 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); |
1731 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | 1582 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); |
1732 | extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | 1583 | extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); |
@@ -1734,7 +1585,6 @@ extern void drbd_rs_cancel_all(struct drbd_conf *mdev); | |||
1734 | extern int drbd_rs_del_all(struct drbd_conf *mdev); | 1585 | extern int drbd_rs_del_all(struct drbd_conf *mdev); |
1735 | extern void drbd_rs_failed_io(struct drbd_conf *mdev, | 1586 | extern void drbd_rs_failed_io(struct drbd_conf *mdev, |
1736 | sector_t sector, int size); | 1587 | sector_t sector, int size); |
1737 | extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); | ||
1738 | extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go); | 1588 | extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go); |
1739 | extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, | 1589 | extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, |
1740 | int size, const char *file, const unsigned int line); | 1590 | int size, const char *file, const unsigned int line); |
@@ -1744,73 +1594,24 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, | |||
1744 | int size, const char *file, const unsigned int line); | 1594 | int size, const char *file, const unsigned int line); |
1745 | #define drbd_set_out_of_sync(mdev, sector, size) \ | 1595 | #define drbd_set_out_of_sync(mdev, sector, size) \ |
1746 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) | 1596 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) |
1747 | extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); | ||
1748 | extern void drbd_al_shrink(struct drbd_conf *mdev); | 1597 | extern void drbd_al_shrink(struct drbd_conf *mdev); |
1749 | 1598 | ||
1750 | |||
1751 | /* drbd_nl.c */ | 1599 | /* drbd_nl.c */ |
1752 | 1600 | /* state info broadcast */ | |
1753 | void drbd_nl_cleanup(void); | 1601 | struct sib_info { |
1754 | int __init drbd_nl_init(void); | 1602 | enum drbd_state_info_bcast_reason sib_reason; |
1755 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); | 1603 | union { |
1756 | void drbd_bcast_sync_progress(struct drbd_conf *mdev); | 1604 | struct { |
1757 | void drbd_bcast_ee(struct drbd_conf *mdev, | 1605 | char *helper_name; |
1758 | const char *reason, const int dgs, | 1606 | unsigned helper_exit_code; |
1759 | const char* seen_hash, const char* calc_hash, | 1607 | }; |
1760 | const struct drbd_epoch_entry* e); | 1608 | struct { |
1761 | 1609 | union drbd_state os; | |
1762 | 1610 | union drbd_state ns; | |
1763 | /** | 1611 | }; |
1764 | * DOC: DRBD State macros | 1612 | }; |
1765 | * | 1613 | }; |
1766 | * These macros are used to express state changes in easily readable form. | 1614 | void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib); |
1767 | * | ||
1768 | * The NS macros expand to a mask and a value, that can be bit ored onto the | ||
1769 | * current state as soon as the spinlock (req_lock) was taken. | ||
1770 | * | ||
1771 | * The _NS macros are used for state functions that get called with the | ||
1772 | * spinlock. These macros expand directly to the new state value. | ||
1773 | * | ||
1774 | * Besides the basic forms NS() and _NS() additional _?NS[23] are defined | ||
1775 | * to express state changes that affect more than one aspect of the state. | ||
1776 | * | ||
1777 | * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) | ||
1778 | * Means that the network connection was established and that the peer | ||
1779 | * is in secondary role. | ||
1780 | */ | ||
1781 | #define role_MASK R_MASK | ||
1782 | #define peer_MASK R_MASK | ||
1783 | #define disk_MASK D_MASK | ||
1784 | #define pdsk_MASK D_MASK | ||
1785 | #define conn_MASK C_MASK | ||
1786 | #define susp_MASK 1 | ||
1787 | #define user_isp_MASK 1 | ||
1788 | #define aftr_isp_MASK 1 | ||
1789 | #define susp_nod_MASK 1 | ||
1790 | #define susp_fen_MASK 1 | ||
1791 | |||
1792 | #define NS(T, S) \ | ||
1793 | ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ | ||
1794 | ({ union drbd_state val; val.i = 0; val.T = (S); val; }) | ||
1795 | #define NS2(T1, S1, T2, S2) \ | ||
1796 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1797 | mask.T2 = T2##_MASK; mask; }), \ | ||
1798 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1799 | val.T2 = (S2); val; }) | ||
1800 | #define NS3(T1, S1, T2, S2, T3, S3) \ | ||
1801 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1802 | mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ | ||
1803 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1804 | val.T2 = (S2); val.T3 = (S3); val; }) | ||
1805 | |||
1806 | #define _NS(D, T, S) \ | ||
1807 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) | ||
1808 | #define _NS2(D, T1, S1, T2, S2) \ | ||
1809 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1810 | __ns.T2 = (S2); __ns; }) | ||
1811 | #define _NS3(D, T1, S1, T2, S2, T3, S3) \ | ||
1812 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1813 | __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) | ||
1814 | 1615 | ||
1815 | /* | 1616 | /* |
1816 | * inline helper functions | 1617 | * inline helper functions |
@@ -1827,9 +1628,10 @@ static inline struct page *page_chain_next(struct page *page) | |||
1827 | #define page_chain_for_each_safe(page, n) \ | 1628 | #define page_chain_for_each_safe(page, n) \ |
1828 | for (; page && ({ n = page_chain_next(page); 1; }); page = n) | 1629 | for (; page && ({ n = page_chain_next(page); 1; }); page = n) |
1829 | 1630 | ||
1830 | static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) | 1631 | |
1632 | static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req) | ||
1831 | { | 1633 | { |
1832 | struct page *page = e->pages; | 1634 | struct page *page = peer_req->pages; |
1833 | page_chain_for_each(page) { | 1635 | page_chain_for_each(page) { |
1834 | if (page_count(page) > 1) | 1636 | if (page_count(page) > 1) |
1835 | return 1; | 1637 | return 1; |
@@ -1837,18 +1639,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) | |||
1837 | return 0; | 1639 | return 0; |
1838 | } | 1640 | } |
1839 | 1641 | ||
1840 | static inline void drbd_state_lock(struct drbd_conf *mdev) | ||
1841 | { | ||
1842 | wait_event(mdev->misc_wait, | ||
1843 | !drbd_test_and_set_flag(mdev, CLUSTER_ST_CHANGE)); | ||
1844 | } | ||
1845 | |||
1846 | static inline void drbd_state_unlock(struct drbd_conf *mdev) | ||
1847 | { | ||
1848 | drbd_clear_flag(mdev, CLUSTER_ST_CHANGE); | ||
1849 | wake_up(&mdev->misc_wait); | ||
1850 | } | ||
1851 | |||
1852 | static inline enum drbd_state_rv | 1642 | static inline enum drbd_state_rv |
1853 | _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | 1643 | _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, |
1854 | enum chg_state_flags flags, struct completion *done) | 1644 | enum chg_state_flags flags, struct completion *done) |
@@ -1862,21 +1652,16 @@ _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | |||
1862 | return rv; | 1652 | return rv; |
1863 | } | 1653 | } |
1864 | 1654 | ||
1865 | /** | 1655 | static inline union drbd_state drbd_read_state(struct drbd_conf *mdev) |
1866 | * drbd_request_state() - Reqest a state change | ||
1867 | * @mdev: DRBD device. | ||
1868 | * @mask: mask of state bits to change. | ||
1869 | * @val: value of new state bits. | ||
1870 | * | ||
1871 | * This is the most graceful way of requesting a state change. It is verbose | ||
1872 | * quite verbose in case the state change is not possible, and all those | ||
1873 | * state changes are globally serialized. | ||
1874 | */ | ||
1875 | static inline int drbd_request_state(struct drbd_conf *mdev, | ||
1876 | union drbd_state mask, | ||
1877 | union drbd_state val) | ||
1878 | { | 1656 | { |
1879 | return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); | 1657 | union drbd_state rv; |
1658 | |||
1659 | rv.i = mdev->state.i; | ||
1660 | rv.susp = mdev->tconn->susp; | ||
1661 | rv.susp_nod = mdev->tconn->susp_nod; | ||
1662 | rv.susp_fen = mdev->tconn->susp_fen; | ||
1663 | |||
1664 | return rv; | ||
1880 | } | 1665 | } |
1881 | 1666 | ||
1882 | enum drbd_force_detach_flags { | 1667 | enum drbd_force_detach_flags { |
@@ -1891,8 +1676,13 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1891 | enum drbd_force_detach_flags df, | 1676 | enum drbd_force_detach_flags df, |
1892 | const char *where) | 1677 | const char *where) |
1893 | { | 1678 | { |
1894 | switch (mdev->ldev->dc.on_io_error) { | 1679 | enum drbd_io_error_p ep; |
1895 | case EP_PASS_ON: | 1680 | |
1681 | rcu_read_lock(); | ||
1682 | ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; | ||
1683 | rcu_read_unlock(); | ||
1684 | switch (ep) { | ||
1685 | case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ | ||
1896 | if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) { | 1686 | if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) { |
1897 | if (__ratelimit(&drbd_ratelimit_state)) | 1687 | if (__ratelimit(&drbd_ratelimit_state)) |
1898 | dev_err(DEV, "Local IO failed in %s.\n", where); | 1688 | dev_err(DEV, "Local IO failed in %s.\n", where); |
@@ -1923,11 +1713,11 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1923 | * we read meta data only once during attach, | 1713 | * we read meta data only once during attach, |
1924 | * which will fail in case of errors. | 1714 | * which will fail in case of errors. |
1925 | */ | 1715 | */ |
1926 | drbd_set_flag(mdev, WAS_IO_ERROR); | 1716 | set_bit(WAS_IO_ERROR, &mdev->flags); |
1927 | if (df == DRBD_READ_ERROR) | 1717 | if (df == DRBD_READ_ERROR) |
1928 | drbd_set_flag(mdev, WAS_READ_ERROR); | 1718 | set_bit(WAS_READ_ERROR, &mdev->flags); |
1929 | if (df == DRBD_FORCE_DETACH) | 1719 | if (df == DRBD_FORCE_DETACH) |
1930 | drbd_set_flag(mdev, FORCE_DETACH); | 1720 | set_bit(FORCE_DETACH, &mdev->flags); |
1931 | if (mdev->state.disk > D_FAILED) { | 1721 | if (mdev->state.disk > D_FAILED) { |
1932 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); | 1722 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); |
1933 | dev_err(DEV, | 1723 | dev_err(DEV, |
@@ -1951,9 +1741,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1951 | { | 1741 | { |
1952 | if (error) { | 1742 | if (error) { |
1953 | unsigned long flags; | 1743 | unsigned long flags; |
1954 | spin_lock_irqsave(&mdev->req_lock, flags); | 1744 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
1955 | __drbd_chk_io_error_(mdev, forcedetach, where); | 1745 | __drbd_chk_io_error_(mdev, forcedetach, where); |
1956 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 1746 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
1957 | } | 1747 | } |
1958 | } | 1748 | } |
1959 | 1749 | ||
@@ -1965,9 +1755,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1965 | * BTW, for internal meta data, this happens to be the maximum capacity | 1755 | * BTW, for internal meta data, this happens to be the maximum capacity |
1966 | * we could agree upon with our peer node. | 1756 | * we could agree upon with our peer node. |
1967 | */ | 1757 | */ |
1968 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | 1758 | static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) |
1969 | { | 1759 | { |
1970 | switch (bdev->dc.meta_dev_idx) { | 1760 | switch (meta_dev_idx) { |
1971 | case DRBD_MD_INDEX_INTERNAL: | 1761 | case DRBD_MD_INDEX_INTERNAL: |
1972 | case DRBD_MD_INDEX_FLEX_INT: | 1762 | case DRBD_MD_INDEX_FLEX_INT: |
1973 | return bdev->md.md_offset + bdev->md.bm_offset; | 1763 | return bdev->md.md_offset + bdev->md.bm_offset; |
@@ -1977,13 +1767,30 @@ static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | |||
1977 | } | 1767 | } |
1978 | } | 1768 | } |
1979 | 1769 | ||
1770 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | ||
1771 | { | ||
1772 | int meta_dev_idx; | ||
1773 | |||
1774 | rcu_read_lock(); | ||
1775 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1776 | rcu_read_unlock(); | ||
1777 | |||
1778 | return _drbd_md_first_sector(meta_dev_idx, bdev); | ||
1779 | } | ||
1780 | |||
1980 | /** | 1781 | /** |
1981 | * drbd_md_last_sector() - Return the last sector number of the meta data area | 1782 | * drbd_md_last_sector() - Return the last sector number of the meta data area |
1982 | * @bdev: Meta data block device. | 1783 | * @bdev: Meta data block device. |
1983 | */ | 1784 | */ |
1984 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) | 1785 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) |
1985 | { | 1786 | { |
1986 | switch (bdev->dc.meta_dev_idx) { | 1787 | int meta_dev_idx; |
1788 | |||
1789 | rcu_read_lock(); | ||
1790 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1791 | rcu_read_unlock(); | ||
1792 | |||
1793 | switch (meta_dev_idx) { | ||
1987 | case DRBD_MD_INDEX_INTERNAL: | 1794 | case DRBD_MD_INDEX_INTERNAL: |
1988 | case DRBD_MD_INDEX_FLEX_INT: | 1795 | case DRBD_MD_INDEX_FLEX_INT: |
1989 | return bdev->md.md_offset + MD_AL_OFFSET - 1; | 1796 | return bdev->md.md_offset + MD_AL_OFFSET - 1; |
@@ -2011,12 +1818,18 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) | |||
2011 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | 1818 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) |
2012 | { | 1819 | { |
2013 | sector_t s; | 1820 | sector_t s; |
2014 | switch (bdev->dc.meta_dev_idx) { | 1821 | int meta_dev_idx; |
1822 | |||
1823 | rcu_read_lock(); | ||
1824 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1825 | rcu_read_unlock(); | ||
1826 | |||
1827 | switch (meta_dev_idx) { | ||
2015 | case DRBD_MD_INDEX_INTERNAL: | 1828 | case DRBD_MD_INDEX_INTERNAL: |
2016 | case DRBD_MD_INDEX_FLEX_INT: | 1829 | case DRBD_MD_INDEX_FLEX_INT: |
2017 | s = drbd_get_capacity(bdev->backing_bdev) | 1830 | s = drbd_get_capacity(bdev->backing_bdev) |
2018 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | 1831 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, |
2019 | drbd_md_first_sector(bdev)) | 1832 | _drbd_md_first_sector(meta_dev_idx, bdev)) |
2020 | : 0; | 1833 | : 0; |
2021 | break; | 1834 | break; |
2022 | case DRBD_MD_INDEX_FLEX_EXT: | 1835 | case DRBD_MD_INDEX_FLEX_EXT: |
@@ -2042,9 +1855,15 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | |||
2042 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, | 1855 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, |
2043 | struct drbd_backing_dev *bdev) | 1856 | struct drbd_backing_dev *bdev) |
2044 | { | 1857 | { |
2045 | switch (bdev->dc.meta_dev_idx) { | 1858 | int meta_dev_idx; |
1859 | |||
1860 | rcu_read_lock(); | ||
1861 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1862 | rcu_read_unlock(); | ||
1863 | |||
1864 | switch (meta_dev_idx) { | ||
2046 | default: /* external, some index */ | 1865 | default: /* external, some index */ |
2047 | return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; | 1866 | return MD_RESERVED_SECT * meta_dev_idx; |
2048 | case DRBD_MD_INDEX_INTERNAL: | 1867 | case DRBD_MD_INDEX_INTERNAL: |
2049 | /* with drbd08, internal meta data is always "flexible" */ | 1868 | /* with drbd08, internal meta data is always "flexible" */ |
2050 | case DRBD_MD_INDEX_FLEX_INT: | 1869 | case DRBD_MD_INDEX_FLEX_INT: |
@@ -2070,9 +1889,8 @@ drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) | |||
2070 | unsigned long flags; | 1889 | unsigned long flags; |
2071 | spin_lock_irqsave(&q->q_lock, flags); | 1890 | spin_lock_irqsave(&q->q_lock, flags); |
2072 | list_add(&w->list, &q->q); | 1891 | list_add(&w->list, &q->q); |
2073 | up(&q->s); /* within the spinlock, | ||
2074 | see comment near end of drbd_worker() */ | ||
2075 | spin_unlock_irqrestore(&q->q_lock, flags); | 1892 | spin_unlock_irqrestore(&q->q_lock, flags); |
1893 | wake_up(&q->q_wait); | ||
2076 | } | 1894 | } |
2077 | 1895 | ||
2078 | static inline void | 1896 | static inline void |
@@ -2081,41 +1899,35 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) | |||
2081 | unsigned long flags; | 1899 | unsigned long flags; |
2082 | spin_lock_irqsave(&q->q_lock, flags); | 1900 | spin_lock_irqsave(&q->q_lock, flags); |
2083 | list_add_tail(&w->list, &q->q); | 1901 | list_add_tail(&w->list, &q->q); |
2084 | up(&q->s); /* within the spinlock, | ||
2085 | see comment near end of drbd_worker() */ | ||
2086 | spin_unlock_irqrestore(&q->q_lock, flags); | 1902 | spin_unlock_irqrestore(&q->q_lock, flags); |
1903 | wake_up(&q->q_wait); | ||
2087 | } | 1904 | } |
2088 | 1905 | ||
2089 | static inline void wake_asender(struct drbd_conf *mdev) | 1906 | static inline void wake_asender(struct drbd_tconn *tconn) |
2090 | { | ||
2091 | if (drbd_test_flag(mdev, SIGNAL_ASENDER)) | ||
2092 | force_sig(DRBD_SIG, mdev->asender.task); | ||
2093 | } | ||
2094 | |||
2095 | static inline void request_ping(struct drbd_conf *mdev) | ||
2096 | { | 1907 | { |
2097 | drbd_set_flag(mdev, SEND_PING); | 1908 | if (test_bit(SIGNAL_ASENDER, &tconn->flags)) |
2098 | wake_asender(mdev); | 1909 | force_sig(DRBD_SIG, tconn->asender.task); |
2099 | } | 1910 | } |
2100 | 1911 | ||
2101 | static inline int drbd_send_short_cmd(struct drbd_conf *mdev, | 1912 | static inline void request_ping(struct drbd_tconn *tconn) |
2102 | enum drbd_packets cmd) | ||
2103 | { | 1913 | { |
2104 | struct p_header80 h; | 1914 | set_bit(SEND_PING, &tconn->flags); |
2105 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); | 1915 | wake_asender(tconn); |
2106 | } | 1916 | } |
2107 | 1917 | ||
2108 | static inline int drbd_send_ping(struct drbd_conf *mdev) | 1918 | extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *); |
2109 | { | 1919 | extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *); |
2110 | struct p_header80 h; | 1920 | extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *, |
2111 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); | 1921 | enum drbd_packet, unsigned int, void *, |
2112 | } | 1922 | unsigned int); |
1923 | extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *, | ||
1924 | enum drbd_packet, unsigned int, void *, | ||
1925 | unsigned int); | ||
2113 | 1926 | ||
2114 | static inline int drbd_send_ping_ack(struct drbd_conf *mdev) | 1927 | extern int drbd_send_ping(struct drbd_tconn *tconn); |
2115 | { | 1928 | extern int drbd_send_ping_ack(struct drbd_tconn *tconn); |
2116 | struct p_header80 h; | 1929 | extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); |
2117 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); | 1930 | extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state); |
2118 | } | ||
2119 | 1931 | ||
2120 | static inline void drbd_thread_stop(struct drbd_thread *thi) | 1932 | static inline void drbd_thread_stop(struct drbd_thread *thi) |
2121 | { | 1933 | { |
@@ -2137,21 +1949,21 @@ static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) | |||
2137 | * or implicit barrier packets as necessary. | 1949 | * or implicit barrier packets as necessary. |
2138 | * increased: | 1950 | * increased: |
2139 | * w_send_barrier | 1951 | * w_send_barrier |
2140 | * _req_mod(req, queue_for_net_write or queue_for_net_read); | 1952 | * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ); |
2141 | * it is much easier and equally valid to count what we queue for the | 1953 | * it is much easier and equally valid to count what we queue for the |
2142 | * worker, even before it actually was queued or send. | 1954 | * worker, even before it actually was queued or send. |
2143 | * (drbd_make_request_common; recovery path on read io-error) | 1955 | * (drbd_make_request_common; recovery path on read io-error) |
2144 | * decreased: | 1956 | * decreased: |
2145 | * got_BarrierAck (respective tl_clear, tl_clear_barrier) | 1957 | * got_BarrierAck (respective tl_clear, tl_clear_barrier) |
2146 | * _req_mod(req, data_received) | 1958 | * _req_mod(req, DATA_RECEIVED) |
2147 | * [from receive_DataReply] | 1959 | * [from receive_DataReply] |
2148 | * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) | 1960 | * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED) |
2149 | * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] | 1961 | * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] |
2150 | * for some reason it is NOT decreased in got_NegAck, | 1962 | * for some reason it is NOT decreased in got_NegAck, |
2151 | * but in the resulting cleanup code from report_params. | 1963 | * but in the resulting cleanup code from report_params. |
2152 | * we should try to remember the reason for that... | 1964 | * we should try to remember the reason for that... |
2153 | * _req_mod(req, send_failed or send_canceled) | 1965 | * _req_mod(req, SEND_FAILED or SEND_CANCELED) |
2154 | * _req_mod(req, connection_lost_while_pending) | 1966 | * _req_mod(req, CONNECTION_LOST_WHILE_PENDING) |
2155 | * [from tl_clear_barrier] | 1967 | * [from tl_clear_barrier] |
2156 | */ | 1968 | */ |
2157 | static inline void inc_ap_pending(struct drbd_conf *mdev) | 1969 | static inline void inc_ap_pending(struct drbd_conf *mdev) |
@@ -2159,17 +1971,19 @@ static inline void inc_ap_pending(struct drbd_conf *mdev) | |||
2159 | atomic_inc(&mdev->ap_pending_cnt); | 1971 | atomic_inc(&mdev->ap_pending_cnt); |
2160 | } | 1972 | } |
2161 | 1973 | ||
2162 | #define ERR_IF_CNT_IS_NEGATIVE(which) \ | 1974 | #define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \ |
2163 | if (atomic_read(&mdev->which) < 0) \ | 1975 | if (atomic_read(&mdev->which) < 0) \ |
2164 | dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ | 1976 | dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ |
2165 | __func__ , __LINE__ , \ | 1977 | func, line, \ |
2166 | atomic_read(&mdev->which)) | 1978 | atomic_read(&mdev->which)) |
2167 | 1979 | ||
2168 | #define dec_ap_pending(mdev) do { \ | 1980 | #define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__) |
2169 | typecheck(struct drbd_conf *, mdev); \ | 1981 | static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line) |
2170 | if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ | 1982 | { |
2171 | wake_up(&mdev->misc_wait); \ | 1983 | if (atomic_dec_and_test(&mdev->ap_pending_cnt)) |
2172 | ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) | 1984 | wake_up(&mdev->misc_wait); |
1985 | ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line); | ||
1986 | } | ||
2173 | 1987 | ||
2174 | /* counts how many resync-related answers we still expect from the peer | 1988 | /* counts how many resync-related answers we still expect from the peer |
2175 | * increase decrease | 1989 | * increase decrease |
@@ -2182,10 +1996,12 @@ static inline void inc_rs_pending(struct drbd_conf *mdev) | |||
2182 | atomic_inc(&mdev->rs_pending_cnt); | 1996 | atomic_inc(&mdev->rs_pending_cnt); |
2183 | } | 1997 | } |
2184 | 1998 | ||
2185 | #define dec_rs_pending(mdev) do { \ | 1999 | #define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__) |
2186 | typecheck(struct drbd_conf *, mdev); \ | 2000 | static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line) |
2187 | atomic_dec(&mdev->rs_pending_cnt); \ | 2001 | { |
2188 | ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) | 2002 | atomic_dec(&mdev->rs_pending_cnt); |
2003 | ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line); | ||
2004 | } | ||
2189 | 2005 | ||
2190 | /* counts how many answers we still need to send to the peer. | 2006 | /* counts how many answers we still need to send to the peer. |
2191 | * increased on | 2007 | * increased on |
@@ -2201,38 +2017,18 @@ static inline void inc_unacked(struct drbd_conf *mdev) | |||
2201 | atomic_inc(&mdev->unacked_cnt); | 2017 | atomic_inc(&mdev->unacked_cnt); |
2202 | } | 2018 | } |
2203 | 2019 | ||
2204 | #define dec_unacked(mdev) do { \ | 2020 | #define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__) |
2205 | typecheck(struct drbd_conf *, mdev); \ | 2021 | static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line) |
2206 | atomic_dec(&mdev->unacked_cnt); \ | ||
2207 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
2208 | |||
2209 | #define sub_unacked(mdev, n) do { \ | ||
2210 | typecheck(struct drbd_conf *, mdev); \ | ||
2211 | atomic_sub(n, &mdev->unacked_cnt); \ | ||
2212 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
2213 | |||
2214 | |||
2215 | static inline void put_net_conf(struct drbd_conf *mdev) | ||
2216 | { | 2022 | { |
2217 | if (atomic_dec_and_test(&mdev->net_cnt)) | 2023 | atomic_dec(&mdev->unacked_cnt); |
2218 | wake_up(&mdev->net_cnt_wait); | 2024 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); |
2219 | } | 2025 | } |
2220 | 2026 | ||
2221 | /** | 2027 | #define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__) |
2222 | * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there | 2028 | static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line) |
2223 | * @mdev: DRBD device. | ||
2224 | * | ||
2225 | * You have to call put_net_conf() when finished working with mdev->net_conf. | ||
2226 | */ | ||
2227 | static inline int get_net_conf(struct drbd_conf *mdev) | ||
2228 | { | 2029 | { |
2229 | int have_net_conf; | 2030 | atomic_sub(n, &mdev->unacked_cnt); |
2230 | 2031 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); | |
2231 | atomic_inc(&mdev->net_cnt); | ||
2232 | have_net_conf = mdev->state.conn >= C_UNCONNECTED; | ||
2233 | if (!have_net_conf) | ||
2234 | put_net_conf(mdev); | ||
2235 | return have_net_conf; | ||
2236 | } | 2032 | } |
2237 | 2033 | ||
2238 | /** | 2034 | /** |
@@ -2336,17 +2132,20 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, | |||
2336 | * maybe re-implement using semaphores? */ | 2132 | * maybe re-implement using semaphores? */ |
2337 | static inline int drbd_get_max_buffers(struct drbd_conf *mdev) | 2133 | static inline int drbd_get_max_buffers(struct drbd_conf *mdev) |
2338 | { | 2134 | { |
2339 | int mxb = 1000000; /* arbitrary limit on open requests */ | 2135 | struct net_conf *nc; |
2340 | if (get_net_conf(mdev)) { | 2136 | int mxb; |
2341 | mxb = mdev->net_conf->max_buffers; | 2137 | |
2342 | put_net_conf(mdev); | 2138 | rcu_read_lock(); |
2343 | } | 2139 | nc = rcu_dereference(mdev->tconn->net_conf); |
2140 | mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */ | ||
2141 | rcu_read_unlock(); | ||
2142 | |||
2344 | return mxb; | 2143 | return mxb; |
2345 | } | 2144 | } |
2346 | 2145 | ||
2347 | static inline int drbd_state_is_stable(struct drbd_conf *mdev) | 2146 | static inline int drbd_state_is_stable(struct drbd_conf *mdev) |
2348 | { | 2147 | { |
2349 | union drbd_state s = mdev->state; | 2148 | union drbd_dev_state s = mdev->state; |
2350 | 2149 | ||
2351 | /* DO NOT add a default clause, we want the compiler to warn us | 2150 | /* DO NOT add a default clause, we want the compiler to warn us |
2352 | * for any newly introduced state we may have forgotten to add here */ | 2151 | * for any newly introduced state we may have forgotten to add here */ |
@@ -2380,7 +2179,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) | |||
2380 | 2179 | ||
2381 | /* Allow IO in BM exchange states with new protocols */ | 2180 | /* Allow IO in BM exchange states with new protocols */ |
2382 | case C_WF_BITMAP_S: | 2181 | case C_WF_BITMAP_S: |
2383 | if (mdev->agreed_pro_version < 96) | 2182 | if (mdev->tconn->agreed_pro_version < 96) |
2384 | return 0; | 2183 | return 0; |
2385 | break; | 2184 | break; |
2386 | 2185 | ||
@@ -2402,7 +2201,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) | |||
2402 | /* disk state is stable as well. */ | 2201 | /* disk state is stable as well. */ |
2403 | break; | 2202 | break; |
2404 | 2203 | ||
2405 | /* no new io accepted during tansitional states */ | 2204 | /* no new io accepted during transitional states */ |
2406 | case D_ATTACHING: | 2205 | case D_ATTACHING: |
2407 | case D_NEGOTIATING: | 2206 | case D_NEGOTIATING: |
2408 | case D_UNKNOWN: | 2207 | case D_UNKNOWN: |
@@ -2414,18 +2213,20 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) | |||
2414 | return 1; | 2213 | return 1; |
2415 | } | 2214 | } |
2416 | 2215 | ||
2417 | static inline int is_susp(union drbd_state s) | 2216 | static inline int drbd_suspended(struct drbd_conf *mdev) |
2418 | { | 2217 | { |
2419 | return s.susp || s.susp_nod || s.susp_fen; | 2218 | struct drbd_tconn *tconn = mdev->tconn; |
2219 | |||
2220 | return tconn->susp || tconn->susp_fen || tconn->susp_nod; | ||
2420 | } | 2221 | } |
2421 | 2222 | ||
2422 | static inline bool may_inc_ap_bio(struct drbd_conf *mdev) | 2223 | static inline bool may_inc_ap_bio(struct drbd_conf *mdev) |
2423 | { | 2224 | { |
2424 | int mxb = drbd_get_max_buffers(mdev); | 2225 | int mxb = drbd_get_max_buffers(mdev); |
2425 | 2226 | ||
2426 | if (is_susp(mdev->state)) | 2227 | if (drbd_suspended(mdev)) |
2427 | return false; | 2228 | return false; |
2428 | if (drbd_test_flag(mdev, SUSPEND_IO)) | 2229 | if (test_bit(SUSPEND_IO, &mdev->flags)) |
2429 | return false; | 2230 | return false; |
2430 | 2231 | ||
2431 | /* to avoid potential deadlock or bitmap corruption, | 2232 | /* to avoid potential deadlock or bitmap corruption, |
@@ -2440,35 +2241,35 @@ static inline bool may_inc_ap_bio(struct drbd_conf *mdev) | |||
2440 | * and we are within the spinlock anyways, we have this workaround. */ | 2241 | * and we are within the spinlock anyways, we have this workaround. */ |
2441 | if (atomic_read(&mdev->ap_bio_cnt) > mxb) | 2242 | if (atomic_read(&mdev->ap_bio_cnt) > mxb) |
2442 | return false; | 2243 | return false; |
2443 | if (drbd_test_flag(mdev, BITMAP_IO)) | 2244 | if (test_bit(BITMAP_IO, &mdev->flags)) |
2444 | return false; | 2245 | return false; |
2445 | return true; | 2246 | return true; |
2446 | } | 2247 | } |
2447 | 2248 | ||
2448 | static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count) | 2249 | static inline bool inc_ap_bio_cond(struct drbd_conf *mdev) |
2449 | { | 2250 | { |
2450 | bool rv = false; | 2251 | bool rv = false; |
2451 | 2252 | ||
2452 | spin_lock_irq(&mdev->req_lock); | 2253 | spin_lock_irq(&mdev->tconn->req_lock); |
2453 | rv = may_inc_ap_bio(mdev); | 2254 | rv = may_inc_ap_bio(mdev); |
2454 | if (rv) | 2255 | if (rv) |
2455 | atomic_add(count, &mdev->ap_bio_cnt); | 2256 | atomic_inc(&mdev->ap_bio_cnt); |
2456 | spin_unlock_irq(&mdev->req_lock); | 2257 | spin_unlock_irq(&mdev->tconn->req_lock); |
2457 | 2258 | ||
2458 | return rv; | 2259 | return rv; |
2459 | } | 2260 | } |
2460 | 2261 | ||
2461 | static inline void inc_ap_bio(struct drbd_conf *mdev, int count) | 2262 | static inline void inc_ap_bio(struct drbd_conf *mdev) |
2462 | { | 2263 | { |
2463 | /* we wait here | 2264 | /* we wait here |
2464 | * as long as the device is suspended | 2265 | * as long as the device is suspended |
2465 | * until the bitmap is no longer on the fly during connection | 2266 | * until the bitmap is no longer on the fly during connection |
2466 | * handshake as long as we would exeed the max_buffer limit. | 2267 | * handshake as long as we would exceed the max_buffer limit. |
2467 | * | 2268 | * |
2468 | * to avoid races with the reconnect code, | 2269 | * to avoid races with the reconnect code, |
2469 | * we need to atomic_inc within the spinlock. */ | 2270 | * we need to atomic_inc within the spinlock. */ |
2470 | 2271 | ||
2471 | wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count)); | 2272 | wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev)); |
2472 | } | 2273 | } |
2473 | 2274 | ||
2474 | static inline void dec_ap_bio(struct drbd_conf *mdev) | 2275 | static inline void dec_ap_bio(struct drbd_conf *mdev) |
@@ -2478,9 +2279,9 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) | |||
2478 | 2279 | ||
2479 | D_ASSERT(ap_bio >= 0); | 2280 | D_ASSERT(ap_bio >= 0); |
2480 | 2281 | ||
2481 | if (ap_bio == 0 && drbd_test_flag(mdev, BITMAP_IO)) { | 2282 | if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { |
2482 | if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED)) | 2283 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) |
2483 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | 2284 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w); |
2484 | } | 2285 | } |
2485 | 2286 | ||
2486 | /* this currently does wake_up for every dec_ap_bio! | 2287 | /* this currently does wake_up for every dec_ap_bio! |
@@ -2490,6 +2291,12 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) | |||
2490 | wake_up(&mdev->misc_wait); | 2291 | wake_up(&mdev->misc_wait); |
2491 | } | 2292 | } |
2492 | 2293 | ||
2294 | static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev) | ||
2295 | { | ||
2296 | return mdev->tconn->agreed_pro_version >= 97 && | ||
2297 | mdev->tconn->agreed_pro_version != 100; | ||
2298 | } | ||
2299 | |||
2493 | static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) | 2300 | static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) |
2494 | { | 2301 | { |
2495 | int changed = mdev->ed_uuid != val; | 2302 | int changed = mdev->ed_uuid != val; |
@@ -2497,40 +2304,6 @@ static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) | |||
2497 | return changed; | 2304 | return changed; |
2498 | } | 2305 | } |
2499 | 2306 | ||
2500 | static inline int seq_cmp(u32 a, u32 b) | ||
2501 | { | ||
2502 | /* we assume wrap around at 32bit. | ||
2503 | * for wrap around at 24bit (old atomic_t), | ||
2504 | * we'd have to | ||
2505 | * a <<= 8; b <<= 8; | ||
2506 | */ | ||
2507 | return (s32)(a) - (s32)(b); | ||
2508 | } | ||
2509 | #define seq_lt(a, b) (seq_cmp((a), (b)) < 0) | ||
2510 | #define seq_gt(a, b) (seq_cmp((a), (b)) > 0) | ||
2511 | #define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) | ||
2512 | #define seq_le(a, b) (seq_cmp((a), (b)) <= 0) | ||
2513 | /* CAUTION: please no side effects in arguments! */ | ||
2514 | #define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) | ||
2515 | |||
2516 | static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) | ||
2517 | { | ||
2518 | unsigned int m; | ||
2519 | spin_lock(&mdev->peer_seq_lock); | ||
2520 | m = seq_max(mdev->peer_seq, new_seq); | ||
2521 | mdev->peer_seq = m; | ||
2522 | spin_unlock(&mdev->peer_seq_lock); | ||
2523 | if (m == new_seq) | ||
2524 | wake_up(&mdev->seq_wait); | ||
2525 | } | ||
2526 | |||
2527 | static inline void drbd_update_congested(struct drbd_conf *mdev) | ||
2528 | { | ||
2529 | struct sock *sk = mdev->data.socket->sk; | ||
2530 | if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) | ||
2531 | drbd_set_flag(mdev, NET_CONGESTED); | ||
2532 | } | ||
2533 | |||
2534 | static inline int drbd_queue_order_type(struct drbd_conf *mdev) | 2307 | static inline int drbd_queue_order_type(struct drbd_conf *mdev) |
2535 | { | 2308 | { |
2536 | /* sorry, we currently have no working implementation | 2309 | /* sorry, we currently have no working implementation |
@@ -2545,15 +2318,46 @@ static inline void drbd_md_flush(struct drbd_conf *mdev) | |||
2545 | { | 2318 | { |
2546 | int r; | 2319 | int r; |
2547 | 2320 | ||
2548 | if (drbd_test_flag(mdev, MD_NO_FUA)) | 2321 | if (mdev->ldev == NULL) { |
2322 | dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n"); | ||
2323 | return; | ||
2324 | } | ||
2325 | |||
2326 | if (test_bit(MD_NO_FUA, &mdev->flags)) | ||
2549 | return; | 2327 | return; |
2550 | 2328 | ||
2551 | r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL); | 2329 | r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL); |
2552 | if (r) { | 2330 | if (r) { |
2553 | drbd_set_flag(mdev, MD_NO_FUA); | 2331 | set_bit(MD_NO_FUA, &mdev->flags); |
2554 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); | 2332 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); |
2555 | } | 2333 | } |
2556 | } | 2334 | } |
2557 | 2335 | ||
2558 | |||
2559 | #endif | 2336 | #endif |
2337 | |||
2338 | /* This is defined in drivers/md/md.h as well. Should go into wait.h */ | ||
2339 | #define __wait_event_lock_irq(wq, condition, lock, cmd) \ | ||
2340 | do { \ | ||
2341 | wait_queue_t __wait; \ | ||
2342 | init_waitqueue_entry(&__wait, current); \ | ||
2343 | \ | ||
2344 | add_wait_queue(&wq, &__wait); \ | ||
2345 | for (;;) { \ | ||
2346 | set_current_state(TASK_UNINTERRUPTIBLE); \ | ||
2347 | if (condition) \ | ||
2348 | break; \ | ||
2349 | spin_unlock_irq(&lock); \ | ||
2350 | cmd; \ | ||
2351 | schedule(); \ | ||
2352 | spin_lock_irq(&lock); \ | ||
2353 | } \ | ||
2354 | current->state = TASK_RUNNING; \ | ||
2355 | remove_wait_queue(&wq, &__wait); \ | ||
2356 | } while (0) | ||
2357 | |||
2358 | #define wait_event_lock_irq(wq, condition, lock, cmd) \ | ||
2359 | do { \ | ||
2360 | if (condition) \ | ||
2361 | break; \ | ||
2362 | __wait_event_lock_irq(wq, condition, lock, cmd); \ | ||
2363 | } while (0) | ||
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c new file mode 100644 index 000000000000..89c497c630b4 --- /dev/null +++ b/drivers/block/drbd/drbd_interval.c | |||
@@ -0,0 +1,207 @@ | |||
1 | #include <asm/bug.h> | ||
2 | #include <linux/rbtree_augmented.h> | ||
3 | #include "drbd_interval.h" | ||
4 | |||
5 | /** | ||
6 | * interval_end - return end of @node | ||
7 | */ | ||
8 | static inline | ||
9 | sector_t interval_end(struct rb_node *node) | ||
10 | { | ||
11 | struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); | ||
12 | return this->end; | ||
13 | } | ||
14 | |||
15 | /** | ||
16 | * compute_subtree_last - compute end of @node | ||
17 | * | ||
18 | * The end of an interval is the highest (start + (size >> 9)) value of this | ||
19 | * node and of its children. Called for @node and its parents whenever the end | ||
20 | * may have changed. | ||
21 | */ | ||
22 | static inline sector_t | ||
23 | compute_subtree_last(struct drbd_interval *node) | ||
24 | { | ||
25 | sector_t max = node->sector + (node->size >> 9); | ||
26 | |||
27 | if (node->rb.rb_left) { | ||
28 | sector_t left = interval_end(node->rb.rb_left); | ||
29 | if (left > max) | ||
30 | max = left; | ||
31 | } | ||
32 | if (node->rb.rb_right) { | ||
33 | sector_t right = interval_end(node->rb.rb_right); | ||
34 | if (right > max) | ||
35 | max = right; | ||
36 | } | ||
37 | return max; | ||
38 | } | ||
39 | |||
40 | static void augment_propagate(struct rb_node *rb, struct rb_node *stop) | ||
41 | { | ||
42 | while (rb != stop) { | ||
43 | struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb); | ||
44 | sector_t subtree_last = compute_subtree_last(node); | ||
45 | if (node->end == subtree_last) | ||
46 | break; | ||
47 | node->end = subtree_last; | ||
48 | rb = rb_parent(&node->rb); | ||
49 | } | ||
50 | } | ||
51 | |||
52 | static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) | ||
53 | { | ||
54 | struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); | ||
55 | struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); | ||
56 | |||
57 | new->end = old->end; | ||
58 | } | ||
59 | |||
60 | static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) | ||
61 | { | ||
62 | struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); | ||
63 | struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); | ||
64 | |||
65 | new->end = old->end; | ||
66 | old->end = compute_subtree_last(old); | ||
67 | } | ||
68 | |||
69 | static const struct rb_augment_callbacks augment_callbacks = { | ||
70 | augment_propagate, | ||
71 | augment_copy, | ||
72 | augment_rotate, | ||
73 | }; | ||
74 | |||
75 | /** | ||
76 | * drbd_insert_interval - insert a new interval into a tree | ||
77 | */ | ||
78 | bool | ||
79 | drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) | ||
80 | { | ||
81 | struct rb_node **new = &root->rb_node, *parent = NULL; | ||
82 | |||
83 | BUG_ON(!IS_ALIGNED(this->size, 512)); | ||
84 | |||
85 | while (*new) { | ||
86 | struct drbd_interval *here = | ||
87 | rb_entry(*new, struct drbd_interval, rb); | ||
88 | |||
89 | parent = *new; | ||
90 | if (this->sector < here->sector) | ||
91 | new = &(*new)->rb_left; | ||
92 | else if (this->sector > here->sector) | ||
93 | new = &(*new)->rb_right; | ||
94 | else if (this < here) | ||
95 | new = &(*new)->rb_left; | ||
96 | else if (this > here) | ||
97 | new = &(*new)->rb_right; | ||
98 | else | ||
99 | return false; | ||
100 | } | ||
101 | |||
102 | rb_link_node(&this->rb, parent, new); | ||
103 | rb_insert_augmented(&this->rb, root, &augment_callbacks); | ||
104 | return true; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * drbd_contains_interval - check if a tree contains a given interval | ||
109 | * @sector: start sector of @interval | ||
110 | * @interval: may not be a valid pointer | ||
111 | * | ||
112 | * Returns if the tree contains the node @interval with start sector @start. | ||
113 | * Does not dereference @interval until @interval is known to be a valid object | ||
114 | * in @tree. Returns %false if @interval is in the tree but with a different | ||
115 | * sector number. | ||
116 | */ | ||
117 | bool | ||
118 | drbd_contains_interval(struct rb_root *root, sector_t sector, | ||
119 | struct drbd_interval *interval) | ||
120 | { | ||
121 | struct rb_node *node = root->rb_node; | ||
122 | |||
123 | while (node) { | ||
124 | struct drbd_interval *here = | ||
125 | rb_entry(node, struct drbd_interval, rb); | ||
126 | |||
127 | if (sector < here->sector) | ||
128 | node = node->rb_left; | ||
129 | else if (sector > here->sector) | ||
130 | node = node->rb_right; | ||
131 | else if (interval < here) | ||
132 | node = node->rb_left; | ||
133 | else if (interval > here) | ||
134 | node = node->rb_right; | ||
135 | else | ||
136 | return true; | ||
137 | } | ||
138 | return false; | ||
139 | } | ||
140 | |||
141 | /** | ||
142 | * drbd_remove_interval - remove an interval from a tree | ||
143 | */ | ||
144 | void | ||
145 | drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) | ||
146 | { | ||
147 | rb_erase_augmented(&this->rb, root, &augment_callbacks); | ||
148 | } | ||
149 | |||
150 | /** | ||
151 | * drbd_find_overlap - search for an interval overlapping with [sector, sector + size) | ||
152 | * @sector: start sector | ||
153 | * @size: size, aligned to 512 bytes | ||
154 | * | ||
155 | * Returns an interval overlapping with [sector, sector + size), or NULL if | ||
156 | * there is none. When there is more than one overlapping interval in the | ||
157 | * tree, the interval with the lowest start sector is returned, and all other | ||
158 | * overlapping intervals will be on the right side of the tree, reachable with | ||
159 | * rb_next(). | ||
160 | */ | ||
161 | struct drbd_interval * | ||
162 | drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size) | ||
163 | { | ||
164 | struct rb_node *node = root->rb_node; | ||
165 | struct drbd_interval *overlap = NULL; | ||
166 | sector_t end = sector + (size >> 9); | ||
167 | |||
168 | BUG_ON(!IS_ALIGNED(size, 512)); | ||
169 | |||
170 | while (node) { | ||
171 | struct drbd_interval *here = | ||
172 | rb_entry(node, struct drbd_interval, rb); | ||
173 | |||
174 | if (node->rb_left && | ||
175 | sector < interval_end(node->rb_left)) { | ||
176 | /* Overlap if any must be on left side */ | ||
177 | node = node->rb_left; | ||
178 | } else if (here->sector < end && | ||
179 | sector < here->sector + (here->size >> 9)) { | ||
180 | overlap = here; | ||
181 | break; | ||
182 | } else if (sector >= here->sector) { | ||
183 | /* Overlap if any must be on right side */ | ||
184 | node = node->rb_right; | ||
185 | } else | ||
186 | break; | ||
187 | } | ||
188 | return overlap; | ||
189 | } | ||
190 | |||
191 | struct drbd_interval * | ||
192 | drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size) | ||
193 | { | ||
194 | sector_t end = sector + (size >> 9); | ||
195 | struct rb_node *node; | ||
196 | |||
197 | for (;;) { | ||
198 | node = rb_next(&i->rb); | ||
199 | if (!node) | ||
200 | return NULL; | ||
201 | i = rb_entry(node, struct drbd_interval, rb); | ||
202 | if (i->sector >= end) | ||
203 | return NULL; | ||
204 | if (sector < i->sector + (i->size >> 9)) | ||
205 | return i; | ||
206 | } | ||
207 | } | ||
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h new file mode 100644 index 000000000000..f38fcb00c10d --- /dev/null +++ b/drivers/block/drbd/drbd_interval.h | |||
@@ -0,0 +1,40 @@ | |||
1 | #ifndef __DRBD_INTERVAL_H | ||
2 | #define __DRBD_INTERVAL_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <linux/rbtree.h> | ||
6 | |||
7 | struct drbd_interval { | ||
8 | struct rb_node rb; | ||
9 | sector_t sector; /* start sector of the interval */ | ||
10 | unsigned int size; /* size in bytes */ | ||
11 | sector_t end; /* highest interval end in subtree */ | ||
12 | int local:1 /* local or remote request? */; | ||
13 | int waiting:1; | ||
14 | }; | ||
15 | |||
16 | static inline void drbd_clear_interval(struct drbd_interval *i) | ||
17 | { | ||
18 | RB_CLEAR_NODE(&i->rb); | ||
19 | } | ||
20 | |||
21 | static inline bool drbd_interval_empty(struct drbd_interval *i) | ||
22 | { | ||
23 | return RB_EMPTY_NODE(&i->rb); | ||
24 | } | ||
25 | |||
26 | extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *); | ||
27 | extern bool drbd_contains_interval(struct rb_root *, sector_t, | ||
28 | struct drbd_interval *); | ||
29 | extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *); | ||
30 | extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t, | ||
31 | unsigned int); | ||
32 | extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t, | ||
33 | unsigned int); | ||
34 | |||
35 | #define drbd_for_each_overlap(i, root, sector, size) \ | ||
36 | for (i = drbd_find_overlap(root, sector, size); \ | ||
37 | i; \ | ||
38 | i = drbd_next_overlap(i, sector, size)) | ||
39 | |||
40 | #endif /* __DRBD_INTERVAL_H */ | ||
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 9b833e0fb440..52de26daa1f6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -56,14 +56,6 @@ | |||
56 | 56 | ||
57 | #include "drbd_vli.h" | 57 | #include "drbd_vli.h" |
58 | 58 | ||
59 | struct after_state_chg_work { | ||
60 | struct drbd_work w; | ||
61 | union drbd_state os; | ||
62 | union drbd_state ns; | ||
63 | enum chg_state_flags flags; | ||
64 | struct completion *done; | ||
65 | }; | ||
66 | |||
67 | static DEFINE_MUTEX(drbd_main_mutex); | 59 | static DEFINE_MUTEX(drbd_main_mutex); |
68 | int drbdd_init(struct drbd_thread *); | 60 | int drbdd_init(struct drbd_thread *); |
69 | int drbd_worker(struct drbd_thread *); | 61 | int drbd_worker(struct drbd_thread *); |
@@ -72,21 +64,17 @@ int drbd_asender(struct drbd_thread *); | |||
72 | int drbd_init(void); | 64 | int drbd_init(void); |
73 | static int drbd_open(struct block_device *bdev, fmode_t mode); | 65 | static int drbd_open(struct block_device *bdev, fmode_t mode); |
74 | static int drbd_release(struct gendisk *gd, fmode_t mode); | 66 | static int drbd_release(struct gendisk *gd, fmode_t mode); |
75 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 67 | static int w_md_sync(struct drbd_work *w, int unused); |
76 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
77 | union drbd_state ns, enum chg_state_flags flags); | ||
78 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
79 | static void md_sync_timer_fn(unsigned long data); | 68 | static void md_sync_timer_fn(unsigned long data); |
80 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 69 | static int w_bitmap_io(struct drbd_work *w, int unused); |
81 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 70 | static int w_go_diskless(struct drbd_work *w, int unused); |
82 | static void _tl_clear(struct drbd_conf *mdev); | ||
83 | 71 | ||
84 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " | 72 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " |
85 | "Lars Ellenberg <lars@linbit.com>"); | 73 | "Lars Ellenberg <lars@linbit.com>"); |
86 | MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); | 74 | MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); |
87 | MODULE_VERSION(REL_VERSION); | 75 | MODULE_VERSION(REL_VERSION); |
88 | MODULE_LICENSE("GPL"); | 76 | MODULE_LICENSE("GPL"); |
89 | MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (" | 77 | MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices (" |
90 | __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); | 78 | __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); |
91 | MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); | 79 | MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); |
92 | 80 | ||
@@ -98,7 +86,6 @@ MODULE_PARM_DESC(allow_oos, "DONT USE!"); | |||
98 | module_param(minor_count, uint, 0444); | 86 | module_param(minor_count, uint, 0444); |
99 | module_param(disable_sendpage, bool, 0644); | 87 | module_param(disable_sendpage, bool, 0644); |
100 | module_param(allow_oos, bool, 0); | 88 | module_param(allow_oos, bool, 0); |
101 | module_param(cn_idx, uint, 0444); | ||
102 | module_param(proc_details, int, 0644); | 89 | module_param(proc_details, int, 0644); |
103 | 90 | ||
104 | #ifdef CONFIG_DRBD_FAULT_INJECTION | 91 | #ifdef CONFIG_DRBD_FAULT_INJECTION |
@@ -120,7 +107,6 @@ module_param(fault_devs, int, 0644); | |||
120 | unsigned int minor_count = DRBD_MINOR_COUNT_DEF; | 107 | unsigned int minor_count = DRBD_MINOR_COUNT_DEF; |
121 | bool disable_sendpage; | 108 | bool disable_sendpage; |
122 | bool allow_oos; | 109 | bool allow_oos; |
123 | unsigned int cn_idx = CN_IDX_DRBD; | ||
124 | int proc_details; /* Detail level in proc drbd*/ | 110 | int proc_details; /* Detail level in proc drbd*/ |
125 | 111 | ||
126 | /* Module parameter for setting the user mode helper program | 112 | /* Module parameter for setting the user mode helper program |
@@ -132,10 +118,11 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0 | |||
132 | /* in 2.6.x, our device mapping and config info contains our virtual gendisks | 118 | /* in 2.6.x, our device mapping and config info contains our virtual gendisks |
133 | * as member "struct gendisk *vdisk;" | 119 | * as member "struct gendisk *vdisk;" |
134 | */ | 120 | */ |
135 | struct drbd_conf **minor_table; | 121 | struct idr minors; |
122 | struct list_head drbd_tconns; /* list of struct drbd_tconn */ | ||
136 | 123 | ||
137 | struct kmem_cache *drbd_request_cache; | 124 | struct kmem_cache *drbd_request_cache; |
138 | struct kmem_cache *drbd_ee_cache; /* epoch entries */ | 125 | struct kmem_cache *drbd_ee_cache; /* peer requests */ |
139 | struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | 126 | struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ |
140 | struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | 127 | struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ |
141 | mempool_t *drbd_request_mempool; | 128 | mempool_t *drbd_request_mempool; |
@@ -164,10 +151,15 @@ static const struct block_device_operations drbd_ops = { | |||
164 | 151 | ||
165 | struct bio *bio_alloc_drbd(gfp_t gfp_mask) | 152 | struct bio *bio_alloc_drbd(gfp_t gfp_mask) |
166 | { | 153 | { |
154 | struct bio *bio; | ||
155 | |||
167 | if (!drbd_md_io_bio_set) | 156 | if (!drbd_md_io_bio_set) |
168 | return bio_alloc(gfp_mask, 1); | 157 | return bio_alloc(gfp_mask, 1); |
169 | 158 | ||
170 | return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); | 159 | bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); |
160 | if (!bio) | ||
161 | return NULL; | ||
162 | return bio; | ||
171 | } | 163 | } |
172 | 164 | ||
173 | #ifdef __CHECKER__ | 165 | #ifdef __CHECKER__ |
@@ -190,158 +182,87 @@ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) | |||
190 | #endif | 182 | #endif |
191 | 183 | ||
192 | /** | 184 | /** |
193 | * DOC: The transfer log | 185 | * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch |
194 | * | 186 | * @tconn: DRBD connection. |
195 | * The transfer log is a single linked list of &struct drbd_tl_epoch objects. | ||
196 | * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail | ||
197 | * of the list. There is always at least one &struct drbd_tl_epoch object. | ||
198 | * | ||
199 | * Each &struct drbd_tl_epoch has a circular double linked list of requests | ||
200 | * attached. | ||
201 | */ | ||
202 | static int tl_init(struct drbd_conf *mdev) | ||
203 | { | ||
204 | struct drbd_tl_epoch *b; | ||
205 | |||
206 | /* during device minor initialization, we may well use GFP_KERNEL */ | ||
207 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); | ||
208 | if (!b) | ||
209 | return 0; | ||
210 | INIT_LIST_HEAD(&b->requests); | ||
211 | INIT_LIST_HEAD(&b->w.list); | ||
212 | b->next = NULL; | ||
213 | b->br_number = 4711; | ||
214 | b->n_writes = 0; | ||
215 | b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
216 | |||
217 | mdev->oldest_tle = b; | ||
218 | mdev->newest_tle = b; | ||
219 | INIT_LIST_HEAD(&mdev->out_of_sequence_requests); | ||
220 | INIT_LIST_HEAD(&mdev->barrier_acked_requests); | ||
221 | |||
222 | mdev->tl_hash = NULL; | ||
223 | mdev->tl_hash_s = 0; | ||
224 | |||
225 | return 1; | ||
226 | } | ||
227 | |||
228 | static void tl_cleanup(struct drbd_conf *mdev) | ||
229 | { | ||
230 | D_ASSERT(mdev->oldest_tle == mdev->newest_tle); | ||
231 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
232 | kfree(mdev->oldest_tle); | ||
233 | mdev->oldest_tle = NULL; | ||
234 | kfree(mdev->unused_spare_tle); | ||
235 | mdev->unused_spare_tle = NULL; | ||
236 | kfree(mdev->tl_hash); | ||
237 | mdev->tl_hash = NULL; | ||
238 | mdev->tl_hash_s = 0; | ||
239 | } | ||
240 | |||
241 | /** | ||
242 | * _tl_add_barrier() - Adds a barrier to the transfer log | ||
243 | * @mdev: DRBD device. | ||
244 | * @new: Barrier to be added before the current head of the TL. | ||
245 | * | ||
246 | * The caller must hold the req_lock. | ||
247 | */ | ||
248 | void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) | ||
249 | { | ||
250 | struct drbd_tl_epoch *newest_before; | ||
251 | |||
252 | INIT_LIST_HEAD(&new->requests); | ||
253 | INIT_LIST_HEAD(&new->w.list); | ||
254 | new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
255 | new->next = NULL; | ||
256 | new->n_writes = 0; | ||
257 | |||
258 | newest_before = mdev->newest_tle; | ||
259 | new->br_number = newest_before->br_number+1; | ||
260 | if (mdev->newest_tle != new) { | ||
261 | mdev->newest_tle->next = new; | ||
262 | mdev->newest_tle = new; | ||
263 | } | ||
264 | } | ||
265 | |||
266 | /** | ||
267 | * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL | ||
268 | * @mdev: DRBD device. | ||
269 | * @barrier_nr: Expected identifier of the DRBD write barrier packet. | 187 | * @barrier_nr: Expected identifier of the DRBD write barrier packet. |
270 | * @set_size: Expected number of requests before that barrier. | 188 | * @set_size: Expected number of requests before that barrier. |
271 | * | 189 | * |
272 | * In case the passed barrier_nr or set_size does not match the oldest | 190 | * In case the passed barrier_nr or set_size does not match the oldest |
273 | * &struct drbd_tl_epoch objects this function will cause a termination | 191 | * epoch of not yet barrier-acked requests, this function will cause a |
274 | * of the connection. | 192 | * termination of the connection. |
275 | */ | 193 | */ |
276 | void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | 194 | void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr, |
277 | unsigned int set_size) | 195 | unsigned int set_size) |
278 | { | 196 | { |
279 | struct drbd_tl_epoch *b, *nob; /* next old barrier */ | ||
280 | struct list_head *le, *tle; | ||
281 | struct drbd_request *r; | 197 | struct drbd_request *r; |
282 | 198 | struct drbd_request *req = NULL; | |
283 | spin_lock_irq(&mdev->req_lock); | 199 | int expect_epoch = 0; |
284 | 200 | int expect_size = 0; | |
285 | b = mdev->oldest_tle; | 201 | |
202 | spin_lock_irq(&tconn->req_lock); | ||
203 | |||
204 | /* find oldest not yet barrier-acked write request, | ||
205 | * count writes in its epoch. */ | ||
206 | list_for_each_entry(r, &tconn->transfer_log, tl_requests) { | ||
207 | const unsigned s = r->rq_state; | ||
208 | if (!req) { | ||
209 | if (!(s & RQ_WRITE)) | ||
210 | continue; | ||
211 | if (!(s & RQ_NET_MASK)) | ||
212 | continue; | ||
213 | if (s & RQ_NET_DONE) | ||
214 | continue; | ||
215 | req = r; | ||
216 | expect_epoch = req->epoch; | ||
217 | expect_size ++; | ||
218 | } else { | ||
219 | if (r->epoch != expect_epoch) | ||
220 | break; | ||
221 | if (!(s & RQ_WRITE)) | ||
222 | continue; | ||
223 | /* if (s & RQ_DONE): not expected */ | ||
224 | /* if (!(s & RQ_NET_MASK)): not expected */ | ||
225 | expect_size++; | ||
226 | } | ||
227 | } | ||
286 | 228 | ||
287 | /* first some paranoia code */ | 229 | /* first some paranoia code */ |
288 | if (b == NULL) { | 230 | if (req == NULL) { |
289 | dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", | 231 | conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", |
290 | barrier_nr); | 232 | barrier_nr); |
291 | goto bail; | 233 | goto bail; |
292 | } | 234 | } |
293 | if (b->br_number != barrier_nr) { | 235 | if (expect_epoch != barrier_nr) { |
294 | dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", | 236 | conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n", |
295 | barrier_nr, b->br_number); | 237 | barrier_nr, expect_epoch); |
296 | goto bail; | 238 | goto bail; |
297 | } | 239 | } |
298 | if (b->n_writes != set_size) { | 240 | |
299 | dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", | 241 | if (expect_size != set_size) { |
300 | barrier_nr, set_size, b->n_writes); | 242 | conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", |
243 | barrier_nr, set_size, expect_size); | ||
301 | goto bail; | 244 | goto bail; |
302 | } | 245 | } |
303 | 246 | ||
304 | /* Clean up list of requests processed during current epoch */ | 247 | /* Clean up list of requests processed during current epoch. */ |
305 | list_for_each_safe(le, tle, &b->requests) { | 248 | /* this extra list walk restart is paranoia, |
306 | r = list_entry(le, struct drbd_request, tl_requests); | 249 | * to catch requests being barrier-acked "unexpectedly". |
307 | _req_mod(r, barrier_acked); | 250 | * It usually should find the same req again, or some READ preceding it. */ |
308 | } | 251 | list_for_each_entry(req, &tconn->transfer_log, tl_requests) |
309 | /* There could be requests on the list waiting for completion | 252 | if (req->epoch == expect_epoch) |
310 | of the write to the local disk. To avoid corruptions of | 253 | break; |
311 | slab's data structures we have to remove the lists head. | 254 | list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) { |
312 | 255 | if (req->epoch != expect_epoch) | |
313 | Also there could have been a barrier ack out of sequence, overtaking | 256 | break; |
314 | the write acks - which would be a bug and violating write ordering. | 257 | _req_mod(req, BARRIER_ACKED); |
315 | To not deadlock in case we lose connection while such requests are | ||
316 | still pending, we need some way to find them for the | ||
317 | _req_mode(connection_lost_while_pending). | ||
318 | |||
319 | These have been list_move'd to the out_of_sequence_requests list in | ||
320 | _req_mod(, barrier_acked) above. | ||
321 | */ | ||
322 | list_splice_init(&b->requests, &mdev->barrier_acked_requests); | ||
323 | |||
324 | nob = b->next; | ||
325 | if (drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) { | ||
326 | _tl_add_barrier(mdev, b); | ||
327 | if (nob) | ||
328 | mdev->oldest_tle = nob; | ||
329 | /* if nob == NULL b was the only barrier, and becomes the new | ||
330 | barrier. Therefore mdev->oldest_tle points already to b */ | ||
331 | } else { | ||
332 | D_ASSERT(nob != NULL); | ||
333 | mdev->oldest_tle = nob; | ||
334 | kfree(b); | ||
335 | } | 258 | } |
336 | 259 | spin_unlock_irq(&tconn->req_lock); | |
337 | spin_unlock_irq(&mdev->req_lock); | ||
338 | dec_ap_pending(mdev); | ||
339 | 260 | ||
340 | return; | 261 | return; |
341 | 262 | ||
342 | bail: | 263 | bail: |
343 | spin_unlock_irq(&mdev->req_lock); | 264 | spin_unlock_irq(&tconn->req_lock); |
344 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | 265 | conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); |
345 | } | 266 | } |
346 | 267 | ||
347 | 268 | ||
@@ -350,85 +271,24 @@ bail: | |||
350 | * @mdev: DRBD device. | 271 | * @mdev: DRBD device. |
351 | * @what: The action/event to perform with all request objects | 272 | * @what: The action/event to perform with all request objects |
352 | * | 273 | * |
353 | * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, | 274 | * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, |
354 | * restart_frozen_disk_io. | 275 | * RESTART_FROZEN_DISK_IO. |
355 | */ | 276 | */ |
356 | static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | 277 | /* must hold resource->req_lock */ |
357 | { | 278 | void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) |
358 | struct drbd_tl_epoch *b, *tmp, **pn; | 279 | { |
359 | struct list_head *le, *tle, carry_reads; | 280 | struct drbd_request *req, *r; |
360 | struct drbd_request *req; | ||
361 | int rv, n_writes, n_reads; | ||
362 | |||
363 | b = mdev->oldest_tle; | ||
364 | pn = &mdev->oldest_tle; | ||
365 | while (b) { | ||
366 | n_writes = 0; | ||
367 | n_reads = 0; | ||
368 | INIT_LIST_HEAD(&carry_reads); | ||
369 | list_for_each_safe(le, tle, &b->requests) { | ||
370 | req = list_entry(le, struct drbd_request, tl_requests); | ||
371 | rv = _req_mod(req, what); | ||
372 | |||
373 | n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT; | ||
374 | n_reads += (rv & MR_READ) >> MR_READ_SHIFT; | ||
375 | } | ||
376 | tmp = b->next; | ||
377 | |||
378 | if (n_writes) { | ||
379 | if (what == resend) { | ||
380 | b->n_writes = n_writes; | ||
381 | if (b->w.cb == NULL) { | ||
382 | b->w.cb = w_send_barrier; | ||
383 | inc_ap_pending(mdev); | ||
384 | drbd_set_flag(mdev, CREATE_BARRIER); | ||
385 | } | ||
386 | |||
387 | drbd_queue_work(&mdev->data.work, &b->w); | ||
388 | } | ||
389 | pn = &b->next; | ||
390 | } else { | ||
391 | if (n_reads) | ||
392 | list_add(&carry_reads, &b->requests); | ||
393 | /* there could still be requests on that ring list, | ||
394 | * in case local io is still pending */ | ||
395 | list_del(&b->requests); | ||
396 | |||
397 | /* dec_ap_pending corresponding to queue_barrier. | ||
398 | * the newest barrier may not have been queued yet, | ||
399 | * in which case w.cb is still NULL. */ | ||
400 | if (b->w.cb != NULL) | ||
401 | dec_ap_pending(mdev); | ||
402 | |||
403 | if (b == mdev->newest_tle) { | ||
404 | /* recycle, but reinit! */ | ||
405 | D_ASSERT(tmp == NULL); | ||
406 | INIT_LIST_HEAD(&b->requests); | ||
407 | list_splice(&carry_reads, &b->requests); | ||
408 | INIT_LIST_HEAD(&b->w.list); | ||
409 | b->w.cb = NULL; | ||
410 | b->br_number = net_random(); | ||
411 | b->n_writes = 0; | ||
412 | |||
413 | *pn = b; | ||
414 | break; | ||
415 | } | ||
416 | *pn = tmp; | ||
417 | kfree(b); | ||
418 | } | ||
419 | b = tmp; | ||
420 | list_splice(&carry_reads, &b->requests); | ||
421 | } | ||
422 | |||
423 | /* Actions operating on the disk state, also want to work on | ||
424 | requests that got barrier acked. */ | ||
425 | 281 | ||
426 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { | 282 | list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) |
427 | req = list_entry(le, struct drbd_request, tl_requests); | ||
428 | _req_mod(req, what); | 283 | _req_mod(req, what); |
429 | } | ||
430 | } | 284 | } |
431 | 285 | ||
286 | void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) | ||
287 | { | ||
288 | spin_lock_irq(&tconn->req_lock); | ||
289 | _tl_restart(tconn, what); | ||
290 | spin_unlock_irq(&tconn->req_lock); | ||
291 | } | ||
432 | 292 | ||
433 | /** | 293 | /** |
434 | * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL | 294 | * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL |
@@ -438,43 +298,9 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
438 | * by the requests on the transfer gets marked as our of sync. Called from the | 298 | * by the requests on the transfer gets marked as our of sync. Called from the |
439 | * receiver thread and the worker thread. | 299 | * receiver thread and the worker thread. |
440 | */ | 300 | */ |
441 | void tl_clear(struct drbd_conf *mdev) | 301 | void tl_clear(struct drbd_tconn *tconn) |
442 | { | 302 | { |
443 | spin_lock_irq(&mdev->req_lock); | 303 | tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); |
444 | _tl_clear(mdev); | ||
445 | spin_unlock_irq(&mdev->req_lock); | ||
446 | } | ||
447 | |||
448 | static void _tl_clear(struct drbd_conf *mdev) | ||
449 | { | ||
450 | struct list_head *le, *tle; | ||
451 | struct drbd_request *r; | ||
452 | |||
453 | _tl_restart(mdev, connection_lost_while_pending); | ||
454 | |||
455 | /* we expect this list to be empty. */ | ||
456 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
457 | |||
458 | /* but just in case, clean it up anyways! */ | ||
459 | list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { | ||
460 | r = list_entry(le, struct drbd_request, tl_requests); | ||
461 | /* It would be nice to complete outside of spinlock. | ||
462 | * But this is easier for now. */ | ||
463 | _req_mod(r, connection_lost_while_pending); | ||
464 | } | ||
465 | |||
466 | /* ensure bit indicating barrier is required is clear */ | ||
467 | drbd_clear_flag(mdev, CREATE_BARRIER); | ||
468 | |||
469 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); | ||
470 | |||
471 | } | ||
472 | |||
473 | void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | ||
474 | { | ||
475 | spin_lock_irq(&mdev->req_lock); | ||
476 | _tl_restart(mdev, what); | ||
477 | spin_unlock_irq(&mdev->req_lock); | ||
478 | } | 304 | } |
479 | 305 | ||
480 | /** | 306 | /** |
@@ -483,1392 +309,131 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
483 | */ | 309 | */ |
484 | void tl_abort_disk_io(struct drbd_conf *mdev) | 310 | void tl_abort_disk_io(struct drbd_conf *mdev) |
485 | { | 311 | { |
486 | struct drbd_tl_epoch *b; | 312 | struct drbd_tconn *tconn = mdev->tconn; |
487 | struct list_head *le, *tle; | 313 | struct drbd_request *req, *r; |
488 | struct drbd_request *req; | ||
489 | 314 | ||
490 | spin_lock_irq(&mdev->req_lock); | 315 | spin_lock_irq(&tconn->req_lock); |
491 | b = mdev->oldest_tle; | 316 | list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) { |
492 | while (b) { | ||
493 | list_for_each_safe(le, tle, &b->requests) { | ||
494 | req = list_entry(le, struct drbd_request, tl_requests); | ||
495 | if (!(req->rq_state & RQ_LOCAL_PENDING)) | ||
496 | continue; | ||
497 | _req_mod(req, abort_disk_io); | ||
498 | } | ||
499 | b = b->next; | ||
500 | } | ||
501 | |||
502 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { | ||
503 | req = list_entry(le, struct drbd_request, tl_requests); | ||
504 | if (!(req->rq_state & RQ_LOCAL_PENDING)) | 317 | if (!(req->rq_state & RQ_LOCAL_PENDING)) |
505 | continue; | 318 | continue; |
506 | _req_mod(req, abort_disk_io); | 319 | if (req->w.mdev != mdev) |
507 | } | 320 | continue; |
508 | 321 | _req_mod(req, ABORT_DISK_IO); | |
509 | spin_unlock_irq(&mdev->req_lock); | ||
510 | } | ||
511 | |||
512 | /** | ||
513 | * cl_wide_st_chg() - true if the state change is a cluster wide one | ||
514 | * @mdev: DRBD device. | ||
515 | * @os: old (current) state. | ||
516 | * @ns: new (wanted) state. | ||
517 | */ | ||
518 | static int cl_wide_st_chg(struct drbd_conf *mdev, | ||
519 | union drbd_state os, union drbd_state ns) | ||
520 | { | ||
521 | return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && | ||
522 | ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || | ||
523 | (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
524 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || | ||
525 | (os.disk != D_FAILED && ns.disk == D_FAILED))) || | ||
526 | (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || | ||
527 | (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); | ||
528 | } | ||
529 | |||
530 | enum drbd_state_rv | ||
531 | drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
532 | union drbd_state mask, union drbd_state val) | ||
533 | { | ||
534 | unsigned long flags; | ||
535 | union drbd_state os, ns; | ||
536 | enum drbd_state_rv rv; | ||
537 | |||
538 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
539 | os = mdev->state; | ||
540 | ns.i = (os.i & ~mask.i) | val.i; | ||
541 | rv = _drbd_set_state(mdev, ns, f, NULL); | ||
542 | ns = mdev->state; | ||
543 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
544 | |||
545 | return rv; | ||
546 | } | ||
547 | |||
548 | /** | ||
549 | * drbd_force_state() - Impose a change which happens outside our control on our state | ||
550 | * @mdev: DRBD device. | ||
551 | * @mask: mask of state bits to change. | ||
552 | * @val: value of new state bits. | ||
553 | */ | ||
554 | void drbd_force_state(struct drbd_conf *mdev, | ||
555 | union drbd_state mask, union drbd_state val) | ||
556 | { | ||
557 | drbd_change_state(mdev, CS_HARD, mask, val); | ||
558 | } | ||
559 | |||
560 | static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); | ||
561 | static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, | ||
562 | union drbd_state, | ||
563 | union drbd_state); | ||
564 | enum sanitize_state_warnings { | ||
565 | NO_WARNING, | ||
566 | ABORTED_ONLINE_VERIFY, | ||
567 | ABORTED_RESYNC, | ||
568 | CONNECTION_LOST_NEGOTIATING, | ||
569 | IMPLICITLY_UPGRADED_DISK, | ||
570 | IMPLICITLY_UPGRADED_PDSK, | ||
571 | }; | ||
572 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
573 | union drbd_state ns, enum sanitize_state_warnings *warn); | ||
574 | int drbd_send_state_req(struct drbd_conf *, | ||
575 | union drbd_state, union drbd_state); | ||
576 | |||
577 | static enum drbd_state_rv | ||
578 | _req_st_cond(struct drbd_conf *mdev, union drbd_state mask, | ||
579 | union drbd_state val) | ||
580 | { | ||
581 | union drbd_state os, ns; | ||
582 | unsigned long flags; | ||
583 | enum drbd_state_rv rv; | ||
584 | |||
585 | if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_SUCCESS)) | ||
586 | return SS_CW_SUCCESS; | ||
587 | |||
588 | if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_FAIL)) | ||
589 | return SS_CW_FAILED_BY_PEER; | ||
590 | |||
591 | rv = 0; | ||
592 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
593 | os = mdev->state; | ||
594 | ns.i = (os.i & ~mask.i) | val.i; | ||
595 | ns = sanitize_state(mdev, os, ns, NULL); | ||
596 | |||
597 | if (!cl_wide_st_chg(mdev, os, ns)) | ||
598 | rv = SS_CW_NO_NEED; | ||
599 | if (!rv) { | ||
600 | rv = is_valid_state(mdev, ns); | ||
601 | if (rv == SS_SUCCESS) { | ||
602 | rv = is_valid_state_transition(mdev, ns, os); | ||
603 | if (rv == SS_SUCCESS) | ||
604 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
605 | } | ||
606 | } | ||
607 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
608 | |||
609 | return rv; | ||
610 | } | ||
611 | |||
612 | /** | ||
613 | * drbd_req_state() - Perform an eventually cluster wide state change | ||
614 | * @mdev: DRBD device. | ||
615 | * @mask: mask of state bits to change. | ||
616 | * @val: value of new state bits. | ||
617 | * @f: flags | ||
618 | * | ||
619 | * Should not be called directly, use drbd_request_state() or | ||
620 | * _drbd_request_state(). | ||
621 | */ | ||
622 | static enum drbd_state_rv | ||
623 | drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, | ||
624 | union drbd_state val, enum chg_state_flags f) | ||
625 | { | ||
626 | struct completion done; | ||
627 | unsigned long flags; | ||
628 | union drbd_state os, ns; | ||
629 | enum drbd_state_rv rv; | ||
630 | |||
631 | init_completion(&done); | ||
632 | |||
633 | if (f & CS_SERIALIZE) | ||
634 | mutex_lock(&mdev->state_mutex); | ||
635 | |||
636 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
637 | os = mdev->state; | ||
638 | ns.i = (os.i & ~mask.i) | val.i; | ||
639 | ns = sanitize_state(mdev, os, ns, NULL); | ||
640 | |||
641 | if (cl_wide_st_chg(mdev, os, ns)) { | ||
642 | rv = is_valid_state(mdev, ns); | ||
643 | if (rv == SS_SUCCESS) | ||
644 | rv = is_valid_state_transition(mdev, ns, os); | ||
645 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
646 | |||
647 | if (rv < SS_SUCCESS) { | ||
648 | if (f & CS_VERBOSE) | ||
649 | print_st_err(mdev, os, ns, rv); | ||
650 | goto abort; | ||
651 | } | ||
652 | |||
653 | drbd_state_lock(mdev); | ||
654 | if (!drbd_send_state_req(mdev, mask, val)) { | ||
655 | drbd_state_unlock(mdev); | ||
656 | rv = SS_CW_FAILED_BY_PEER; | ||
657 | if (f & CS_VERBOSE) | ||
658 | print_st_err(mdev, os, ns, rv); | ||
659 | goto abort; | ||
660 | } | ||
661 | |||
662 | if (mask.conn == C_MASK && val.conn == C_DISCONNECTING) | ||
663 | drbd_set_flag(mdev, DISCONNECT_SENT); | ||
664 | |||
665 | wait_event(mdev->state_wait, | ||
666 | (rv = _req_st_cond(mdev, mask, val))); | ||
667 | |||
668 | if (rv < SS_SUCCESS) { | ||
669 | drbd_state_unlock(mdev); | ||
670 | if (f & CS_VERBOSE) | ||
671 | print_st_err(mdev, os, ns, rv); | ||
672 | goto abort; | ||
673 | } | ||
674 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
675 | os = mdev->state; | ||
676 | ns.i = (os.i & ~mask.i) | val.i; | ||
677 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
678 | drbd_state_unlock(mdev); | ||
679 | } else { | ||
680 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
681 | } | ||
682 | |||
683 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
684 | |||
685 | if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { | ||
686 | D_ASSERT(current != mdev->worker.task); | ||
687 | wait_for_completion(&done); | ||
688 | } | ||
689 | |||
690 | abort: | ||
691 | if (f & CS_SERIALIZE) | ||
692 | mutex_unlock(&mdev->state_mutex); | ||
693 | |||
694 | return rv; | ||
695 | } | ||
696 | |||
697 | /** | ||
698 | * _drbd_request_state() - Request a state change (with flags) | ||
699 | * @mdev: DRBD device. | ||
700 | * @mask: mask of state bits to change. | ||
701 | * @val: value of new state bits. | ||
702 | * @f: flags | ||
703 | * | ||
704 | * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE | ||
705 | * flag, or when logging of failed state change requests is not desired. | ||
706 | */ | ||
707 | enum drbd_state_rv | ||
708 | _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, | ||
709 | union drbd_state val, enum chg_state_flags f) | ||
710 | { | ||
711 | enum drbd_state_rv rv; | ||
712 | |||
713 | wait_event(mdev->state_wait, | ||
714 | (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); | ||
715 | |||
716 | return rv; | ||
717 | } | ||
718 | |||
719 | static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) | ||
720 | { | ||
721 | dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", | ||
722 | name, | ||
723 | drbd_conn_str(ns.conn), | ||
724 | drbd_role_str(ns.role), | ||
725 | drbd_role_str(ns.peer), | ||
726 | drbd_disk_str(ns.disk), | ||
727 | drbd_disk_str(ns.pdsk), | ||
728 | is_susp(ns) ? 's' : 'r', | ||
729 | ns.aftr_isp ? 'a' : '-', | ||
730 | ns.peer_isp ? 'p' : '-', | ||
731 | ns.user_isp ? 'u' : '-' | ||
732 | ); | ||
733 | } | ||
734 | |||
735 | void print_st_err(struct drbd_conf *mdev, union drbd_state os, | ||
736 | union drbd_state ns, enum drbd_state_rv err) | ||
737 | { | ||
738 | if (err == SS_IN_TRANSIENT_STATE) | ||
739 | return; | ||
740 | dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); | ||
741 | print_st(mdev, " state", os); | ||
742 | print_st(mdev, "wanted", ns); | ||
743 | } | ||
744 | |||
745 | |||
746 | /** | ||
747 | * is_valid_state() - Returns an SS_ error code if ns is not valid | ||
748 | * @mdev: DRBD device. | ||
749 | * @ns: State to consider. | ||
750 | */ | ||
751 | static enum drbd_state_rv | ||
752 | is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | ||
753 | { | ||
754 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
755 | |||
756 | enum drbd_fencing_p fp; | ||
757 | enum drbd_state_rv rv = SS_SUCCESS; | ||
758 | |||
759 | fp = FP_DONT_CARE; | ||
760 | if (get_ldev(mdev)) { | ||
761 | fp = mdev->ldev->dc.fencing; | ||
762 | put_ldev(mdev); | ||
763 | } | ||
764 | |||
765 | if (get_net_conf(mdev)) { | ||
766 | if (!mdev->net_conf->two_primaries && | ||
767 | ns.role == R_PRIMARY && ns.peer == R_PRIMARY) | ||
768 | rv = SS_TWO_PRIMARIES; | ||
769 | put_net_conf(mdev); | ||
770 | } | ||
771 | |||
772 | if (rv <= 0) | ||
773 | /* already found a reason to abort */; | ||
774 | else if (ns.role == R_SECONDARY && mdev->open_cnt) | ||
775 | rv = SS_DEVICE_IN_USE; | ||
776 | |||
777 | else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) | ||
778 | rv = SS_NO_UP_TO_DATE_DISK; | ||
779 | |||
780 | else if (fp >= FP_RESOURCE && | ||
781 | ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) | ||
782 | rv = SS_PRIMARY_NOP; | ||
783 | |||
784 | else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) | ||
785 | rv = SS_NO_UP_TO_DATE_DISK; | ||
786 | |||
787 | else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) | ||
788 | rv = SS_NO_LOCAL_DISK; | ||
789 | |||
790 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | ||
791 | rv = SS_NO_REMOTE_DISK; | ||
792 | |||
793 | else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) | ||
794 | rv = SS_NO_UP_TO_DATE_DISK; | ||
795 | |||
796 | else if ((ns.conn == C_CONNECTED || | ||
797 | ns.conn == C_WF_BITMAP_S || | ||
798 | ns.conn == C_SYNC_SOURCE || | ||
799 | ns.conn == C_PAUSED_SYNC_S) && | ||
800 | ns.disk == D_OUTDATED) | ||
801 | rv = SS_CONNECTED_OUTDATES; | ||
802 | |||
803 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
804 | (mdev->sync_conf.verify_alg[0] == 0)) | ||
805 | rv = SS_NO_VERIFY_ALG; | ||
806 | |||
807 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
808 | mdev->agreed_pro_version < 88) | ||
809 | rv = SS_NOT_SUPPORTED; | ||
810 | |||
811 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) | ||
812 | rv = SS_CONNECTED_OUTDATES; | ||
813 | |||
814 | return rv; | ||
815 | } | ||
816 | |||
817 | /** | ||
818 | * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible | ||
819 | * @mdev: DRBD device. | ||
820 | * @ns: new state. | ||
821 | * @os: old state. | ||
822 | */ | ||
823 | static enum drbd_state_rv | ||
824 | is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, | ||
825 | union drbd_state os) | ||
826 | { | ||
827 | enum drbd_state_rv rv = SS_SUCCESS; | ||
828 | |||
829 | if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && | ||
830 | os.conn > C_CONNECTED) | ||
831 | rv = SS_RESYNC_RUNNING; | ||
832 | |||
833 | if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) | ||
834 | rv = SS_ALREADY_STANDALONE; | ||
835 | |||
836 | if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) | ||
837 | rv = SS_IS_DISKLESS; | ||
838 | |||
839 | if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) | ||
840 | rv = SS_NO_NET_CONFIG; | ||
841 | |||
842 | if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) | ||
843 | rv = SS_LOWER_THAN_OUTDATED; | ||
844 | |||
845 | if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) | ||
846 | rv = SS_IN_TRANSIENT_STATE; | ||
847 | |||
848 | if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) | ||
849 | rv = SS_IN_TRANSIENT_STATE; | ||
850 | |||
851 | /* While establishing a connection only allow cstate to change. | ||
852 | Delay/refuse role changes, detach attach etc... */ | ||
853 | if (drbd_test_flag(mdev, STATE_SENT) && | ||
854 | !(os.conn == C_WF_REPORT_PARAMS || | ||
855 | (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) | ||
856 | rv = SS_IN_TRANSIENT_STATE; | ||
857 | |||
858 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) | ||
859 | rv = SS_NEED_CONNECTION; | ||
860 | |||
861 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
862 | ns.conn != os.conn && os.conn > C_CONNECTED) | ||
863 | rv = SS_RESYNC_RUNNING; | ||
864 | |||
865 | if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
866 | os.conn < C_CONNECTED) | ||
867 | rv = SS_NEED_CONNECTION; | ||
868 | |||
869 | if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) | ||
870 | && os.conn < C_WF_REPORT_PARAMS) | ||
871 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ | ||
872 | |||
873 | return rv; | ||
874 | } | ||
875 | |||
876 | static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) | ||
877 | { | ||
878 | static const char *msg_table[] = { | ||
879 | [NO_WARNING] = "", | ||
880 | [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", | ||
881 | [ABORTED_RESYNC] = "Resync aborted.", | ||
882 | [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", | ||
883 | [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", | ||
884 | [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", | ||
885 | }; | ||
886 | |||
887 | if (warn != NO_WARNING) | ||
888 | dev_warn(DEV, "%s\n", msg_table[warn]); | ||
889 | } | ||
890 | |||
891 | /** | ||
892 | * sanitize_state() - Resolves implicitly necessary additional changes to a state transition | ||
893 | * @mdev: DRBD device. | ||
894 | * @os: old state. | ||
895 | * @ns: new state. | ||
896 | * @warn_sync_abort: | ||
897 | * | ||
898 | * When we loose connection, we have to set the state of the peers disk (pdsk) | ||
899 | * to D_UNKNOWN. This rule and many more along those lines are in this function. | ||
900 | */ | ||
901 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
902 | union drbd_state ns, enum sanitize_state_warnings *warn) | ||
903 | { | ||
904 | enum drbd_fencing_p fp; | ||
905 | enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; | ||
906 | |||
907 | if (warn) | ||
908 | *warn = NO_WARNING; | ||
909 | |||
910 | fp = FP_DONT_CARE; | ||
911 | if (get_ldev(mdev)) { | ||
912 | fp = mdev->ldev->dc.fencing; | ||
913 | put_ldev(mdev); | ||
914 | } | ||
915 | |||
916 | /* Disallow Network errors to configure a device's network part */ | ||
917 | if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && | ||
918 | os.conn <= C_DISCONNECTING) | ||
919 | ns.conn = os.conn; | ||
920 | |||
921 | /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. | ||
922 | * If you try to go into some Sync* state, that shall fail (elsewhere). */ | ||
923 | if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && | ||
924 | ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) | ||
925 | ns.conn = os.conn; | ||
926 | |||
927 | /* we cannot fail (again) if we already detached */ | ||
928 | if (ns.disk == D_FAILED && os.disk == D_DISKLESS) | ||
929 | ns.disk = D_DISKLESS; | ||
930 | |||
931 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | ||
932 | if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) | ||
933 | ns.conn = os.conn; | ||
934 | |||
935 | if (ns.conn < C_CONNECTED) { | ||
936 | ns.peer_isp = 0; | ||
937 | ns.peer = R_UNKNOWN; | ||
938 | if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) | ||
939 | ns.pdsk = D_UNKNOWN; | ||
940 | } | ||
941 | |||
942 | /* Clear the aftr_isp when becoming unconfigured */ | ||
943 | if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) | ||
944 | ns.aftr_isp = 0; | ||
945 | |||
946 | /* Abort resync if a disk fails/detaches */ | ||
947 | if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && | ||
948 | (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { | ||
949 | if (warn) | ||
950 | *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? | ||
951 | ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; | ||
952 | ns.conn = C_CONNECTED; | ||
953 | } | ||
954 | |||
955 | /* Connection breaks down before we finished "Negotiating" */ | ||
956 | if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && | ||
957 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
958 | if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { | ||
959 | ns.disk = mdev->new_state_tmp.disk; | ||
960 | ns.pdsk = mdev->new_state_tmp.pdsk; | ||
961 | } else { | ||
962 | if (warn) | ||
963 | *warn = CONNECTION_LOST_NEGOTIATING; | ||
964 | ns.disk = D_DISKLESS; | ||
965 | ns.pdsk = D_UNKNOWN; | ||
966 | } | ||
967 | put_ldev(mdev); | ||
968 | } | ||
969 | |||
970 | /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ | ||
971 | if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { | ||
972 | if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) | ||
973 | ns.disk = D_UP_TO_DATE; | ||
974 | if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) | ||
975 | ns.pdsk = D_UP_TO_DATE; | ||
976 | } | ||
977 | |||
978 | /* Implications of the connection stat on the disk states */ | ||
979 | disk_min = D_DISKLESS; | ||
980 | disk_max = D_UP_TO_DATE; | ||
981 | pdsk_min = D_INCONSISTENT; | ||
982 | pdsk_max = D_UNKNOWN; | ||
983 | switch ((enum drbd_conns)ns.conn) { | ||
984 | case C_WF_BITMAP_T: | ||
985 | case C_PAUSED_SYNC_T: | ||
986 | case C_STARTING_SYNC_T: | ||
987 | case C_WF_SYNC_UUID: | ||
988 | case C_BEHIND: | ||
989 | disk_min = D_INCONSISTENT; | ||
990 | disk_max = D_OUTDATED; | ||
991 | pdsk_min = D_UP_TO_DATE; | ||
992 | pdsk_max = D_UP_TO_DATE; | ||
993 | break; | ||
994 | case C_VERIFY_S: | ||
995 | case C_VERIFY_T: | ||
996 | disk_min = D_UP_TO_DATE; | ||
997 | disk_max = D_UP_TO_DATE; | ||
998 | pdsk_min = D_UP_TO_DATE; | ||
999 | pdsk_max = D_UP_TO_DATE; | ||
1000 | break; | ||
1001 | case C_CONNECTED: | ||
1002 | disk_min = D_DISKLESS; | ||
1003 | disk_max = D_UP_TO_DATE; | ||
1004 | pdsk_min = D_DISKLESS; | ||
1005 | pdsk_max = D_UP_TO_DATE; | ||
1006 | break; | ||
1007 | case C_WF_BITMAP_S: | ||
1008 | case C_PAUSED_SYNC_S: | ||
1009 | case C_STARTING_SYNC_S: | ||
1010 | case C_AHEAD: | ||
1011 | disk_min = D_UP_TO_DATE; | ||
1012 | disk_max = D_UP_TO_DATE; | ||
1013 | pdsk_min = D_INCONSISTENT; | ||
1014 | pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ | ||
1015 | break; | ||
1016 | case C_SYNC_TARGET: | ||
1017 | disk_min = D_INCONSISTENT; | ||
1018 | disk_max = D_INCONSISTENT; | ||
1019 | pdsk_min = D_UP_TO_DATE; | ||
1020 | pdsk_max = D_UP_TO_DATE; | ||
1021 | break; | ||
1022 | case C_SYNC_SOURCE: | ||
1023 | disk_min = D_UP_TO_DATE; | ||
1024 | disk_max = D_UP_TO_DATE; | ||
1025 | pdsk_min = D_INCONSISTENT; | ||
1026 | pdsk_max = D_INCONSISTENT; | ||
1027 | break; | ||
1028 | case C_STANDALONE: | ||
1029 | case C_DISCONNECTING: | ||
1030 | case C_UNCONNECTED: | ||
1031 | case C_TIMEOUT: | ||
1032 | case C_BROKEN_PIPE: | ||
1033 | case C_NETWORK_FAILURE: | ||
1034 | case C_PROTOCOL_ERROR: | ||
1035 | case C_TEAR_DOWN: | ||
1036 | case C_WF_CONNECTION: | ||
1037 | case C_WF_REPORT_PARAMS: | ||
1038 | case C_MASK: | ||
1039 | break; | ||
1040 | } | ||
1041 | if (ns.disk > disk_max) | ||
1042 | ns.disk = disk_max; | ||
1043 | |||
1044 | if (ns.disk < disk_min) { | ||
1045 | if (warn) | ||
1046 | *warn = IMPLICITLY_UPGRADED_DISK; | ||
1047 | ns.disk = disk_min; | ||
1048 | } | ||
1049 | if (ns.pdsk > pdsk_max) | ||
1050 | ns.pdsk = pdsk_max; | ||
1051 | |||
1052 | if (ns.pdsk < pdsk_min) { | ||
1053 | if (warn) | ||
1054 | *warn = IMPLICITLY_UPGRADED_PDSK; | ||
1055 | ns.pdsk = pdsk_min; | ||
1056 | } | ||
1057 | |||
1058 | if (fp == FP_STONITH && | ||
1059 | (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && | ||
1060 | !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) | ||
1061 | ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ | ||
1062 | |||
1063 | if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO && | ||
1064 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && | ||
1065 | !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) | ||
1066 | ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ | ||
1067 | |||
1068 | if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { | ||
1069 | if (ns.conn == C_SYNC_SOURCE) | ||
1070 | ns.conn = C_PAUSED_SYNC_S; | ||
1071 | if (ns.conn == C_SYNC_TARGET) | ||
1072 | ns.conn = C_PAUSED_SYNC_T; | ||
1073 | } else { | ||
1074 | if (ns.conn == C_PAUSED_SYNC_S) | ||
1075 | ns.conn = C_SYNC_SOURCE; | ||
1076 | if (ns.conn == C_PAUSED_SYNC_T) | ||
1077 | ns.conn = C_SYNC_TARGET; | ||
1078 | } | ||
1079 | |||
1080 | return ns; | ||
1081 | } | ||
1082 | |||
1083 | /* helper for __drbd_set_state */ | ||
1084 | static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) | ||
1085 | { | ||
1086 | if (mdev->agreed_pro_version < 90) | ||
1087 | mdev->ov_start_sector = 0; | ||
1088 | mdev->rs_total = drbd_bm_bits(mdev); | ||
1089 | mdev->ov_position = 0; | ||
1090 | if (cs == C_VERIFY_T) { | ||
1091 | /* starting online verify from an arbitrary position | ||
1092 | * does not fit well into the existing protocol. | ||
1093 | * on C_VERIFY_T, we initialize ov_left and friends | ||
1094 | * implicitly in receive_DataRequest once the | ||
1095 | * first P_OV_REQUEST is received */ | ||
1096 | mdev->ov_start_sector = ~(sector_t)0; | ||
1097 | } else { | ||
1098 | unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); | ||
1099 | if (bit >= mdev->rs_total) { | ||
1100 | mdev->ov_start_sector = | ||
1101 | BM_BIT_TO_SECT(mdev->rs_total - 1); | ||
1102 | mdev->rs_total = 1; | ||
1103 | } else | ||
1104 | mdev->rs_total -= bit; | ||
1105 | mdev->ov_position = mdev->ov_start_sector; | ||
1106 | } | ||
1107 | mdev->ov_left = mdev->rs_total; | ||
1108 | } | ||
1109 | |||
1110 | static void drbd_resume_al(struct drbd_conf *mdev) | ||
1111 | { | ||
1112 | if (drbd_test_and_clear_flag(mdev, AL_SUSPENDED)) | ||
1113 | dev_info(DEV, "Resumed AL updates\n"); | ||
1114 | } | ||
1115 | |||
1116 | /** | ||
1117 | * __drbd_set_state() - Set a new DRBD state | ||
1118 | * @mdev: DRBD device. | ||
1119 | * @ns: new state. | ||
1120 | * @flags: Flags | ||
1121 | * @done: Optional completion, that will get completed after the after_state_ch() finished | ||
1122 | * | ||
1123 | * Caller needs to hold req_lock, and global_state_lock. Do not call directly. | ||
1124 | */ | ||
1125 | enum drbd_state_rv | ||
1126 | __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | ||
1127 | enum chg_state_flags flags, struct completion *done) | ||
1128 | { | ||
1129 | union drbd_state os; | ||
1130 | enum drbd_state_rv rv = SS_SUCCESS; | ||
1131 | enum sanitize_state_warnings ssw; | ||
1132 | struct after_state_chg_work *ascw; | ||
1133 | |||
1134 | os = mdev->state; | ||
1135 | |||
1136 | ns = sanitize_state(mdev, os, ns, &ssw); | ||
1137 | |||
1138 | if (ns.i == os.i) | ||
1139 | return SS_NOTHING_TO_DO; | ||
1140 | |||
1141 | if (!(flags & CS_HARD)) { | ||
1142 | /* pre-state-change checks ; only look at ns */ | ||
1143 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
1144 | |||
1145 | rv = is_valid_state(mdev, ns); | ||
1146 | if (rv < SS_SUCCESS) { | ||
1147 | /* If the old state was illegal as well, then let | ||
1148 | this happen...*/ | ||
1149 | |||
1150 | if (is_valid_state(mdev, os) == rv) | ||
1151 | rv = is_valid_state_transition(mdev, ns, os); | ||
1152 | } else | ||
1153 | rv = is_valid_state_transition(mdev, ns, os); | ||
1154 | } | ||
1155 | |||
1156 | if (rv < SS_SUCCESS) { | ||
1157 | if (flags & CS_VERBOSE) | ||
1158 | print_st_err(mdev, os, ns, rv); | ||
1159 | return rv; | ||
1160 | } | ||
1161 | |||
1162 | print_sanitize_warnings(mdev, ssw); | ||
1163 | |||
1164 | { | ||
1165 | char *pbp, pb[300]; | ||
1166 | pbp = pb; | ||
1167 | *pbp = 0; | ||
1168 | if (ns.role != os.role) | ||
1169 | pbp += sprintf(pbp, "role( %s -> %s ) ", | ||
1170 | drbd_role_str(os.role), | ||
1171 | drbd_role_str(ns.role)); | ||
1172 | if (ns.peer != os.peer) | ||
1173 | pbp += sprintf(pbp, "peer( %s -> %s ) ", | ||
1174 | drbd_role_str(os.peer), | ||
1175 | drbd_role_str(ns.peer)); | ||
1176 | if (ns.conn != os.conn) | ||
1177 | pbp += sprintf(pbp, "conn( %s -> %s ) ", | ||
1178 | drbd_conn_str(os.conn), | ||
1179 | drbd_conn_str(ns.conn)); | ||
1180 | if (ns.disk != os.disk) | ||
1181 | pbp += sprintf(pbp, "disk( %s -> %s ) ", | ||
1182 | drbd_disk_str(os.disk), | ||
1183 | drbd_disk_str(ns.disk)); | ||
1184 | if (ns.pdsk != os.pdsk) | ||
1185 | pbp += sprintf(pbp, "pdsk( %s -> %s ) ", | ||
1186 | drbd_disk_str(os.pdsk), | ||
1187 | drbd_disk_str(ns.pdsk)); | ||
1188 | if (is_susp(ns) != is_susp(os)) | ||
1189 | pbp += sprintf(pbp, "susp( %d -> %d ) ", | ||
1190 | is_susp(os), | ||
1191 | is_susp(ns)); | ||
1192 | if (ns.aftr_isp != os.aftr_isp) | ||
1193 | pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", | ||
1194 | os.aftr_isp, | ||
1195 | ns.aftr_isp); | ||
1196 | if (ns.peer_isp != os.peer_isp) | ||
1197 | pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", | ||
1198 | os.peer_isp, | ||
1199 | ns.peer_isp); | ||
1200 | if (ns.user_isp != os.user_isp) | ||
1201 | pbp += sprintf(pbp, "user_isp( %d -> %d ) ", | ||
1202 | os.user_isp, | ||
1203 | ns.user_isp); | ||
1204 | dev_info(DEV, "%s\n", pb); | ||
1205 | } | ||
1206 | |||
1207 | /* solve the race between becoming unconfigured, | ||
1208 | * worker doing the cleanup, and | ||
1209 | * admin reconfiguring us: | ||
1210 | * on (re)configure, first set CONFIG_PENDING, | ||
1211 | * then wait for a potentially exiting worker, | ||
1212 | * start the worker, and schedule one no_op. | ||
1213 | * then proceed with configuration. | ||
1214 | */ | ||
1215 | if (ns.disk == D_DISKLESS && | ||
1216 | ns.conn == C_STANDALONE && | ||
1217 | ns.role == R_SECONDARY && | ||
1218 | !drbd_test_and_set_flag(mdev, CONFIG_PENDING)) | ||
1219 | drbd_set_flag(mdev, DEVICE_DYING); | ||
1220 | |||
1221 | /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference | ||
1222 | * on the ldev here, to be sure the transition -> D_DISKLESS resp. | ||
1223 | * drbd_ldev_destroy() won't happen before our corresponding | ||
1224 | * after_state_ch works run, where we put_ldev again. */ | ||
1225 | if ((os.disk != D_FAILED && ns.disk == D_FAILED) || | ||
1226 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) | ||
1227 | atomic_inc(&mdev->local_cnt); | ||
1228 | |||
1229 | mdev->state = ns; | ||
1230 | |||
1231 | if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) | ||
1232 | drbd_print_uuids(mdev, "attached to UUIDs"); | ||
1233 | |||
1234 | wake_up(&mdev->misc_wait); | ||
1235 | wake_up(&mdev->state_wait); | ||
1236 | |||
1237 | /* Aborted verify run, or we reached the stop sector. | ||
1238 | * Log the last position, unless end-of-device. */ | ||
1239 | if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && | ||
1240 | ns.conn <= C_CONNECTED) { | ||
1241 | mdev->ov_start_sector = | ||
1242 | BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); | ||
1243 | if (mdev->ov_left) | ||
1244 | dev_info(DEV, "Online Verify reached sector %llu\n", | ||
1245 | (unsigned long long)mdev->ov_start_sector); | ||
1246 | } | ||
1247 | |||
1248 | if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && | ||
1249 | (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { | ||
1250 | dev_info(DEV, "Syncer continues.\n"); | ||
1251 | mdev->rs_paused += (long)jiffies | ||
1252 | -(long)mdev->rs_mark_time[mdev->rs_last_mark]; | ||
1253 | if (ns.conn == C_SYNC_TARGET) | ||
1254 | mod_timer(&mdev->resync_timer, jiffies); | ||
1255 | } | ||
1256 | |||
1257 | if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && | ||
1258 | (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { | ||
1259 | dev_info(DEV, "Resync suspended\n"); | ||
1260 | mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; | ||
1261 | } | ||
1262 | |||
1263 | if (os.conn == C_CONNECTED && | ||
1264 | (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { | ||
1265 | unsigned long now = jiffies; | ||
1266 | int i; | ||
1267 | |||
1268 | set_ov_position(mdev, ns.conn); | ||
1269 | mdev->rs_start = now; | ||
1270 | mdev->rs_last_events = 0; | ||
1271 | mdev->rs_last_sect_ev = 0; | ||
1272 | mdev->ov_last_oos_size = 0; | ||
1273 | mdev->ov_last_oos_start = 0; | ||
1274 | |||
1275 | for (i = 0; i < DRBD_SYNC_MARKS; i++) { | ||
1276 | mdev->rs_mark_left[i] = mdev->ov_left; | ||
1277 | mdev->rs_mark_time[i] = now; | ||
1278 | } | ||
1279 | |||
1280 | drbd_rs_controller_reset(mdev); | ||
1281 | |||
1282 | if (ns.conn == C_VERIFY_S) { | ||
1283 | dev_info(DEV, "Starting Online Verify from sector %llu\n", | ||
1284 | (unsigned long long)mdev->ov_position); | ||
1285 | mod_timer(&mdev->resync_timer, jiffies); | ||
1286 | } | ||
1287 | } | ||
1288 | |||
1289 | if (get_ldev(mdev)) { | ||
1290 | u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| | ||
1291 | MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| | ||
1292 | MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); | ||
1293 | |||
1294 | if (drbd_test_flag(mdev, CRASHED_PRIMARY)) | ||
1295 | mdf |= MDF_CRASHED_PRIMARY; | ||
1296 | if (mdev->state.role == R_PRIMARY || | ||
1297 | (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) | ||
1298 | mdf |= MDF_PRIMARY_IND; | ||
1299 | if (mdev->state.conn > C_WF_REPORT_PARAMS) | ||
1300 | mdf |= MDF_CONNECTED_IND; | ||
1301 | if (mdev->state.disk > D_INCONSISTENT) | ||
1302 | mdf |= MDF_CONSISTENT; | ||
1303 | if (mdev->state.disk > D_OUTDATED) | ||
1304 | mdf |= MDF_WAS_UP_TO_DATE; | ||
1305 | if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) | ||
1306 | mdf |= MDF_PEER_OUT_DATED; | ||
1307 | if (mdf != mdev->ldev->md.flags) { | ||
1308 | mdev->ldev->md.flags = mdf; | ||
1309 | drbd_md_mark_dirty(mdev); | ||
1310 | } | ||
1311 | if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) | ||
1312 | drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); | ||
1313 | put_ldev(mdev); | ||
1314 | } | ||
1315 | |||
1316 | /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ | ||
1317 | if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && | ||
1318 | os.peer == R_SECONDARY && ns.peer == R_PRIMARY) | ||
1319 | drbd_set_flag(mdev, CONSIDER_RESYNC); | ||
1320 | |||
1321 | /* Receiver should clean up itself */ | ||
1322 | if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) | ||
1323 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1324 | |||
1325 | /* Now the receiver finished cleaning up itself, it should die */ | ||
1326 | if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) | ||
1327 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1328 | |||
1329 | /* Upon network failure, we need to restart the receiver. */ | ||
1330 | if (os.conn > C_WF_CONNECTION && | ||
1331 | ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) | ||
1332 | drbd_thread_restart_nowait(&mdev->receiver); | ||
1333 | |||
1334 | /* Resume AL writing if we get a connection */ | ||
1335 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | ||
1336 | drbd_resume_al(mdev); | ||
1337 | |||
1338 | /* remember last connect and attach times so request_timer_fn() won't | ||
1339 | * kill newly established sessions while we are still trying to thaw | ||
1340 | * previously frozen IO */ | ||
1341 | if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) | ||
1342 | mdev->last_reconnect_jif = jiffies; | ||
1343 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1344 | ns.disk > D_NEGOTIATING) | ||
1345 | mdev->last_reattach_jif = jiffies; | ||
1346 | |||
1347 | ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); | ||
1348 | if (ascw) { | ||
1349 | ascw->os = os; | ||
1350 | ascw->ns = ns; | ||
1351 | ascw->flags = flags; | ||
1352 | ascw->w.cb = w_after_state_ch; | ||
1353 | ascw->done = done; | ||
1354 | drbd_queue_work(&mdev->data.work, &ascw->w); | ||
1355 | } else { | ||
1356 | dev_warn(DEV, "Could not kmalloc an ascw\n"); | ||
1357 | } | ||
1358 | |||
1359 | return rv; | ||
1360 | } | ||
1361 | |||
1362 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1363 | { | ||
1364 | struct after_state_chg_work *ascw = | ||
1365 | container_of(w, struct after_state_chg_work, w); | ||
1366 | after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); | ||
1367 | if (ascw->flags & CS_WAIT_COMPLETE) { | ||
1368 | D_ASSERT(ascw->done != NULL); | ||
1369 | complete(ascw->done); | ||
1370 | } | ||
1371 | kfree(ascw); | ||
1372 | |||
1373 | return 1; | ||
1374 | } | ||
1375 | |||
1376 | static void abw_start_sync(struct drbd_conf *mdev, int rv) | ||
1377 | { | ||
1378 | if (rv) { | ||
1379 | dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); | ||
1380 | _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); | ||
1381 | return; | ||
1382 | } | ||
1383 | |||
1384 | switch (mdev->state.conn) { | ||
1385 | case C_STARTING_SYNC_T: | ||
1386 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
1387 | break; | ||
1388 | case C_STARTING_SYNC_S: | ||
1389 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1390 | break; | ||
1391 | } | ||
1392 | } | ||
1393 | |||
1394 | int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | ||
1395 | int (*io_fn)(struct drbd_conf *), | ||
1396 | char *why, enum bm_flag flags) | ||
1397 | { | ||
1398 | int rv; | ||
1399 | |||
1400 | D_ASSERT(current == mdev->worker.task); | ||
1401 | |||
1402 | /* open coded non-blocking drbd_suspend_io(mdev); */ | ||
1403 | drbd_set_flag(mdev, SUSPEND_IO); | ||
1404 | |||
1405 | drbd_bm_lock(mdev, why, flags); | ||
1406 | rv = io_fn(mdev); | ||
1407 | drbd_bm_unlock(mdev); | ||
1408 | |||
1409 | drbd_resume_io(mdev); | ||
1410 | |||
1411 | return rv; | ||
1412 | } | ||
1413 | |||
1414 | /** | ||
1415 | * after_state_ch() - Perform after state change actions that may sleep | ||
1416 | * @mdev: DRBD device. | ||
1417 | * @os: old state. | ||
1418 | * @ns: new state. | ||
1419 | * @flags: Flags | ||
1420 | */ | ||
1421 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
1422 | union drbd_state ns, enum chg_state_flags flags) | ||
1423 | { | ||
1424 | enum drbd_fencing_p fp; | ||
1425 | enum drbd_req_event what = nothing; | ||
1426 | union drbd_state nsm = (union drbd_state){ .i = -1 }; | ||
1427 | |||
1428 | if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { | ||
1429 | drbd_clear_flag(mdev, CRASHED_PRIMARY); | ||
1430 | if (mdev->p_uuid) | ||
1431 | mdev->p_uuid[UI_FLAGS] &= ~((u64)2); | ||
1432 | } | ||
1433 | |||
1434 | fp = FP_DONT_CARE; | ||
1435 | if (get_ldev(mdev)) { | ||
1436 | fp = mdev->ldev->dc.fencing; | ||
1437 | put_ldev(mdev); | ||
1438 | } | ||
1439 | |||
1440 | /* Inform userspace about the change... */ | ||
1441 | drbd_bcast_state(mdev, ns); | ||
1442 | |||
1443 | if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && | ||
1444 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
1445 | drbd_khelper(mdev, "pri-on-incon-degr"); | ||
1446 | |||
1447 | /* Here we have the actions that are performed after a | ||
1448 | state change. This function might sleep */ | ||
1449 | |||
1450 | if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) | ||
1451 | mod_timer(&mdev->request_timer, jiffies + HZ); | ||
1452 | |||
1453 | nsm.i = -1; | ||
1454 | if (ns.susp_nod) { | ||
1455 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | ||
1456 | what = resend; | ||
1457 | |||
1458 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1459 | ns.disk > D_NEGOTIATING) | ||
1460 | what = restart_frozen_disk_io; | ||
1461 | |||
1462 | if (what != nothing) | ||
1463 | nsm.susp_nod = 0; | ||
1464 | } | ||
1465 | |||
1466 | if (ns.susp_fen) { | ||
1467 | /* case1: The outdate peer handler is successful: */ | ||
1468 | if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { | ||
1469 | if (drbd_test_flag(mdev, NEW_CUR_UUID)) { | ||
1470 | drbd_uuid_new_current(mdev); | ||
1471 | drbd_clear_flag(mdev, NEW_CUR_UUID); | ||
1472 | } | ||
1473 | spin_lock_irq(&mdev->req_lock); | ||
1474 | _tl_clear(mdev); | ||
1475 | _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); | ||
1476 | spin_unlock_irq(&mdev->req_lock); | ||
1477 | } | ||
1478 | /* case2: The connection was established again: */ | ||
1479 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { | ||
1480 | drbd_clear_flag(mdev, NEW_CUR_UUID); | ||
1481 | what = resend; | ||
1482 | nsm.susp_fen = 0; | ||
1483 | } | ||
1484 | } | ||
1485 | |||
1486 | if (what != nothing) { | ||
1487 | spin_lock_irq(&mdev->req_lock); | ||
1488 | _tl_restart(mdev, what); | ||
1489 | nsm.i &= mdev->state.i; | ||
1490 | _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL); | ||
1491 | spin_unlock_irq(&mdev->req_lock); | ||
1492 | } | ||
1493 | |||
1494 | /* Became sync source. With protocol >= 96, we still need to send out | ||
1495 | * the sync uuid now. Need to do that before any drbd_send_state, or | ||
1496 | * the other side may go "paused sync" before receiving the sync uuids, | ||
1497 | * which is unexpected. */ | ||
1498 | if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && | ||
1499 | (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && | ||
1500 | mdev->agreed_pro_version >= 96 && get_ldev(mdev)) { | ||
1501 | drbd_gen_and_send_sync_uuid(mdev); | ||
1502 | put_ldev(mdev); | ||
1503 | } | ||
1504 | |||
1505 | /* Do not change the order of the if above and the two below... */ | ||
1506 | if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ | ||
1507 | /* we probably will start a resync soon. | ||
1508 | * make sure those things are properly reset. */ | ||
1509 | mdev->rs_total = 0; | ||
1510 | mdev->rs_failed = 0; | ||
1511 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1512 | drbd_rs_cancel_all(mdev); | ||
1513 | |||
1514 | drbd_send_uuids(mdev); | ||
1515 | drbd_send_state(mdev, ns); | ||
1516 | } | ||
1517 | /* No point in queuing send_bitmap if we don't have a connection | ||
1518 | * anymore, so check also the _current_ state, not only the new state | ||
1519 | * at the time this work was queued. */ | ||
1520 | if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && | ||
1521 | mdev->state.conn == C_WF_BITMAP_S) | ||
1522 | drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, | ||
1523 | "send_bitmap (WFBitMapS)", | ||
1524 | BM_LOCKED_TEST_ALLOWED); | ||
1525 | |||
1526 | /* Lost contact to peer's copy of the data */ | ||
1527 | if ((os.pdsk >= D_INCONSISTENT && | ||
1528 | os.pdsk != D_UNKNOWN && | ||
1529 | os.pdsk != D_OUTDATED) | ||
1530 | && (ns.pdsk < D_INCONSISTENT || | ||
1531 | ns.pdsk == D_UNKNOWN || | ||
1532 | ns.pdsk == D_OUTDATED)) { | ||
1533 | if (get_ldev(mdev)) { | ||
1534 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | ||
1535 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1536 | if (is_susp(mdev->state)) { | ||
1537 | drbd_set_flag(mdev, NEW_CUR_UUID); | ||
1538 | } else { | ||
1539 | drbd_uuid_new_current(mdev); | ||
1540 | drbd_send_uuids(mdev); | ||
1541 | } | ||
1542 | } | ||
1543 | put_ldev(mdev); | ||
1544 | } | ||
1545 | } | ||
1546 | |||
1547 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | ||
1548 | if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && | ||
1549 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1550 | drbd_uuid_new_current(mdev); | ||
1551 | drbd_send_uuids(mdev); | ||
1552 | } | ||
1553 | /* D_DISKLESS Peer becomes secondary */ | ||
1554 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | ||
1555 | /* We may still be Primary ourselves. | ||
1556 | * No harm done if the bitmap still changes, | ||
1557 | * redirtied pages will follow later. */ | ||
1558 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1559 | "demote diskless peer", BM_LOCKED_SET_ALLOWED); | ||
1560 | put_ldev(mdev); | ||
1561 | } | ||
1562 | |||
1563 | /* Write out all changed bits on demote. | ||
1564 | * Though, no need to da that just yet | ||
1565 | * if there is a resync going on still */ | ||
1566 | if (os.role == R_PRIMARY && ns.role == R_SECONDARY && | ||
1567 | mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1568 | /* No changes to the bitmap expected this time, so assert that, | ||
1569 | * even though no harm was done if it did change. */ | ||
1570 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1571 | "demote", BM_LOCKED_TEST_ALLOWED); | ||
1572 | put_ldev(mdev); | ||
1573 | } | ||
1574 | |||
1575 | /* Last part of the attaching process ... */ | ||
1576 | if (ns.conn >= C_CONNECTED && | ||
1577 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | ||
1578 | drbd_send_sizes(mdev, 0, 0); /* to start sync... */ | ||
1579 | drbd_send_uuids(mdev); | ||
1580 | drbd_send_state(mdev, ns); | ||
1581 | } | ||
1582 | |||
1583 | /* We want to pause/continue resync, tell peer. */ | ||
1584 | if (ns.conn >= C_CONNECTED && | ||
1585 | ((os.aftr_isp != ns.aftr_isp) || | ||
1586 | (os.user_isp != ns.user_isp))) | ||
1587 | drbd_send_state(mdev, ns); | ||
1588 | |||
1589 | /* In case one of the isp bits got set, suspend other devices. */ | ||
1590 | if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && | ||
1591 | (ns.aftr_isp || ns.peer_isp || ns.user_isp)) | ||
1592 | suspend_other_sg(mdev); | ||
1593 | |||
1594 | /* Make sure the peer gets informed about eventual state | ||
1595 | changes (ISP bits) while we were in WFReportParams. */ | ||
1596 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) | ||
1597 | drbd_send_state(mdev, ns); | ||
1598 | |||
1599 | if (os.conn != C_AHEAD && ns.conn == C_AHEAD) | ||
1600 | drbd_send_state(mdev, ns); | ||
1601 | |||
1602 | /* We are in the progress to start a full sync... */ | ||
1603 | if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
1604 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) | ||
1605 | /* no other bitmap changes expected during this phase */ | ||
1606 | drbd_queue_bitmap_io(mdev, | ||
1607 | &drbd_bmio_set_n_write, &abw_start_sync, | ||
1608 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); | ||
1609 | |||
1610 | /* We are invalidating our self... */ | ||
1611 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1612 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1613 | /* other bitmap operation expected during this phase */ | ||
1614 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, | ||
1615 | "set_n_write from invalidate", BM_LOCKED_MASK); | ||
1616 | |||
1617 | /* first half of local IO error, failure to attach, | ||
1618 | * or administrative detach */ | ||
1619 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { | ||
1620 | /* corresponding get_ldev was in __drbd_set_state, to serialize | ||
1621 | * our cleanup here with the transition to D_DISKLESS. | ||
1622 | * But it is still not safe to dreference ldev here, we may end | ||
1623 | * up here from a failed attach, before ldev was even set. */ | ||
1624 | if (mdev->ldev) { | ||
1625 | enum drbd_io_error_p eh = mdev->ldev->dc.on_io_error; | ||
1626 | |||
1627 | /* In some setups, this handler triggers a suicide, | ||
1628 | * basically mapping IO error to node failure, to | ||
1629 | * reduce the number of different failure scenarios. | ||
1630 | * | ||
1631 | * This handler intentionally runs before we abort IO, | ||
1632 | * notify the peer, or try to update our meta data. */ | ||
1633 | if (eh == EP_CALL_HELPER && drbd_test_flag(mdev, WAS_IO_ERROR)) | ||
1634 | drbd_khelper(mdev, "local-io-error"); | ||
1635 | |||
1636 | /* Immediately allow completion of all application IO, | ||
1637 | * that waits for completion from the local disk, | ||
1638 | * if this was a force-detach due to disk_timeout | ||
1639 | * or administrator request (drbdsetup detach --force). | ||
1640 | * Do NOT abort otherwise. | ||
1641 | * Aborting local requests may cause serious problems, | ||
1642 | * if requests are completed to upper layers already, | ||
1643 | * and then later the already submitted local bio completes. | ||
1644 | * This can cause DMA into former bio pages that meanwhile | ||
1645 | * have been re-used for other things. | ||
1646 | * So aborting local requests may cause crashes, | ||
1647 | * or even worse, silent data corruption. | ||
1648 | */ | ||
1649 | if (drbd_test_flag(mdev, FORCE_DETACH)) | ||
1650 | tl_abort_disk_io(mdev); | ||
1651 | |||
1652 | /* current state still has to be D_FAILED, | ||
1653 | * there is only one way out: to D_DISKLESS, | ||
1654 | * and that may only happen after our put_ldev below. */ | ||
1655 | if (mdev->state.disk != D_FAILED) | ||
1656 | dev_err(DEV, | ||
1657 | "ASSERT FAILED: disk is %s during detach\n", | ||
1658 | drbd_disk_str(mdev->state.disk)); | ||
1659 | |||
1660 | if (ns.conn >= C_CONNECTED) | ||
1661 | drbd_send_state(mdev, ns); | ||
1662 | |||
1663 | drbd_rs_cancel_all(mdev); | ||
1664 | |||
1665 | /* In case we want to get something to stable storage still, | ||
1666 | * this may be the last chance. | ||
1667 | * Following put_ldev may transition to D_DISKLESS. */ | ||
1668 | drbd_md_sync(mdev); | ||
1669 | } | ||
1670 | put_ldev(mdev); | ||
1671 | } | ||
1672 | |||
1673 | /* second half of local IO error, failure to attach, | ||
1674 | * or administrative detach, | ||
1675 | * after local_cnt references have reached zero again */ | ||
1676 | if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1677 | /* We must still be diskless, | ||
1678 | * re-attach has to be serialized with this! */ | ||
1679 | if (mdev->state.disk != D_DISKLESS) | ||
1680 | dev_err(DEV, | ||
1681 | "ASSERT FAILED: disk is %s while going diskless\n", | ||
1682 | drbd_disk_str(mdev->state.disk)); | ||
1683 | |||
1684 | if (ns.conn >= C_CONNECTED) | ||
1685 | drbd_send_state(mdev, ns); | ||
1686 | |||
1687 | /* corresponding get_ldev in __drbd_set_state | ||
1688 | * this may finally trigger drbd_ldev_destroy. */ | ||
1689 | put_ldev(mdev); | ||
1690 | } | ||
1691 | |||
1692 | /* Notify peer that I had a local IO error, and did not detached.. */ | ||
1693 | if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) | ||
1694 | drbd_send_state(mdev, ns); | ||
1695 | |||
1696 | /* Disks got bigger while they were detached */ | ||
1697 | if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && | ||
1698 | drbd_test_and_clear_flag(mdev, RESYNC_AFTER_NEG)) { | ||
1699 | if (ns.conn == C_CONNECTED) | ||
1700 | resync_after_online_grow(mdev); | ||
1701 | } | ||
1702 | |||
1703 | /* A resync finished or aborted, wake paused devices... */ | ||
1704 | if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || | ||
1705 | (os.peer_isp && !ns.peer_isp) || | ||
1706 | (os.user_isp && !ns.user_isp)) | ||
1707 | resume_next_sg(mdev); | ||
1708 | |||
1709 | /* sync target done with resync. Explicitly notify peer, even though | ||
1710 | * it should (at least for non-empty resyncs) already know itself. */ | ||
1711 | if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) | ||
1712 | drbd_send_state(mdev, ns); | ||
1713 | |||
1714 | /* Verify finished, or reached stop sector. Peer did not know about | ||
1715 | * the stop sector, and we may even have changed the stop sector during | ||
1716 | * verify to interrupt/stop early. Send the new state. */ | ||
1717 | if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED | ||
1718 | && mdev->agreed_pro_version >= 97) | ||
1719 | drbd_send_state(mdev, ns); | ||
1720 | |||
1721 | /* Wake up role changes, that were delayed because of connection establishing */ | ||
1722 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { | ||
1723 | drbd_clear_flag(mdev, STATE_SENT); | ||
1724 | wake_up(&mdev->state_wait); | ||
1725 | } | ||
1726 | |||
1727 | /* This triggers bitmap writeout of potentially still unwritten pages | ||
1728 | * if the resync finished cleanly, or aborted because of peer disk | ||
1729 | * failure, or because of connection loss. | ||
1730 | * For resync aborted because of local disk failure, we cannot do | ||
1731 | * any bitmap writeout anymore. | ||
1732 | * No harm done if some bits change during this phase. | ||
1733 | */ | ||
1734 | if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1735 | drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, | ||
1736 | "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); | ||
1737 | put_ldev(mdev); | ||
1738 | } | ||
1739 | |||
1740 | /* free tl_hash if we Got thawed and are C_STANDALONE */ | ||
1741 | if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) | ||
1742 | drbd_free_tl_hash(mdev); | ||
1743 | |||
1744 | /* Upon network connection, we need to start the receiver */ | ||
1745 | if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) | ||
1746 | drbd_thread_start(&mdev->receiver); | ||
1747 | |||
1748 | /* Terminate worker thread if we are unconfigured - it will be | ||
1749 | restarted as needed... */ | ||
1750 | if (ns.disk == D_DISKLESS && | ||
1751 | ns.conn == C_STANDALONE && | ||
1752 | ns.role == R_SECONDARY) { | ||
1753 | if (os.aftr_isp != ns.aftr_isp) | ||
1754 | resume_next_sg(mdev); | ||
1755 | /* set in __drbd_set_state, unless CONFIG_PENDING was set */ | ||
1756 | if (drbd_test_flag(mdev, DEVICE_DYING)) | ||
1757 | drbd_thread_stop_nowait(&mdev->worker); | ||
1758 | } | 322 | } |
1759 | 323 | spin_unlock_irq(&tconn->req_lock); | |
1760 | drbd_md_sync(mdev); | ||
1761 | } | 324 | } |
1762 | 325 | ||
1763 | |||
1764 | static int drbd_thread_setup(void *arg) | 326 | static int drbd_thread_setup(void *arg) |
1765 | { | 327 | { |
1766 | struct drbd_thread *thi = (struct drbd_thread *) arg; | 328 | struct drbd_thread *thi = (struct drbd_thread *) arg; |
1767 | struct drbd_conf *mdev = thi->mdev; | 329 | struct drbd_tconn *tconn = thi->tconn; |
1768 | unsigned long flags; | 330 | unsigned long flags; |
1769 | int retval; | 331 | int retval; |
1770 | 332 | ||
333 | snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s", | ||
334 | thi->name[0], thi->tconn->name); | ||
335 | |||
1771 | restart: | 336 | restart: |
1772 | retval = thi->function(thi); | 337 | retval = thi->function(thi); |
1773 | 338 | ||
1774 | spin_lock_irqsave(&thi->t_lock, flags); | 339 | spin_lock_irqsave(&thi->t_lock, flags); |
1775 | 340 | ||
1776 | /* if the receiver has been "Exiting", the last thing it did | 341 | /* if the receiver has been "EXITING", the last thing it did |
1777 | * was set the conn state to "StandAlone", | 342 | * was set the conn state to "StandAlone", |
1778 | * if now a re-connect request comes in, conn state goes C_UNCONNECTED, | 343 | * if now a re-connect request comes in, conn state goes C_UNCONNECTED, |
1779 | * and receiver thread will be "started". | 344 | * and receiver thread will be "started". |
1780 | * drbd_thread_start needs to set "Restarting" in that case. | 345 | * drbd_thread_start needs to set "RESTARTING" in that case. |
1781 | * t_state check and assignment needs to be within the same spinlock, | 346 | * t_state check and assignment needs to be within the same spinlock, |
1782 | * so either thread_start sees Exiting, and can remap to Restarting, | 347 | * so either thread_start sees EXITING, and can remap to RESTARTING, |
1783 | * or thread_start see None, and can proceed as normal. | 348 | * or thread_start see NONE, and can proceed as normal. |
1784 | */ | 349 | */ |
1785 | 350 | ||
1786 | if (thi->t_state == Restarting) { | 351 | if (thi->t_state == RESTARTING) { |
1787 | dev_info(DEV, "Restarting %s\n", current->comm); | 352 | conn_info(tconn, "Restarting %s thread\n", thi->name); |
1788 | thi->t_state = Running; | 353 | thi->t_state = RUNNING; |
1789 | spin_unlock_irqrestore(&thi->t_lock, flags); | 354 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1790 | goto restart; | 355 | goto restart; |
1791 | } | 356 | } |
1792 | 357 | ||
1793 | thi->task = NULL; | 358 | thi->task = NULL; |
1794 | thi->t_state = None; | 359 | thi->t_state = NONE; |
1795 | smp_mb(); | 360 | smp_mb(); |
1796 | complete(&thi->stop); | 361 | complete_all(&thi->stop); |
1797 | spin_unlock_irqrestore(&thi->t_lock, flags); | 362 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1798 | 363 | ||
1799 | dev_info(DEV, "Terminating %s\n", current->comm); | 364 | conn_info(tconn, "Terminating %s\n", current->comm); |
1800 | 365 | ||
1801 | /* Release mod reference taken when thread was started */ | 366 | /* Release mod reference taken when thread was started */ |
367 | |||
368 | kref_put(&tconn->kref, &conn_destroy); | ||
1802 | module_put(THIS_MODULE); | 369 | module_put(THIS_MODULE); |
1803 | return retval; | 370 | return retval; |
1804 | } | 371 | } |
1805 | 372 | ||
1806 | static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, | 373 | static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi, |
1807 | int (*func) (struct drbd_thread *)) | 374 | int (*func) (struct drbd_thread *), char *name) |
1808 | { | 375 | { |
1809 | spin_lock_init(&thi->t_lock); | 376 | spin_lock_init(&thi->t_lock); |
1810 | thi->task = NULL; | 377 | thi->task = NULL; |
1811 | thi->t_state = None; | 378 | thi->t_state = NONE; |
1812 | thi->function = func; | 379 | thi->function = func; |
1813 | thi->mdev = mdev; | 380 | thi->tconn = tconn; |
381 | strncpy(thi->name, name, ARRAY_SIZE(thi->name)); | ||
1814 | } | 382 | } |
1815 | 383 | ||
1816 | int drbd_thread_start(struct drbd_thread *thi) | 384 | int drbd_thread_start(struct drbd_thread *thi) |
1817 | { | 385 | { |
1818 | struct drbd_conf *mdev = thi->mdev; | 386 | struct drbd_tconn *tconn = thi->tconn; |
1819 | struct task_struct *nt; | 387 | struct task_struct *nt; |
1820 | unsigned long flags; | 388 | unsigned long flags; |
1821 | 389 | ||
1822 | const char *me = | ||
1823 | thi == &mdev->receiver ? "receiver" : | ||
1824 | thi == &mdev->asender ? "asender" : | ||
1825 | thi == &mdev->worker ? "worker" : "NONSENSE"; | ||
1826 | |||
1827 | /* is used from state engine doing drbd_thread_stop_nowait, | 390 | /* is used from state engine doing drbd_thread_stop_nowait, |
1828 | * while holding the req lock irqsave */ | 391 | * while holding the req lock irqsave */ |
1829 | spin_lock_irqsave(&thi->t_lock, flags); | 392 | spin_lock_irqsave(&thi->t_lock, flags); |
1830 | 393 | ||
1831 | switch (thi->t_state) { | 394 | switch (thi->t_state) { |
1832 | case None: | 395 | case NONE: |
1833 | dev_info(DEV, "Starting %s thread (from %s [%d])\n", | 396 | conn_info(tconn, "Starting %s thread (from %s [%d])\n", |
1834 | me, current->comm, current->pid); | 397 | thi->name, current->comm, current->pid); |
1835 | 398 | ||
1836 | /* Get ref on module for thread - this is released when thread exits */ | 399 | /* Get ref on module for thread - this is released when thread exits */ |
1837 | if (!try_module_get(THIS_MODULE)) { | 400 | if (!try_module_get(THIS_MODULE)) { |
1838 | dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); | 401 | conn_err(tconn, "Failed to get module reference in drbd_thread_start\n"); |
1839 | spin_unlock_irqrestore(&thi->t_lock, flags); | 402 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1840 | return false; | 403 | return false; |
1841 | } | 404 | } |
1842 | 405 | ||
406 | kref_get(&thi->tconn->kref); | ||
407 | |||
1843 | init_completion(&thi->stop); | 408 | init_completion(&thi->stop); |
1844 | D_ASSERT(thi->task == NULL); | ||
1845 | thi->reset_cpu_mask = 1; | 409 | thi->reset_cpu_mask = 1; |
1846 | thi->t_state = Running; | 410 | thi->t_state = RUNNING; |
1847 | spin_unlock_irqrestore(&thi->t_lock, flags); | 411 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1848 | flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ | 412 | flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ |
1849 | 413 | ||
1850 | nt = kthread_create(drbd_thread_setup, (void *) thi, | 414 | nt = kthread_create(drbd_thread_setup, (void *) thi, |
1851 | "drbd%d_%s", mdev_to_minor(mdev), me); | 415 | "drbd_%c_%s", thi->name[0], thi->tconn->name); |
1852 | 416 | ||
1853 | if (IS_ERR(nt)) { | 417 | if (IS_ERR(nt)) { |
1854 | dev_err(DEV, "Couldn't start thread\n"); | 418 | conn_err(tconn, "Couldn't start thread\n"); |
1855 | 419 | ||
420 | kref_put(&tconn->kref, &conn_destroy); | ||
1856 | module_put(THIS_MODULE); | 421 | module_put(THIS_MODULE); |
1857 | return false; | 422 | return false; |
1858 | } | 423 | } |
1859 | spin_lock_irqsave(&thi->t_lock, flags); | 424 | spin_lock_irqsave(&thi->t_lock, flags); |
1860 | thi->task = nt; | 425 | thi->task = nt; |
1861 | thi->t_state = Running; | 426 | thi->t_state = RUNNING; |
1862 | spin_unlock_irqrestore(&thi->t_lock, flags); | 427 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1863 | wake_up_process(nt); | 428 | wake_up_process(nt); |
1864 | break; | 429 | break; |
1865 | case Exiting: | 430 | case EXITING: |
1866 | thi->t_state = Restarting; | 431 | thi->t_state = RESTARTING; |
1867 | dev_info(DEV, "Restarting %s thread (from %s [%d])\n", | 432 | conn_info(tconn, "Restarting %s thread (from %s [%d])\n", |
1868 | me, current->comm, current->pid); | 433 | thi->name, current->comm, current->pid); |
1869 | /* fall through */ | 434 | /* fall through */ |
1870 | case Running: | 435 | case RUNNING: |
1871 | case Restarting: | 436 | case RESTARTING: |
1872 | default: | 437 | default: |
1873 | spin_unlock_irqrestore(&thi->t_lock, flags); | 438 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1874 | break; | 439 | break; |
@@ -1882,12 +447,12 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1882 | { | 447 | { |
1883 | unsigned long flags; | 448 | unsigned long flags; |
1884 | 449 | ||
1885 | enum drbd_thread_state ns = restart ? Restarting : Exiting; | 450 | enum drbd_thread_state ns = restart ? RESTARTING : EXITING; |
1886 | 451 | ||
1887 | /* may be called from state engine, holding the req lock irqsave */ | 452 | /* may be called from state engine, holding the req lock irqsave */ |
1888 | spin_lock_irqsave(&thi->t_lock, flags); | 453 | spin_lock_irqsave(&thi->t_lock, flags); |
1889 | 454 | ||
1890 | if (thi->t_state == None) { | 455 | if (thi->t_state == NONE) { |
1891 | spin_unlock_irqrestore(&thi->t_lock, flags); | 456 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1892 | if (restart) | 457 | if (restart) |
1893 | drbd_thread_start(thi); | 458 | drbd_thread_start(thi); |
@@ -1905,7 +470,6 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1905 | init_completion(&thi->stop); | 470 | init_completion(&thi->stop); |
1906 | if (thi->task != current) | 471 | if (thi->task != current) |
1907 | force_sig(DRBD_SIGKILL, thi->task); | 472 | force_sig(DRBD_SIGKILL, thi->task); |
1908 | |||
1909 | } | 473 | } |
1910 | 474 | ||
1911 | spin_unlock_irqrestore(&thi->t_lock, flags); | 475 | spin_unlock_irqrestore(&thi->t_lock, flags); |
@@ -1914,6 +478,35 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1914 | wait_for_completion(&thi->stop); | 478 | wait_for_completion(&thi->stop); |
1915 | } | 479 | } |
1916 | 480 | ||
481 | static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task) | ||
482 | { | ||
483 | struct drbd_thread *thi = | ||
484 | task == tconn->receiver.task ? &tconn->receiver : | ||
485 | task == tconn->asender.task ? &tconn->asender : | ||
486 | task == tconn->worker.task ? &tconn->worker : NULL; | ||
487 | |||
488 | return thi; | ||
489 | } | ||
490 | |||
491 | char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task) | ||
492 | { | ||
493 | struct drbd_thread *thi = drbd_task_to_thread(tconn, task); | ||
494 | return thi ? thi->name : task->comm; | ||
495 | } | ||
496 | |||
497 | int conn_lowest_minor(struct drbd_tconn *tconn) | ||
498 | { | ||
499 | struct drbd_conf *mdev; | ||
500 | int vnr = 0, m; | ||
501 | |||
502 | rcu_read_lock(); | ||
503 | mdev = idr_get_next(&tconn->volumes, &vnr); | ||
504 | m = mdev ? mdev_to_minor(mdev) : -1; | ||
505 | rcu_read_unlock(); | ||
506 | |||
507 | return m; | ||
508 | } | ||
509 | |||
1917 | #ifdef CONFIG_SMP | 510 | #ifdef CONFIG_SMP |
1918 | /** | 511 | /** |
1919 | * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs | 512 | * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs |
@@ -1922,240 +515,345 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1922 | * Forces all threads of a device onto the same CPU. This is beneficial for | 515 | * Forces all threads of a device onto the same CPU. This is beneficial for |
1923 | * DRBD's performance. May be overwritten by user's configuration. | 516 | * DRBD's performance. May be overwritten by user's configuration. |
1924 | */ | 517 | */ |
1925 | void drbd_calc_cpu_mask(struct drbd_conf *mdev) | 518 | void drbd_calc_cpu_mask(struct drbd_tconn *tconn) |
1926 | { | 519 | { |
1927 | int ord, cpu; | 520 | int ord, cpu; |
1928 | 521 | ||
1929 | /* user override. */ | 522 | /* user override. */ |
1930 | if (cpumask_weight(mdev->cpu_mask)) | 523 | if (cpumask_weight(tconn->cpu_mask)) |
1931 | return; | 524 | return; |
1932 | 525 | ||
1933 | ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); | 526 | ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask); |
1934 | for_each_online_cpu(cpu) { | 527 | for_each_online_cpu(cpu) { |
1935 | if (ord-- == 0) { | 528 | if (ord-- == 0) { |
1936 | cpumask_set_cpu(cpu, mdev->cpu_mask); | 529 | cpumask_set_cpu(cpu, tconn->cpu_mask); |
1937 | return; | 530 | return; |
1938 | } | 531 | } |
1939 | } | 532 | } |
1940 | /* should not be reached */ | 533 | /* should not be reached */ |
1941 | cpumask_setall(mdev->cpu_mask); | 534 | cpumask_setall(tconn->cpu_mask); |
1942 | } | 535 | } |
1943 | 536 | ||
1944 | /** | 537 | /** |
1945 | * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread | 538 | * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread |
1946 | * @mdev: DRBD device. | 539 | * @mdev: DRBD device. |
540 | * @thi: drbd_thread object | ||
1947 | * | 541 | * |
1948 | * call in the "main loop" of _all_ threads, no need for any mutex, current won't die | 542 | * call in the "main loop" of _all_ threads, no need for any mutex, current won't die |
1949 | * prematurely. | 543 | * prematurely. |
1950 | */ | 544 | */ |
1951 | void drbd_thread_current_set_cpu(struct drbd_conf *mdev) | 545 | void drbd_thread_current_set_cpu(struct drbd_thread *thi) |
1952 | { | 546 | { |
1953 | struct task_struct *p = current; | 547 | struct task_struct *p = current; |
1954 | struct drbd_thread *thi = | 548 | |
1955 | p == mdev->asender.task ? &mdev->asender : | ||
1956 | p == mdev->receiver.task ? &mdev->receiver : | ||
1957 | p == mdev->worker.task ? &mdev->worker : | ||
1958 | NULL; | ||
1959 | ERR_IF(thi == NULL) | ||
1960 | return; | ||
1961 | if (!thi->reset_cpu_mask) | 549 | if (!thi->reset_cpu_mask) |
1962 | return; | 550 | return; |
1963 | thi->reset_cpu_mask = 0; | 551 | thi->reset_cpu_mask = 0; |
1964 | set_cpus_allowed_ptr(p, mdev->cpu_mask); | 552 | set_cpus_allowed_ptr(p, thi->tconn->cpu_mask); |
1965 | } | 553 | } |
1966 | #endif | 554 | #endif |
1967 | 555 | ||
1968 | /* the appropriate socket mutex must be held already */ | 556 | /** |
1969 | int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | 557 | * drbd_header_size - size of a packet header |
1970 | enum drbd_packets cmd, struct p_header80 *h, | 558 | * |
1971 | size_t size, unsigned msg_flags) | 559 | * The header size is a multiple of 8, so any payload following the header is |
560 | * word aligned on 64-bit architectures. (The bitmap send and receive code | ||
561 | * relies on this.) | ||
562 | */ | ||
563 | unsigned int drbd_header_size(struct drbd_tconn *tconn) | ||
1972 | { | 564 | { |
1973 | int sent, ok; | 565 | if (tconn->agreed_pro_version >= 100) { |
566 | BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8)); | ||
567 | return sizeof(struct p_header100); | ||
568 | } else { | ||
569 | BUILD_BUG_ON(sizeof(struct p_header80) != | ||
570 | sizeof(struct p_header95)); | ||
571 | BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8)); | ||
572 | return sizeof(struct p_header80); | ||
573 | } | ||
574 | } | ||
1974 | 575 | ||
1975 | ERR_IF(!h) return false; | 576 | static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size) |
1976 | ERR_IF(!size) return false; | 577 | { |
578 | h->magic = cpu_to_be32(DRBD_MAGIC); | ||
579 | h->command = cpu_to_be16(cmd); | ||
580 | h->length = cpu_to_be16(size); | ||
581 | return sizeof(struct p_header80); | ||
582 | } | ||
1977 | 583 | ||
1978 | h->magic = BE_DRBD_MAGIC; | 584 | static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size) |
585 | { | ||
586 | h->magic = cpu_to_be16(DRBD_MAGIC_BIG); | ||
1979 | h->command = cpu_to_be16(cmd); | 587 | h->command = cpu_to_be16(cmd); |
1980 | h->length = cpu_to_be16(size-sizeof(struct p_header80)); | 588 | h->length = cpu_to_be32(size); |
589 | return sizeof(struct p_header95); | ||
590 | } | ||
1981 | 591 | ||
1982 | sent = drbd_send(mdev, sock, h, size, msg_flags); | 592 | static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd, |
593 | int size, int vnr) | ||
594 | { | ||
595 | h->magic = cpu_to_be32(DRBD_MAGIC_100); | ||
596 | h->volume = cpu_to_be16(vnr); | ||
597 | h->command = cpu_to_be16(cmd); | ||
598 | h->length = cpu_to_be32(size); | ||
599 | h->pad = 0; | ||
600 | return sizeof(struct p_header100); | ||
601 | } | ||
1983 | 602 | ||
1984 | ok = (sent == size); | 603 | static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr, |
1985 | if (!ok && !signal_pending(current)) | 604 | void *buffer, enum drbd_packet cmd, int size) |
1986 | dev_warn(DEV, "short sent %s size=%d sent=%d\n", | 605 | { |
1987 | cmdname(cmd), (int)size, sent); | 606 | if (tconn->agreed_pro_version >= 100) |
1988 | return ok; | 607 | return prepare_header100(buffer, cmd, size, vnr); |
608 | else if (tconn->agreed_pro_version >= 95 && | ||
609 | size > DRBD_MAX_SIZE_H80_PACKET) | ||
610 | return prepare_header95(buffer, cmd, size); | ||
611 | else | ||
612 | return prepare_header80(buffer, cmd, size); | ||
1989 | } | 613 | } |
1990 | 614 | ||
1991 | /* don't pass the socket. we may only look at it | 615 | static void *__conn_prepare_command(struct drbd_tconn *tconn, |
1992 | * when we hold the appropriate socket mutex. | 616 | struct drbd_socket *sock) |
1993 | */ | ||
1994 | int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | ||
1995 | enum drbd_packets cmd, struct p_header80 *h, size_t size) | ||
1996 | { | 617 | { |
1997 | int ok = 0; | 618 | if (!sock->socket) |
1998 | struct socket *sock; | 619 | return NULL; |
620 | return sock->sbuf + drbd_header_size(tconn); | ||
621 | } | ||
1999 | 622 | ||
2000 | if (use_data_socket) { | 623 | void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock) |
2001 | mutex_lock(&mdev->data.mutex); | 624 | { |
2002 | sock = mdev->data.socket; | 625 | void *p; |
2003 | } else { | ||
2004 | mutex_lock(&mdev->meta.mutex); | ||
2005 | sock = mdev->meta.socket; | ||
2006 | } | ||
2007 | 626 | ||
2008 | /* drbd_disconnect() could have called drbd_free_sock() | 627 | mutex_lock(&sock->mutex); |
2009 | * while we were waiting in down()... */ | 628 | p = __conn_prepare_command(tconn, sock); |
2010 | if (likely(sock != NULL)) | 629 | if (!p) |
2011 | ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); | 630 | mutex_unlock(&sock->mutex); |
2012 | 631 | ||
2013 | if (use_data_socket) | 632 | return p; |
2014 | mutex_unlock(&mdev->data.mutex); | ||
2015 | else | ||
2016 | mutex_unlock(&mdev->meta.mutex); | ||
2017 | return ok; | ||
2018 | } | 633 | } |
2019 | 634 | ||
2020 | int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, | 635 | void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock) |
2021 | size_t size) | ||
2022 | { | 636 | { |
2023 | struct p_header80 h; | 637 | return conn_prepare_command(mdev->tconn, sock); |
2024 | int ok; | 638 | } |
2025 | 639 | ||
2026 | h.magic = BE_DRBD_MAGIC; | 640 | static int __send_command(struct drbd_tconn *tconn, int vnr, |
2027 | h.command = cpu_to_be16(cmd); | 641 | struct drbd_socket *sock, enum drbd_packet cmd, |
2028 | h.length = cpu_to_be16(size); | 642 | unsigned int header_size, void *data, |
643 | unsigned int size) | ||
644 | { | ||
645 | int msg_flags; | ||
646 | int err; | ||
2029 | 647 | ||
2030 | if (!drbd_get_data_sock(mdev)) | 648 | /* |
2031 | return 0; | 649 | * Called with @data == NULL and the size of the data blocks in @size |
650 | * for commands that send data blocks. For those commands, omit the | ||
651 | * MSG_MORE flag: this will increase the likelihood that data blocks | ||
652 | * which are page aligned on the sender will end up page aligned on the | ||
653 | * receiver. | ||
654 | */ | ||
655 | msg_flags = data ? MSG_MORE : 0; | ||
656 | |||
657 | header_size += prepare_header(tconn, vnr, sock->sbuf, cmd, | ||
658 | header_size + size); | ||
659 | err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size, | ||
660 | msg_flags); | ||
661 | if (data && !err) | ||
662 | err = drbd_send_all(tconn, sock->socket, data, size, 0); | ||
663 | return err; | ||
664 | } | ||
2032 | 665 | ||
2033 | ok = (sizeof(h) == | 666 | static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, |
2034 | drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); | 667 | enum drbd_packet cmd, unsigned int header_size, |
2035 | ok = ok && (size == | 668 | void *data, unsigned int size) |
2036 | drbd_send(mdev, mdev->data.socket, data, size, 0)); | 669 | { |
670 | return __send_command(tconn, 0, sock, cmd, header_size, data, size); | ||
671 | } | ||
2037 | 672 | ||
2038 | drbd_put_data_sock(mdev); | 673 | int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, |
674 | enum drbd_packet cmd, unsigned int header_size, | ||
675 | void *data, unsigned int size) | ||
676 | { | ||
677 | int err; | ||
2039 | 678 | ||
2040 | return ok; | 679 | err = __conn_send_command(tconn, sock, cmd, header_size, data, size); |
680 | mutex_unlock(&sock->mutex); | ||
681 | return err; | ||
2041 | } | 682 | } |
2042 | 683 | ||
2043 | int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) | 684 | int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock, |
685 | enum drbd_packet cmd, unsigned int header_size, | ||
686 | void *data, unsigned int size) | ||
2044 | { | 687 | { |
688 | int err; | ||
689 | |||
690 | err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size, | ||
691 | data, size); | ||
692 | mutex_unlock(&sock->mutex); | ||
693 | return err; | ||
694 | } | ||
695 | |||
696 | int drbd_send_ping(struct drbd_tconn *tconn) | ||
697 | { | ||
698 | struct drbd_socket *sock; | ||
699 | |||
700 | sock = &tconn->meta; | ||
701 | if (!conn_prepare_command(tconn, sock)) | ||
702 | return -EIO; | ||
703 | return conn_send_command(tconn, sock, P_PING, 0, NULL, 0); | ||
704 | } | ||
705 | |||
706 | int drbd_send_ping_ack(struct drbd_tconn *tconn) | ||
707 | { | ||
708 | struct drbd_socket *sock; | ||
709 | |||
710 | sock = &tconn->meta; | ||
711 | if (!conn_prepare_command(tconn, sock)) | ||
712 | return -EIO; | ||
713 | return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0); | ||
714 | } | ||
715 | |||
716 | int drbd_send_sync_param(struct drbd_conf *mdev) | ||
717 | { | ||
718 | struct drbd_socket *sock; | ||
2045 | struct p_rs_param_95 *p; | 719 | struct p_rs_param_95 *p; |
2046 | struct socket *sock; | 720 | int size; |
2047 | int size, rv; | 721 | const int apv = mdev->tconn->agreed_pro_version; |
2048 | const int apv = mdev->agreed_pro_version; | 722 | enum drbd_packet cmd; |
723 | struct net_conf *nc; | ||
724 | struct disk_conf *dc; | ||
725 | |||
726 | sock = &mdev->tconn->data; | ||
727 | p = drbd_prepare_command(mdev, sock); | ||
728 | if (!p) | ||
729 | return -EIO; | ||
730 | |||
731 | rcu_read_lock(); | ||
732 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2049 | 733 | ||
2050 | size = apv <= 87 ? sizeof(struct p_rs_param) | 734 | size = apv <= 87 ? sizeof(struct p_rs_param) |
2051 | : apv == 88 ? sizeof(struct p_rs_param) | 735 | : apv == 88 ? sizeof(struct p_rs_param) |
2052 | + strlen(mdev->sync_conf.verify_alg) + 1 | 736 | + strlen(nc->verify_alg) + 1 |
2053 | : apv <= 94 ? sizeof(struct p_rs_param_89) | 737 | : apv <= 94 ? sizeof(struct p_rs_param_89) |
2054 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); | 738 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); |
2055 | 739 | ||
2056 | /* used from admin command context and receiver/worker context. | 740 | cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; |
2057 | * to avoid kmalloc, grab the socket right here, | ||
2058 | * then use the pre-allocated sbuf there */ | ||
2059 | mutex_lock(&mdev->data.mutex); | ||
2060 | sock = mdev->data.socket; | ||
2061 | |||
2062 | if (likely(sock != NULL)) { | ||
2063 | enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; | ||
2064 | 741 | ||
2065 | p = &mdev->data.sbuf.rs_param_95; | 742 | /* initialize verify_alg and csums_alg */ |
743 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | ||
2066 | 744 | ||
2067 | /* initialize verify_alg and csums_alg */ | 745 | if (get_ldev(mdev)) { |
2068 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | 746 | dc = rcu_dereference(mdev->ldev->disk_conf); |
2069 | 747 | p->resync_rate = cpu_to_be32(dc->resync_rate); | |
2070 | p->rate = cpu_to_be32(sc->rate); | 748 | p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead); |
2071 | p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead); | 749 | p->c_delay_target = cpu_to_be32(dc->c_delay_target); |
2072 | p->c_delay_target = cpu_to_be32(sc->c_delay_target); | 750 | p->c_fill_target = cpu_to_be32(dc->c_fill_target); |
2073 | p->c_fill_target = cpu_to_be32(sc->c_fill_target); | 751 | p->c_max_rate = cpu_to_be32(dc->c_max_rate); |
2074 | p->c_max_rate = cpu_to_be32(sc->c_max_rate); | 752 | put_ldev(mdev); |
2075 | 753 | } else { | |
2076 | if (apv >= 88) | 754 | p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF); |
2077 | strcpy(p->verify_alg, mdev->sync_conf.verify_alg); | 755 | p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF); |
2078 | if (apv >= 89) | 756 | p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF); |
2079 | strcpy(p->csums_alg, mdev->sync_conf.csums_alg); | 757 | p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF); |
2080 | 758 | p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF); | |
2081 | rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); | 759 | } |
2082 | } else | ||
2083 | rv = 0; /* not ok */ | ||
2084 | 760 | ||
2085 | mutex_unlock(&mdev->data.mutex); | 761 | if (apv >= 88) |
762 | strcpy(p->verify_alg, nc->verify_alg); | ||
763 | if (apv >= 89) | ||
764 | strcpy(p->csums_alg, nc->csums_alg); | ||
765 | rcu_read_unlock(); | ||
2086 | 766 | ||
2087 | return rv; | 767 | return drbd_send_command(mdev, sock, cmd, size, NULL, 0); |
2088 | } | 768 | } |
2089 | 769 | ||
2090 | int drbd_send_protocol(struct drbd_conf *mdev) | 770 | int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd) |
2091 | { | 771 | { |
772 | struct drbd_socket *sock; | ||
2092 | struct p_protocol *p; | 773 | struct p_protocol *p; |
2093 | int size, cf, rv; | 774 | struct net_conf *nc; |
775 | int size, cf; | ||
2094 | 776 | ||
2095 | size = sizeof(struct p_protocol); | 777 | sock = &tconn->data; |
778 | p = __conn_prepare_command(tconn, sock); | ||
779 | if (!p) | ||
780 | return -EIO; | ||
2096 | 781 | ||
2097 | if (mdev->agreed_pro_version >= 87) | 782 | rcu_read_lock(); |
2098 | size += strlen(mdev->net_conf->integrity_alg) + 1; | 783 | nc = rcu_dereference(tconn->net_conf); |
2099 | 784 | ||
2100 | /* we must not recurse into our own queue, | 785 | if (nc->tentative && tconn->agreed_pro_version < 92) { |
2101 | * as that is blocked during handshake */ | 786 | rcu_read_unlock(); |
2102 | p = kmalloc(size, GFP_NOIO); | 787 | mutex_unlock(&sock->mutex); |
2103 | if (p == NULL) | 788 | conn_err(tconn, "--dry-run is not supported by peer"); |
2104 | return 0; | 789 | return -EOPNOTSUPP; |
790 | } | ||
2105 | 791 | ||
2106 | p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); | 792 | size = sizeof(*p); |
2107 | p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); | 793 | if (tconn->agreed_pro_version >= 87) |
2108 | p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); | 794 | size += strlen(nc->integrity_alg) + 1; |
2109 | p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); | ||
2110 | p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); | ||
2111 | 795 | ||
796 | p->protocol = cpu_to_be32(nc->wire_protocol); | ||
797 | p->after_sb_0p = cpu_to_be32(nc->after_sb_0p); | ||
798 | p->after_sb_1p = cpu_to_be32(nc->after_sb_1p); | ||
799 | p->after_sb_2p = cpu_to_be32(nc->after_sb_2p); | ||
800 | p->two_primaries = cpu_to_be32(nc->two_primaries); | ||
2112 | cf = 0; | 801 | cf = 0; |
2113 | if (mdev->net_conf->want_lose) | 802 | if (nc->discard_my_data) |
2114 | cf |= CF_WANT_LOSE; | 803 | cf |= CF_DISCARD_MY_DATA; |
2115 | if (mdev->net_conf->dry_run) { | 804 | if (nc->tentative) |
2116 | if (mdev->agreed_pro_version >= 92) | 805 | cf |= CF_DRY_RUN; |
2117 | cf |= CF_DRY_RUN; | ||
2118 | else { | ||
2119 | dev_err(DEV, "--dry-run is not supported by peer"); | ||
2120 | kfree(p); | ||
2121 | return -1; | ||
2122 | } | ||
2123 | } | ||
2124 | p->conn_flags = cpu_to_be32(cf); | 806 | p->conn_flags = cpu_to_be32(cf); |
2125 | 807 | ||
2126 | if (mdev->agreed_pro_version >= 87) | 808 | if (tconn->agreed_pro_version >= 87) |
2127 | strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); | 809 | strcpy(p->integrity_alg, nc->integrity_alg); |
810 | rcu_read_unlock(); | ||
2128 | 811 | ||
2129 | rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, | 812 | return __conn_send_command(tconn, sock, cmd, size, NULL, 0); |
2130 | (struct p_header80 *)p, size); | 813 | } |
2131 | kfree(p); | 814 | |
2132 | return rv; | 815 | int drbd_send_protocol(struct drbd_tconn *tconn) |
816 | { | ||
817 | int err; | ||
818 | |||
819 | mutex_lock(&tconn->data.mutex); | ||
820 | err = __drbd_send_protocol(tconn, P_PROTOCOL); | ||
821 | mutex_unlock(&tconn->data.mutex); | ||
822 | |||
823 | return err; | ||
2133 | } | 824 | } |
2134 | 825 | ||
2135 | int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) | 826 | int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) |
2136 | { | 827 | { |
2137 | struct p_uuids p; | 828 | struct drbd_socket *sock; |
829 | struct p_uuids *p; | ||
2138 | int i; | 830 | int i; |
2139 | 831 | ||
2140 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) | 832 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) |
2141 | return 1; | 833 | return 0; |
2142 | 834 | ||
835 | sock = &mdev->tconn->data; | ||
836 | p = drbd_prepare_command(mdev, sock); | ||
837 | if (!p) { | ||
838 | put_ldev(mdev); | ||
839 | return -EIO; | ||
840 | } | ||
2143 | spin_lock_irq(&mdev->ldev->md.uuid_lock); | 841 | spin_lock_irq(&mdev->ldev->md.uuid_lock); |
2144 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 842 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
2145 | p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; | 843 | p->uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; |
2146 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); | 844 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); |
2147 | 845 | ||
2148 | mdev->comm_bm_set = drbd_bm_total_weight(mdev); | 846 | mdev->comm_bm_set = drbd_bm_total_weight(mdev); |
2149 | p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); | 847 | p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); |
2150 | uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; | 848 | rcu_read_lock(); |
2151 | uuid_flags |= drbd_test_flag(mdev, CRASHED_PRIMARY) ? 2 : 0; | 849 | uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0; |
850 | rcu_read_unlock(); | ||
851 | uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; | ||
2152 | uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; | 852 | uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; |
2153 | p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); | 853 | p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); |
2154 | 854 | ||
2155 | put_ldev(mdev); | 855 | put_ldev(mdev); |
2156 | 856 | return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0); | |
2157 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, | ||
2158 | (struct p_header80 *)&p, sizeof(p)); | ||
2159 | } | 857 | } |
2160 | 858 | ||
2161 | int drbd_send_uuids(struct drbd_conf *mdev) | 859 | int drbd_send_uuids(struct drbd_conf *mdev) |
@@ -2186,9 +884,10 @@ void drbd_print_uuids(struct drbd_conf *mdev, const char *text) | |||
2186 | } | 884 | } |
2187 | } | 885 | } |
2188 | 886 | ||
2189 | int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) | 887 | void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) |
2190 | { | 888 | { |
2191 | struct p_rs_uuid p; | 889 | struct drbd_socket *sock; |
890 | struct p_rs_uuid *p; | ||
2192 | u64 uuid; | 891 | u64 uuid; |
2193 | 892 | ||
2194 | D_ASSERT(mdev->state.disk == D_UP_TO_DATE); | 893 | D_ASSERT(mdev->state.disk == D_UP_TO_DATE); |
@@ -2201,24 +900,29 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) | |||
2201 | drbd_uuid_set(mdev, UI_BITMAP, uuid); | 900 | drbd_uuid_set(mdev, UI_BITMAP, uuid); |
2202 | drbd_print_uuids(mdev, "updated sync UUID"); | 901 | drbd_print_uuids(mdev, "updated sync UUID"); |
2203 | drbd_md_sync(mdev); | 902 | drbd_md_sync(mdev); |
2204 | p.uuid = cpu_to_be64(uuid); | ||
2205 | 903 | ||
2206 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, | 904 | sock = &mdev->tconn->data; |
2207 | (struct p_header80 *)&p, sizeof(p)); | 905 | p = drbd_prepare_command(mdev, sock); |
906 | if (p) { | ||
907 | p->uuid = cpu_to_be64(uuid); | ||
908 | drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0); | ||
909 | } | ||
2208 | } | 910 | } |
2209 | 911 | ||
2210 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) | 912 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) |
2211 | { | 913 | { |
2212 | struct p_sizes p; | 914 | struct drbd_socket *sock; |
915 | struct p_sizes *p; | ||
2213 | sector_t d_size, u_size; | 916 | sector_t d_size, u_size; |
2214 | int q_order_type; | 917 | int q_order_type; |
2215 | unsigned int max_bio_size; | 918 | unsigned int max_bio_size; |
2216 | int ok; | ||
2217 | 919 | ||
2218 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { | 920 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { |
2219 | D_ASSERT(mdev->ldev->backing_bdev); | 921 | D_ASSERT(mdev->ldev->backing_bdev); |
2220 | d_size = drbd_get_max_capacity(mdev->ldev); | 922 | d_size = drbd_get_max_capacity(mdev->ldev); |
2221 | u_size = mdev->ldev->dc.disk_size; | 923 | rcu_read_lock(); |
924 | u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
925 | rcu_read_unlock(); | ||
2222 | q_order_type = drbd_queue_order_type(mdev); | 926 | q_order_type = drbd_queue_order_type(mdev); |
2223 | max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; | 927 | max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; |
2224 | max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); | 928 | max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); |
@@ -2230,20 +934,23 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl | |||
2230 | max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ | 934 | max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ |
2231 | } | 935 | } |
2232 | 936 | ||
2233 | /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ | 937 | sock = &mdev->tconn->data; |
2234 | if (mdev->agreed_pro_version <= 94) | 938 | p = drbd_prepare_command(mdev, sock); |
2235 | max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); | 939 | if (!p) |
940 | return -EIO; | ||
2236 | 941 | ||
2237 | p.d_size = cpu_to_be64(d_size); | 942 | if (mdev->tconn->agreed_pro_version <= 94) |
2238 | p.u_size = cpu_to_be64(u_size); | 943 | max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); |
2239 | p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); | 944 | else if (mdev->tconn->agreed_pro_version < 100) |
2240 | p.max_bio_size = cpu_to_be32(max_bio_size); | 945 | max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95); |
2241 | p.queue_order_type = cpu_to_be16(q_order_type); | ||
2242 | p.dds_flags = cpu_to_be16(flags); | ||
2243 | 946 | ||
2244 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, | 947 | p->d_size = cpu_to_be64(d_size); |
2245 | (struct p_header80 *)&p, sizeof(p)); | 948 | p->u_size = cpu_to_be64(u_size); |
2246 | return ok; | 949 | p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); |
950 | p->max_bio_size = cpu_to_be32(max_bio_size); | ||
951 | p->queue_order_type = cpu_to_be16(q_order_type); | ||
952 | p->dds_flags = cpu_to_be16(flags); | ||
953 | return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0); | ||
2247 | } | 954 | } |
2248 | 955 | ||
2249 | /** | 956 | /** |
@@ -2252,34 +959,21 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl | |||
2252 | */ | 959 | */ |
2253 | int drbd_send_current_state(struct drbd_conf *mdev) | 960 | int drbd_send_current_state(struct drbd_conf *mdev) |
2254 | { | 961 | { |
2255 | struct socket *sock; | 962 | struct drbd_socket *sock; |
2256 | struct p_state p; | 963 | struct p_state *p; |
2257 | int ok = 0; | ||
2258 | |||
2259 | /* Grab state lock so we wont send state if we're in the middle | ||
2260 | * of a cluster wide state change on another thread */ | ||
2261 | drbd_state_lock(mdev); | ||
2262 | |||
2263 | mutex_lock(&mdev->data.mutex); | ||
2264 | |||
2265 | p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ | ||
2266 | sock = mdev->data.socket; | ||
2267 | |||
2268 | if (likely(sock != NULL)) { | ||
2269 | ok = _drbd_send_cmd(mdev, sock, P_STATE, | ||
2270 | (struct p_header80 *)&p, sizeof(p), 0); | ||
2271 | } | ||
2272 | 964 | ||
2273 | mutex_unlock(&mdev->data.mutex); | 965 | sock = &mdev->tconn->data; |
2274 | 966 | p = drbd_prepare_command(mdev, sock); | |
2275 | drbd_state_unlock(mdev); | 967 | if (!p) |
2276 | return ok; | 968 | return -EIO; |
969 | p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ | ||
970 | return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); | ||
2277 | } | 971 | } |
2278 | 972 | ||
2279 | /** | 973 | /** |
2280 | * drbd_send_state() - After a state change, sends the new state to the peer | 974 | * drbd_send_state() - After a state change, sends the new state to the peer |
2281 | * @mdev: DRBD device. | 975 | * @mdev: DRBD device. |
2282 | * @state: the state to send, not necessarily the current state. | 976 | * @state: the state to send, not necessarily the current state. |
2283 | * | 977 | * |
2284 | * Each state change queues an "after_state_ch" work, which will eventually | 978 | * Each state change queues an "after_state_ch" work, which will eventually |
2285 | * send the resulting new state to the peer. If more state changes happen | 979 | * send the resulting new state to the peer. If more state changes happen |
@@ -2288,50 +982,95 @@ int drbd_send_current_state(struct drbd_conf *mdev) | |||
2288 | */ | 982 | */ |
2289 | int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) | 983 | int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) |
2290 | { | 984 | { |
2291 | struct socket *sock; | 985 | struct drbd_socket *sock; |
2292 | struct p_state p; | 986 | struct p_state *p; |
2293 | int ok = 0; | ||
2294 | 987 | ||
2295 | mutex_lock(&mdev->data.mutex); | 988 | sock = &mdev->tconn->data; |
989 | p = drbd_prepare_command(mdev, sock); | ||
990 | if (!p) | ||
991 | return -EIO; | ||
992 | p->state = cpu_to_be32(state.i); /* Within the send mutex */ | ||
993 | return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); | ||
994 | } | ||
2296 | 995 | ||
2297 | p.state = cpu_to_be32(state.i); | 996 | int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) |
2298 | sock = mdev->data.socket; | 997 | { |
998 | struct drbd_socket *sock; | ||
999 | struct p_req_state *p; | ||
2299 | 1000 | ||
2300 | if (likely(sock != NULL)) { | 1001 | sock = &mdev->tconn->data; |
2301 | ok = _drbd_send_cmd(mdev, sock, P_STATE, | 1002 | p = drbd_prepare_command(mdev, sock); |
2302 | (struct p_header80 *)&p, sizeof(p), 0); | 1003 | if (!p) |
2303 | } | 1004 | return -EIO; |
1005 | p->mask = cpu_to_be32(mask.i); | ||
1006 | p->val = cpu_to_be32(val.i); | ||
1007 | return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0); | ||
1008 | } | ||
2304 | 1009 | ||
2305 | mutex_unlock(&mdev->data.mutex); | 1010 | int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) |
1011 | { | ||
1012 | enum drbd_packet cmd; | ||
1013 | struct drbd_socket *sock; | ||
1014 | struct p_req_state *p; | ||
2306 | 1015 | ||
2307 | return ok; | 1016 | cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ; |
1017 | sock = &tconn->data; | ||
1018 | p = conn_prepare_command(tconn, sock); | ||
1019 | if (!p) | ||
1020 | return -EIO; | ||
1021 | p->mask = cpu_to_be32(mask.i); | ||
1022 | p->val = cpu_to_be32(val.i); | ||
1023 | return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); | ||
2308 | } | 1024 | } |
2309 | 1025 | ||
2310 | int drbd_send_state_req(struct drbd_conf *mdev, | 1026 | void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode) |
2311 | union drbd_state mask, union drbd_state val) | ||
2312 | { | 1027 | { |
2313 | struct p_req_state p; | 1028 | struct drbd_socket *sock; |
1029 | struct p_req_state_reply *p; | ||
2314 | 1030 | ||
2315 | p.mask = cpu_to_be32(mask.i); | 1031 | sock = &mdev->tconn->meta; |
2316 | p.val = cpu_to_be32(val.i); | 1032 | p = drbd_prepare_command(mdev, sock); |
1033 | if (p) { | ||
1034 | p->retcode = cpu_to_be32(retcode); | ||
1035 | drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0); | ||
1036 | } | ||
1037 | } | ||
1038 | |||
1039 | void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode) | ||
1040 | { | ||
1041 | struct drbd_socket *sock; | ||
1042 | struct p_req_state_reply *p; | ||
1043 | enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY; | ||
2317 | 1044 | ||
2318 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, | 1045 | sock = &tconn->meta; |
2319 | (struct p_header80 *)&p, sizeof(p)); | 1046 | p = conn_prepare_command(tconn, sock); |
1047 | if (p) { | ||
1048 | p->retcode = cpu_to_be32(retcode); | ||
1049 | conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); | ||
1050 | } | ||
2320 | } | 1051 | } |
2321 | 1052 | ||
2322 | int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode) | 1053 | static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) |
2323 | { | 1054 | { |
2324 | struct p_req_state_reply p; | 1055 | BUG_ON(code & ~0xf); |
1056 | p->encoding = (p->encoding & ~0xf) | code; | ||
1057 | } | ||
2325 | 1058 | ||
2326 | p.retcode = cpu_to_be32(retcode); | 1059 | static void dcbp_set_start(struct p_compressed_bm *p, int set) |
1060 | { | ||
1061 | p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); | ||
1062 | } | ||
2327 | 1063 | ||
2328 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, | 1064 | static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n) |
2329 | (struct p_header80 *)&p, sizeof(p)); | 1065 | { |
1066 | BUG_ON(n & ~0x7); | ||
1067 | p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); | ||
2330 | } | 1068 | } |
2331 | 1069 | ||
2332 | int fill_bitmap_rle_bits(struct drbd_conf *mdev, | 1070 | int fill_bitmap_rle_bits(struct drbd_conf *mdev, |
2333 | struct p_compressed_bm *p, | 1071 | struct p_compressed_bm *p, |
2334 | struct bm_xfer_ctx *c) | 1072 | unsigned int size, |
1073 | struct bm_xfer_ctx *c) | ||
2335 | { | 1074 | { |
2336 | struct bitstream bs; | 1075 | struct bitstream bs; |
2337 | unsigned long plain_bits; | 1076 | unsigned long plain_bits; |
@@ -2339,19 +1078,21 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2339 | unsigned long rl; | 1078 | unsigned long rl; |
2340 | unsigned len; | 1079 | unsigned len; |
2341 | unsigned toggle; | 1080 | unsigned toggle; |
2342 | int bits; | 1081 | int bits, use_rle; |
2343 | 1082 | ||
2344 | /* may we use this feature? */ | 1083 | /* may we use this feature? */ |
2345 | if ((mdev->sync_conf.use_rle == 0) || | 1084 | rcu_read_lock(); |
2346 | (mdev->agreed_pro_version < 90)) | 1085 | use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle; |
2347 | return 0; | 1086 | rcu_read_unlock(); |
1087 | if (!use_rle || mdev->tconn->agreed_pro_version < 90) | ||
1088 | return 0; | ||
2348 | 1089 | ||
2349 | if (c->bit_offset >= c->bm_bits) | 1090 | if (c->bit_offset >= c->bm_bits) |
2350 | return 0; /* nothing to do. */ | 1091 | return 0; /* nothing to do. */ |
2351 | 1092 | ||
2352 | /* use at most thus many bytes */ | 1093 | /* use at most thus many bytes */ |
2353 | bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); | 1094 | bitstream_init(&bs, p->code, size, 0); |
2354 | memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); | 1095 | memset(p->code, 0, size); |
2355 | /* plain bits covered in this code string */ | 1096 | /* plain bits covered in this code string */ |
2356 | plain_bits = 0; | 1097 | plain_bits = 0; |
2357 | 1098 | ||
@@ -2373,12 +1114,12 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2373 | if (rl == 0) { | 1114 | if (rl == 0) { |
2374 | /* the first checked bit was set, | 1115 | /* the first checked bit was set, |
2375 | * store start value, */ | 1116 | * store start value, */ |
2376 | DCBP_set_start(p, 1); | 1117 | dcbp_set_start(p, 1); |
2377 | /* but skip encoding of zero run length */ | 1118 | /* but skip encoding of zero run length */ |
2378 | toggle = !toggle; | 1119 | toggle = !toggle; |
2379 | continue; | 1120 | continue; |
2380 | } | 1121 | } |
2381 | DCBP_set_start(p, 0); | 1122 | dcbp_set_start(p, 0); |
2382 | } | 1123 | } |
2383 | 1124 | ||
2384 | /* paranoia: catch zero runlength. | 1125 | /* paranoia: catch zero runlength. |
@@ -2418,7 +1159,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2418 | bm_xfer_ctx_bit_to_word_offset(c); | 1159 | bm_xfer_ctx_bit_to_word_offset(c); |
2419 | 1160 | ||
2420 | /* store pad_bits */ | 1161 | /* store pad_bits */ |
2421 | DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); | 1162 | dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); |
2422 | 1163 | ||
2423 | return len; | 1164 | return len; |
2424 | } | 1165 | } |
@@ -2430,48 +1171,52 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2430 | * code upon failure. | 1171 | * code upon failure. |
2431 | */ | 1172 | */ |
2432 | static int | 1173 | static int |
2433 | send_bitmap_rle_or_plain(struct drbd_conf *mdev, | 1174 | send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c) |
2434 | struct p_header80 *h, struct bm_xfer_ctx *c) | ||
2435 | { | 1175 | { |
2436 | struct p_compressed_bm *p = (void*)h; | 1176 | struct drbd_socket *sock = &mdev->tconn->data; |
2437 | unsigned long num_words; | 1177 | unsigned int header_size = drbd_header_size(mdev->tconn); |
2438 | int len; | 1178 | struct p_compressed_bm *p = sock->sbuf + header_size; |
2439 | int ok; | 1179 | int len, err; |
2440 | |||
2441 | len = fill_bitmap_rle_bits(mdev, p, c); | ||
2442 | 1180 | ||
1181 | len = fill_bitmap_rle_bits(mdev, p, | ||
1182 | DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c); | ||
2443 | if (len < 0) | 1183 | if (len < 0) |
2444 | return -EIO; | 1184 | return -EIO; |
2445 | 1185 | ||
2446 | if (len) { | 1186 | if (len) { |
2447 | DCBP_set_code(p, RLE_VLI_Bits); | 1187 | dcbp_set_code(p, RLE_VLI_Bits); |
2448 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, | 1188 | err = __send_command(mdev->tconn, mdev->vnr, sock, |
2449 | sizeof(*p) + len, 0); | 1189 | P_COMPRESSED_BITMAP, sizeof(*p) + len, |
2450 | 1190 | NULL, 0); | |
2451 | c->packets[0]++; | 1191 | c->packets[0]++; |
2452 | c->bytes[0] += sizeof(*p) + len; | 1192 | c->bytes[0] += header_size + sizeof(*p) + len; |
2453 | 1193 | ||
2454 | if (c->bit_offset >= c->bm_bits) | 1194 | if (c->bit_offset >= c->bm_bits) |
2455 | len = 0; /* DONE */ | 1195 | len = 0; /* DONE */ |
2456 | } else { | 1196 | } else { |
2457 | /* was not compressible. | 1197 | /* was not compressible. |
2458 | * send a buffer full of plain text bits instead. */ | 1198 | * send a buffer full of plain text bits instead. */ |
2459 | num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | 1199 | unsigned int data_size; |
2460 | len = num_words * sizeof(long); | 1200 | unsigned long num_words; |
1201 | unsigned long *p = sock->sbuf + header_size; | ||
1202 | |||
1203 | data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; | ||
1204 | num_words = min_t(size_t, data_size / sizeof(*p), | ||
1205 | c->bm_words - c->word_offset); | ||
1206 | len = num_words * sizeof(*p); | ||
2461 | if (len) | 1207 | if (len) |
2462 | drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); | 1208 | drbd_bm_get_lel(mdev, c->word_offset, num_words, p); |
2463 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, | 1209 | err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0); |
2464 | h, sizeof(struct p_header80) + len, 0); | ||
2465 | c->word_offset += num_words; | 1210 | c->word_offset += num_words; |
2466 | c->bit_offset = c->word_offset * BITS_PER_LONG; | 1211 | c->bit_offset = c->word_offset * BITS_PER_LONG; |
2467 | 1212 | ||
2468 | c->packets[1]++; | 1213 | c->packets[1]++; |
2469 | c->bytes[1] += sizeof(struct p_header80) + len; | 1214 | c->bytes[1] += header_size + len; |
2470 | 1215 | ||
2471 | if (c->bit_offset > c->bm_bits) | 1216 | if (c->bit_offset > c->bm_bits) |
2472 | c->bit_offset = c->bm_bits; | 1217 | c->bit_offset = c->bm_bits; |
2473 | } | 1218 | } |
2474 | if (ok) { | 1219 | if (!err) { |
2475 | if (len == 0) { | 1220 | if (len == 0) { |
2476 | INFO_bm_xfer_stats(mdev, "send", c); | 1221 | INFO_bm_xfer_stats(mdev, "send", c); |
2477 | return 0; | 1222 | return 0; |
@@ -2482,21 +1227,13 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev, | |||
2482 | } | 1227 | } |
2483 | 1228 | ||
2484 | /* See the comment at receive_bitmap() */ | 1229 | /* See the comment at receive_bitmap() */ |
2485 | int _drbd_send_bitmap(struct drbd_conf *mdev) | 1230 | static int _drbd_send_bitmap(struct drbd_conf *mdev) |
2486 | { | 1231 | { |
2487 | struct bm_xfer_ctx c; | 1232 | struct bm_xfer_ctx c; |
2488 | struct p_header80 *p; | ||
2489 | int err; | 1233 | int err; |
2490 | 1234 | ||
2491 | ERR_IF(!mdev->bitmap) return false; | 1235 | if (!expect(mdev->bitmap)) |
2492 | |||
2493 | /* maybe we should use some per thread scratch page, | ||
2494 | * and allocate that during initial device creation? */ | ||
2495 | p = (struct p_header80 *) __get_free_page(GFP_NOIO); | ||
2496 | if (!p) { | ||
2497 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
2498 | return false; | 1236 | return false; |
2499 | } | ||
2500 | 1237 | ||
2501 | if (get_ldev(mdev)) { | 1238 | if (get_ldev(mdev)) { |
2502 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | 1239 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { |
@@ -2521,37 +1258,39 @@ int _drbd_send_bitmap(struct drbd_conf *mdev) | |||
2521 | }; | 1258 | }; |
2522 | 1259 | ||
2523 | do { | 1260 | do { |
2524 | err = send_bitmap_rle_or_plain(mdev, p, &c); | 1261 | err = send_bitmap_rle_or_plain(mdev, &c); |
2525 | } while (err > 0); | 1262 | } while (err > 0); |
2526 | 1263 | ||
2527 | free_page((unsigned long) p); | ||
2528 | return err == 0; | 1264 | return err == 0; |
2529 | } | 1265 | } |
2530 | 1266 | ||
2531 | int drbd_send_bitmap(struct drbd_conf *mdev) | 1267 | int drbd_send_bitmap(struct drbd_conf *mdev) |
2532 | { | 1268 | { |
2533 | int err; | 1269 | struct drbd_socket *sock = &mdev->tconn->data; |
1270 | int err = -1; | ||
2534 | 1271 | ||
2535 | if (!drbd_get_data_sock(mdev)) | 1272 | mutex_lock(&sock->mutex); |
2536 | return -1; | 1273 | if (sock->socket) |
2537 | err = !_drbd_send_bitmap(mdev); | 1274 | err = !_drbd_send_bitmap(mdev); |
2538 | drbd_put_data_sock(mdev); | 1275 | mutex_unlock(&sock->mutex); |
2539 | return err; | 1276 | return err; |
2540 | } | 1277 | } |
2541 | 1278 | ||
2542 | int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) | 1279 | void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, u32 set_size) |
2543 | { | 1280 | { |
2544 | int ok; | 1281 | struct drbd_socket *sock; |
2545 | struct p_barrier_ack p; | 1282 | struct p_barrier_ack *p; |
2546 | 1283 | ||
2547 | p.barrier = barrier_nr; | 1284 | if (tconn->cstate < C_WF_REPORT_PARAMS) |
2548 | p.set_size = cpu_to_be32(set_size); | 1285 | return; |
2549 | 1286 | ||
2550 | if (mdev->state.conn < C_CONNECTED) | 1287 | sock = &tconn->meta; |
2551 | return false; | 1288 | p = conn_prepare_command(tconn, sock); |
2552 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, | 1289 | if (!p) |
2553 | (struct p_header80 *)&p, sizeof(p)); | 1290 | return; |
2554 | return ok; | 1291 | p->barrier = barrier_nr; |
1292 | p->set_size = cpu_to_be32(set_size); | ||
1293 | conn_send_command(tconn, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0); | ||
2555 | } | 1294 | } |
2556 | 1295 | ||
2557 | /** | 1296 | /** |
@@ -2562,62 +1301,62 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) | |||
2562 | * @blksize: size in byte, needs to be in big endian byte order | 1301 | * @blksize: size in byte, needs to be in big endian byte order |
2563 | * @block_id: Id, big endian byte order | 1302 | * @block_id: Id, big endian byte order |
2564 | */ | 1303 | */ |
2565 | static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | 1304 | static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, |
2566 | u64 sector, | 1305 | u64 sector, u32 blksize, u64 block_id) |
2567 | u32 blksize, | ||
2568 | u64 block_id) | ||
2569 | { | 1306 | { |
2570 | int ok; | 1307 | struct drbd_socket *sock; |
2571 | struct p_block_ack p; | 1308 | struct p_block_ack *p; |
2572 | 1309 | ||
2573 | p.sector = sector; | 1310 | if (mdev->state.conn < C_CONNECTED) |
2574 | p.block_id = block_id; | 1311 | return -EIO; |
2575 | p.blksize = blksize; | ||
2576 | p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); | ||
2577 | 1312 | ||
2578 | if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) | 1313 | sock = &mdev->tconn->meta; |
2579 | return false; | 1314 | p = drbd_prepare_command(mdev, sock); |
2580 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, | 1315 | if (!p) |
2581 | (struct p_header80 *)&p, sizeof(p)); | 1316 | return -EIO; |
2582 | return ok; | 1317 | p->sector = sector; |
1318 | p->block_id = block_id; | ||
1319 | p->blksize = blksize; | ||
1320 | p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq)); | ||
1321 | return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); | ||
2583 | } | 1322 | } |
2584 | 1323 | ||
2585 | /* dp->sector and dp->block_id already/still in network byte order, | 1324 | /* dp->sector and dp->block_id already/still in network byte order, |
2586 | * data_size is payload size according to dp->head, | 1325 | * data_size is payload size according to dp->head, |
2587 | * and may need to be corrected for digest size. */ | 1326 | * and may need to be corrected for digest size. */ |
2588 | int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | 1327 | void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, |
2589 | struct p_data *dp, int data_size) | 1328 | struct p_data *dp, int data_size) |
2590 | { | 1329 | { |
2591 | data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | 1330 | if (mdev->tconn->peer_integrity_tfm) |
2592 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | 1331 | data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); |
2593 | return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), | 1332 | _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), |
2594 | dp->block_id); | 1333 | dp->block_id); |
2595 | } | 1334 | } |
2596 | 1335 | ||
2597 | int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | 1336 | void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, |
2598 | struct p_block_req *rp) | 1337 | struct p_block_req *rp) |
2599 | { | 1338 | { |
2600 | return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); | 1339 | _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); |
2601 | } | 1340 | } |
2602 | 1341 | ||
2603 | /** | 1342 | /** |
2604 | * drbd_send_ack() - Sends an ack packet | 1343 | * drbd_send_ack() - Sends an ack packet |
2605 | * @mdev: DRBD device. | 1344 | * @mdev: DRBD device |
2606 | * @cmd: Packet command code. | 1345 | * @cmd: packet command code |
2607 | * @e: Epoch entry. | 1346 | * @peer_req: peer request |
2608 | */ | 1347 | */ |
2609 | int drbd_send_ack(struct drbd_conf *mdev, | 1348 | int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, |
2610 | enum drbd_packets cmd, struct drbd_epoch_entry *e) | 1349 | struct drbd_peer_request *peer_req) |
2611 | { | 1350 | { |
2612 | return _drbd_send_ack(mdev, cmd, | 1351 | return _drbd_send_ack(mdev, cmd, |
2613 | cpu_to_be64(e->sector), | 1352 | cpu_to_be64(peer_req->i.sector), |
2614 | cpu_to_be32(e->size), | 1353 | cpu_to_be32(peer_req->i.size), |
2615 | e->block_id); | 1354 | peer_req->block_id); |
2616 | } | 1355 | } |
2617 | 1356 | ||
2618 | /* This function misuses the block_id field to signal if the blocks | 1357 | /* This function misuses the block_id field to signal if the blocks |
2619 | * are is sync or not. */ | 1358 | * are is sync or not. */ |
2620 | int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | 1359 | int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, |
2621 | sector_t sector, int blksize, u64 block_id) | 1360 | sector_t sector, int blksize, u64 block_id) |
2622 | { | 1361 | { |
2623 | return _drbd_send_ack(mdev, cmd, | 1362 | return _drbd_send_ack(mdev, cmd, |
@@ -2629,85 +1368,87 @@ int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | |||
2629 | int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | 1368 | int drbd_send_drequest(struct drbd_conf *mdev, int cmd, |
2630 | sector_t sector, int size, u64 block_id) | 1369 | sector_t sector, int size, u64 block_id) |
2631 | { | 1370 | { |
2632 | int ok; | 1371 | struct drbd_socket *sock; |
2633 | struct p_block_req p; | 1372 | struct p_block_req *p; |
2634 | 1373 | ||
2635 | p.sector = cpu_to_be64(sector); | 1374 | sock = &mdev->tconn->data; |
2636 | p.block_id = block_id; | 1375 | p = drbd_prepare_command(mdev, sock); |
2637 | p.blksize = cpu_to_be32(size); | 1376 | if (!p) |
2638 | 1377 | return -EIO; | |
2639 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, | 1378 | p->sector = cpu_to_be64(sector); |
2640 | (struct p_header80 *)&p, sizeof(p)); | 1379 | p->block_id = block_id; |
2641 | return ok; | 1380 | p->blksize = cpu_to_be32(size); |
1381 | return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); | ||
2642 | } | 1382 | } |
2643 | 1383 | ||
2644 | int drbd_send_drequest_csum(struct drbd_conf *mdev, | 1384 | int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size, |
2645 | sector_t sector, int size, | 1385 | void *digest, int digest_size, enum drbd_packet cmd) |
2646 | void *digest, int digest_size, | ||
2647 | enum drbd_packets cmd) | ||
2648 | { | 1386 | { |
2649 | int ok; | 1387 | struct drbd_socket *sock; |
2650 | struct p_block_req p; | 1388 | struct p_block_req *p; |
2651 | |||
2652 | p.sector = cpu_to_be64(sector); | ||
2653 | p.block_id = BE_DRBD_MAGIC + 0xbeef; | ||
2654 | p.blksize = cpu_to_be32(size); | ||
2655 | |||
2656 | p.head.magic = BE_DRBD_MAGIC; | ||
2657 | p.head.command = cpu_to_be16(cmd); | ||
2658 | p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size); | ||
2659 | |||
2660 | mutex_lock(&mdev->data.mutex); | ||
2661 | 1389 | ||
2662 | ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); | 1390 | /* FIXME: Put the digest into the preallocated socket buffer. */ |
2663 | ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); | ||
2664 | 1391 | ||
2665 | mutex_unlock(&mdev->data.mutex); | 1392 | sock = &mdev->tconn->data; |
2666 | 1393 | p = drbd_prepare_command(mdev, sock); | |
2667 | return ok; | 1394 | if (!p) |
1395 | return -EIO; | ||
1396 | p->sector = cpu_to_be64(sector); | ||
1397 | p->block_id = ID_SYNCER /* unused */; | ||
1398 | p->blksize = cpu_to_be32(size); | ||
1399 | return drbd_send_command(mdev, sock, cmd, sizeof(*p), | ||
1400 | digest, digest_size); | ||
2668 | } | 1401 | } |
2669 | 1402 | ||
2670 | int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) | 1403 | int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) |
2671 | { | 1404 | { |
2672 | int ok; | 1405 | struct drbd_socket *sock; |
2673 | struct p_block_req p; | 1406 | struct p_block_req *p; |
2674 | |||
2675 | p.sector = cpu_to_be64(sector); | ||
2676 | p.block_id = BE_DRBD_MAGIC + 0xbabe; | ||
2677 | p.blksize = cpu_to_be32(size); | ||
2678 | 1407 | ||
2679 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, | 1408 | sock = &mdev->tconn->data; |
2680 | (struct p_header80 *)&p, sizeof(p)); | 1409 | p = drbd_prepare_command(mdev, sock); |
2681 | return ok; | 1410 | if (!p) |
1411 | return -EIO; | ||
1412 | p->sector = cpu_to_be64(sector); | ||
1413 | p->block_id = ID_SYNCER /* unused */; | ||
1414 | p->blksize = cpu_to_be32(size); | ||
1415 | return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0); | ||
2682 | } | 1416 | } |
2683 | 1417 | ||
2684 | /* called on sndtimeo | 1418 | /* called on sndtimeo |
2685 | * returns false if we should retry, | 1419 | * returns false if we should retry, |
2686 | * true if we think connection is dead | 1420 | * true if we think connection is dead |
2687 | */ | 1421 | */ |
2688 | static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) | 1422 | static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock) |
2689 | { | 1423 | { |
2690 | int drop_it; | 1424 | int drop_it; |
2691 | /* long elapsed = (long)(jiffies - mdev->last_received); */ | 1425 | /* long elapsed = (long)(jiffies - mdev->last_received); */ |
2692 | 1426 | ||
2693 | drop_it = mdev->meta.socket == sock | 1427 | drop_it = tconn->meta.socket == sock |
2694 | || !mdev->asender.task | 1428 | || !tconn->asender.task |
2695 | || get_t_state(&mdev->asender) != Running | 1429 | || get_t_state(&tconn->asender) != RUNNING |
2696 | || mdev->state.conn < C_CONNECTED; | 1430 | || tconn->cstate < C_WF_REPORT_PARAMS; |
2697 | 1431 | ||
2698 | if (drop_it) | 1432 | if (drop_it) |
2699 | return true; | 1433 | return true; |
2700 | 1434 | ||
2701 | drop_it = !--mdev->ko_count; | 1435 | drop_it = !--tconn->ko_count; |
2702 | if (!drop_it) { | 1436 | if (!drop_it) { |
2703 | dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", | 1437 | conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n", |
2704 | current->comm, current->pid, mdev->ko_count); | 1438 | current->comm, current->pid, tconn->ko_count); |
2705 | request_ping(mdev); | 1439 | request_ping(tconn); |
2706 | } | 1440 | } |
2707 | 1441 | ||
2708 | return drop_it; /* && (mdev->state == R_PRIMARY) */; | 1442 | return drop_it; /* && (mdev->state == R_PRIMARY) */; |
2709 | } | 1443 | } |
2710 | 1444 | ||
1445 | static void drbd_update_congested(struct drbd_tconn *tconn) | ||
1446 | { | ||
1447 | struct sock *sk = tconn->data.socket->sk; | ||
1448 | if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) | ||
1449 | set_bit(NET_CONGESTED, &tconn->flags); | ||
1450 | } | ||
1451 | |||
2711 | /* The idea of sendpage seems to be to put some kind of reference | 1452 | /* The idea of sendpage seems to be to put some kind of reference |
2712 | * to the page into the skb, and to hand it over to the NIC. In | 1453 | * to the page into the skb, and to hand it over to the NIC. In |
2713 | * this process get_page() gets called. | 1454 | * this process get_page() gets called. |
@@ -2730,21 +1471,28 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket * | |||
2730 | * with page_count == 0 or PageSlab. | 1471 | * with page_count == 0 or PageSlab. |
2731 | */ | 1472 | */ |
2732 | static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, | 1473 | static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, |
2733 | int offset, size_t size, unsigned msg_flags) | 1474 | int offset, size_t size, unsigned msg_flags) |
2734 | { | 1475 | { |
2735 | int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags); | 1476 | struct socket *socket; |
1477 | void *addr; | ||
1478 | int err; | ||
1479 | |||
1480 | socket = mdev->tconn->data.socket; | ||
1481 | addr = kmap(page) + offset; | ||
1482 | err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags); | ||
2736 | kunmap(page); | 1483 | kunmap(page); |
2737 | if (sent == size) | 1484 | if (!err) |
2738 | mdev->send_cnt += size>>9; | 1485 | mdev->send_cnt += size >> 9; |
2739 | return sent == size; | 1486 | return err; |
2740 | } | 1487 | } |
2741 | 1488 | ||
2742 | static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, | 1489 | static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, |
2743 | int offset, size_t size, unsigned msg_flags) | 1490 | int offset, size_t size, unsigned msg_flags) |
2744 | { | 1491 | { |
1492 | struct socket *socket = mdev->tconn->data.socket; | ||
2745 | mm_segment_t oldfs = get_fs(); | 1493 | mm_segment_t oldfs = get_fs(); |
2746 | int sent, ok; | ||
2747 | int len = size; | 1494 | int len = size; |
1495 | int err = -EIO; | ||
2748 | 1496 | ||
2749 | /* e.g. XFS meta- & log-data is in slab pages, which have a | 1497 | /* e.g. XFS meta- & log-data is in slab pages, which have a |
2750 | * page_count of 0 and/or have PageSlab() set. | 1498 | * page_count of 0 and/or have PageSlab() set. |
@@ -2756,34 +1504,35 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, | |||
2756 | return _drbd_no_send_page(mdev, page, offset, size, msg_flags); | 1504 | return _drbd_no_send_page(mdev, page, offset, size, msg_flags); |
2757 | 1505 | ||
2758 | msg_flags |= MSG_NOSIGNAL; | 1506 | msg_flags |= MSG_NOSIGNAL; |
2759 | drbd_update_congested(mdev); | 1507 | drbd_update_congested(mdev->tconn); |
2760 | set_fs(KERNEL_DS); | 1508 | set_fs(KERNEL_DS); |
2761 | do { | 1509 | do { |
2762 | sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, | 1510 | int sent; |
2763 | offset, len, | 1511 | |
2764 | msg_flags); | 1512 | sent = socket->ops->sendpage(socket, page, offset, len, msg_flags); |
2765 | if (sent == -EAGAIN) { | ||
2766 | if (we_should_drop_the_connection(mdev, | ||
2767 | mdev->data.socket)) | ||
2768 | break; | ||
2769 | else | ||
2770 | continue; | ||
2771 | } | ||
2772 | if (sent <= 0) { | 1513 | if (sent <= 0) { |
1514 | if (sent == -EAGAIN) { | ||
1515 | if (we_should_drop_the_connection(mdev->tconn, socket)) | ||
1516 | break; | ||
1517 | continue; | ||
1518 | } | ||
2773 | dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", | 1519 | dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", |
2774 | __func__, (int)size, len, sent); | 1520 | __func__, (int)size, len, sent); |
1521 | if (sent < 0) | ||
1522 | err = sent; | ||
2775 | break; | 1523 | break; |
2776 | } | 1524 | } |
2777 | len -= sent; | 1525 | len -= sent; |
2778 | offset += sent; | 1526 | offset += sent; |
2779 | } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); | 1527 | } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); |
2780 | set_fs(oldfs); | 1528 | set_fs(oldfs); |
2781 | drbd_clear_flag(mdev, NET_CONGESTED); | 1529 | clear_bit(NET_CONGESTED, &mdev->tconn->flags); |
2782 | 1530 | ||
2783 | ok = (len == 0); | 1531 | if (len == 0) { |
2784 | if (likely(ok)) | 1532 | err = 0; |
2785 | mdev->send_cnt += size>>9; | 1533 | mdev->send_cnt += size >> 9; |
2786 | return ok; | 1534 | } |
1535 | return err; | ||
2787 | } | 1536 | } |
2788 | 1537 | ||
2789 | static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) | 1538 | static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) |
@@ -2792,12 +1541,15 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) | |||
2792 | int i; | 1541 | int i; |
2793 | /* hint all but last page with MSG_MORE */ | 1542 | /* hint all but last page with MSG_MORE */ |
2794 | bio_for_each_segment(bvec, bio, i) { | 1543 | bio_for_each_segment(bvec, bio, i) { |
2795 | if (!_drbd_no_send_page(mdev, bvec->bv_page, | 1544 | int err; |
2796 | bvec->bv_offset, bvec->bv_len, | 1545 | |
2797 | i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) | 1546 | err = _drbd_no_send_page(mdev, bvec->bv_page, |
2798 | return 0; | 1547 | bvec->bv_offset, bvec->bv_len, |
1548 | i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); | ||
1549 | if (err) | ||
1550 | return err; | ||
2799 | } | 1551 | } |
2800 | return 1; | 1552 | return 0; |
2801 | } | 1553 | } |
2802 | 1554 | ||
2803 | static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | 1555 | static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) |
@@ -2806,32 +1558,40 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | |||
2806 | int i; | 1558 | int i; |
2807 | /* hint all but last page with MSG_MORE */ | 1559 | /* hint all but last page with MSG_MORE */ |
2808 | bio_for_each_segment(bvec, bio, i) { | 1560 | bio_for_each_segment(bvec, bio, i) { |
2809 | if (!_drbd_send_page(mdev, bvec->bv_page, | 1561 | int err; |
2810 | bvec->bv_offset, bvec->bv_len, | 1562 | |
2811 | i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) | 1563 | err = _drbd_send_page(mdev, bvec->bv_page, |
2812 | return 0; | 1564 | bvec->bv_offset, bvec->bv_len, |
1565 | i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); | ||
1566 | if (err) | ||
1567 | return err; | ||
2813 | } | 1568 | } |
2814 | return 1; | 1569 | return 0; |
2815 | } | 1570 | } |
2816 | 1571 | ||
2817 | static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | 1572 | static int _drbd_send_zc_ee(struct drbd_conf *mdev, |
1573 | struct drbd_peer_request *peer_req) | ||
2818 | { | 1574 | { |
2819 | struct page *page = e->pages; | 1575 | struct page *page = peer_req->pages; |
2820 | unsigned len = e->size; | 1576 | unsigned len = peer_req->i.size; |
1577 | int err; | ||
1578 | |||
2821 | /* hint all but last page with MSG_MORE */ | 1579 | /* hint all but last page with MSG_MORE */ |
2822 | page_chain_for_each(page) { | 1580 | page_chain_for_each(page) { |
2823 | unsigned l = min_t(unsigned, len, PAGE_SIZE); | 1581 | unsigned l = min_t(unsigned, len, PAGE_SIZE); |
2824 | if (!_drbd_send_page(mdev, page, 0, l, | 1582 | |
2825 | page_chain_next(page) ? MSG_MORE : 0)) | 1583 | err = _drbd_send_page(mdev, page, 0, l, |
2826 | return 0; | 1584 | page_chain_next(page) ? MSG_MORE : 0); |
1585 | if (err) | ||
1586 | return err; | ||
2827 | len -= l; | 1587 | len -= l; |
2828 | } | 1588 | } |
2829 | return 1; | 1589 | return 0; |
2830 | } | 1590 | } |
2831 | 1591 | ||
2832 | static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) | 1592 | static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) |
2833 | { | 1593 | { |
2834 | if (mdev->agreed_pro_version >= 95) | 1594 | if (mdev->tconn->agreed_pro_version >= 95) |
2835 | return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | | 1595 | return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | |
2836 | (bi_rw & REQ_FUA ? DP_FUA : 0) | | 1596 | (bi_rw & REQ_FUA ? DP_FUA : 0) | |
2837 | (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | | 1597 | (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | |
@@ -2845,50 +1605,36 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) | |||
2845 | */ | 1605 | */ |
2846 | int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | 1606 | int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) |
2847 | { | 1607 | { |
2848 | int ok = 1; | 1608 | struct drbd_socket *sock; |
2849 | struct p_data p; | 1609 | struct p_data *p; |
2850 | unsigned int dp_flags = 0; | 1610 | unsigned int dp_flags = 0; |
2851 | void *dgb; | ||
2852 | int dgs; | 1611 | int dgs; |
1612 | int err; | ||
2853 | 1613 | ||
2854 | if (!drbd_get_data_sock(mdev)) | 1614 | sock = &mdev->tconn->data; |
2855 | return 0; | 1615 | p = drbd_prepare_command(mdev, sock); |
2856 | 1616 | dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; | |
2857 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | ||
2858 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | ||
2859 | |||
2860 | if (req->size <= DRBD_MAX_SIZE_H80_PACKET) { | ||
2861 | p.head.h80.magic = BE_DRBD_MAGIC; | ||
2862 | p.head.h80.command = cpu_to_be16(P_DATA); | ||
2863 | p.head.h80.length = | ||
2864 | cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size); | ||
2865 | } else { | ||
2866 | p.head.h95.magic = BE_DRBD_MAGIC_BIG; | ||
2867 | p.head.h95.command = cpu_to_be16(P_DATA); | ||
2868 | p.head.h95.length = | ||
2869 | cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size); | ||
2870 | } | ||
2871 | |||
2872 | p.sector = cpu_to_be64(req->sector); | ||
2873 | p.block_id = (unsigned long)req; | ||
2874 | p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); | ||
2875 | 1617 | ||
1618 | if (!p) | ||
1619 | return -EIO; | ||
1620 | p->sector = cpu_to_be64(req->i.sector); | ||
1621 | p->block_id = (unsigned long)req; | ||
1622 | p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq)); | ||
2876 | dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); | 1623 | dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); |
2877 | |||
2878 | if (mdev->state.conn >= C_SYNC_SOURCE && | 1624 | if (mdev->state.conn >= C_SYNC_SOURCE && |
2879 | mdev->state.conn <= C_PAUSED_SYNC_T) | 1625 | mdev->state.conn <= C_PAUSED_SYNC_T) |
2880 | dp_flags |= DP_MAY_SET_IN_SYNC; | 1626 | dp_flags |= DP_MAY_SET_IN_SYNC; |
2881 | 1627 | if (mdev->tconn->agreed_pro_version >= 100) { | |
2882 | p.dp_flags = cpu_to_be32(dp_flags); | 1628 | if (req->rq_state & RQ_EXP_RECEIVE_ACK) |
2883 | drbd_set_flag(mdev, UNPLUG_REMOTE); | 1629 | dp_flags |= DP_SEND_RECEIVE_ACK; |
2884 | ok = (sizeof(p) == | 1630 | if (req->rq_state & RQ_EXP_WRITE_ACK) |
2885 | drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0)); | 1631 | dp_flags |= DP_SEND_WRITE_ACK; |
2886 | if (ok && dgs) { | 1632 | } |
2887 | dgb = mdev->int_dig_out; | 1633 | p->dp_flags = cpu_to_be32(dp_flags); |
2888 | drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); | 1634 | if (dgs) |
2889 | ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); | 1635 | drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1); |
2890 | } | 1636 | err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); |
2891 | if (ok) { | 1637 | if (!err) { |
2892 | /* For protocol A, we have to memcpy the payload into | 1638 | /* For protocol A, we have to memcpy the payload into |
2893 | * socket buffers, as we may complete right away | 1639 | * socket buffers, as we may complete right away |
2894 | * as soon as we handed it over to tcp, at which point the data | 1640 | * as soon as we handed it over to tcp, at which point the data |
@@ -2900,92 +1646,76 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | |||
2900 | * out ok after sending on this side, but does not fit on the | 1646 | * out ok after sending on this side, but does not fit on the |
2901 | * receiving side, we sure have detected corruption elsewhere. | 1647 | * receiving side, we sure have detected corruption elsewhere. |
2902 | */ | 1648 | */ |
2903 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs) | 1649 | if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs) |
2904 | ok = _drbd_send_bio(mdev, req->master_bio); | 1650 | err = _drbd_send_bio(mdev, req->master_bio); |
2905 | else | 1651 | else |
2906 | ok = _drbd_send_zc_bio(mdev, req->master_bio); | 1652 | err = _drbd_send_zc_bio(mdev, req->master_bio); |
2907 | 1653 | ||
2908 | /* double check digest, sometimes buffers have been modified in flight. */ | 1654 | /* double check digest, sometimes buffers have been modified in flight. */ |
2909 | if (dgs > 0 && dgs <= 64) { | 1655 | if (dgs > 0 && dgs <= 64) { |
2910 | /* 64 byte, 512 bit, is the largest digest size | 1656 | /* 64 byte, 512 bit, is the largest digest size |
2911 | * currently supported in kernel crypto. */ | 1657 | * currently supported in kernel crypto. */ |
2912 | unsigned char digest[64]; | 1658 | unsigned char digest[64]; |
2913 | drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest); | 1659 | drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest); |
2914 | if (memcmp(mdev->int_dig_out, digest, dgs)) { | 1660 | if (memcmp(p + 1, digest, dgs)) { |
2915 | dev_warn(DEV, | 1661 | dev_warn(DEV, |
2916 | "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", | 1662 | "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", |
2917 | (unsigned long long)req->sector, req->size); | 1663 | (unsigned long long)req->i.sector, req->i.size); |
2918 | } | 1664 | } |
2919 | } /* else if (dgs > 64) { | 1665 | } /* else if (dgs > 64) { |
2920 | ... Be noisy about digest too large ... | 1666 | ... Be noisy about digest too large ... |
2921 | } */ | 1667 | } */ |
2922 | } | 1668 | } |
1669 | mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ | ||
2923 | 1670 | ||
2924 | drbd_put_data_sock(mdev); | 1671 | return err; |
2925 | |||
2926 | return ok; | ||
2927 | } | 1672 | } |
2928 | 1673 | ||
2929 | /* answer packet, used to send data back for read requests: | 1674 | /* answer packet, used to send data back for read requests: |
2930 | * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) | 1675 | * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) |
2931 | * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) | 1676 | * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) |
2932 | */ | 1677 | */ |
2933 | int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | 1678 | int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd, |
2934 | struct drbd_epoch_entry *e) | 1679 | struct drbd_peer_request *peer_req) |
2935 | { | 1680 | { |
2936 | int ok; | 1681 | struct drbd_socket *sock; |
2937 | struct p_data p; | 1682 | struct p_data *p; |
2938 | void *dgb; | 1683 | int err; |
2939 | int dgs; | 1684 | int dgs; |
2940 | 1685 | ||
2941 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | 1686 | sock = &mdev->tconn->data; |
2942 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | 1687 | p = drbd_prepare_command(mdev, sock); |
2943 | 1688 | ||
2944 | if (e->size <= DRBD_MAX_SIZE_H80_PACKET) { | 1689 | dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; |
2945 | p.head.h80.magic = BE_DRBD_MAGIC; | ||
2946 | p.head.h80.command = cpu_to_be16(cmd); | ||
2947 | p.head.h80.length = | ||
2948 | cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); | ||
2949 | } else { | ||
2950 | p.head.h95.magic = BE_DRBD_MAGIC_BIG; | ||
2951 | p.head.h95.command = cpu_to_be16(cmd); | ||
2952 | p.head.h95.length = | ||
2953 | cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); | ||
2954 | } | ||
2955 | 1690 | ||
2956 | p.sector = cpu_to_be64(e->sector); | 1691 | if (!p) |
2957 | p.block_id = e->block_id; | 1692 | return -EIO; |
2958 | /* p.seq_num = 0; No sequence numbers here.. */ | 1693 | p->sector = cpu_to_be64(peer_req->i.sector); |
2959 | 1694 | p->block_id = peer_req->block_id; | |
2960 | /* Only called by our kernel thread. | 1695 | p->seq_num = 0; /* unused */ |
2961 | * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL | 1696 | p->dp_flags = 0; |
2962 | * in response to admin command or module unload. | 1697 | if (dgs) |
2963 | */ | 1698 | drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1); |
2964 | if (!drbd_get_data_sock(mdev)) | 1699 | err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size); |
2965 | return 0; | 1700 | if (!err) |
2966 | 1701 | err = _drbd_send_zc_ee(mdev, peer_req); | |
2967 | ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0); | 1702 | mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ |
2968 | if (ok && dgs) { | ||
2969 | dgb = mdev->int_dig_out; | ||
2970 | drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); | ||
2971 | ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); | ||
2972 | } | ||
2973 | if (ok) | ||
2974 | ok = _drbd_send_zc_ee(mdev, e); | ||
2975 | |||
2976 | drbd_put_data_sock(mdev); | ||
2977 | 1703 | ||
2978 | return ok; | 1704 | return err; |
2979 | } | 1705 | } |
2980 | 1706 | ||
2981 | int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req) | 1707 | int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req) |
2982 | { | 1708 | { |
2983 | struct p_block_desc p; | 1709 | struct drbd_socket *sock; |
2984 | 1710 | struct p_block_desc *p; | |
2985 | p.sector = cpu_to_be64(req->sector); | ||
2986 | p.blksize = cpu_to_be32(req->size); | ||
2987 | 1711 | ||
2988 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p)); | 1712 | sock = &mdev->tconn->data; |
1713 | p = drbd_prepare_command(mdev, sock); | ||
1714 | if (!p) | ||
1715 | return -EIO; | ||
1716 | p->sector = cpu_to_be64(req->i.sector); | ||
1717 | p->blksize = cpu_to_be32(req->i.size); | ||
1718 | return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0); | ||
2989 | } | 1719 | } |
2990 | 1720 | ||
2991 | /* | 1721 | /* |
@@ -3004,7 +1734,7 @@ int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req) | |||
3004 | /* | 1734 | /* |
3005 | * you must have down()ed the appropriate [m]sock_mutex elsewhere! | 1735 | * you must have down()ed the appropriate [m]sock_mutex elsewhere! |
3006 | */ | 1736 | */ |
3007 | int drbd_send(struct drbd_conf *mdev, struct socket *sock, | 1737 | int drbd_send(struct drbd_tconn *tconn, struct socket *sock, |
3008 | void *buf, size_t size, unsigned msg_flags) | 1738 | void *buf, size_t size, unsigned msg_flags) |
3009 | { | 1739 | { |
3010 | struct kvec iov; | 1740 | struct kvec iov; |
@@ -3012,7 +1742,7 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3012 | int rv, sent = 0; | 1742 | int rv, sent = 0; |
3013 | 1743 | ||
3014 | if (!sock) | 1744 | if (!sock) |
3015 | return -1000; | 1745 | return -EBADR; |
3016 | 1746 | ||
3017 | /* THINK if (signal_pending) return ... ? */ | 1747 | /* THINK if (signal_pending) return ... ? */ |
3018 | 1748 | ||
@@ -3025,9 +1755,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3025 | msg.msg_controllen = 0; | 1755 | msg.msg_controllen = 0; |
3026 | msg.msg_flags = msg_flags | MSG_NOSIGNAL; | 1756 | msg.msg_flags = msg_flags | MSG_NOSIGNAL; |
3027 | 1757 | ||
3028 | if (sock == mdev->data.socket) { | 1758 | if (sock == tconn->data.socket) { |
3029 | mdev->ko_count = mdev->net_conf->ko_count; | 1759 | rcu_read_lock(); |
3030 | drbd_update_congested(mdev); | 1760 | tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count; |
1761 | rcu_read_unlock(); | ||
1762 | drbd_update_congested(tconn); | ||
3031 | } | 1763 | } |
3032 | do { | 1764 | do { |
3033 | /* STRANGE | 1765 | /* STRANGE |
@@ -3041,12 +1773,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3041 | */ | 1773 | */ |
3042 | rv = kernel_sendmsg(sock, &msg, &iov, 1, size); | 1774 | rv = kernel_sendmsg(sock, &msg, &iov, 1, size); |
3043 | if (rv == -EAGAIN) { | 1775 | if (rv == -EAGAIN) { |
3044 | if (we_should_drop_the_connection(mdev, sock)) | 1776 | if (we_should_drop_the_connection(tconn, sock)) |
3045 | break; | 1777 | break; |
3046 | else | 1778 | else |
3047 | continue; | 1779 | continue; |
3048 | } | 1780 | } |
3049 | D_ASSERT(rv != 0); | ||
3050 | if (rv == -EINTR) { | 1781 | if (rv == -EINTR) { |
3051 | flush_signals(current); | 1782 | flush_signals(current); |
3052 | rv = 0; | 1783 | rv = 0; |
@@ -3058,22 +1789,40 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3058 | iov.iov_len -= rv; | 1789 | iov.iov_len -= rv; |
3059 | } while (sent < size); | 1790 | } while (sent < size); |
3060 | 1791 | ||
3061 | if (sock == mdev->data.socket) | 1792 | if (sock == tconn->data.socket) |
3062 | drbd_clear_flag(mdev, NET_CONGESTED); | 1793 | clear_bit(NET_CONGESTED, &tconn->flags); |
3063 | 1794 | ||
3064 | if (rv <= 0) { | 1795 | if (rv <= 0) { |
3065 | if (rv != -EAGAIN) { | 1796 | if (rv != -EAGAIN) { |
3066 | dev_err(DEV, "%s_sendmsg returned %d\n", | 1797 | conn_err(tconn, "%s_sendmsg returned %d\n", |
3067 | sock == mdev->meta.socket ? "msock" : "sock", | 1798 | sock == tconn->meta.socket ? "msock" : "sock", |
3068 | rv); | 1799 | rv); |
3069 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | 1800 | conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); |
3070 | } else | 1801 | } else |
3071 | drbd_force_state(mdev, NS(conn, C_TIMEOUT)); | 1802 | conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD); |
3072 | } | 1803 | } |
3073 | 1804 | ||
3074 | return sent; | 1805 | return sent; |
3075 | } | 1806 | } |
3076 | 1807 | ||
1808 | /** | ||
1809 | * drbd_send_all - Send an entire buffer | ||
1810 | * | ||
1811 | * Returns 0 upon success and a negative error value otherwise. | ||
1812 | */ | ||
1813 | int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer, | ||
1814 | size_t size, unsigned msg_flags) | ||
1815 | { | ||
1816 | int err; | ||
1817 | |||
1818 | err = drbd_send(tconn, sock, buffer, size, msg_flags); | ||
1819 | if (err < 0) | ||
1820 | return err; | ||
1821 | if (err != size) | ||
1822 | return -EIO; | ||
1823 | return 0; | ||
1824 | } | ||
1825 | |||
3077 | static int drbd_open(struct block_device *bdev, fmode_t mode) | 1826 | static int drbd_open(struct block_device *bdev, fmode_t mode) |
3078 | { | 1827 | { |
3079 | struct drbd_conf *mdev = bdev->bd_disk->private_data; | 1828 | struct drbd_conf *mdev = bdev->bd_disk->private_data; |
@@ -3081,7 +1830,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) | |||
3081 | int rv = 0; | 1830 | int rv = 0; |
3082 | 1831 | ||
3083 | mutex_lock(&drbd_main_mutex); | 1832 | mutex_lock(&drbd_main_mutex); |
3084 | spin_lock_irqsave(&mdev->req_lock, flags); | 1833 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
3085 | /* to have a stable mdev->state.role | 1834 | /* to have a stable mdev->state.role |
3086 | * and no race with updating open_cnt */ | 1835 | * and no race with updating open_cnt */ |
3087 | 1836 | ||
@@ -3094,7 +1843,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) | |||
3094 | 1843 | ||
3095 | if (!rv) | 1844 | if (!rv) |
3096 | mdev->open_cnt++; | 1845 | mdev->open_cnt++; |
3097 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 1846 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
3098 | mutex_unlock(&drbd_main_mutex); | 1847 | mutex_unlock(&drbd_main_mutex); |
3099 | 1848 | ||
3100 | return rv; | 1849 | return rv; |
@@ -3111,35 +1860,14 @@ static int drbd_release(struct gendisk *gd, fmode_t mode) | |||
3111 | 1860 | ||
3112 | static void drbd_set_defaults(struct drbd_conf *mdev) | 1861 | static void drbd_set_defaults(struct drbd_conf *mdev) |
3113 | { | 1862 | { |
3114 | /* This way we get a compile error when sync_conf grows, | 1863 | /* Beware! The actual layout differs |
3115 | and we forgot to initialize it here */ | 1864 | * between big endian and little endian */ |
3116 | mdev->sync_conf = (struct syncer_conf) { | 1865 | mdev->state = (union drbd_dev_state) { |
3117 | /* .rate = */ DRBD_RATE_DEF, | ||
3118 | /* .after = */ DRBD_AFTER_DEF, | ||
3119 | /* .al_extents = */ DRBD_AL_EXTENTS_DEF, | ||
3120 | /* .verify_alg = */ {}, 0, | ||
3121 | /* .cpu_mask = */ {}, 0, | ||
3122 | /* .csums_alg = */ {}, 0, | ||
3123 | /* .use_rle = */ 0, | ||
3124 | /* .on_no_data = */ DRBD_ON_NO_DATA_DEF, | ||
3125 | /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF, | ||
3126 | /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF, | ||
3127 | /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF, | ||
3128 | /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF, | ||
3129 | /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF | ||
3130 | }; | ||
3131 | |||
3132 | /* Have to use that way, because the layout differs between | ||
3133 | big endian and little endian */ | ||
3134 | mdev->state = (union drbd_state) { | ||
3135 | { .role = R_SECONDARY, | 1866 | { .role = R_SECONDARY, |
3136 | .peer = R_UNKNOWN, | 1867 | .peer = R_UNKNOWN, |
3137 | .conn = C_STANDALONE, | 1868 | .conn = C_STANDALONE, |
3138 | .disk = D_DISKLESS, | 1869 | .disk = D_DISKLESS, |
3139 | .pdsk = D_UNKNOWN, | 1870 | .pdsk = D_UNKNOWN, |
3140 | .susp = 0, | ||
3141 | .susp_nod = 0, | ||
3142 | .susp_fen = 0 | ||
3143 | } }; | 1871 | } }; |
3144 | } | 1872 | } |
3145 | 1873 | ||
@@ -3155,28 +1883,17 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3155 | atomic_set(&mdev->rs_pending_cnt, 0); | 1883 | atomic_set(&mdev->rs_pending_cnt, 0); |
3156 | atomic_set(&mdev->unacked_cnt, 0); | 1884 | atomic_set(&mdev->unacked_cnt, 0); |
3157 | atomic_set(&mdev->local_cnt, 0); | 1885 | atomic_set(&mdev->local_cnt, 0); |
3158 | atomic_set(&mdev->net_cnt, 0); | ||
3159 | atomic_set(&mdev->packet_seq, 0); | ||
3160 | atomic_set(&mdev->pp_in_use, 0); | ||
3161 | atomic_set(&mdev->pp_in_use_by_net, 0); | 1886 | atomic_set(&mdev->pp_in_use_by_net, 0); |
3162 | atomic_set(&mdev->rs_sect_in, 0); | 1887 | atomic_set(&mdev->rs_sect_in, 0); |
3163 | atomic_set(&mdev->rs_sect_ev, 0); | 1888 | atomic_set(&mdev->rs_sect_ev, 0); |
3164 | atomic_set(&mdev->ap_in_flight, 0); | 1889 | atomic_set(&mdev->ap_in_flight, 0); |
3165 | atomic_set(&mdev->md_io_in_use, 0); | 1890 | atomic_set(&mdev->md_io_in_use, 0); |
3166 | 1891 | ||
3167 | mutex_init(&mdev->data.mutex); | 1892 | mutex_init(&mdev->own_state_mutex); |
3168 | mutex_init(&mdev->meta.mutex); | 1893 | mdev->state_mutex = &mdev->own_state_mutex; |
3169 | sema_init(&mdev->data.work.s, 0); | ||
3170 | sema_init(&mdev->meta.work.s, 0); | ||
3171 | mutex_init(&mdev->state_mutex); | ||
3172 | |||
3173 | spin_lock_init(&mdev->data.work.q_lock); | ||
3174 | spin_lock_init(&mdev->meta.work.q_lock); | ||
3175 | 1894 | ||
3176 | spin_lock_init(&mdev->al_lock); | 1895 | spin_lock_init(&mdev->al_lock); |
3177 | spin_lock_init(&mdev->req_lock); | ||
3178 | spin_lock_init(&mdev->peer_seq_lock); | 1896 | spin_lock_init(&mdev->peer_seq_lock); |
3179 | spin_lock_init(&mdev->epoch_lock); | ||
3180 | 1897 | ||
3181 | INIT_LIST_HEAD(&mdev->active_ee); | 1898 | INIT_LIST_HEAD(&mdev->active_ee); |
3182 | INIT_LIST_HEAD(&mdev->sync_ee); | 1899 | INIT_LIST_HEAD(&mdev->sync_ee); |
@@ -3184,8 +1901,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3184 | INIT_LIST_HEAD(&mdev->read_ee); | 1901 | INIT_LIST_HEAD(&mdev->read_ee); |
3185 | INIT_LIST_HEAD(&mdev->net_ee); | 1902 | INIT_LIST_HEAD(&mdev->net_ee); |
3186 | INIT_LIST_HEAD(&mdev->resync_reads); | 1903 | INIT_LIST_HEAD(&mdev->resync_reads); |
3187 | INIT_LIST_HEAD(&mdev->data.work.q); | ||
3188 | INIT_LIST_HEAD(&mdev->meta.work.q); | ||
3189 | INIT_LIST_HEAD(&mdev->resync_work.list); | 1904 | INIT_LIST_HEAD(&mdev->resync_work.list); |
3190 | INIT_LIST_HEAD(&mdev->unplug_work.list); | 1905 | INIT_LIST_HEAD(&mdev->unplug_work.list); |
3191 | INIT_LIST_HEAD(&mdev->go_diskless.list); | 1906 | INIT_LIST_HEAD(&mdev->go_diskless.list); |
@@ -3199,6 +1914,14 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3199 | mdev->md_sync_work.cb = w_md_sync; | 1914 | mdev->md_sync_work.cb = w_md_sync; |
3200 | mdev->bm_io_work.w.cb = w_bitmap_io; | 1915 | mdev->bm_io_work.w.cb = w_bitmap_io; |
3201 | mdev->start_resync_work.cb = w_start_resync; | 1916 | mdev->start_resync_work.cb = w_start_resync; |
1917 | |||
1918 | mdev->resync_work.mdev = mdev; | ||
1919 | mdev->unplug_work.mdev = mdev; | ||
1920 | mdev->go_diskless.mdev = mdev; | ||
1921 | mdev->md_sync_work.mdev = mdev; | ||
1922 | mdev->bm_io_work.w.mdev = mdev; | ||
1923 | mdev->start_resync_work.mdev = mdev; | ||
1924 | |||
3202 | init_timer(&mdev->resync_timer); | 1925 | init_timer(&mdev->resync_timer); |
3203 | init_timer(&mdev->md_sync_timer); | 1926 | init_timer(&mdev->md_sync_timer); |
3204 | init_timer(&mdev->start_resync_timer); | 1927 | init_timer(&mdev->start_resync_timer); |
@@ -3214,17 +1937,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3214 | 1937 | ||
3215 | init_waitqueue_head(&mdev->misc_wait); | 1938 | init_waitqueue_head(&mdev->misc_wait); |
3216 | init_waitqueue_head(&mdev->state_wait); | 1939 | init_waitqueue_head(&mdev->state_wait); |
3217 | init_waitqueue_head(&mdev->net_cnt_wait); | ||
3218 | init_waitqueue_head(&mdev->ee_wait); | 1940 | init_waitqueue_head(&mdev->ee_wait); |
3219 | init_waitqueue_head(&mdev->al_wait); | 1941 | init_waitqueue_head(&mdev->al_wait); |
3220 | init_waitqueue_head(&mdev->seq_wait); | 1942 | init_waitqueue_head(&mdev->seq_wait); |
3221 | 1943 | ||
3222 | drbd_thread_init(mdev, &mdev->receiver, drbdd_init); | ||
3223 | drbd_thread_init(mdev, &mdev->worker, drbd_worker); | ||
3224 | drbd_thread_init(mdev, &mdev->asender, drbd_asender); | ||
3225 | |||
3226 | mdev->agreed_pro_version = PRO_VERSION_MAX; | ||
3227 | mdev->write_ordering = WO_bdev_flush; | ||
3228 | mdev->resync_wenr = LC_FREE; | 1944 | mdev->resync_wenr = LC_FREE; |
3229 | mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; | 1945 | mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; |
3230 | mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; | 1946 | mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; |
@@ -3233,13 +1949,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3233 | void drbd_mdev_cleanup(struct drbd_conf *mdev) | 1949 | void drbd_mdev_cleanup(struct drbd_conf *mdev) |
3234 | { | 1950 | { |
3235 | int i; | 1951 | int i; |
3236 | if (mdev->receiver.t_state != None) | 1952 | if (mdev->tconn->receiver.t_state != NONE) |
3237 | dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", | 1953 | dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", |
3238 | mdev->receiver.t_state); | 1954 | mdev->tconn->receiver.t_state); |
3239 | 1955 | ||
3240 | /* no need to lock it, I'm the only thread alive */ | ||
3241 | if (atomic_read(&mdev->current_epoch->epoch_size) != 0) | ||
3242 | dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); | ||
3243 | mdev->al_writ_cnt = | 1956 | mdev->al_writ_cnt = |
3244 | mdev->bm_writ_cnt = | 1957 | mdev->bm_writ_cnt = |
3245 | mdev->read_cnt = | 1958 | mdev->read_cnt = |
@@ -3256,7 +1969,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) | |||
3256 | mdev->rs_mark_left[i] = 0; | 1969 | mdev->rs_mark_left[i] = 0; |
3257 | mdev->rs_mark_time[i] = 0; | 1970 | mdev->rs_mark_time[i] = 0; |
3258 | } | 1971 | } |
3259 | D_ASSERT(mdev->net_conf == NULL); | 1972 | D_ASSERT(mdev->tconn->net_conf == NULL); |
3260 | 1973 | ||
3261 | drbd_set_my_capacity(mdev, 0); | 1974 | drbd_set_my_capacity(mdev, 0); |
3262 | if (mdev->bitmap) { | 1975 | if (mdev->bitmap) { |
@@ -3265,21 +1978,18 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) | |||
3265 | drbd_bm_cleanup(mdev); | 1978 | drbd_bm_cleanup(mdev); |
3266 | } | 1979 | } |
3267 | 1980 | ||
3268 | drbd_free_resources(mdev); | 1981 | drbd_free_bc(mdev->ldev); |
3269 | drbd_clear_flag(mdev, AL_SUSPENDED); | 1982 | mdev->ldev = NULL; |
1983 | |||
1984 | clear_bit(AL_SUSPENDED, &mdev->flags); | ||
3270 | 1985 | ||
3271 | /* | ||
3272 | * currently we drbd_init_ee only on module load, so | ||
3273 | * we may do drbd_release_ee only on module unload! | ||
3274 | */ | ||
3275 | D_ASSERT(list_empty(&mdev->active_ee)); | 1986 | D_ASSERT(list_empty(&mdev->active_ee)); |
3276 | D_ASSERT(list_empty(&mdev->sync_ee)); | 1987 | D_ASSERT(list_empty(&mdev->sync_ee)); |
3277 | D_ASSERT(list_empty(&mdev->done_ee)); | 1988 | D_ASSERT(list_empty(&mdev->done_ee)); |
3278 | D_ASSERT(list_empty(&mdev->read_ee)); | 1989 | D_ASSERT(list_empty(&mdev->read_ee)); |
3279 | D_ASSERT(list_empty(&mdev->net_ee)); | 1990 | D_ASSERT(list_empty(&mdev->net_ee)); |
3280 | D_ASSERT(list_empty(&mdev->resync_reads)); | 1991 | D_ASSERT(list_empty(&mdev->resync_reads)); |
3281 | D_ASSERT(list_empty(&mdev->data.work.q)); | 1992 | D_ASSERT(list_empty(&mdev->tconn->sender_work.q)); |
3282 | D_ASSERT(list_empty(&mdev->meta.work.q)); | ||
3283 | D_ASSERT(list_empty(&mdev->resync_work.list)); | 1993 | D_ASSERT(list_empty(&mdev->resync_work.list)); |
3284 | D_ASSERT(list_empty(&mdev->unplug_work.list)); | 1994 | D_ASSERT(list_empty(&mdev->unplug_work.list)); |
3285 | D_ASSERT(list_empty(&mdev->go_diskless.list)); | 1995 | D_ASSERT(list_empty(&mdev->go_diskless.list)); |
@@ -3353,7 +2063,7 @@ static int drbd_create_mempools(void) | |||
3353 | goto Enomem; | 2063 | goto Enomem; |
3354 | 2064 | ||
3355 | drbd_ee_cache = kmem_cache_create( | 2065 | drbd_ee_cache = kmem_cache_create( |
3356 | "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); | 2066 | "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL); |
3357 | if (drbd_ee_cache == NULL) | 2067 | if (drbd_ee_cache == NULL) |
3358 | goto Enomem; | 2068 | goto Enomem; |
3359 | 2069 | ||
@@ -3368,11 +2078,9 @@ static int drbd_create_mempools(void) | |||
3368 | goto Enomem; | 2078 | goto Enomem; |
3369 | 2079 | ||
3370 | /* mempools */ | 2080 | /* mempools */ |
3371 | #ifdef COMPAT_HAVE_BIOSET_CREATE | ||
3372 | drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); | 2081 | drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); |
3373 | if (drbd_md_io_bio_set == NULL) | 2082 | if (drbd_md_io_bio_set == NULL) |
3374 | goto Enomem; | 2083 | goto Enomem; |
3375 | #endif | ||
3376 | 2084 | ||
3377 | drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); | 2085 | drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); |
3378 | if (drbd_md_io_page_pool == NULL) | 2086 | if (drbd_md_io_page_pool == NULL) |
@@ -3421,73 +2129,53 @@ static struct notifier_block drbd_notifier = { | |||
3421 | .notifier_call = drbd_notify_sys, | 2129 | .notifier_call = drbd_notify_sys, |
3422 | }; | 2130 | }; |
3423 | 2131 | ||
3424 | static void drbd_release_ee_lists(struct drbd_conf *mdev) | 2132 | static void drbd_release_all_peer_reqs(struct drbd_conf *mdev) |
3425 | { | 2133 | { |
3426 | int rr; | 2134 | int rr; |
3427 | 2135 | ||
3428 | rr = drbd_release_ee(mdev, &mdev->active_ee); | 2136 | rr = drbd_free_peer_reqs(mdev, &mdev->active_ee); |
3429 | if (rr) | 2137 | if (rr) |
3430 | dev_err(DEV, "%d EEs in active list found!\n", rr); | 2138 | dev_err(DEV, "%d EEs in active list found!\n", rr); |
3431 | 2139 | ||
3432 | rr = drbd_release_ee(mdev, &mdev->sync_ee); | 2140 | rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee); |
3433 | if (rr) | 2141 | if (rr) |
3434 | dev_err(DEV, "%d EEs in sync list found!\n", rr); | 2142 | dev_err(DEV, "%d EEs in sync list found!\n", rr); |
3435 | 2143 | ||
3436 | rr = drbd_release_ee(mdev, &mdev->read_ee); | 2144 | rr = drbd_free_peer_reqs(mdev, &mdev->read_ee); |
3437 | if (rr) | 2145 | if (rr) |
3438 | dev_err(DEV, "%d EEs in read list found!\n", rr); | 2146 | dev_err(DEV, "%d EEs in read list found!\n", rr); |
3439 | 2147 | ||
3440 | rr = drbd_release_ee(mdev, &mdev->done_ee); | 2148 | rr = drbd_free_peer_reqs(mdev, &mdev->done_ee); |
3441 | if (rr) | 2149 | if (rr) |
3442 | dev_err(DEV, "%d EEs in done list found!\n", rr); | 2150 | dev_err(DEV, "%d EEs in done list found!\n", rr); |
3443 | 2151 | ||
3444 | rr = drbd_release_ee(mdev, &mdev->net_ee); | 2152 | rr = drbd_free_peer_reqs(mdev, &mdev->net_ee); |
3445 | if (rr) | 2153 | if (rr) |
3446 | dev_err(DEV, "%d EEs in net list found!\n", rr); | 2154 | dev_err(DEV, "%d EEs in net list found!\n", rr); |
3447 | } | 2155 | } |
3448 | 2156 | ||
3449 | /* caution. no locking. | 2157 | /* caution. no locking. */ |
3450 | * currently only used from module cleanup code. */ | 2158 | void drbd_minor_destroy(struct kref *kref) |
3451 | static void drbd_delete_device(unsigned int minor) | ||
3452 | { | 2159 | { |
3453 | struct drbd_conf *mdev = minor_to_mdev(minor); | 2160 | struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref); |
3454 | 2161 | struct drbd_tconn *tconn = mdev->tconn; | |
3455 | if (!mdev) | ||
3456 | return; | ||
3457 | 2162 | ||
3458 | del_timer_sync(&mdev->request_timer); | 2163 | del_timer_sync(&mdev->request_timer); |
3459 | 2164 | ||
3460 | /* paranoia asserts */ | 2165 | /* paranoia asserts */ |
3461 | if (mdev->open_cnt != 0) | 2166 | D_ASSERT(mdev->open_cnt == 0); |
3462 | dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, | ||
3463 | __FILE__ , __LINE__); | ||
3464 | |||
3465 | ERR_IF (!list_empty(&mdev->data.work.q)) { | ||
3466 | struct list_head *lp; | ||
3467 | list_for_each(lp, &mdev->data.work.q) { | ||
3468 | dev_err(DEV, "lp = %p\n", lp); | ||
3469 | } | ||
3470 | }; | ||
3471 | /* end paranoia asserts */ | 2167 | /* end paranoia asserts */ |
3472 | 2168 | ||
3473 | del_gendisk(mdev->vdisk); | ||
3474 | |||
3475 | /* cleanup stuff that may have been allocated during | 2169 | /* cleanup stuff that may have been allocated during |
3476 | * device (re-)configuration or state changes */ | 2170 | * device (re-)configuration or state changes */ |
3477 | 2171 | ||
3478 | if (mdev->this_bdev) | 2172 | if (mdev->this_bdev) |
3479 | bdput(mdev->this_bdev); | 2173 | bdput(mdev->this_bdev); |
3480 | 2174 | ||
3481 | drbd_free_resources(mdev); | 2175 | drbd_free_bc(mdev->ldev); |
2176 | mdev->ldev = NULL; | ||
3482 | 2177 | ||
3483 | drbd_release_ee_lists(mdev); | 2178 | drbd_release_all_peer_reqs(mdev); |
3484 | |||
3485 | /* should be freed on disconnect? */ | ||
3486 | kfree(mdev->ee_hash); | ||
3487 | /* | ||
3488 | mdev->ee_hash_s = 0; | ||
3489 | mdev->ee_hash = NULL; | ||
3490 | */ | ||
3491 | 2179 | ||
3492 | lc_destroy(mdev->act_log); | 2180 | lc_destroy(mdev->act_log); |
3493 | lc_destroy(mdev->resync); | 2181 | lc_destroy(mdev->resync); |
@@ -3495,19 +2183,101 @@ static void drbd_delete_device(unsigned int minor) | |||
3495 | kfree(mdev->p_uuid); | 2183 | kfree(mdev->p_uuid); |
3496 | /* mdev->p_uuid = NULL; */ | 2184 | /* mdev->p_uuid = NULL; */ |
3497 | 2185 | ||
3498 | kfree(mdev->int_dig_out); | 2186 | if (mdev->bitmap) /* should no longer be there. */ |
3499 | kfree(mdev->int_dig_in); | 2187 | drbd_bm_cleanup(mdev); |
3500 | kfree(mdev->int_dig_vv); | 2188 | __free_page(mdev->md_io_page); |
2189 | put_disk(mdev->vdisk); | ||
2190 | blk_cleanup_queue(mdev->rq_queue); | ||
2191 | kfree(mdev->rs_plan_s); | ||
2192 | kfree(mdev); | ||
3501 | 2193 | ||
3502 | /* cleanup the rest that has been | 2194 | kref_put(&tconn->kref, &conn_destroy); |
3503 | * allocated from drbd_new_device | ||
3504 | * and actually free the mdev itself */ | ||
3505 | drbd_free_mdev(mdev); | ||
3506 | } | 2195 | } |
3507 | 2196 | ||
2197 | /* One global retry thread, if we need to push back some bio and have it | ||
2198 | * reinserted through our make request function. | ||
2199 | */ | ||
2200 | static struct retry_worker { | ||
2201 | struct workqueue_struct *wq; | ||
2202 | struct work_struct worker; | ||
2203 | |||
2204 | spinlock_t lock; | ||
2205 | struct list_head writes; | ||
2206 | } retry; | ||
2207 | |||
2208 | static void do_retry(struct work_struct *ws) | ||
2209 | { | ||
2210 | struct retry_worker *retry = container_of(ws, struct retry_worker, worker); | ||
2211 | LIST_HEAD(writes); | ||
2212 | struct drbd_request *req, *tmp; | ||
2213 | |||
2214 | spin_lock_irq(&retry->lock); | ||
2215 | list_splice_init(&retry->writes, &writes); | ||
2216 | spin_unlock_irq(&retry->lock); | ||
2217 | |||
2218 | list_for_each_entry_safe(req, tmp, &writes, tl_requests) { | ||
2219 | struct drbd_conf *mdev = req->w.mdev; | ||
2220 | struct bio *bio = req->master_bio; | ||
2221 | unsigned long start_time = req->start_time; | ||
2222 | bool expected; | ||
2223 | |||
2224 | expected = | ||
2225 | expect(atomic_read(&req->completion_ref) == 0) && | ||
2226 | expect(req->rq_state & RQ_POSTPONED) && | ||
2227 | expect((req->rq_state & RQ_LOCAL_PENDING) == 0 || | ||
2228 | (req->rq_state & RQ_LOCAL_ABORTED) != 0); | ||
2229 | |||
2230 | if (!expected) | ||
2231 | dev_err(DEV, "req=%p completion_ref=%d rq_state=%x\n", | ||
2232 | req, atomic_read(&req->completion_ref), | ||
2233 | req->rq_state); | ||
2234 | |||
2235 | /* We still need to put one kref associated with the | ||
2236 | * "completion_ref" going zero in the code path that queued it | ||
2237 | * here. The request object may still be referenced by a | ||
2238 | * frozen local req->private_bio, in case we force-detached. | ||
2239 | */ | ||
2240 | kref_put(&req->kref, drbd_req_destroy); | ||
2241 | |||
2242 | /* A single suspended or otherwise blocking device may stall | ||
2243 | * all others as well. Fortunately, this code path is to | ||
2244 | * recover from a situation that "should not happen": | ||
2245 | * concurrent writes in multi-primary setup. | ||
2246 | * In a "normal" lifecycle, this workqueue is supposed to be | ||
2247 | * destroyed without ever doing anything. | ||
2248 | * If it turns out to be an issue anyways, we can do per | ||
2249 | * resource (replication group) or per device (minor) retry | ||
2250 | * workqueues instead. | ||
2251 | */ | ||
2252 | |||
2253 | /* We are not just doing generic_make_request(), | ||
2254 | * as we want to keep the start_time information. */ | ||
2255 | inc_ap_bio(mdev); | ||
2256 | __drbd_make_request(mdev, bio, start_time); | ||
2257 | } | ||
2258 | } | ||
2259 | |||
2260 | void drbd_restart_request(struct drbd_request *req) | ||
2261 | { | ||
2262 | unsigned long flags; | ||
2263 | spin_lock_irqsave(&retry.lock, flags); | ||
2264 | list_move_tail(&req->tl_requests, &retry.writes); | ||
2265 | spin_unlock_irqrestore(&retry.lock, flags); | ||
2266 | |||
2267 | /* Drop the extra reference that would otherwise | ||
2268 | * have been dropped by complete_master_bio. | ||
2269 | * do_retry() needs to grab a new one. */ | ||
2270 | dec_ap_bio(req->w.mdev); | ||
2271 | |||
2272 | queue_work(retry.wq, &retry.worker); | ||
2273 | } | ||
2274 | |||
2275 | |||
3508 | static void drbd_cleanup(void) | 2276 | static void drbd_cleanup(void) |
3509 | { | 2277 | { |
3510 | unsigned int i; | 2278 | unsigned int i; |
2279 | struct drbd_conf *mdev; | ||
2280 | struct drbd_tconn *tconn, *tmp; | ||
3511 | 2281 | ||
3512 | unregister_reboot_notifier(&drbd_notifier); | 2282 | unregister_reboot_notifier(&drbd_notifier); |
3513 | 2283 | ||
@@ -3522,19 +2292,31 @@ static void drbd_cleanup(void) | |||
3522 | if (drbd_proc) | 2292 | if (drbd_proc) |
3523 | remove_proc_entry("drbd", NULL); | 2293 | remove_proc_entry("drbd", NULL); |
3524 | 2294 | ||
3525 | drbd_nl_cleanup(); | 2295 | if (retry.wq) |
2296 | destroy_workqueue(retry.wq); | ||
2297 | |||
2298 | drbd_genl_unregister(); | ||
3526 | 2299 | ||
3527 | if (minor_table) { | 2300 | idr_for_each_entry(&minors, mdev, i) { |
3528 | i = minor_count; | 2301 | idr_remove(&minors, mdev_to_minor(mdev)); |
3529 | while (i--) | 2302 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
3530 | drbd_delete_device(i); | 2303 | del_gendisk(mdev->vdisk); |
3531 | drbd_destroy_mempools(); | 2304 | /* synchronize_rcu(); No other threads running at this point */ |
2305 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
3532 | } | 2306 | } |
3533 | 2307 | ||
3534 | kfree(minor_table); | 2308 | /* not _rcu since, no other updater anymore. Genl already unregistered */ |
2309 | list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { | ||
2310 | list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */ | ||
2311 | /* synchronize_rcu(); */ | ||
2312 | kref_put(&tconn->kref, &conn_destroy); | ||
2313 | } | ||
3535 | 2314 | ||
2315 | drbd_destroy_mempools(); | ||
3536 | unregister_blkdev(DRBD_MAJOR, "drbd"); | 2316 | unregister_blkdev(DRBD_MAJOR, "drbd"); |
3537 | 2317 | ||
2318 | idr_destroy(&minors); | ||
2319 | |||
3538 | printk(KERN_INFO "drbd: module cleanup done.\n"); | 2320 | printk(KERN_INFO "drbd: module cleanup done.\n"); |
3539 | } | 2321 | } |
3540 | 2322 | ||
@@ -3559,7 +2341,7 @@ static int drbd_congested(void *congested_data, int bdi_bits) | |||
3559 | goto out; | 2341 | goto out; |
3560 | } | 2342 | } |
3561 | 2343 | ||
3562 | if (drbd_test_flag(mdev, CALLBACK_PENDING)) { | 2344 | if (test_bit(CALLBACK_PENDING, &mdev->tconn->flags)) { |
3563 | r |= (1 << BDI_async_congested); | 2345 | r |= (1 << BDI_async_congested); |
3564 | /* Without good local data, we would need to read from remote, | 2346 | /* Without good local data, we would need to read from remote, |
3565 | * and that would need the worker thread as well, which is | 2347 | * and that would need the worker thread as well, which is |
@@ -3583,7 +2365,7 @@ static int drbd_congested(void *congested_data, int bdi_bits) | |||
3583 | reason = 'b'; | 2365 | reason = 'b'; |
3584 | } | 2366 | } |
3585 | 2367 | ||
3586 | if (bdi_bits & (1 << BDI_async_congested) && drbd_test_flag(mdev, NET_CONGESTED)) { | 2368 | if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) { |
3587 | r |= (1 << BDI_async_congested); | 2369 | r |= (1 << BDI_async_congested); |
3588 | reason = reason == 'b' ? 'a' : 'n'; | 2370 | reason = reason == 'b' ? 'a' : 'n'; |
3589 | } | 2371 | } |
@@ -3593,20 +2375,243 @@ out: | |||
3593 | return r; | 2375 | return r; |
3594 | } | 2376 | } |
3595 | 2377 | ||
3596 | struct drbd_conf *drbd_new_device(unsigned int minor) | 2378 | static void drbd_init_workqueue(struct drbd_work_queue* wq) |
2379 | { | ||
2380 | spin_lock_init(&wq->q_lock); | ||
2381 | INIT_LIST_HEAD(&wq->q); | ||
2382 | init_waitqueue_head(&wq->q_wait); | ||
2383 | } | ||
2384 | |||
2385 | struct drbd_tconn *conn_get_by_name(const char *name) | ||
2386 | { | ||
2387 | struct drbd_tconn *tconn; | ||
2388 | |||
2389 | if (!name || !name[0]) | ||
2390 | return NULL; | ||
2391 | |||
2392 | rcu_read_lock(); | ||
2393 | list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { | ||
2394 | if (!strcmp(tconn->name, name)) { | ||
2395 | kref_get(&tconn->kref); | ||
2396 | goto found; | ||
2397 | } | ||
2398 | } | ||
2399 | tconn = NULL; | ||
2400 | found: | ||
2401 | rcu_read_unlock(); | ||
2402 | return tconn; | ||
2403 | } | ||
2404 | |||
2405 | struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, | ||
2406 | void *peer_addr, int peer_addr_len) | ||
2407 | { | ||
2408 | struct drbd_tconn *tconn; | ||
2409 | |||
2410 | rcu_read_lock(); | ||
2411 | list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { | ||
2412 | if (tconn->my_addr_len == my_addr_len && | ||
2413 | tconn->peer_addr_len == peer_addr_len && | ||
2414 | !memcmp(&tconn->my_addr, my_addr, my_addr_len) && | ||
2415 | !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) { | ||
2416 | kref_get(&tconn->kref); | ||
2417 | goto found; | ||
2418 | } | ||
2419 | } | ||
2420 | tconn = NULL; | ||
2421 | found: | ||
2422 | rcu_read_unlock(); | ||
2423 | return tconn; | ||
2424 | } | ||
2425 | |||
2426 | static int drbd_alloc_socket(struct drbd_socket *socket) | ||
2427 | { | ||
2428 | socket->rbuf = (void *) __get_free_page(GFP_KERNEL); | ||
2429 | if (!socket->rbuf) | ||
2430 | return -ENOMEM; | ||
2431 | socket->sbuf = (void *) __get_free_page(GFP_KERNEL); | ||
2432 | if (!socket->sbuf) | ||
2433 | return -ENOMEM; | ||
2434 | return 0; | ||
2435 | } | ||
2436 | |||
2437 | static void drbd_free_socket(struct drbd_socket *socket) | ||
2438 | { | ||
2439 | free_page((unsigned long) socket->sbuf); | ||
2440 | free_page((unsigned long) socket->rbuf); | ||
2441 | } | ||
2442 | |||
2443 | void conn_free_crypto(struct drbd_tconn *tconn) | ||
2444 | { | ||
2445 | drbd_free_sock(tconn); | ||
2446 | |||
2447 | crypto_free_hash(tconn->csums_tfm); | ||
2448 | crypto_free_hash(tconn->verify_tfm); | ||
2449 | crypto_free_hash(tconn->cram_hmac_tfm); | ||
2450 | crypto_free_hash(tconn->integrity_tfm); | ||
2451 | crypto_free_hash(tconn->peer_integrity_tfm); | ||
2452 | kfree(tconn->int_dig_in); | ||
2453 | kfree(tconn->int_dig_vv); | ||
2454 | |||
2455 | tconn->csums_tfm = NULL; | ||
2456 | tconn->verify_tfm = NULL; | ||
2457 | tconn->cram_hmac_tfm = NULL; | ||
2458 | tconn->integrity_tfm = NULL; | ||
2459 | tconn->peer_integrity_tfm = NULL; | ||
2460 | tconn->int_dig_in = NULL; | ||
2461 | tconn->int_dig_vv = NULL; | ||
2462 | } | ||
2463 | |||
2464 | int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts) | ||
2465 | { | ||
2466 | cpumask_var_t new_cpu_mask; | ||
2467 | int err; | ||
2468 | |||
2469 | if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) | ||
2470 | return -ENOMEM; | ||
2471 | /* | ||
2472 | retcode = ERR_NOMEM; | ||
2473 | drbd_msg_put_info("unable to allocate cpumask"); | ||
2474 | */ | ||
2475 | |||
2476 | /* silently ignore cpu mask on UP kernel */ | ||
2477 | if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { | ||
2478 | /* FIXME: Get rid of constant 32 here */ | ||
2479 | err = bitmap_parse(res_opts->cpu_mask, 32, | ||
2480 | cpumask_bits(new_cpu_mask), nr_cpu_ids); | ||
2481 | if (err) { | ||
2482 | conn_warn(tconn, "bitmap_parse() failed with %d\n", err); | ||
2483 | /* retcode = ERR_CPU_MASK_PARSE; */ | ||
2484 | goto fail; | ||
2485 | } | ||
2486 | } | ||
2487 | tconn->res_opts = *res_opts; | ||
2488 | if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) { | ||
2489 | cpumask_copy(tconn->cpu_mask, new_cpu_mask); | ||
2490 | drbd_calc_cpu_mask(tconn); | ||
2491 | tconn->receiver.reset_cpu_mask = 1; | ||
2492 | tconn->asender.reset_cpu_mask = 1; | ||
2493 | tconn->worker.reset_cpu_mask = 1; | ||
2494 | } | ||
2495 | err = 0; | ||
2496 | |||
2497 | fail: | ||
2498 | free_cpumask_var(new_cpu_mask); | ||
2499 | return err; | ||
2500 | |||
2501 | } | ||
2502 | |||
2503 | /* caller must be under genl_lock() */ | ||
2504 | struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts) | ||
2505 | { | ||
2506 | struct drbd_tconn *tconn; | ||
2507 | |||
2508 | tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL); | ||
2509 | if (!tconn) | ||
2510 | return NULL; | ||
2511 | |||
2512 | tconn->name = kstrdup(name, GFP_KERNEL); | ||
2513 | if (!tconn->name) | ||
2514 | goto fail; | ||
2515 | |||
2516 | if (drbd_alloc_socket(&tconn->data)) | ||
2517 | goto fail; | ||
2518 | if (drbd_alloc_socket(&tconn->meta)) | ||
2519 | goto fail; | ||
2520 | |||
2521 | if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL)) | ||
2522 | goto fail; | ||
2523 | |||
2524 | if (set_resource_options(tconn, res_opts)) | ||
2525 | goto fail; | ||
2526 | |||
2527 | tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); | ||
2528 | if (!tconn->current_epoch) | ||
2529 | goto fail; | ||
2530 | |||
2531 | INIT_LIST_HEAD(&tconn->transfer_log); | ||
2532 | |||
2533 | INIT_LIST_HEAD(&tconn->current_epoch->list); | ||
2534 | tconn->epochs = 1; | ||
2535 | spin_lock_init(&tconn->epoch_lock); | ||
2536 | tconn->write_ordering = WO_bdev_flush; | ||
2537 | |||
2538 | tconn->send.seen_any_write_yet = false; | ||
2539 | tconn->send.current_epoch_nr = 0; | ||
2540 | tconn->send.current_epoch_writes = 0; | ||
2541 | |||
2542 | tconn->cstate = C_STANDALONE; | ||
2543 | mutex_init(&tconn->cstate_mutex); | ||
2544 | spin_lock_init(&tconn->req_lock); | ||
2545 | mutex_init(&tconn->conf_update); | ||
2546 | init_waitqueue_head(&tconn->ping_wait); | ||
2547 | idr_init(&tconn->volumes); | ||
2548 | |||
2549 | drbd_init_workqueue(&tconn->sender_work); | ||
2550 | mutex_init(&tconn->data.mutex); | ||
2551 | mutex_init(&tconn->meta.mutex); | ||
2552 | |||
2553 | drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver"); | ||
2554 | drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker"); | ||
2555 | drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender"); | ||
2556 | |||
2557 | kref_init(&tconn->kref); | ||
2558 | list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns); | ||
2559 | |||
2560 | return tconn; | ||
2561 | |||
2562 | fail: | ||
2563 | kfree(tconn->current_epoch); | ||
2564 | free_cpumask_var(tconn->cpu_mask); | ||
2565 | drbd_free_socket(&tconn->meta); | ||
2566 | drbd_free_socket(&tconn->data); | ||
2567 | kfree(tconn->name); | ||
2568 | kfree(tconn); | ||
2569 | |||
2570 | return NULL; | ||
2571 | } | ||
2572 | |||
2573 | void conn_destroy(struct kref *kref) | ||
2574 | { | ||
2575 | struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref); | ||
2576 | |||
2577 | if (atomic_read(&tconn->current_epoch->epoch_size) != 0) | ||
2578 | conn_err(tconn, "epoch_size:%d\n", atomic_read(&tconn->current_epoch->epoch_size)); | ||
2579 | kfree(tconn->current_epoch); | ||
2580 | |||
2581 | idr_destroy(&tconn->volumes); | ||
2582 | |||
2583 | free_cpumask_var(tconn->cpu_mask); | ||
2584 | drbd_free_socket(&tconn->meta); | ||
2585 | drbd_free_socket(&tconn->data); | ||
2586 | kfree(tconn->name); | ||
2587 | kfree(tconn->int_dig_in); | ||
2588 | kfree(tconn->int_dig_vv); | ||
2589 | kfree(tconn); | ||
2590 | } | ||
2591 | |||
2592 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) | ||
3597 | { | 2593 | { |
3598 | struct drbd_conf *mdev; | 2594 | struct drbd_conf *mdev; |
3599 | struct gendisk *disk; | 2595 | struct gendisk *disk; |
3600 | struct request_queue *q; | 2596 | struct request_queue *q; |
2597 | int vnr_got = vnr; | ||
2598 | int minor_got = minor; | ||
2599 | enum drbd_ret_code err = ERR_NOMEM; | ||
2600 | |||
2601 | mdev = minor_to_mdev(minor); | ||
2602 | if (mdev) | ||
2603 | return ERR_MINOR_EXISTS; | ||
3601 | 2604 | ||
3602 | /* GFP_KERNEL, we are outside of all write-out paths */ | 2605 | /* GFP_KERNEL, we are outside of all write-out paths */ |
3603 | mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); | 2606 | mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); |
3604 | if (!mdev) | 2607 | if (!mdev) |
3605 | return NULL; | 2608 | return ERR_NOMEM; |
3606 | if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) | 2609 | |
3607 | goto out_no_cpumask; | 2610 | kref_get(&tconn->kref); |
2611 | mdev->tconn = tconn; | ||
3608 | 2612 | ||
3609 | mdev->minor = minor; | 2613 | mdev->minor = minor; |
2614 | mdev->vnr = vnr; | ||
3610 | 2615 | ||
3611 | drbd_init_set_defaults(mdev); | 2616 | drbd_init_set_defaults(mdev); |
3612 | 2617 | ||
@@ -3644,7 +2649,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor) | |||
3644 | blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); | 2649 | blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); |
3645 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); | 2650 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); |
3646 | blk_queue_merge_bvec(q, drbd_merge_bvec); | 2651 | blk_queue_merge_bvec(q, drbd_merge_bvec); |
3647 | q->queue_lock = &mdev->req_lock; | 2652 | q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */ |
3648 | 2653 | ||
3649 | mdev->md_io_page = alloc_page(GFP_KERNEL); | 2654 | mdev->md_io_page = alloc_page(GFP_KERNEL); |
3650 | if (!mdev->md_io_page) | 2655 | if (!mdev->md_io_page) |
@@ -3652,30 +2657,44 @@ struct drbd_conf *drbd_new_device(unsigned int minor) | |||
3652 | 2657 | ||
3653 | if (drbd_bm_init(mdev)) | 2658 | if (drbd_bm_init(mdev)) |
3654 | goto out_no_bitmap; | 2659 | goto out_no_bitmap; |
3655 | /* no need to lock access, we are still initializing this minor device. */ | 2660 | mdev->read_requests = RB_ROOT; |
3656 | if (!tl_init(mdev)) | 2661 | mdev->write_requests = RB_ROOT; |
3657 | goto out_no_tl; | 2662 | |
3658 | 2663 | if (!idr_pre_get(&minors, GFP_KERNEL)) | |
3659 | mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); | 2664 | goto out_no_minor_idr; |
3660 | if (!mdev->app_reads_hash) | 2665 | if (idr_get_new_above(&minors, mdev, minor, &minor_got)) |
3661 | goto out_no_app_reads; | 2666 | goto out_no_minor_idr; |
3662 | 2667 | if (minor_got != minor) { | |
3663 | mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); | 2668 | err = ERR_MINOR_EXISTS; |
3664 | if (!mdev->current_epoch) | 2669 | drbd_msg_put_info("requested minor exists already"); |
3665 | goto out_no_epoch; | 2670 | goto out_idr_remove_minor; |
3666 | 2671 | } | |
3667 | INIT_LIST_HEAD(&mdev->current_epoch->list); | 2672 | |
3668 | mdev->epochs = 1; | 2673 | if (!idr_pre_get(&tconn->volumes, GFP_KERNEL)) |
3669 | 2674 | goto out_idr_remove_minor; | |
3670 | return mdev; | 2675 | if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got)) |
3671 | 2676 | goto out_idr_remove_minor; | |
3672 | /* out_whatever_else: | 2677 | if (vnr_got != vnr) { |
3673 | kfree(mdev->current_epoch); */ | 2678 | err = ERR_INVALID_REQUEST; |
3674 | out_no_epoch: | 2679 | drbd_msg_put_info("requested volume exists already"); |
3675 | kfree(mdev->app_reads_hash); | 2680 | goto out_idr_remove_vol; |
3676 | out_no_app_reads: | 2681 | } |
3677 | tl_cleanup(mdev); | 2682 | add_disk(disk); |
3678 | out_no_tl: | 2683 | kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ |
2684 | |||
2685 | /* inherit the connection state */ | ||
2686 | mdev->state.conn = tconn->cstate; | ||
2687 | if (mdev->state.conn == C_WF_REPORT_PARAMS) | ||
2688 | drbd_connected(mdev); | ||
2689 | |||
2690 | return NO_ERROR; | ||
2691 | |||
2692 | out_idr_remove_vol: | ||
2693 | idr_remove(&tconn->volumes, vnr_got); | ||
2694 | out_idr_remove_minor: | ||
2695 | idr_remove(&minors, minor_got); | ||
2696 | synchronize_rcu(); | ||
2697 | out_no_minor_idr: | ||
3679 | drbd_bm_cleanup(mdev); | 2698 | drbd_bm_cleanup(mdev); |
3680 | out_no_bitmap: | 2699 | out_no_bitmap: |
3681 | __free_page(mdev->md_io_page); | 2700 | __free_page(mdev->md_io_page); |
@@ -3684,55 +2703,25 @@ out_no_io_page: | |||
3684 | out_no_disk: | 2703 | out_no_disk: |
3685 | blk_cleanup_queue(q); | 2704 | blk_cleanup_queue(q); |
3686 | out_no_q: | 2705 | out_no_q: |
3687 | free_cpumask_var(mdev->cpu_mask); | ||
3688 | out_no_cpumask: | ||
3689 | kfree(mdev); | ||
3690 | return NULL; | ||
3691 | } | ||
3692 | |||
3693 | /* counterpart of drbd_new_device. | ||
3694 | * last part of drbd_delete_device. */ | ||
3695 | void drbd_free_mdev(struct drbd_conf *mdev) | ||
3696 | { | ||
3697 | kfree(mdev->current_epoch); | ||
3698 | kfree(mdev->app_reads_hash); | ||
3699 | tl_cleanup(mdev); | ||
3700 | if (mdev->bitmap) /* should no longer be there. */ | ||
3701 | drbd_bm_cleanup(mdev); | ||
3702 | __free_page(mdev->md_io_page); | ||
3703 | put_disk(mdev->vdisk); | ||
3704 | blk_cleanup_queue(mdev->rq_queue); | ||
3705 | free_cpumask_var(mdev->cpu_mask); | ||
3706 | drbd_free_tl_hash(mdev); | ||
3707 | kfree(mdev); | 2706 | kfree(mdev); |
2707 | kref_put(&tconn->kref, &conn_destroy); | ||
2708 | return err; | ||
3708 | } | 2709 | } |
3709 | 2710 | ||
3710 | |||
3711 | int __init drbd_init(void) | 2711 | int __init drbd_init(void) |
3712 | { | 2712 | { |
3713 | int err; | 2713 | int err; |
3714 | 2714 | ||
3715 | if (sizeof(struct p_handshake) != 80) { | ||
3716 | printk(KERN_ERR | ||
3717 | "drbd: never change the size or layout " | ||
3718 | "of the HandShake packet.\n"); | ||
3719 | return -EINVAL; | ||
3720 | } | ||
3721 | |||
3722 | if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { | 2715 | if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { |
3723 | printk(KERN_ERR | 2716 | printk(KERN_ERR |
3724 | "drbd: invalid minor_count (%d)\n", minor_count); | 2717 | "drbd: invalid minor_count (%d)\n", minor_count); |
3725 | #ifdef MODULE | 2718 | #ifdef MODULE |
3726 | return -EINVAL; | 2719 | return -EINVAL; |
3727 | #else | 2720 | #else |
3728 | minor_count = 8; | 2721 | minor_count = DRBD_MINOR_COUNT_DEF; |
3729 | #endif | 2722 | #endif |
3730 | } | 2723 | } |
3731 | 2724 | ||
3732 | err = drbd_nl_init(); | ||
3733 | if (err) | ||
3734 | return err; | ||
3735 | |||
3736 | err = register_blkdev(DRBD_MAJOR, "drbd"); | 2725 | err = register_blkdev(DRBD_MAJOR, "drbd"); |
3737 | if (err) { | 2726 | if (err) { |
3738 | printk(KERN_ERR | 2727 | printk(KERN_ERR |
@@ -3741,6 +2730,13 @@ int __init drbd_init(void) | |||
3741 | return err; | 2730 | return err; |
3742 | } | 2731 | } |
3743 | 2732 | ||
2733 | err = drbd_genl_register(); | ||
2734 | if (err) { | ||
2735 | printk(KERN_ERR "drbd: unable to register generic netlink family\n"); | ||
2736 | goto fail; | ||
2737 | } | ||
2738 | |||
2739 | |||
3744 | register_reboot_notifier(&drbd_notifier); | 2740 | register_reboot_notifier(&drbd_notifier); |
3745 | 2741 | ||
3746 | /* | 2742 | /* |
@@ -3751,22 +2747,29 @@ int __init drbd_init(void) | |||
3751 | init_waitqueue_head(&drbd_pp_wait); | 2747 | init_waitqueue_head(&drbd_pp_wait); |
3752 | 2748 | ||
3753 | drbd_proc = NULL; /* play safe for drbd_cleanup */ | 2749 | drbd_proc = NULL; /* play safe for drbd_cleanup */ |
3754 | minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, | 2750 | idr_init(&minors); |
3755 | GFP_KERNEL); | ||
3756 | if (!minor_table) | ||
3757 | goto Enomem; | ||
3758 | 2751 | ||
3759 | err = drbd_create_mempools(); | 2752 | err = drbd_create_mempools(); |
3760 | if (err) | 2753 | if (err) |
3761 | goto Enomem; | 2754 | goto fail; |
3762 | 2755 | ||
3763 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); | 2756 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); |
3764 | if (!drbd_proc) { | 2757 | if (!drbd_proc) { |
3765 | printk(KERN_ERR "drbd: unable to register proc file\n"); | 2758 | printk(KERN_ERR "drbd: unable to register proc file\n"); |
3766 | goto Enomem; | 2759 | goto fail; |
3767 | } | 2760 | } |
3768 | 2761 | ||
3769 | rwlock_init(&global_state_lock); | 2762 | rwlock_init(&global_state_lock); |
2763 | INIT_LIST_HEAD(&drbd_tconns); | ||
2764 | |||
2765 | retry.wq = create_singlethread_workqueue("drbd-reissue"); | ||
2766 | if (!retry.wq) { | ||
2767 | printk(KERN_ERR "drbd: unable to create retry workqueue\n"); | ||
2768 | goto fail; | ||
2769 | } | ||
2770 | INIT_WORK(&retry.worker, do_retry); | ||
2771 | spin_lock_init(&retry.lock); | ||
2772 | INIT_LIST_HEAD(&retry.writes); | ||
3770 | 2773 | ||
3771 | printk(KERN_INFO "drbd: initialized. " | 2774 | printk(KERN_INFO "drbd: initialized. " |
3772 | "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", | 2775 | "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", |
@@ -3774,11 +2777,10 @@ int __init drbd_init(void) | |||
3774 | printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); | 2777 | printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); |
3775 | printk(KERN_INFO "drbd: registered as block device major %d\n", | 2778 | printk(KERN_INFO "drbd: registered as block device major %d\n", |
3776 | DRBD_MAJOR); | 2779 | DRBD_MAJOR); |
3777 | printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); | ||
3778 | 2780 | ||
3779 | return 0; /* Success! */ | 2781 | return 0; /* Success! */ |
3780 | 2782 | ||
3781 | Enomem: | 2783 | fail: |
3782 | drbd_cleanup(); | 2784 | drbd_cleanup(); |
3783 | if (err == -ENOMEM) | 2785 | if (err == -ENOMEM) |
3784 | /* currently always the case */ | 2786 | /* currently always the case */ |
@@ -3799,47 +2801,42 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) | |||
3799 | kfree(ldev); | 2801 | kfree(ldev); |
3800 | } | 2802 | } |
3801 | 2803 | ||
3802 | void drbd_free_sock(struct drbd_conf *mdev) | 2804 | void drbd_free_sock(struct drbd_tconn *tconn) |
3803 | { | 2805 | { |
3804 | if (mdev->data.socket) { | 2806 | if (tconn->data.socket) { |
3805 | mutex_lock(&mdev->data.mutex); | 2807 | mutex_lock(&tconn->data.mutex); |
3806 | kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); | 2808 | kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR); |
3807 | sock_release(mdev->data.socket); | 2809 | sock_release(tconn->data.socket); |
3808 | mdev->data.socket = NULL; | 2810 | tconn->data.socket = NULL; |
3809 | mutex_unlock(&mdev->data.mutex); | 2811 | mutex_unlock(&tconn->data.mutex); |
3810 | } | 2812 | } |
3811 | if (mdev->meta.socket) { | 2813 | if (tconn->meta.socket) { |
3812 | mutex_lock(&mdev->meta.mutex); | 2814 | mutex_lock(&tconn->meta.mutex); |
3813 | kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); | 2815 | kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR); |
3814 | sock_release(mdev->meta.socket); | 2816 | sock_release(tconn->meta.socket); |
3815 | mdev->meta.socket = NULL; | 2817 | tconn->meta.socket = NULL; |
3816 | mutex_unlock(&mdev->meta.mutex); | 2818 | mutex_unlock(&tconn->meta.mutex); |
3817 | } | 2819 | } |
3818 | } | 2820 | } |
3819 | 2821 | ||
2822 | /* meta data management */ | ||
3820 | 2823 | ||
3821 | void drbd_free_resources(struct drbd_conf *mdev) | 2824 | void conn_md_sync(struct drbd_tconn *tconn) |
3822 | { | 2825 | { |
3823 | crypto_free_hash(mdev->csums_tfm); | 2826 | struct drbd_conf *mdev; |
3824 | mdev->csums_tfm = NULL; | 2827 | int vnr; |
3825 | crypto_free_hash(mdev->verify_tfm); | ||
3826 | mdev->verify_tfm = NULL; | ||
3827 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3828 | mdev->cram_hmac_tfm = NULL; | ||
3829 | crypto_free_hash(mdev->integrity_w_tfm); | ||
3830 | mdev->integrity_w_tfm = NULL; | ||
3831 | crypto_free_hash(mdev->integrity_r_tfm); | ||
3832 | mdev->integrity_r_tfm = NULL; | ||
3833 | |||
3834 | drbd_free_sock(mdev); | ||
3835 | 2828 | ||
3836 | __no_warn(local, | 2829 | rcu_read_lock(); |
3837 | drbd_free_bc(mdev->ldev); | 2830 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
3838 | mdev->ldev = NULL;); | 2831 | kref_get(&mdev->kref); |
2832 | rcu_read_unlock(); | ||
2833 | drbd_md_sync(mdev); | ||
2834 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
2835 | rcu_read_lock(); | ||
2836 | } | ||
2837 | rcu_read_unlock(); | ||
3839 | } | 2838 | } |
3840 | 2839 | ||
3841 | /* meta data management */ | ||
3842 | |||
3843 | struct meta_data_on_disk { | 2840 | struct meta_data_on_disk { |
3844 | u64 la_size; /* last agreed size. */ | 2841 | u64 la_size; /* last agreed size. */ |
3845 | u64 uuid[UI_SIZE]; /* UUIDs. */ | 2842 | u64 uuid[UI_SIZE]; /* UUIDs. */ |
@@ -3850,7 +2847,7 @@ struct meta_data_on_disk { | |||
3850 | u32 md_size_sect; | 2847 | u32 md_size_sect; |
3851 | u32 al_offset; /* offset to this block */ | 2848 | u32 al_offset; /* offset to this block */ |
3852 | u32 al_nr_extents; /* important for restoring the AL */ | 2849 | u32 al_nr_extents; /* important for restoring the AL */ |
3853 | /* `-- act_log->nr_elements <-- sync_conf.al_extents */ | 2850 | /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ |
3854 | u32 bm_offset; /* offset to the bitmap, from here */ | 2851 | u32 bm_offset; /* offset to the bitmap, from here */ |
3855 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ | 2852 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ |
3856 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ | 2853 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ |
@@ -3870,7 +2867,7 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
3870 | 2867 | ||
3871 | del_timer(&mdev->md_sync_timer); | 2868 | del_timer(&mdev->md_sync_timer); |
3872 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ | 2869 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ |
3873 | if (!drbd_test_and_clear_flag(mdev, MD_DIRTY)) | 2870 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) |
3874 | return; | 2871 | return; |
3875 | 2872 | ||
3876 | /* We use here D_FAILED and not D_ATTACHING because we try to write | 2873 | /* We use here D_FAILED and not D_ATTACHING because we try to write |
@@ -3888,7 +2885,7 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
3888 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 2885 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
3889 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); | 2886 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); |
3890 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); | 2887 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); |
3891 | buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); | 2888 | buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN); |
3892 | 2889 | ||
3893 | buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); | 2890 | buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); |
3894 | buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); | 2891 | buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); |
@@ -3902,7 +2899,7 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
3902 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); | 2899 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); |
3903 | sector = mdev->ldev->md.md_offset; | 2900 | sector = mdev->ldev->md.md_offset; |
3904 | 2901 | ||
3905 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | 2902 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { |
3906 | /* this was a try anyways ... */ | 2903 | /* this was a try anyways ... */ |
3907 | dev_err(DEV, "meta data update failed!\n"); | 2904 | dev_err(DEV, "meta data update failed!\n"); |
3908 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 2905 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); |
@@ -3923,11 +2920,12 @@ out: | |||
3923 | * @bdev: Device from which the meta data should be read in. | 2920 | * @bdev: Device from which the meta data should be read in. |
3924 | * | 2921 | * |
3925 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case | 2922 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case |
3926 | * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. | 2923 | * something goes wrong. |
3927 | */ | 2924 | */ |
3928 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | 2925 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) |
3929 | { | 2926 | { |
3930 | struct meta_data_on_disk *buffer; | 2927 | struct meta_data_on_disk *buffer; |
2928 | u32 magic, flags; | ||
3931 | int i, rv = NO_ERROR; | 2929 | int i, rv = NO_ERROR; |
3932 | 2930 | ||
3933 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | 2931 | if (!get_ldev_if_state(mdev, D_ATTACHING)) |
@@ -3937,7 +2935,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
3937 | if (!buffer) | 2935 | if (!buffer) |
3938 | goto out; | 2936 | goto out; |
3939 | 2937 | ||
3940 | if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { | 2938 | if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { |
3941 | /* NOTE: can't do normal error processing here as this is | 2939 | /* NOTE: can't do normal error processing here as this is |
3942 | called BEFORE disk is attached */ | 2940 | called BEFORE disk is attached */ |
3943 | dev_err(DEV, "Error while reading metadata.\n"); | 2941 | dev_err(DEV, "Error while reading metadata.\n"); |
@@ -3945,8 +2943,20 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
3945 | goto err; | 2943 | goto err; |
3946 | } | 2944 | } |
3947 | 2945 | ||
3948 | if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { | 2946 | magic = be32_to_cpu(buffer->magic); |
3949 | dev_err(DEV, "Error while reading metadata, magic not found.\n"); | 2947 | flags = be32_to_cpu(buffer->flags); |
2948 | if (magic == DRBD_MD_MAGIC_84_UNCLEAN || | ||
2949 | (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) { | ||
2950 | /* btw: that's Activity Log clean, not "all" clean. */ | ||
2951 | dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n"); | ||
2952 | rv = ERR_MD_UNCLEAN; | ||
2953 | goto err; | ||
2954 | } | ||
2955 | if (magic != DRBD_MD_MAGIC_08) { | ||
2956 | if (magic == DRBD_MD_MAGIC_07) | ||
2957 | dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); | ||
2958 | else | ||
2959 | dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); | ||
3950 | rv = ERR_MD_INVALID; | 2960 | rv = ERR_MD_INVALID; |
3951 | goto err; | 2961 | goto err; |
3952 | } | 2962 | } |
@@ -3980,20 +2990,16 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
3980 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 2990 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
3981 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | 2991 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); |
3982 | bdev->md.flags = be32_to_cpu(buffer->flags); | 2992 | bdev->md.flags = be32_to_cpu(buffer->flags); |
3983 | mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); | ||
3984 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | 2993 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); |
3985 | 2994 | ||
3986 | spin_lock_irq(&mdev->req_lock); | 2995 | spin_lock_irq(&mdev->tconn->req_lock); |
3987 | if (mdev->state.conn < C_CONNECTED) { | 2996 | if (mdev->state.conn < C_CONNECTED) { |
3988 | unsigned int peer; | 2997 | unsigned int peer; |
3989 | peer = be32_to_cpu(buffer->la_peer_max_bio_size); | 2998 | peer = be32_to_cpu(buffer->la_peer_max_bio_size); |
3990 | peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); | 2999 | peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); |
3991 | mdev->peer_max_bio_size = peer; | 3000 | mdev->peer_max_bio_size = peer; |
3992 | } | 3001 | } |
3993 | spin_unlock_irq(&mdev->req_lock); | 3002 | spin_unlock_irq(&mdev->tconn->req_lock); |
3994 | |||
3995 | if (mdev->sync_conf.al_extents < 7) | ||
3996 | mdev->sync_conf.al_extents = 127; | ||
3997 | 3003 | ||
3998 | err: | 3004 | err: |
3999 | drbd_md_put_buffer(mdev); | 3005 | drbd_md_put_buffer(mdev); |
@@ -4014,7 +3020,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
4014 | #ifdef DEBUG | 3020 | #ifdef DEBUG |
4015 | void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func) | 3021 | void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func) |
4016 | { | 3022 | { |
4017 | if (!drbd_test_and_set_flag(mdev, MD_DIRTY)) { | 3023 | if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) { |
4018 | mod_timer(&mdev->md_sync_timer, jiffies + HZ); | 3024 | mod_timer(&mdev->md_sync_timer, jiffies + HZ); |
4019 | mdev->last_md_mark_dirty.line = line; | 3025 | mdev->last_md_mark_dirty.line = line; |
4020 | mdev->last_md_mark_dirty.func = func; | 3026 | mdev->last_md_mark_dirty.func = func; |
@@ -4023,7 +3029,7 @@ void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char * | |||
4023 | #else | 3029 | #else |
4024 | void drbd_md_mark_dirty(struct drbd_conf *mdev) | 3030 | void drbd_md_mark_dirty(struct drbd_conf *mdev) |
4025 | { | 3031 | { |
4026 | if (!drbd_test_and_set_flag(mdev, MD_DIRTY)) | 3032 | if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) |
4027 | mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); | 3033 | mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); |
4028 | } | 3034 | } |
4029 | #endif | 3035 | #endif |
@@ -4171,9 +3177,10 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev) | |||
4171 | return rv; | 3177 | return rv; |
4172 | } | 3178 | } |
4173 | 3179 | ||
4174 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3180 | static int w_bitmap_io(struct drbd_work *w, int unused) |
4175 | { | 3181 | { |
4176 | struct bm_io_work *work = container_of(w, struct bm_io_work, w); | 3182 | struct bm_io_work *work = container_of(w, struct bm_io_work, w); |
3183 | struct drbd_conf *mdev = w->mdev; | ||
4177 | int rv = -EIO; | 3184 | int rv = -EIO; |
4178 | 3185 | ||
4179 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); | 3186 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); |
@@ -4185,18 +3192,17 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | |||
4185 | put_ldev(mdev); | 3192 | put_ldev(mdev); |
4186 | } | 3193 | } |
4187 | 3194 | ||
4188 | drbd_clear_flag(mdev, BITMAP_IO); | 3195 | clear_bit_unlock(BITMAP_IO, &mdev->flags); |
4189 | smp_mb__after_clear_bit(); | ||
4190 | wake_up(&mdev->misc_wait); | 3196 | wake_up(&mdev->misc_wait); |
4191 | 3197 | ||
4192 | if (work->done) | 3198 | if (work->done) |
4193 | work->done(mdev, rv); | 3199 | work->done(mdev, rv); |
4194 | 3200 | ||
4195 | drbd_clear_flag(mdev, BITMAP_IO_QUEUED); | 3201 | clear_bit(BITMAP_IO_QUEUED, &mdev->flags); |
4196 | work->why = NULL; | 3202 | work->why = NULL; |
4197 | work->flags = 0; | 3203 | work->flags = 0; |
4198 | 3204 | ||
4199 | return 1; | 3205 | return 0; |
4200 | } | 3206 | } |
4201 | 3207 | ||
4202 | void drbd_ldev_destroy(struct drbd_conf *mdev) | 3208 | void drbd_ldev_destroy(struct drbd_conf *mdev) |
@@ -4209,15 +3215,13 @@ void drbd_ldev_destroy(struct drbd_conf *mdev) | |||
4209 | drbd_free_bc(mdev->ldev); | 3215 | drbd_free_bc(mdev->ldev); |
4210 | mdev->ldev = NULL;); | 3216 | mdev->ldev = NULL;); |
4211 | 3217 | ||
4212 | if (mdev->md_io_tmpp) { | 3218 | clear_bit(GO_DISKLESS, &mdev->flags); |
4213 | __free_page(mdev->md_io_tmpp); | ||
4214 | mdev->md_io_tmpp = NULL; | ||
4215 | } | ||
4216 | drbd_clear_flag(mdev, GO_DISKLESS); | ||
4217 | } | 3219 | } |
4218 | 3220 | ||
4219 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3221 | static int w_go_diskless(struct drbd_work *w, int unused) |
4220 | { | 3222 | { |
3223 | struct drbd_conf *mdev = w->mdev; | ||
3224 | |||
4221 | D_ASSERT(mdev->state.disk == D_FAILED); | 3225 | D_ASSERT(mdev->state.disk == D_FAILED); |
4222 | /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will | 3226 | /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will |
4223 | * inc/dec it frequently. Once we are D_DISKLESS, no one will touch | 3227 | * inc/dec it frequently. Once we are D_DISKLESS, no one will touch |
@@ -4232,11 +3236,15 @@ static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused | |||
4232 | * (Do we want a specific meta data flag for this?) | 3236 | * (Do we want a specific meta data flag for this?) |
4233 | * | 3237 | * |
4234 | * If that does not make it to stable storage either, | 3238 | * If that does not make it to stable storage either, |
4235 | * we cannot do anything about that anymore. */ | 3239 | * we cannot do anything about that anymore. |
4236 | if (mdev->bitmap) { | 3240 | * |
3241 | * We still need to check if both bitmap and ldev are present, we may | ||
3242 | * end up here after a failed attach, before ldev was even assigned. | ||
3243 | */ | ||
3244 | if (mdev->bitmap && mdev->ldev) { | ||
4237 | if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, | 3245 | if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, |
4238 | "detach", BM_LOCKED_MASK)) { | 3246 | "detach", BM_LOCKED_MASK)) { |
4239 | if (drbd_test_flag(mdev, WAS_READ_ERROR)) { | 3247 | if (test_bit(WAS_READ_ERROR, &mdev->flags)) { |
4240 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); | 3248 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); |
4241 | drbd_md_sync(mdev); | 3249 | drbd_md_sync(mdev); |
4242 | } | 3250 | } |
@@ -4244,14 +3252,14 @@ static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused | |||
4244 | } | 3252 | } |
4245 | 3253 | ||
4246 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | 3254 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); |
4247 | return 1; | 3255 | return 0; |
4248 | } | 3256 | } |
4249 | 3257 | ||
4250 | void drbd_go_diskless(struct drbd_conf *mdev) | 3258 | void drbd_go_diskless(struct drbd_conf *mdev) |
4251 | { | 3259 | { |
4252 | D_ASSERT(mdev->state.disk == D_FAILED); | 3260 | D_ASSERT(mdev->state.disk == D_FAILED); |
4253 | if (!drbd_test_and_set_flag(mdev, GO_DISKLESS)) | 3261 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) |
4254 | drbd_queue_work(&mdev->data.work, &mdev->go_diskless); | 3262 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); |
4255 | } | 3263 | } |
4256 | 3264 | ||
4257 | /** | 3265 | /** |
@@ -4271,10 +3279,10 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev, | |||
4271 | void (*done)(struct drbd_conf *, int), | 3279 | void (*done)(struct drbd_conf *, int), |
4272 | char *why, enum bm_flag flags) | 3280 | char *why, enum bm_flag flags) |
4273 | { | 3281 | { |
4274 | D_ASSERT(current == mdev->worker.task); | 3282 | D_ASSERT(current == mdev->tconn->worker.task); |
4275 | 3283 | ||
4276 | D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO_QUEUED)); | 3284 | D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); |
4277 | D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO)); | 3285 | D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); |
4278 | D_ASSERT(list_empty(&mdev->bm_io_work.w.list)); | 3286 | D_ASSERT(list_empty(&mdev->bm_io_work.w.list)); |
4279 | if (mdev->bm_io_work.why) | 3287 | if (mdev->bm_io_work.why) |
4280 | dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n", | 3288 | dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n", |
@@ -4285,13 +3293,13 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev, | |||
4285 | mdev->bm_io_work.why = why; | 3293 | mdev->bm_io_work.why = why; |
4286 | mdev->bm_io_work.flags = flags; | 3294 | mdev->bm_io_work.flags = flags; |
4287 | 3295 | ||
4288 | spin_lock_irq(&mdev->req_lock); | 3296 | spin_lock_irq(&mdev->tconn->req_lock); |
4289 | drbd_set_flag(mdev, BITMAP_IO); | 3297 | set_bit(BITMAP_IO, &mdev->flags); |
4290 | if (atomic_read(&mdev->ap_bio_cnt) == 0) { | 3298 | if (atomic_read(&mdev->ap_bio_cnt) == 0) { |
4291 | if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED)) | 3299 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) |
4292 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | 3300 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w); |
4293 | } | 3301 | } |
4294 | spin_unlock_irq(&mdev->req_lock); | 3302 | spin_unlock_irq(&mdev->tconn->req_lock); |
4295 | } | 3303 | } |
4296 | 3304 | ||
4297 | /** | 3305 | /** |
@@ -4308,7 +3316,7 @@ int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), | |||
4308 | { | 3316 | { |
4309 | int rv; | 3317 | int rv; |
4310 | 3318 | ||
4311 | D_ASSERT(current != mdev->worker.task); | 3319 | D_ASSERT(current != mdev->tconn->worker.task); |
4312 | 3320 | ||
4313 | if ((flags & BM_LOCKED_SET_ALLOWED) == 0) | 3321 | if ((flags & BM_LOCKED_SET_ALLOWED) == 0) |
4314 | drbd_suspend_io(mdev); | 3322 | drbd_suspend_io(mdev); |
@@ -4347,18 +3355,127 @@ static void md_sync_timer_fn(unsigned long data) | |||
4347 | { | 3355 | { |
4348 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 3356 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
4349 | 3357 | ||
4350 | drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); | 3358 | /* must not double-queue! */ |
3359 | if (list_empty(&mdev->md_sync_work.list)) | ||
3360 | drbd_queue_work_front(&mdev->tconn->sender_work, &mdev->md_sync_work); | ||
4351 | } | 3361 | } |
4352 | 3362 | ||
4353 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3363 | static int w_md_sync(struct drbd_work *w, int unused) |
4354 | { | 3364 | { |
3365 | struct drbd_conf *mdev = w->mdev; | ||
3366 | |||
4355 | dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); | 3367 | dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); |
4356 | #ifdef DEBUG | 3368 | #ifdef DEBUG |
4357 | dev_warn(DEV, "last md_mark_dirty: %s:%u\n", | 3369 | dev_warn(DEV, "last md_mark_dirty: %s:%u\n", |
4358 | mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); | 3370 | mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); |
4359 | #endif | 3371 | #endif |
4360 | drbd_md_sync(mdev); | 3372 | drbd_md_sync(mdev); |
4361 | return 1; | 3373 | return 0; |
3374 | } | ||
3375 | |||
3376 | const char *cmdname(enum drbd_packet cmd) | ||
3377 | { | ||
3378 | /* THINK may need to become several global tables | ||
3379 | * when we want to support more than | ||
3380 | * one PRO_VERSION */ | ||
3381 | static const char *cmdnames[] = { | ||
3382 | [P_DATA] = "Data", | ||
3383 | [P_DATA_REPLY] = "DataReply", | ||
3384 | [P_RS_DATA_REPLY] = "RSDataReply", | ||
3385 | [P_BARRIER] = "Barrier", | ||
3386 | [P_BITMAP] = "ReportBitMap", | ||
3387 | [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", | ||
3388 | [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", | ||
3389 | [P_UNPLUG_REMOTE] = "UnplugRemote", | ||
3390 | [P_DATA_REQUEST] = "DataRequest", | ||
3391 | [P_RS_DATA_REQUEST] = "RSDataRequest", | ||
3392 | [P_SYNC_PARAM] = "SyncParam", | ||
3393 | [P_SYNC_PARAM89] = "SyncParam89", | ||
3394 | [P_PROTOCOL] = "ReportProtocol", | ||
3395 | [P_UUIDS] = "ReportUUIDs", | ||
3396 | [P_SIZES] = "ReportSizes", | ||
3397 | [P_STATE] = "ReportState", | ||
3398 | [P_SYNC_UUID] = "ReportSyncUUID", | ||
3399 | [P_AUTH_CHALLENGE] = "AuthChallenge", | ||
3400 | [P_AUTH_RESPONSE] = "AuthResponse", | ||
3401 | [P_PING] = "Ping", | ||
3402 | [P_PING_ACK] = "PingAck", | ||
3403 | [P_RECV_ACK] = "RecvAck", | ||
3404 | [P_WRITE_ACK] = "WriteAck", | ||
3405 | [P_RS_WRITE_ACK] = "RSWriteAck", | ||
3406 | [P_SUPERSEDED] = "Superseded", | ||
3407 | [P_NEG_ACK] = "NegAck", | ||
3408 | [P_NEG_DREPLY] = "NegDReply", | ||
3409 | [P_NEG_RS_DREPLY] = "NegRSDReply", | ||
3410 | [P_BARRIER_ACK] = "BarrierAck", | ||
3411 | [P_STATE_CHG_REQ] = "StateChgRequest", | ||
3412 | [P_STATE_CHG_REPLY] = "StateChgReply", | ||
3413 | [P_OV_REQUEST] = "OVRequest", | ||
3414 | [P_OV_REPLY] = "OVReply", | ||
3415 | [P_OV_RESULT] = "OVResult", | ||
3416 | [P_CSUM_RS_REQUEST] = "CsumRSRequest", | ||
3417 | [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", | ||
3418 | [P_COMPRESSED_BITMAP] = "CBitmap", | ||
3419 | [P_DELAY_PROBE] = "DelayProbe", | ||
3420 | [P_OUT_OF_SYNC] = "OutOfSync", | ||
3421 | [P_RETRY_WRITE] = "RetryWrite", | ||
3422 | [P_RS_CANCEL] = "RSCancel", | ||
3423 | [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", | ||
3424 | [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", | ||
3425 | [P_RETRY_WRITE] = "retry_write", | ||
3426 | [P_PROTOCOL_UPDATE] = "protocol_update", | ||
3427 | |||
3428 | /* enum drbd_packet, but not commands - obsoleted flags: | ||
3429 | * P_MAY_IGNORE | ||
3430 | * P_MAX_OPT_CMD | ||
3431 | */ | ||
3432 | }; | ||
3433 | |||
3434 | /* too big for the array: 0xfffX */ | ||
3435 | if (cmd == P_INITIAL_META) | ||
3436 | return "InitialMeta"; | ||
3437 | if (cmd == P_INITIAL_DATA) | ||
3438 | return "InitialData"; | ||
3439 | if (cmd == P_CONNECTION_FEATURES) | ||
3440 | return "ConnectionFeatures"; | ||
3441 | if (cmd >= ARRAY_SIZE(cmdnames)) | ||
3442 | return "Unknown"; | ||
3443 | return cmdnames[cmd]; | ||
3444 | } | ||
3445 | |||
3446 | /** | ||
3447 | * drbd_wait_misc - wait for a request to make progress | ||
3448 | * @mdev: device associated with the request | ||
3449 | * @i: the struct drbd_interval embedded in struct drbd_request or | ||
3450 | * struct drbd_peer_request | ||
3451 | */ | ||
3452 | int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i) | ||
3453 | { | ||
3454 | struct net_conf *nc; | ||
3455 | DEFINE_WAIT(wait); | ||
3456 | long timeout; | ||
3457 | |||
3458 | rcu_read_lock(); | ||
3459 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
3460 | if (!nc) { | ||
3461 | rcu_read_unlock(); | ||
3462 | return -ETIMEDOUT; | ||
3463 | } | ||
3464 | timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT; | ||
3465 | rcu_read_unlock(); | ||
3466 | |||
3467 | /* Indicate to wake up mdev->misc_wait on progress. */ | ||
3468 | i->waiting = true; | ||
3469 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE); | ||
3470 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
3471 | timeout = schedule_timeout(timeout); | ||
3472 | finish_wait(&mdev->misc_wait, &wait); | ||
3473 | spin_lock_irq(&mdev->tconn->req_lock); | ||
3474 | if (!timeout || mdev->state.conn < C_CONNECTED) | ||
3475 | return -ETIMEDOUT; | ||
3476 | if (signal_pending(current)) | ||
3477 | return -ERESTARTSYS; | ||
3478 | return 0; | ||
4362 | } | 3479 | } |
4363 | 3480 | ||
4364 | #ifdef CONFIG_DRBD_FAULT_INJECTION | 3481 | #ifdef CONFIG_DRBD_FAULT_INJECTION |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index c8dda4e8dfce..76bb3a684b86 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -29,159 +29,317 @@ | |||
29 | #include <linux/fs.h> | 29 | #include <linux/fs.h> |
30 | #include <linux/file.h> | 30 | #include <linux/file.h> |
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/connector.h> | ||
33 | #include <linux/blkpg.h> | 32 | #include <linux/blkpg.h> |
34 | #include <linux/cpumask.h> | 33 | #include <linux/cpumask.h> |
35 | #include "drbd_int.h" | 34 | #include "drbd_int.h" |
36 | #include "drbd_req.h" | 35 | #include "drbd_req.h" |
37 | #include "drbd_wrappers.h" | 36 | #include "drbd_wrappers.h" |
38 | #include <asm/unaligned.h> | 37 | #include <asm/unaligned.h> |
39 | #include <linux/drbd_tag_magic.h> | ||
40 | #include <linux/drbd_limits.h> | 38 | #include <linux/drbd_limits.h> |
41 | #include <linux/compiler.h> | ||
42 | #include <linux/kthread.h> | 39 | #include <linux/kthread.h> |
43 | 40 | ||
44 | static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); | 41 | #include <net/genetlink.h> |
45 | static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); | 42 | |
46 | static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); | 43 | /* .doit */ |
47 | 44 | // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info); | |
48 | /* see get_sb_bdev and bd_claim */ | 45 | // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info); |
46 | |||
47 | int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info); | ||
48 | int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info); | ||
49 | |||
50 | int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info); | ||
51 | int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info); | ||
52 | int drbd_adm_down(struct sk_buff *skb, struct genl_info *info); | ||
53 | |||
54 | int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info); | ||
55 | int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info); | ||
56 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info); | ||
57 | int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info); | ||
58 | int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info); | ||
59 | int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info); | ||
60 | int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info); | ||
61 | int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info); | ||
62 | int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info); | ||
63 | int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info); | ||
64 | int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info); | ||
65 | int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info); | ||
66 | int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info); | ||
67 | int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info); | ||
68 | int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info); | ||
69 | int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info); | ||
70 | int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info); | ||
71 | int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info); | ||
72 | int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); | ||
73 | int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); | ||
74 | /* .dumpit */ | ||
75 | int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); | ||
76 | |||
77 | #include <linux/drbd_genl_api.h> | ||
78 | #include "drbd_nla.h" | ||
79 | #include <linux/genl_magic_func.h> | ||
80 | |||
81 | /* used blkdev_get_by_path, to claim our meta data device(s) */ | ||
49 | static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; | 82 | static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; |
50 | 83 | ||
51 | /* Generate the tag_list to struct functions */ | 84 | /* Configuration is strictly serialized, because generic netlink message |
52 | #define NL_PACKET(name, number, fields) \ | 85 | * processing is strictly serialized by the genl_lock(). |
53 | static int name ## _from_tags(struct drbd_conf *mdev, \ | 86 | * Which means we can use one static global drbd_config_context struct. |
54 | unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ | 87 | */ |
55 | static int name ## _from_tags(struct drbd_conf *mdev, \ | 88 | static struct drbd_config_context { |
56 | unsigned short *tags, struct name *arg) \ | 89 | /* assigned from drbd_genlmsghdr */ |
57 | { \ | 90 | unsigned int minor; |
58 | int tag; \ | 91 | /* assigned from request attributes, if present */ |
59 | int dlen; \ | 92 | unsigned int volume; |
60 | \ | 93 | #define VOLUME_UNSPECIFIED (-1U) |
61 | while ((tag = get_unaligned(tags++)) != TT_END) { \ | 94 | /* pointer into the request skb, |
62 | dlen = get_unaligned(tags++); \ | 95 | * limited lifetime! */ |
63 | switch (tag_number(tag)) { \ | 96 | char *resource_name; |
64 | fields \ | 97 | struct nlattr *my_addr; |
65 | default: \ | 98 | struct nlattr *peer_addr; |
66 | if (tag & T_MANDATORY) { \ | 99 | |
67 | dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ | 100 | /* reply buffer */ |
68 | return 0; \ | 101 | struct sk_buff *reply_skb; |
69 | } \ | 102 | /* pointer into reply buffer */ |
70 | } \ | 103 | struct drbd_genlmsghdr *reply_dh; |
71 | tags = (unsigned short *)((char *)tags + dlen); \ | 104 | /* resolved from attributes, if possible */ |
72 | } \ | 105 | struct drbd_conf *mdev; |
73 | return 1; \ | 106 | struct drbd_tconn *tconn; |
74 | } | 107 | } adm_ctx; |
75 | #define NL_INTEGER(pn, pr, member) \ | 108 | |
76 | case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ | 109 | static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) |
77 | arg->member = get_unaligned((int *)(tags)); \ | 110 | { |
78 | break; | 111 | genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); |
79 | #define NL_INT64(pn, pr, member) \ | 112 | if (genlmsg_reply(skb, info)) |
80 | case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ | 113 | printk(KERN_ERR "drbd: error sending genl reply\n"); |
81 | arg->member = get_unaligned((u64 *)(tags)); \ | 114 | } |
115 | |||
116 | /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only | ||
117 | * reason it could fail was no space in skb, and there are 4k available. */ | ||
118 | int drbd_msg_put_info(const char *info) | ||
119 | { | ||
120 | struct sk_buff *skb = adm_ctx.reply_skb; | ||
121 | struct nlattr *nla; | ||
122 | int err = -EMSGSIZE; | ||
123 | |||
124 | if (!info || !info[0]) | ||
125 | return 0; | ||
126 | |||
127 | nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); | ||
128 | if (!nla) | ||
129 | return err; | ||
130 | |||
131 | err = nla_put_string(skb, T_info_text, info); | ||
132 | if (err) { | ||
133 | nla_nest_cancel(skb, nla); | ||
134 | return err; | ||
135 | } else | ||
136 | nla_nest_end(skb, nla); | ||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | /* This would be a good candidate for a "pre_doit" hook, | ||
141 | * and per-family private info->pointers. | ||
142 | * But we need to stay compatible with older kernels. | ||
143 | * If it returns successfully, adm_ctx members are valid. | ||
144 | */ | ||
145 | #define DRBD_ADM_NEED_MINOR 1 | ||
146 | #define DRBD_ADM_NEED_RESOURCE 2 | ||
147 | #define DRBD_ADM_NEED_CONNECTION 4 | ||
148 | static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, | ||
149 | unsigned flags) | ||
150 | { | ||
151 | struct drbd_genlmsghdr *d_in = info->userhdr; | ||
152 | const u8 cmd = info->genlhdr->cmd; | ||
153 | int err; | ||
154 | |||
155 | memset(&adm_ctx, 0, sizeof(adm_ctx)); | ||
156 | |||
157 | /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ | ||
158 | if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) | ||
159 | return -EPERM; | ||
160 | |||
161 | adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); | ||
162 | if (!adm_ctx.reply_skb) { | ||
163 | err = -ENOMEM; | ||
164 | goto fail; | ||
165 | } | ||
166 | |||
167 | adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, | ||
168 | info, &drbd_genl_family, 0, cmd); | ||
169 | /* put of a few bytes into a fresh skb of >= 4k will always succeed. | ||
170 | * but anyways */ | ||
171 | if (!adm_ctx.reply_dh) { | ||
172 | err = -ENOMEM; | ||
173 | goto fail; | ||
174 | } | ||
175 | |||
176 | adm_ctx.reply_dh->minor = d_in->minor; | ||
177 | adm_ctx.reply_dh->ret_code = NO_ERROR; | ||
178 | |||
179 | adm_ctx.volume = VOLUME_UNSPECIFIED; | ||
180 | if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { | ||
181 | struct nlattr *nla; | ||
182 | /* parse and validate only */ | ||
183 | err = drbd_cfg_context_from_attrs(NULL, info); | ||
184 | if (err) | ||
185 | goto fail; | ||
186 | |||
187 | /* It was present, and valid, | ||
188 | * copy it over to the reply skb. */ | ||
189 | err = nla_put_nohdr(adm_ctx.reply_skb, | ||
190 | info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, | ||
191 | info->attrs[DRBD_NLA_CFG_CONTEXT]); | ||
192 | if (err) | ||
193 | goto fail; | ||
194 | |||
195 | /* and assign stuff to the global adm_ctx */ | ||
196 | nla = nested_attr_tb[__nla_type(T_ctx_volume)]; | ||
197 | if (nla) | ||
198 | adm_ctx.volume = nla_get_u32(nla); | ||
199 | nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; | ||
200 | if (nla) | ||
201 | adm_ctx.resource_name = nla_data(nla); | ||
202 | adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; | ||
203 | adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; | ||
204 | if ((adm_ctx.my_addr && | ||
205 | nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) || | ||
206 | (adm_ctx.peer_addr && | ||
207 | nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) { | ||
208 | err = -EINVAL; | ||
209 | goto fail; | ||
210 | } | ||
211 | } | ||
212 | |||
213 | adm_ctx.minor = d_in->minor; | ||
214 | adm_ctx.mdev = minor_to_mdev(d_in->minor); | ||
215 | adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name); | ||
216 | |||
217 | if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) { | ||
218 | drbd_msg_put_info("unknown minor"); | ||
219 | return ERR_MINOR_INVALID; | ||
220 | } | ||
221 | if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) { | ||
222 | drbd_msg_put_info("unknown resource"); | ||
223 | return ERR_INVALID_REQUEST; | ||
224 | } | ||
225 | |||
226 | if (flags & DRBD_ADM_NEED_CONNECTION) { | ||
227 | if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) { | ||
228 | drbd_msg_put_info("no resource name expected"); | ||
229 | return ERR_INVALID_REQUEST; | ||
230 | } | ||
231 | if (adm_ctx.mdev) { | ||
232 | drbd_msg_put_info("no minor number expected"); | ||
233 | return ERR_INVALID_REQUEST; | ||
234 | } | ||
235 | if (adm_ctx.my_addr && adm_ctx.peer_addr) | ||
236 | adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr), | ||
237 | nla_len(adm_ctx.my_addr), | ||
238 | nla_data(adm_ctx.peer_addr), | ||
239 | nla_len(adm_ctx.peer_addr)); | ||
240 | if (!adm_ctx.tconn) { | ||
241 | drbd_msg_put_info("unknown connection"); | ||
242 | return ERR_INVALID_REQUEST; | ||
243 | } | ||
244 | } | ||
245 | |||
246 | /* some more paranoia, if the request was over-determined */ | ||
247 | if (adm_ctx.mdev && adm_ctx.tconn && | ||
248 | adm_ctx.mdev->tconn != adm_ctx.tconn) { | ||
249 | pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n", | ||
250 | adm_ctx.minor, adm_ctx.resource_name, | ||
251 | adm_ctx.mdev->tconn->name); | ||
252 | drbd_msg_put_info("minor exists in different resource"); | ||
253 | return ERR_INVALID_REQUEST; | ||
254 | } | ||
255 | if (adm_ctx.mdev && | ||
256 | adm_ctx.volume != VOLUME_UNSPECIFIED && | ||
257 | adm_ctx.volume != adm_ctx.mdev->vnr) { | ||
258 | pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", | ||
259 | adm_ctx.minor, adm_ctx.volume, | ||
260 | adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name); | ||
261 | drbd_msg_put_info("minor exists as different volume"); | ||
262 | return ERR_INVALID_REQUEST; | ||
263 | } | ||
264 | |||
265 | return NO_ERROR; | ||
266 | |||
267 | fail: | ||
268 | nlmsg_free(adm_ctx.reply_skb); | ||
269 | adm_ctx.reply_skb = NULL; | ||
270 | return err; | ||
271 | } | ||
272 | |||
273 | static int drbd_adm_finish(struct genl_info *info, int retcode) | ||
274 | { | ||
275 | if (adm_ctx.tconn) { | ||
276 | kref_put(&adm_ctx.tconn->kref, &conn_destroy); | ||
277 | adm_ctx.tconn = NULL; | ||
278 | } | ||
279 | |||
280 | if (!adm_ctx.reply_skb) | ||
281 | return -ENOMEM; | ||
282 | |||
283 | adm_ctx.reply_dh->ret_code = retcode; | ||
284 | drbd_adm_send_reply(adm_ctx.reply_skb, info); | ||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | static void setup_khelper_env(struct drbd_tconn *tconn, char **envp) | ||
289 | { | ||
290 | char *afs; | ||
291 | |||
292 | /* FIXME: A future version will not allow this case. */ | ||
293 | if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0) | ||
294 | return; | ||
295 | |||
296 | switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) { | ||
297 | case AF_INET6: | ||
298 | afs = "ipv6"; | ||
299 | snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6", | ||
300 | &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr); | ||
82 | break; | 301 | break; |
83 | #define NL_BIT(pn, pr, member) \ | 302 | case AF_INET: |
84 | case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ | 303 | afs = "ipv4"; |
85 | arg->member = *(char *)(tags) ? 1 : 0; \ | 304 | snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", |
305 | &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); | ||
86 | break; | 306 | break; |
87 | #define NL_STRING(pn, pr, member, len) \ | 307 | default: |
88 | case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ | 308 | afs = "ssocks"; |
89 | if (dlen > len) { \ | 309 | snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", |
90 | dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ | 310 | &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); |
91 | #member, dlen, (unsigned int)len); \ | 311 | } |
92 | return 0; \ | 312 | snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs); |
93 | } \ | 313 | } |
94 | arg->member ## _len = dlen; \ | ||
95 | memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ | ||
96 | break; | ||
97 | #include <linux/drbd_nl.h> | ||
98 | |||
99 | /* Generate the struct to tag_list functions */ | ||
100 | #define NL_PACKET(name, number, fields) \ | ||
101 | static unsigned short* \ | ||
102 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
103 | struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ | ||
104 | static unsigned short* \ | ||
105 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
106 | struct name *arg, unsigned short *tags) \ | ||
107 | { \ | ||
108 | fields \ | ||
109 | return tags; \ | ||
110 | } | ||
111 | |||
112 | #define NL_INTEGER(pn, pr, member) \ | ||
113 | put_unaligned(pn | pr | TT_INTEGER, tags++); \ | ||
114 | put_unaligned(sizeof(int), tags++); \ | ||
115 | put_unaligned(arg->member, (int *)tags); \ | ||
116 | tags = (unsigned short *)((char *)tags+sizeof(int)); | ||
117 | #define NL_INT64(pn, pr, member) \ | ||
118 | put_unaligned(pn | pr | TT_INT64, tags++); \ | ||
119 | put_unaligned(sizeof(u64), tags++); \ | ||
120 | put_unaligned(arg->member, (u64 *)tags); \ | ||
121 | tags = (unsigned short *)((char *)tags+sizeof(u64)); | ||
122 | #define NL_BIT(pn, pr, member) \ | ||
123 | put_unaligned(pn | pr | TT_BIT, tags++); \ | ||
124 | put_unaligned(sizeof(char), tags++); \ | ||
125 | *(char *)tags = arg->member; \ | ||
126 | tags = (unsigned short *)((char *)tags+sizeof(char)); | ||
127 | #define NL_STRING(pn, pr, member, len) \ | ||
128 | put_unaligned(pn | pr | TT_STRING, tags++); \ | ||
129 | put_unaligned(arg->member ## _len, tags++); \ | ||
130 | memcpy(tags, arg->member, arg->member ## _len); \ | ||
131 | tags = (unsigned short *)((char *)tags + arg->member ## _len); | ||
132 | #include <linux/drbd_nl.h> | ||
133 | |||
134 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); | ||
135 | void drbd_nl_send_reply(struct cn_msg *, int); | ||
136 | 314 | ||
137 | int drbd_khelper(struct drbd_conf *mdev, char *cmd) | 315 | int drbd_khelper(struct drbd_conf *mdev, char *cmd) |
138 | { | 316 | { |
139 | char *envp[] = { "HOME=/", | 317 | char *envp[] = { "HOME=/", |
140 | "TERM=linux", | 318 | "TERM=linux", |
141 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | 319 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", |
142 | NULL, /* Will be set to address family */ | 320 | (char[20]) { }, /* address family */ |
143 | NULL, /* Will be set to address */ | 321 | (char[60]) { }, /* address */ |
144 | NULL }; | 322 | NULL }; |
145 | 323 | char mb[12]; | |
146 | char mb[12], af[20], ad[60], *afs; | ||
147 | char *argv[] = {usermode_helper, cmd, mb, NULL }; | 324 | char *argv[] = {usermode_helper, cmd, mb, NULL }; |
325 | struct drbd_tconn *tconn = mdev->tconn; | ||
326 | struct sib_info sib; | ||
148 | int ret; | 327 | int ret; |
149 | 328 | ||
150 | if (current == mdev->worker.task) | 329 | if (current == tconn->worker.task) |
151 | drbd_set_flag(mdev, CALLBACK_PENDING); | 330 | set_bit(CALLBACK_PENDING, &tconn->flags); |
152 | 331 | ||
153 | snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); | 332 | snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); |
154 | 333 | setup_khelper_env(tconn, envp); | |
155 | if (get_net_conf(mdev)) { | ||
156 | switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { | ||
157 | case AF_INET6: | ||
158 | afs = "ipv6"; | ||
159 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", | ||
160 | &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); | ||
161 | break; | ||
162 | case AF_INET: | ||
163 | afs = "ipv4"; | ||
164 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
165 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
166 | break; | ||
167 | default: | ||
168 | afs = "ssocks"; | ||
169 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
170 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
171 | } | ||
172 | snprintf(af, 20, "DRBD_PEER_AF=%s", afs); | ||
173 | envp[3]=af; | ||
174 | envp[4]=ad; | ||
175 | put_net_conf(mdev); | ||
176 | } | ||
177 | 334 | ||
178 | /* The helper may take some time. | 335 | /* The helper may take some time. |
179 | * write out any unsynced meta data changes now */ | 336 | * write out any unsynced meta data changes now */ |
180 | drbd_md_sync(mdev); | 337 | drbd_md_sync(mdev); |
181 | 338 | ||
182 | dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); | 339 | dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); |
183 | 340 | sib.sib_reason = SIB_HELPER_PRE; | |
184 | drbd_bcast_ev_helper(mdev, cmd); | 341 | sib.helper_name = cmd; |
342 | drbd_bcast_event(mdev, &sib); | ||
185 | ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); | 343 | ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); |
186 | if (ret) | 344 | if (ret) |
187 | dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | 345 | dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", |
@@ -191,9 +349,46 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) | |||
191 | dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | 349 | dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", |
192 | usermode_helper, cmd, mb, | 350 | usermode_helper, cmd, mb, |
193 | (ret >> 8) & 0xff, ret); | 351 | (ret >> 8) & 0xff, ret); |
352 | sib.sib_reason = SIB_HELPER_POST; | ||
353 | sib.helper_exit_code = ret; | ||
354 | drbd_bcast_event(mdev, &sib); | ||
355 | |||
356 | if (current == tconn->worker.task) | ||
357 | clear_bit(CALLBACK_PENDING, &tconn->flags); | ||
358 | |||
359 | if (ret < 0) /* Ignore any ERRNOs we got. */ | ||
360 | ret = 0; | ||
361 | |||
362 | return ret; | ||
363 | } | ||
364 | |||
365 | int conn_khelper(struct drbd_tconn *tconn, char *cmd) | ||
366 | { | ||
367 | char *envp[] = { "HOME=/", | ||
368 | "TERM=linux", | ||
369 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | ||
370 | (char[20]) { }, /* address family */ | ||
371 | (char[60]) { }, /* address */ | ||
372 | NULL }; | ||
373 | char *argv[] = {usermode_helper, cmd, tconn->name, NULL }; | ||
374 | int ret; | ||
375 | |||
376 | setup_khelper_env(tconn, envp); | ||
377 | conn_md_sync(tconn); | ||
194 | 378 | ||
195 | if (current == mdev->worker.task) | 379 | conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name); |
196 | drbd_clear_flag(mdev, CALLBACK_PENDING); | 380 | /* TODO: conn_bcast_event() ?? */ |
381 | |||
382 | ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); | ||
383 | if (ret) | ||
384 | conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
385 | usermode_helper, cmd, tconn->name, | ||
386 | (ret >> 8) & 0xff, ret); | ||
387 | else | ||
388 | conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
389 | usermode_helper, cmd, tconn->name, | ||
390 | (ret >> 8) & 0xff, ret); | ||
391 | /* TODO: conn_bcast_event() ?? */ | ||
197 | 392 | ||
198 | if (ret < 0) /* Ignore any ERRNOs we got. */ | 393 | if (ret < 0) /* Ignore any ERRNOs we got. */ |
199 | ret = 0; | 394 | ret = 0; |
@@ -201,116 +396,129 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) | |||
201 | return ret; | 396 | return ret; |
202 | } | 397 | } |
203 | 398 | ||
204 | enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) | 399 | static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn) |
205 | { | 400 | { |
401 | enum drbd_fencing_p fp = FP_NOT_AVAIL; | ||
402 | struct drbd_conf *mdev; | ||
403 | int vnr; | ||
404 | |||
405 | rcu_read_lock(); | ||
406 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
407 | if (get_ldev_if_state(mdev, D_CONSISTENT)) { | ||
408 | fp = max_t(enum drbd_fencing_p, fp, | ||
409 | rcu_dereference(mdev->ldev->disk_conf)->fencing); | ||
410 | put_ldev(mdev); | ||
411 | } | ||
412 | } | ||
413 | rcu_read_unlock(); | ||
414 | |||
415 | return fp; | ||
416 | } | ||
417 | |||
418 | bool conn_try_outdate_peer(struct drbd_tconn *tconn) | ||
419 | { | ||
420 | union drbd_state mask = { }; | ||
421 | union drbd_state val = { }; | ||
422 | enum drbd_fencing_p fp; | ||
206 | char *ex_to_string; | 423 | char *ex_to_string; |
207 | int r; | 424 | int r; |
208 | enum drbd_disk_state nps; | ||
209 | enum drbd_fencing_p fp; | ||
210 | 425 | ||
211 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | 426 | if (tconn->cstate >= C_WF_REPORT_PARAMS) { |
427 | conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n"); | ||
428 | return false; | ||
429 | } | ||
212 | 430 | ||
213 | if (get_ldev_if_state(mdev, D_CONSISTENT)) { | 431 | fp = highest_fencing_policy(tconn); |
214 | fp = mdev->ldev->dc.fencing; | 432 | switch (fp) { |
215 | put_ldev(mdev); | 433 | case FP_NOT_AVAIL: |
216 | } else { | 434 | conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n"); |
217 | dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); | ||
218 | nps = mdev->state.pdsk; | ||
219 | goto out; | 435 | goto out; |
436 | case FP_DONT_CARE: | ||
437 | return true; | ||
438 | default: ; | ||
220 | } | 439 | } |
221 | 440 | ||
222 | r = drbd_khelper(mdev, "fence-peer"); | 441 | r = conn_khelper(tconn, "fence-peer"); |
223 | 442 | ||
224 | switch ((r>>8) & 0xff) { | 443 | switch ((r>>8) & 0xff) { |
225 | case 3: /* peer is inconsistent */ | 444 | case 3: /* peer is inconsistent */ |
226 | ex_to_string = "peer is inconsistent or worse"; | 445 | ex_to_string = "peer is inconsistent or worse"; |
227 | nps = D_INCONSISTENT; | 446 | mask.pdsk = D_MASK; |
447 | val.pdsk = D_INCONSISTENT; | ||
228 | break; | 448 | break; |
229 | case 4: /* peer got outdated, or was already outdated */ | 449 | case 4: /* peer got outdated, or was already outdated */ |
230 | ex_to_string = "peer was fenced"; | 450 | ex_to_string = "peer was fenced"; |
231 | nps = D_OUTDATED; | 451 | mask.pdsk = D_MASK; |
452 | val.pdsk = D_OUTDATED; | ||
232 | break; | 453 | break; |
233 | case 5: /* peer was down */ | 454 | case 5: /* peer was down */ |
234 | if (mdev->state.disk == D_UP_TO_DATE) { | 455 | if (conn_highest_disk(tconn) == D_UP_TO_DATE) { |
235 | /* we will(have) create(d) a new UUID anyways... */ | 456 | /* we will(have) create(d) a new UUID anyways... */ |
236 | ex_to_string = "peer is unreachable, assumed to be dead"; | 457 | ex_to_string = "peer is unreachable, assumed to be dead"; |
237 | nps = D_OUTDATED; | 458 | mask.pdsk = D_MASK; |
459 | val.pdsk = D_OUTDATED; | ||
238 | } else { | 460 | } else { |
239 | ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; | 461 | ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; |
240 | nps = mdev->state.pdsk; | ||
241 | } | 462 | } |
242 | break; | 463 | break; |
243 | case 6: /* Peer is primary, voluntarily outdate myself. | 464 | case 6: /* Peer is primary, voluntarily outdate myself. |
244 | * This is useful when an unconnected R_SECONDARY is asked to | 465 | * This is useful when an unconnected R_SECONDARY is asked to |
245 | * become R_PRIMARY, but finds the other peer being active. */ | 466 | * become R_PRIMARY, but finds the other peer being active. */ |
246 | ex_to_string = "peer is active"; | 467 | ex_to_string = "peer is active"; |
247 | dev_warn(DEV, "Peer is primary, outdating myself.\n"); | 468 | conn_warn(tconn, "Peer is primary, outdating myself.\n"); |
248 | nps = D_UNKNOWN; | 469 | mask.disk = D_MASK; |
249 | _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); | 470 | val.disk = D_OUTDATED; |
250 | break; | 471 | break; |
251 | case 7: | 472 | case 7: |
252 | if (fp != FP_STONITH) | 473 | if (fp != FP_STONITH) |
253 | dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); | 474 | conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n"); |
254 | ex_to_string = "peer was stonithed"; | 475 | ex_to_string = "peer was stonithed"; |
255 | nps = D_OUTDATED; | 476 | mask.pdsk = D_MASK; |
477 | val.pdsk = D_OUTDATED; | ||
256 | break; | 478 | break; |
257 | default: | 479 | default: |
258 | /* The script is broken ... */ | 480 | /* The script is broken ... */ |
259 | nps = D_UNKNOWN; | 481 | conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); |
260 | dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); | 482 | return false; /* Eventually leave IO frozen */ |
261 | return nps; | ||
262 | } | 483 | } |
263 | 484 | ||
264 | dev_info(DEV, "fence-peer helper returned %d (%s)\n", | 485 | conn_info(tconn, "fence-peer helper returned %d (%s)\n", |
265 | (r>>8) & 0xff, ex_to_string); | 486 | (r>>8) & 0xff, ex_to_string); |
266 | 487 | ||
267 | out: | 488 | out: |
268 | if (mdev->state.susp_fen && nps >= D_UNKNOWN) { | ||
269 | /* The handler was not successful... unfreeze here, the | ||
270 | state engine can not unfreeze... */ | ||
271 | _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE); | ||
272 | } | ||
273 | 489 | ||
274 | return nps; | 490 | /* Not using |
491 | conn_request_state(tconn, mask, val, CS_VERBOSE); | ||
492 | here, because we might were able to re-establish the connection in the | ||
493 | meantime. */ | ||
494 | spin_lock_irq(&tconn->req_lock); | ||
495 | if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) | ||
496 | _conn_request_state(tconn, mask, val, CS_VERBOSE); | ||
497 | spin_unlock_irq(&tconn->req_lock); | ||
498 | |||
499 | return conn_highest_pdsk(tconn) <= D_OUTDATED; | ||
275 | } | 500 | } |
276 | 501 | ||
277 | static int _try_outdate_peer_async(void *data) | 502 | static int _try_outdate_peer_async(void *data) |
278 | { | 503 | { |
279 | struct drbd_conf *mdev = (struct drbd_conf *)data; | 504 | struct drbd_tconn *tconn = (struct drbd_tconn *)data; |
280 | enum drbd_disk_state nps; | ||
281 | union drbd_state ns; | ||
282 | 505 | ||
283 | nps = drbd_try_outdate_peer(mdev); | 506 | conn_try_outdate_peer(tconn); |
284 | |||
285 | /* Not using | ||
286 | drbd_request_state(mdev, NS(pdsk, nps)); | ||
287 | here, because we might were able to re-establish the connection | ||
288 | in the meantime. This can only partially be solved in the state's | ||
289 | engine is_valid_state() and is_valid_state_transition() | ||
290 | functions. | ||
291 | |||
292 | nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN. | ||
293 | pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid, | ||
294 | therefore we have to have the pre state change check here. | ||
295 | */ | ||
296 | spin_lock_irq(&mdev->req_lock); | ||
297 | ns = mdev->state; | ||
298 | if (ns.conn < C_WF_REPORT_PARAMS && !drbd_test_flag(mdev, STATE_SENT)) { | ||
299 | ns.pdsk = nps; | ||
300 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
301 | } | ||
302 | spin_unlock_irq(&mdev->req_lock); | ||
303 | 507 | ||
508 | kref_put(&tconn->kref, &conn_destroy); | ||
304 | return 0; | 509 | return 0; |
305 | } | 510 | } |
306 | 511 | ||
307 | void drbd_try_outdate_peer_async(struct drbd_conf *mdev) | 512 | void conn_try_outdate_peer_async(struct drbd_tconn *tconn) |
308 | { | 513 | { |
309 | struct task_struct *opa; | 514 | struct task_struct *opa; |
310 | 515 | ||
311 | opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev)); | 516 | kref_get(&tconn->kref); |
312 | if (IS_ERR(opa)) | 517 | opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h"); |
313 | dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n"); | 518 | if (IS_ERR(opa)) { |
519 | conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n"); | ||
520 | kref_put(&tconn->kref, &conn_destroy); | ||
521 | } | ||
314 | } | 522 | } |
315 | 523 | ||
316 | enum drbd_state_rv | 524 | enum drbd_state_rv |
@@ -318,15 +526,15 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
318 | { | 526 | { |
319 | const int max_tries = 4; | 527 | const int max_tries = 4; |
320 | enum drbd_state_rv rv = SS_UNKNOWN_ERROR; | 528 | enum drbd_state_rv rv = SS_UNKNOWN_ERROR; |
529 | struct net_conf *nc; | ||
321 | int try = 0; | 530 | int try = 0; |
322 | int forced = 0; | 531 | int forced = 0; |
323 | union drbd_state mask, val; | 532 | union drbd_state mask, val; |
324 | enum drbd_disk_state nps; | ||
325 | 533 | ||
326 | if (new_role == R_PRIMARY) | 534 | if (new_role == R_PRIMARY) |
327 | request_ping(mdev); /* Detect a dead peer ASAP */ | 535 | request_ping(mdev->tconn); /* Detect a dead peer ASAP */ |
328 | 536 | ||
329 | mutex_lock(&mdev->state_mutex); | 537 | mutex_lock(mdev->state_mutex); |
330 | 538 | ||
331 | mask.i = 0; mask.role = R_MASK; | 539 | mask.i = 0; mask.role = R_MASK; |
332 | val.i = 0; val.role = new_role; | 540 | val.i = 0; val.role = new_role; |
@@ -354,38 +562,34 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
354 | if (rv == SS_NO_UP_TO_DATE_DISK && | 562 | if (rv == SS_NO_UP_TO_DATE_DISK && |
355 | mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { | 563 | mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { |
356 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | 564 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); |
357 | nps = drbd_try_outdate_peer(mdev); | ||
358 | 565 | ||
359 | if (nps == D_OUTDATED || nps == D_INCONSISTENT) { | 566 | if (conn_try_outdate_peer(mdev->tconn)) { |
360 | val.disk = D_UP_TO_DATE; | 567 | val.disk = D_UP_TO_DATE; |
361 | mask.disk = D_MASK; | 568 | mask.disk = D_MASK; |
362 | } | 569 | } |
363 | |||
364 | val.pdsk = nps; | ||
365 | mask.pdsk = D_MASK; | ||
366 | |||
367 | continue; | 570 | continue; |
368 | } | 571 | } |
369 | 572 | ||
370 | if (rv == SS_NOTHING_TO_DO) | 573 | if (rv == SS_NOTHING_TO_DO) |
371 | goto fail; | 574 | goto out; |
372 | if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { | 575 | if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { |
373 | nps = drbd_try_outdate_peer(mdev); | 576 | if (!conn_try_outdate_peer(mdev->tconn) && force) { |
374 | |||
375 | if (force && nps > D_OUTDATED) { | ||
376 | dev_warn(DEV, "Forced into split brain situation!\n"); | 577 | dev_warn(DEV, "Forced into split brain situation!\n"); |
377 | nps = D_OUTDATED; | 578 | mask.pdsk = D_MASK; |
378 | } | 579 | val.pdsk = D_OUTDATED; |
379 | |||
380 | mask.pdsk = D_MASK; | ||
381 | val.pdsk = nps; | ||
382 | 580 | ||
581 | } | ||
383 | continue; | 582 | continue; |
384 | } | 583 | } |
385 | if (rv == SS_TWO_PRIMARIES) { | 584 | if (rv == SS_TWO_PRIMARIES) { |
386 | /* Maybe the peer is detected as dead very soon... | 585 | /* Maybe the peer is detected as dead very soon... |
387 | retry at most once more in this case. */ | 586 | retry at most once more in this case. */ |
388 | schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10); | 587 | int timeo; |
588 | rcu_read_lock(); | ||
589 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
590 | timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; | ||
591 | rcu_read_unlock(); | ||
592 | schedule_timeout_interruptible(timeo); | ||
389 | if (try < max_tries) | 593 | if (try < max_tries) |
390 | try = max_tries - 1; | 594 | try = max_tries - 1; |
391 | continue; | 595 | continue; |
@@ -394,13 +598,13 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
394 | rv = _drbd_request_state(mdev, mask, val, | 598 | rv = _drbd_request_state(mdev, mask, val, |
395 | CS_VERBOSE + CS_WAIT_COMPLETE); | 599 | CS_VERBOSE + CS_WAIT_COMPLETE); |
396 | if (rv < SS_SUCCESS) | 600 | if (rv < SS_SUCCESS) |
397 | goto fail; | 601 | goto out; |
398 | } | 602 | } |
399 | break; | 603 | break; |
400 | } | 604 | } |
401 | 605 | ||
402 | if (rv < SS_SUCCESS) | 606 | if (rv < SS_SUCCESS) |
403 | goto fail; | 607 | goto out; |
404 | 608 | ||
405 | if (forced) | 609 | if (forced) |
406 | dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); | 610 | dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); |
@@ -408,6 +612,8 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
408 | /* Wait until nothing is on the fly :) */ | 612 | /* Wait until nothing is on the fly :) */ |
409 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); | 613 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); |
410 | 614 | ||
615 | /* FIXME also wait for all pending P_BARRIER_ACK? */ | ||
616 | |||
411 | if (new_role == R_SECONDARY) { | 617 | if (new_role == R_SECONDARY) { |
412 | set_disk_ro(mdev->vdisk, true); | 618 | set_disk_ro(mdev->vdisk, true); |
413 | if (get_ldev(mdev)) { | 619 | if (get_ldev(mdev)) { |
@@ -415,10 +621,12 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
415 | put_ldev(mdev); | 621 | put_ldev(mdev); |
416 | } | 622 | } |
417 | } else { | 623 | } else { |
418 | if (get_net_conf(mdev)) { | 624 | mutex_lock(&mdev->tconn->conf_update); |
419 | mdev->net_conf->want_lose = 0; | 625 | nc = mdev->tconn->net_conf; |
420 | put_net_conf(mdev); | 626 | if (nc) |
421 | } | 627 | nc->discard_my_data = 0; /* without copy; single bit op is atomic */ |
628 | mutex_unlock(&mdev->tconn->conf_update); | ||
629 | |||
422 | set_disk_ro(mdev->vdisk, false); | 630 | set_disk_ro(mdev->vdisk, false); |
423 | if (get_ldev(mdev)) { | 631 | if (get_ldev(mdev)) { |
424 | if (((mdev->state.conn < C_CONNECTED || | 632 | if (((mdev->state.conn < C_CONNECTED || |
@@ -444,67 +652,47 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
444 | drbd_md_sync(mdev); | 652 | drbd_md_sync(mdev); |
445 | 653 | ||
446 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | 654 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); |
447 | fail: | 655 | out: |
448 | mutex_unlock(&mdev->state_mutex); | 656 | mutex_unlock(mdev->state_mutex); |
449 | return rv; | 657 | return rv; |
450 | } | 658 | } |
451 | 659 | ||
452 | static struct drbd_conf *ensure_mdev(int minor, int create) | 660 | static const char *from_attrs_err_to_txt(int err) |
453 | { | 661 | { |
454 | struct drbd_conf *mdev; | 662 | return err == -ENOMSG ? "required attribute missing" : |
455 | 663 | err == -EOPNOTSUPP ? "unknown mandatory attribute" : | |
456 | if (minor >= minor_count) | 664 | err == -EEXIST ? "can not change invariant setting" : |
457 | return NULL; | 665 | "invalid attribute value"; |
458 | |||
459 | mdev = minor_to_mdev(minor); | ||
460 | |||
461 | if (!mdev && create) { | ||
462 | struct gendisk *disk = NULL; | ||
463 | mdev = drbd_new_device(minor); | ||
464 | |||
465 | spin_lock_irq(&drbd_pp_lock); | ||
466 | if (minor_table[minor] == NULL) { | ||
467 | minor_table[minor] = mdev; | ||
468 | disk = mdev->vdisk; | ||
469 | mdev = NULL; | ||
470 | } /* else: we lost the race */ | ||
471 | spin_unlock_irq(&drbd_pp_lock); | ||
472 | |||
473 | if (disk) /* we won the race above */ | ||
474 | /* in case we ever add a drbd_delete_device(), | ||
475 | * don't forget the del_gendisk! */ | ||
476 | add_disk(disk); | ||
477 | else /* we lost the race above */ | ||
478 | drbd_free_mdev(mdev); | ||
479 | |||
480 | mdev = minor_to_mdev(minor); | ||
481 | } | ||
482 | |||
483 | return mdev; | ||
484 | } | 666 | } |
485 | 667 | ||
486 | static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 668 | int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) |
487 | struct drbd_nl_cfg_reply *reply) | ||
488 | { | 669 | { |
489 | struct primary primary_args; | 670 | struct set_role_parms parms; |
490 | 671 | int err; | |
491 | memset(&primary_args, 0, sizeof(struct primary)); | 672 | enum drbd_ret_code retcode; |
492 | if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { | ||
493 | reply->ret_code = ERR_MANDATORY_TAG; | ||
494 | return 0; | ||
495 | } | ||
496 | |||
497 | reply->ret_code = | ||
498 | drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force); | ||
499 | 673 | ||
500 | return 0; | 674 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
501 | } | 675 | if (!adm_ctx.reply_skb) |
676 | return retcode; | ||
677 | if (retcode != NO_ERROR) | ||
678 | goto out; | ||
502 | 679 | ||
503 | static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 680 | memset(&parms, 0, sizeof(parms)); |
504 | struct drbd_nl_cfg_reply *reply) | 681 | if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) { |
505 | { | 682 | err = set_role_parms_from_attrs(&parms, info); |
506 | reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); | 683 | if (err) { |
684 | retcode = ERR_MANDATORY_TAG; | ||
685 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
686 | goto out; | ||
687 | } | ||
688 | } | ||
507 | 689 | ||
690 | if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) | ||
691 | retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate); | ||
692 | else | ||
693 | retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0); | ||
694 | out: | ||
695 | drbd_adm_finish(info, retcode); | ||
508 | return 0; | 696 | return 0; |
509 | } | 697 | } |
510 | 698 | ||
@@ -514,7 +702,12 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
514 | struct drbd_backing_dev *bdev) | 702 | struct drbd_backing_dev *bdev) |
515 | { | 703 | { |
516 | sector_t md_size_sect = 0; | 704 | sector_t md_size_sect = 0; |
517 | switch (bdev->dc.meta_dev_idx) { | 705 | int meta_dev_idx; |
706 | |||
707 | rcu_read_lock(); | ||
708 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
709 | |||
710 | switch (meta_dev_idx) { | ||
518 | default: | 711 | default: |
519 | /* v07 style fixed size indexed meta data */ | 712 | /* v07 style fixed size indexed meta data */ |
520 | bdev->md.md_size_sect = MD_RESERVED_SECT; | 713 | bdev->md.md_size_sect = MD_RESERVED_SECT; |
@@ -533,7 +726,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
533 | case DRBD_MD_INDEX_FLEX_INT: | 726 | case DRBD_MD_INDEX_FLEX_INT: |
534 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | 727 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); |
535 | /* al size is still fixed */ | 728 | /* al size is still fixed */ |
536 | bdev->md.al_offset = -MD_AL_MAX_SIZE; | 729 | bdev->md.al_offset = -MD_AL_SECTORS; |
537 | /* we need (slightly less than) ~ this much bitmap sectors: */ | 730 | /* we need (slightly less than) ~ this much bitmap sectors: */ |
538 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); | 731 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); |
539 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); | 732 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); |
@@ -549,6 +742,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
549 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; | 742 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; |
550 | break; | 743 | break; |
551 | } | 744 | } |
745 | rcu_read_unlock(); | ||
552 | } | 746 | } |
553 | 747 | ||
554 | /* input size is expected to be in KB */ | 748 | /* input size is expected to be in KB */ |
@@ -581,17 +775,23 @@ char *ppsize(char *buf, unsigned long long size) | |||
581 | * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: | 775 | * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: |
582 | * peer may not initiate a resize. | 776 | * peer may not initiate a resize. |
583 | */ | 777 | */ |
778 | /* Note these are not to be confused with | ||
779 | * drbd_adm_suspend_io/drbd_adm_resume_io, | ||
780 | * which are (sub) state changes triggered by admin (drbdsetup), | ||
781 | * and can be long lived. | ||
782 | * This changes an mdev->flag, is triggered by drbd internals, | ||
783 | * and should be short-lived. */ | ||
584 | void drbd_suspend_io(struct drbd_conf *mdev) | 784 | void drbd_suspend_io(struct drbd_conf *mdev) |
585 | { | 785 | { |
586 | drbd_set_flag(mdev, SUSPEND_IO); | 786 | set_bit(SUSPEND_IO, &mdev->flags); |
587 | if (is_susp(mdev->state)) | 787 | if (drbd_suspended(mdev)) |
588 | return; | 788 | return; |
589 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); | 789 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); |
590 | } | 790 | } |
591 | 791 | ||
592 | void drbd_resume_io(struct drbd_conf *mdev) | 792 | void drbd_resume_io(struct drbd_conf *mdev) |
593 | { | 793 | { |
594 | drbd_clear_flag(mdev, SUSPEND_IO); | 794 | clear_bit(SUSPEND_IO, &mdev->flags); |
595 | wake_up(&mdev->misc_wait); | 795 | wake_up(&mdev->misc_wait); |
596 | } | 796 | } |
597 | 797 | ||
@@ -605,7 +805,7 @@ void drbd_resume_io(struct drbd_conf *mdev) | |||
605 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) | 805 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) |
606 | { | 806 | { |
607 | sector_t prev_first_sect, prev_size; /* previous meta location */ | 807 | sector_t prev_first_sect, prev_size; /* previous meta location */ |
608 | sector_t la_size; | 808 | sector_t la_size, u_size; |
609 | sector_t size; | 809 | sector_t size; |
610 | char ppb[10]; | 810 | char ppb[10]; |
611 | 811 | ||
@@ -633,7 +833,10 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
633 | /* TODO: should only be some assert here, not (re)init... */ | 833 | /* TODO: should only be some assert here, not (re)init... */ |
634 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | 834 | drbd_md_set_sector_offsets(mdev, mdev->ldev); |
635 | 835 | ||
636 | size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED); | 836 | rcu_read_lock(); |
837 | u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
838 | rcu_read_unlock(); | ||
839 | size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); | ||
637 | 840 | ||
638 | if (drbd_get_capacity(mdev->this_bdev) != size || | 841 | if (drbd_get_capacity(mdev->this_bdev) != size || |
639 | drbd_bm_capacity(mdev) != size) { | 842 | drbd_bm_capacity(mdev) != size) { |
@@ -696,12 +899,12 @@ out: | |||
696 | } | 899 | } |
697 | 900 | ||
698 | sector_t | 901 | sector_t |
699 | drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space) | 902 | drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, |
903 | sector_t u_size, int assume_peer_has_space) | ||
700 | { | 904 | { |
701 | sector_t p_size = mdev->p_size; /* partner's disk size. */ | 905 | sector_t p_size = mdev->p_size; /* partner's disk size. */ |
702 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ | 906 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ |
703 | sector_t m_size; /* my size */ | 907 | sector_t m_size; /* my size */ |
704 | sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ | ||
705 | sector_t size = 0; | 908 | sector_t size = 0; |
706 | 909 | ||
707 | m_size = drbd_get_max_capacity(bdev); | 910 | m_size = drbd_get_max_capacity(bdev); |
@@ -750,24 +953,21 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int ass | |||
750 | * failed, and 0 on success. You should call drbd_md_sync() after you called | 953 | * failed, and 0 on success. You should call drbd_md_sync() after you called |
751 | * this function. | 954 | * this function. |
752 | */ | 955 | */ |
753 | static int drbd_check_al_size(struct drbd_conf *mdev) | 956 | static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc) |
754 | { | 957 | { |
755 | struct lru_cache *n, *t; | 958 | struct lru_cache *n, *t; |
756 | struct lc_element *e; | 959 | struct lc_element *e; |
757 | unsigned int in_use; | 960 | unsigned int in_use; |
758 | int i; | 961 | int i; |
759 | 962 | ||
760 | ERR_IF(mdev->sync_conf.al_extents < 7) | ||
761 | mdev->sync_conf.al_extents = 127; | ||
762 | |||
763 | if (mdev->act_log && | 963 | if (mdev->act_log && |
764 | mdev->act_log->nr_elements == mdev->sync_conf.al_extents) | 964 | mdev->act_log->nr_elements == dc->al_extents) |
765 | return 0; | 965 | return 0; |
766 | 966 | ||
767 | in_use = 0; | 967 | in_use = 0; |
768 | t = mdev->act_log; | 968 | t = mdev->act_log; |
769 | n = lc_create("act_log", drbd_al_ext_cache, | 969 | n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION, |
770 | mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); | 970 | dc->al_extents, sizeof(struct lc_element), 0); |
771 | 971 | ||
772 | if (n == NULL) { | 972 | if (n == NULL) { |
773 | dev_err(DEV, "Cannot allocate act_log lru!\n"); | 973 | dev_err(DEV, "Cannot allocate act_log lru!\n"); |
@@ -808,7 +1008,9 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_ | |||
808 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; | 1008 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; |
809 | 1009 | ||
810 | max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); | 1010 | max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); |
811 | max_segments = mdev->ldev->dc.max_bio_bvecs; | 1011 | rcu_read_lock(); |
1012 | max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs; | ||
1013 | rcu_read_unlock(); | ||
812 | put_ldev(mdev); | 1014 | put_ldev(mdev); |
813 | } | 1015 | } |
814 | 1016 | ||
@@ -852,12 +1054,14 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) | |||
852 | Because new from 8.3.8 onwards the peer can use multiple | 1054 | Because new from 8.3.8 onwards the peer can use multiple |
853 | BIOs for a single peer_request */ | 1055 | BIOs for a single peer_request */ |
854 | if (mdev->state.conn >= C_CONNECTED) { | 1056 | if (mdev->state.conn >= C_CONNECTED) { |
855 | if (mdev->agreed_pro_version < 94) { | 1057 | if (mdev->tconn->agreed_pro_version < 94) |
856 | peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); | 1058 | peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); |
857 | /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ | 1059 | /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ |
858 | } else if (mdev->agreed_pro_version == 94) | 1060 | else if (mdev->tconn->agreed_pro_version == 94) |
859 | peer = DRBD_MAX_SIZE_H80_PACKET; | 1061 | peer = DRBD_MAX_SIZE_H80_PACKET; |
860 | else /* drbd 8.3.8 onwards */ | 1062 | else if (mdev->tconn->agreed_pro_version < 100) |
1063 | peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ | ||
1064 | else | ||
861 | peer = DRBD_MAX_BIO_SIZE; | 1065 | peer = DRBD_MAX_BIO_SIZE; |
862 | } | 1066 | } |
863 | 1067 | ||
@@ -872,36 +1076,27 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) | |||
872 | drbd_setup_queue_param(mdev, new); | 1076 | drbd_setup_queue_param(mdev, new); |
873 | } | 1077 | } |
874 | 1078 | ||
875 | /* serialize deconfig (worker exiting, doing cleanup) | 1079 | /* Starts the worker thread */ |
876 | * and reconfig (drbdsetup disk, drbdsetup net) | 1080 | static void conn_reconfig_start(struct drbd_tconn *tconn) |
877 | * | ||
878 | * Wait for a potentially exiting worker, then restart it, | ||
879 | * or start a new one. Flush any pending work, there may still be an | ||
880 | * after_state_change queued. | ||
881 | */ | ||
882 | static void drbd_reconfig_start(struct drbd_conf *mdev) | ||
883 | { | 1081 | { |
884 | wait_event(mdev->state_wait, !drbd_test_and_set_flag(mdev, CONFIG_PENDING)); | 1082 | drbd_thread_start(&tconn->worker); |
885 | wait_event(mdev->state_wait, !drbd_test_flag(mdev, DEVICE_DYING)); | 1083 | conn_flush_workqueue(tconn); |
886 | drbd_thread_start(&mdev->worker); | ||
887 | drbd_flush_workqueue(mdev); | ||
888 | } | 1084 | } |
889 | 1085 | ||
890 | /* if still unconfigured, stops worker again. | 1086 | /* if still unconfigured, stops worker again. */ |
891 | * if configured now, clears CONFIG_PENDING. | 1087 | static void conn_reconfig_done(struct drbd_tconn *tconn) |
892 | * wakes potential waiters */ | ||
893 | static void drbd_reconfig_done(struct drbd_conf *mdev) | ||
894 | { | 1088 | { |
895 | spin_lock_irq(&mdev->req_lock); | 1089 | bool stop_threads; |
896 | if (mdev->state.disk == D_DISKLESS && | 1090 | spin_lock_irq(&tconn->req_lock); |
897 | mdev->state.conn == C_STANDALONE && | 1091 | stop_threads = conn_all_vols_unconf(tconn) && |
898 | mdev->state.role == R_SECONDARY) { | 1092 | tconn->cstate == C_STANDALONE; |
899 | drbd_set_flag(mdev, DEVICE_DYING); | 1093 | spin_unlock_irq(&tconn->req_lock); |
900 | drbd_thread_stop_nowait(&mdev->worker); | 1094 | if (stop_threads) { |
901 | } else | 1095 | /* asender is implicitly stopped by receiver |
902 | drbd_clear_flag(mdev, CONFIG_PENDING); | 1096 | * in conn_disconnect() */ |
903 | spin_unlock_irq(&mdev->req_lock); | 1097 | drbd_thread_stop(&tconn->receiver); |
904 | wake_up(&mdev->state_wait); | 1098 | drbd_thread_stop(&tconn->worker); |
1099 | } | ||
905 | } | 1100 | } |
906 | 1101 | ||
907 | /* Make sure IO is suspended before calling this function(). */ | 1102 | /* Make sure IO is suspended before calling this function(). */ |
@@ -909,42 +1104,182 @@ static void drbd_suspend_al(struct drbd_conf *mdev) | |||
909 | { | 1104 | { |
910 | int s = 0; | 1105 | int s = 0; |
911 | 1106 | ||
912 | if (lc_try_lock(mdev->act_log)) { | 1107 | if (!lc_try_lock(mdev->act_log)) { |
913 | drbd_al_shrink(mdev); | ||
914 | lc_unlock(mdev->act_log); | ||
915 | } else { | ||
916 | dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); | 1108 | dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); |
917 | return; | 1109 | return; |
918 | } | 1110 | } |
919 | 1111 | ||
920 | spin_lock_irq(&mdev->req_lock); | 1112 | drbd_al_shrink(mdev); |
1113 | spin_lock_irq(&mdev->tconn->req_lock); | ||
921 | if (mdev->state.conn < C_CONNECTED) | 1114 | if (mdev->state.conn < C_CONNECTED) |
922 | s = !drbd_test_and_set_flag(mdev, AL_SUSPENDED); | 1115 | s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); |
923 | 1116 | spin_unlock_irq(&mdev->tconn->req_lock); | |
924 | spin_unlock_irq(&mdev->req_lock); | 1117 | lc_unlock(mdev->act_log); |
925 | 1118 | ||
926 | if (s) | 1119 | if (s) |
927 | dev_info(DEV, "Suspended AL updates\n"); | 1120 | dev_info(DEV, "Suspended AL updates\n"); |
928 | } | 1121 | } |
929 | 1122 | ||
930 | /* does always return 0; | 1123 | |
931 | * interesting return code is in reply->ret_code */ | 1124 | static bool should_set_defaults(struct genl_info *info) |
932 | static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 1125 | { |
933 | struct drbd_nl_cfg_reply *reply) | 1126 | unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags; |
1127 | return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); | ||
1128 | } | ||
1129 | |||
1130 | static void enforce_disk_conf_limits(struct disk_conf *dc) | ||
1131 | { | ||
1132 | if (dc->al_extents < DRBD_AL_EXTENTS_MIN) | ||
1133 | dc->al_extents = DRBD_AL_EXTENTS_MIN; | ||
1134 | if (dc->al_extents > DRBD_AL_EXTENTS_MAX) | ||
1135 | dc->al_extents = DRBD_AL_EXTENTS_MAX; | ||
1136 | |||
1137 | if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) | ||
1138 | dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; | ||
1139 | } | ||
1140 | |||
1141 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) | ||
934 | { | 1142 | { |
935 | enum drbd_ret_code retcode; | 1143 | enum drbd_ret_code retcode; |
1144 | struct drbd_conf *mdev; | ||
1145 | struct disk_conf *new_disk_conf, *old_disk_conf; | ||
1146 | struct fifo_buffer *old_plan = NULL, *new_plan = NULL; | ||
1147 | int err, fifo_size; | ||
1148 | |||
1149 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
1150 | if (!adm_ctx.reply_skb) | ||
1151 | return retcode; | ||
1152 | if (retcode != NO_ERROR) | ||
1153 | goto out; | ||
1154 | |||
1155 | mdev = adm_ctx.mdev; | ||
1156 | |||
1157 | /* we also need a disk | ||
1158 | * to change the options on */ | ||
1159 | if (!get_ldev(mdev)) { | ||
1160 | retcode = ERR_NO_DISK; | ||
1161 | goto out; | ||
1162 | } | ||
1163 | |||
1164 | new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
1165 | if (!new_disk_conf) { | ||
1166 | retcode = ERR_NOMEM; | ||
1167 | goto fail; | ||
1168 | } | ||
1169 | |||
1170 | mutex_lock(&mdev->tconn->conf_update); | ||
1171 | old_disk_conf = mdev->ldev->disk_conf; | ||
1172 | *new_disk_conf = *old_disk_conf; | ||
1173 | if (should_set_defaults(info)) | ||
1174 | set_disk_conf_defaults(new_disk_conf); | ||
1175 | |||
1176 | err = disk_conf_from_attrs_for_change(new_disk_conf, info); | ||
1177 | if (err && err != -ENOMSG) { | ||
1178 | retcode = ERR_MANDATORY_TAG; | ||
1179 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1180 | } | ||
1181 | |||
1182 | if (!expect(new_disk_conf->resync_rate >= 1)) | ||
1183 | new_disk_conf->resync_rate = 1; | ||
1184 | |||
1185 | enforce_disk_conf_limits(new_disk_conf); | ||
1186 | |||
1187 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; | ||
1188 | if (fifo_size != mdev->rs_plan_s->size) { | ||
1189 | new_plan = fifo_alloc(fifo_size); | ||
1190 | if (!new_plan) { | ||
1191 | dev_err(DEV, "kmalloc of fifo_buffer failed"); | ||
1192 | retcode = ERR_NOMEM; | ||
1193 | goto fail_unlock; | ||
1194 | } | ||
1195 | } | ||
1196 | |||
1197 | drbd_suspend_io(mdev); | ||
1198 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
1199 | drbd_al_shrink(mdev); | ||
1200 | err = drbd_check_al_size(mdev, new_disk_conf); | ||
1201 | lc_unlock(mdev->act_log); | ||
1202 | wake_up(&mdev->al_wait); | ||
1203 | drbd_resume_io(mdev); | ||
1204 | |||
1205 | if (err) { | ||
1206 | retcode = ERR_NOMEM; | ||
1207 | goto fail_unlock; | ||
1208 | } | ||
1209 | |||
1210 | write_lock_irq(&global_state_lock); | ||
1211 | retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); | ||
1212 | if (retcode == NO_ERROR) { | ||
1213 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
1214 | drbd_resync_after_changed(mdev); | ||
1215 | } | ||
1216 | write_unlock_irq(&global_state_lock); | ||
1217 | |||
1218 | if (retcode != NO_ERROR) | ||
1219 | goto fail_unlock; | ||
1220 | |||
1221 | if (new_plan) { | ||
1222 | old_plan = mdev->rs_plan_s; | ||
1223 | rcu_assign_pointer(mdev->rs_plan_s, new_plan); | ||
1224 | } | ||
1225 | |||
1226 | mutex_unlock(&mdev->tconn->conf_update); | ||
1227 | |||
1228 | if (new_disk_conf->al_updates) | ||
1229 | mdev->ldev->md.flags &= ~MDF_AL_DISABLED; | ||
1230 | else | ||
1231 | mdev->ldev->md.flags |= MDF_AL_DISABLED; | ||
1232 | |||
1233 | drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); | ||
1234 | |||
1235 | drbd_md_sync(mdev); | ||
1236 | |||
1237 | if (mdev->state.conn >= C_CONNECTED) | ||
1238 | drbd_send_sync_param(mdev); | ||
1239 | |||
1240 | synchronize_rcu(); | ||
1241 | kfree(old_disk_conf); | ||
1242 | kfree(old_plan); | ||
1243 | mod_timer(&mdev->request_timer, jiffies + HZ); | ||
1244 | goto success; | ||
1245 | |||
1246 | fail_unlock: | ||
1247 | mutex_unlock(&mdev->tconn->conf_update); | ||
1248 | fail: | ||
1249 | kfree(new_disk_conf); | ||
1250 | kfree(new_plan); | ||
1251 | success: | ||
1252 | put_ldev(mdev); | ||
1253 | out: | ||
1254 | drbd_adm_finish(info, retcode); | ||
1255 | return 0; | ||
1256 | } | ||
1257 | |||
1258 | int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | ||
1259 | { | ||
1260 | struct drbd_conf *mdev; | ||
1261 | int err; | ||
1262 | enum drbd_ret_code retcode; | ||
936 | enum determine_dev_size dd; | 1263 | enum determine_dev_size dd; |
937 | sector_t max_possible_sectors; | 1264 | sector_t max_possible_sectors; |
938 | sector_t min_md_device_sectors; | 1265 | sector_t min_md_device_sectors; |
939 | struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ | 1266 | struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ |
1267 | struct disk_conf *new_disk_conf = NULL; | ||
940 | struct block_device *bdev; | 1268 | struct block_device *bdev; |
941 | struct lru_cache *resync_lru = NULL; | 1269 | struct lru_cache *resync_lru = NULL; |
1270 | struct fifo_buffer *new_plan = NULL; | ||
942 | union drbd_state ns, os; | 1271 | union drbd_state ns, os; |
943 | enum drbd_state_rv rv; | 1272 | enum drbd_state_rv rv; |
944 | int cp_discovered = 0; | 1273 | struct net_conf *nc; |
945 | int logical_block_size; | ||
946 | 1274 | ||
947 | drbd_reconfig_start(mdev); | 1275 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
1276 | if (!adm_ctx.reply_skb) | ||
1277 | return retcode; | ||
1278 | if (retcode != NO_ERROR) | ||
1279 | goto finish; | ||
1280 | |||
1281 | mdev = adm_ctx.mdev; | ||
1282 | conn_reconfig_start(mdev->tconn); | ||
948 | 1283 | ||
949 | /* if you want to reconfigure, please tear down first */ | 1284 | /* if you want to reconfigure, please tear down first */ |
950 | if (mdev->state.disk > D_DISKLESS) { | 1285 | if (mdev->state.disk > D_DISKLESS) { |
@@ -958,52 +1293,66 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
958 | wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); | 1293 | wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); |
959 | 1294 | ||
960 | /* make sure there is no leftover from previous force-detach attempts */ | 1295 | /* make sure there is no leftover from previous force-detach attempts */ |
961 | drbd_clear_flag(mdev, FORCE_DETACH); | 1296 | clear_bit(FORCE_DETACH, &mdev->flags); |
962 | drbd_clear_flag(mdev, WAS_IO_ERROR); | 1297 | clear_bit(WAS_IO_ERROR, &mdev->flags); |
963 | drbd_clear_flag(mdev, WAS_READ_ERROR); | 1298 | clear_bit(WAS_READ_ERROR, &mdev->flags); |
964 | 1299 | ||
965 | /* and no leftover from previously aborted resync or verify, either */ | 1300 | /* and no leftover from previously aborted resync or verify, either */ |
966 | mdev->rs_total = 0; | 1301 | mdev->rs_total = 0; |
967 | mdev->rs_failed = 0; | 1302 | mdev->rs_failed = 0; |
968 | atomic_set(&mdev->rs_pending_cnt, 0); | 1303 | atomic_set(&mdev->rs_pending_cnt, 0); |
969 | 1304 | ||
970 | /* allocation not in the IO path, cqueue thread context */ | 1305 | /* allocation not in the IO path, drbdsetup context */ |
971 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); | 1306 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); |
972 | if (!nbc) { | 1307 | if (!nbc) { |
973 | retcode = ERR_NOMEM; | 1308 | retcode = ERR_NOMEM; |
974 | goto fail; | 1309 | goto fail; |
975 | } | 1310 | } |
976 | |||
977 | nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; | ||
978 | nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; | ||
979 | nbc->dc.fencing = DRBD_FENCING_DEF; | ||
980 | nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; | ||
981 | |||
982 | spin_lock_init(&nbc->md.uuid_lock); | 1311 | spin_lock_init(&nbc->md.uuid_lock); |
983 | 1312 | ||
984 | if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { | 1313 | new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); |
1314 | if (!new_disk_conf) { | ||
1315 | retcode = ERR_NOMEM; | ||
1316 | goto fail; | ||
1317 | } | ||
1318 | nbc->disk_conf = new_disk_conf; | ||
1319 | |||
1320 | set_disk_conf_defaults(new_disk_conf); | ||
1321 | err = disk_conf_from_attrs(new_disk_conf, info); | ||
1322 | if (err) { | ||
985 | retcode = ERR_MANDATORY_TAG; | 1323 | retcode = ERR_MANDATORY_TAG; |
1324 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
986 | goto fail; | 1325 | goto fail; |
987 | } | 1326 | } |
988 | 1327 | ||
989 | if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { | 1328 | enforce_disk_conf_limits(new_disk_conf); |
1329 | |||
1330 | new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); | ||
1331 | if (!new_plan) { | ||
1332 | retcode = ERR_NOMEM; | ||
1333 | goto fail; | ||
1334 | } | ||
1335 | |||
1336 | if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { | ||
990 | retcode = ERR_MD_IDX_INVALID; | 1337 | retcode = ERR_MD_IDX_INVALID; |
991 | goto fail; | 1338 | goto fail; |
992 | } | 1339 | } |
993 | 1340 | ||
994 | if (get_net_conf(mdev)) { | 1341 | rcu_read_lock(); |
995 | int prot = mdev->net_conf->wire_protocol; | 1342 | nc = rcu_dereference(mdev->tconn->net_conf); |
996 | put_net_conf(mdev); | 1343 | if (nc) { |
997 | if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) { | 1344 | if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { |
1345 | rcu_read_unlock(); | ||
998 | retcode = ERR_STONITH_AND_PROT_A; | 1346 | retcode = ERR_STONITH_AND_PROT_A; |
999 | goto fail; | 1347 | goto fail; |
1000 | } | 1348 | } |
1001 | } | 1349 | } |
1350 | rcu_read_unlock(); | ||
1002 | 1351 | ||
1003 | bdev = blkdev_get_by_path(nbc->dc.backing_dev, | 1352 | bdev = blkdev_get_by_path(new_disk_conf->backing_dev, |
1004 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); | 1353 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); |
1005 | if (IS_ERR(bdev)) { | 1354 | if (IS_ERR(bdev)) { |
1006 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, | 1355 | dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, |
1007 | PTR_ERR(bdev)); | 1356 | PTR_ERR(bdev)); |
1008 | retcode = ERR_OPEN_DISK; | 1357 | retcode = ERR_OPEN_DISK; |
1009 | goto fail; | 1358 | goto fail; |
@@ -1018,12 +1367,12 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1018 | * should check it for you already; but if you don't, or | 1367 | * should check it for you already; but if you don't, or |
1019 | * someone fooled it, we need to double check here) | 1368 | * someone fooled it, we need to double check here) |
1020 | */ | 1369 | */ |
1021 | bdev = blkdev_get_by_path(nbc->dc.meta_dev, | 1370 | bdev = blkdev_get_by_path(new_disk_conf->meta_dev, |
1022 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, | 1371 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, |
1023 | (nbc->dc.meta_dev_idx < 0) ? | 1372 | (new_disk_conf->meta_dev_idx < 0) ? |
1024 | (void *)mdev : (void *)drbd_m_holder); | 1373 | (void *)mdev : (void *)drbd_m_holder); |
1025 | if (IS_ERR(bdev)) { | 1374 | if (IS_ERR(bdev)) { |
1026 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, | 1375 | dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, |
1027 | PTR_ERR(bdev)); | 1376 | PTR_ERR(bdev)); |
1028 | retcode = ERR_OPEN_MD_DISK; | 1377 | retcode = ERR_OPEN_MD_DISK; |
1029 | goto fail; | 1378 | goto fail; |
@@ -1031,14 +1380,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1031 | nbc->md_bdev = bdev; | 1380 | nbc->md_bdev = bdev; |
1032 | 1381 | ||
1033 | if ((nbc->backing_bdev == nbc->md_bdev) != | 1382 | if ((nbc->backing_bdev == nbc->md_bdev) != |
1034 | (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || | 1383 | (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || |
1035 | nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { | 1384 | new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { |
1036 | retcode = ERR_MD_IDX_INVALID; | 1385 | retcode = ERR_MD_IDX_INVALID; |
1037 | goto fail; | 1386 | goto fail; |
1038 | } | 1387 | } |
1039 | 1388 | ||
1040 | resync_lru = lc_create("resync", drbd_bm_ext_cache, | 1389 | resync_lru = lc_create("resync", drbd_bm_ext_cache, |
1041 | 61, sizeof(struct bm_extent), | 1390 | 1, 61, sizeof(struct bm_extent), |
1042 | offsetof(struct bm_extent, lce)); | 1391 | offsetof(struct bm_extent, lce)); |
1043 | if (!resync_lru) { | 1392 | if (!resync_lru) { |
1044 | retcode = ERR_NOMEM; | 1393 | retcode = ERR_NOMEM; |
@@ -1048,21 +1397,21 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1048 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ | 1397 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ |
1049 | drbd_md_set_sector_offsets(mdev, nbc); | 1398 | drbd_md_set_sector_offsets(mdev, nbc); |
1050 | 1399 | ||
1051 | if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { | 1400 | if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { |
1052 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", | 1401 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", |
1053 | (unsigned long long) drbd_get_max_capacity(nbc), | 1402 | (unsigned long long) drbd_get_max_capacity(nbc), |
1054 | (unsigned long long) nbc->dc.disk_size); | 1403 | (unsigned long long) new_disk_conf->disk_size); |
1055 | retcode = ERR_DISK_TOO_SMALL; | 1404 | retcode = ERR_DISK_TOO_SMALL; |
1056 | goto fail; | 1405 | goto fail; |
1057 | } | 1406 | } |
1058 | 1407 | ||
1059 | if (nbc->dc.meta_dev_idx < 0) { | 1408 | if (new_disk_conf->meta_dev_idx < 0) { |
1060 | max_possible_sectors = DRBD_MAX_SECTORS_FLEX; | 1409 | max_possible_sectors = DRBD_MAX_SECTORS_FLEX; |
1061 | /* at least one MB, otherwise it does not make sense */ | 1410 | /* at least one MB, otherwise it does not make sense */ |
1062 | min_md_device_sectors = (2<<10); | 1411 | min_md_device_sectors = (2<<10); |
1063 | } else { | 1412 | } else { |
1064 | max_possible_sectors = DRBD_MAX_SECTORS; | 1413 | max_possible_sectors = DRBD_MAX_SECTORS; |
1065 | min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); | 1414 | min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); |
1066 | } | 1415 | } |
1067 | 1416 | ||
1068 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { | 1417 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { |
@@ -1087,14 +1436,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1087 | dev_warn(DEV, "==> truncating very big lower level device " | 1436 | dev_warn(DEV, "==> truncating very big lower level device " |
1088 | "to currently maximum possible %llu sectors <==\n", | 1437 | "to currently maximum possible %llu sectors <==\n", |
1089 | (unsigned long long) max_possible_sectors); | 1438 | (unsigned long long) max_possible_sectors); |
1090 | if (nbc->dc.meta_dev_idx >= 0) | 1439 | if (new_disk_conf->meta_dev_idx >= 0) |
1091 | dev_warn(DEV, "==>> using internal or flexible " | 1440 | dev_warn(DEV, "==>> using internal or flexible " |
1092 | "meta data may help <<==\n"); | 1441 | "meta data may help <<==\n"); |
1093 | } | 1442 | } |
1094 | 1443 | ||
1095 | drbd_suspend_io(mdev); | 1444 | drbd_suspend_io(mdev); |
1096 | /* also wait for the last barrier ack. */ | 1445 | /* also wait for the last barrier ack. */ |
1097 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state)); | 1446 | /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171 |
1447 | * We need a way to either ignore barrier acks for barriers sent before a device | ||
1448 | * was attached, or a way to wait for all pending barrier acks to come in. | ||
1449 | * As barriers are counted per resource, | ||
1450 | * we'd need to suspend io on all devices of a resource. | ||
1451 | */ | ||
1452 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev)); | ||
1098 | /* and for any other previously queued work */ | 1453 | /* and for any other previously queued work */ |
1099 | drbd_flush_workqueue(mdev); | 1454 | drbd_flush_workqueue(mdev); |
1100 | 1455 | ||
@@ -1109,25 +1464,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1109 | 1464 | ||
1110 | drbd_md_set_sector_offsets(mdev, nbc); | 1465 | drbd_md_set_sector_offsets(mdev, nbc); |
1111 | 1466 | ||
1112 | /* allocate a second IO page if logical_block_size != 512 */ | ||
1113 | logical_block_size = bdev_logical_block_size(nbc->md_bdev); | ||
1114 | if (logical_block_size == 0) | ||
1115 | logical_block_size = MD_SECTOR_SIZE; | ||
1116 | |||
1117 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
1118 | if (!mdev->md_io_tmpp) { | ||
1119 | struct page *page = alloc_page(GFP_NOIO); | ||
1120 | if (!page) | ||
1121 | goto force_diskless_dec; | ||
1122 | |||
1123 | dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", | ||
1124 | logical_block_size, MD_SECTOR_SIZE); | ||
1125 | dev_warn(DEV, "Workaround engaged (has performance impact).\n"); | ||
1126 | |||
1127 | mdev->md_io_tmpp = page; | ||
1128 | } | ||
1129 | } | ||
1130 | |||
1131 | if (!mdev->bitmap) { | 1467 | if (!mdev->bitmap) { |
1132 | if (drbd_bm_init(mdev)) { | 1468 | if (drbd_bm_init(mdev)) { |
1133 | retcode = ERR_NOMEM; | 1469 | retcode = ERR_NOMEM; |
@@ -1149,30 +1485,25 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1149 | } | 1485 | } |
1150 | 1486 | ||
1151 | /* Since we are diskless, fix the activity log first... */ | 1487 | /* Since we are diskless, fix the activity log first... */ |
1152 | if (drbd_check_al_size(mdev)) { | 1488 | if (drbd_check_al_size(mdev, new_disk_conf)) { |
1153 | retcode = ERR_NOMEM; | 1489 | retcode = ERR_NOMEM; |
1154 | goto force_diskless_dec; | 1490 | goto force_diskless_dec; |
1155 | } | 1491 | } |
1156 | 1492 | ||
1157 | /* Prevent shrinking of consistent devices ! */ | 1493 | /* Prevent shrinking of consistent devices ! */ |
1158 | if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && | 1494 | if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && |
1159 | drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { | 1495 | drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) { |
1160 | dev_warn(DEV, "refusing to truncate a consistent device\n"); | 1496 | dev_warn(DEV, "refusing to truncate a consistent device\n"); |
1161 | retcode = ERR_DISK_TOO_SMALL; | 1497 | retcode = ERR_DISK_TOO_SMALL; |
1162 | goto force_diskless_dec; | 1498 | goto force_diskless_dec; |
1163 | } | 1499 | } |
1164 | 1500 | ||
1165 | if (!drbd_al_read_log(mdev, nbc)) { | ||
1166 | retcode = ERR_IO_MD_DISK; | ||
1167 | goto force_diskless_dec; | ||
1168 | } | ||
1169 | |||
1170 | /* Reset the "barriers don't work" bits here, then force meta data to | 1501 | /* Reset the "barriers don't work" bits here, then force meta data to |
1171 | * be written, to ensure we determine if barriers are supported. */ | 1502 | * be written, to ensure we determine if barriers are supported. */ |
1172 | if (nbc->dc.no_md_flush) | 1503 | if (new_disk_conf->md_flushes) |
1173 | drbd_set_flag(mdev, MD_NO_FUA); | 1504 | clear_bit(MD_NO_FUA, &mdev->flags); |
1174 | else | 1505 | else |
1175 | drbd_clear_flag(mdev, MD_NO_FUA); | 1506 | set_bit(MD_NO_FUA, &mdev->flags); |
1176 | 1507 | ||
1177 | /* Point of no return reached. | 1508 | /* Point of no return reached. |
1178 | * Devices and memory are no longer released by error cleanup below. | 1509 | * Devices and memory are no longer released by error cleanup below. |
@@ -1181,22 +1512,22 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1181 | D_ASSERT(mdev->ldev == NULL); | 1512 | D_ASSERT(mdev->ldev == NULL); |
1182 | mdev->ldev = nbc; | 1513 | mdev->ldev = nbc; |
1183 | mdev->resync = resync_lru; | 1514 | mdev->resync = resync_lru; |
1515 | mdev->rs_plan_s = new_plan; | ||
1184 | nbc = NULL; | 1516 | nbc = NULL; |
1185 | resync_lru = NULL; | 1517 | resync_lru = NULL; |
1518 | new_disk_conf = NULL; | ||
1519 | new_plan = NULL; | ||
1186 | 1520 | ||
1187 | mdev->write_ordering = WO_bdev_flush; | 1521 | drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); |
1188 | drbd_bump_write_ordering(mdev, WO_bdev_flush); | ||
1189 | 1522 | ||
1190 | if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) | 1523 | if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) |
1191 | drbd_set_flag(mdev, CRASHED_PRIMARY); | 1524 | set_bit(CRASHED_PRIMARY, &mdev->flags); |
1192 | else | 1525 | else |
1193 | drbd_clear_flag(mdev, CRASHED_PRIMARY); | 1526 | clear_bit(CRASHED_PRIMARY, &mdev->flags); |
1194 | 1527 | ||
1195 | if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && | 1528 | if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && |
1196 | !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) { | 1529 | !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod)) |
1197 | drbd_set_flag(mdev, CRASHED_PRIMARY); | 1530 | set_bit(CRASHED_PRIMARY, &mdev->flags); |
1198 | cp_discovered = 1; | ||
1199 | } | ||
1200 | 1531 | ||
1201 | mdev->send_cnt = 0; | 1532 | mdev->send_cnt = 0; |
1202 | mdev->recv_cnt = 0; | 1533 | mdev->recv_cnt = 0; |
@@ -1219,20 +1550,22 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1219 | * so we can automatically recover from a crash of a | 1550 | * so we can automatically recover from a crash of a |
1220 | * degraded but active "cluster" after a certain timeout. | 1551 | * degraded but active "cluster" after a certain timeout. |
1221 | */ | 1552 | */ |
1222 | drbd_clear_flag(mdev, USE_DEGR_WFC_T); | 1553 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); |
1223 | if (mdev->state.role != R_PRIMARY && | 1554 | if (mdev->state.role != R_PRIMARY && |
1224 | drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && | 1555 | drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && |
1225 | !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) | 1556 | !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) |
1226 | drbd_set_flag(mdev, USE_DEGR_WFC_T); | 1557 | set_bit(USE_DEGR_WFC_T, &mdev->flags); |
1227 | 1558 | ||
1228 | dd = drbd_determine_dev_size(mdev, 0); | 1559 | dd = drbd_determine_dev_size(mdev, 0); |
1229 | if (dd == dev_size_error) { | 1560 | if (dd == dev_size_error) { |
1230 | retcode = ERR_NOMEM_BITMAP; | 1561 | retcode = ERR_NOMEM_BITMAP; |
1231 | goto force_diskless_dec; | 1562 | goto force_diskless_dec; |
1232 | } else if (dd == grew) | 1563 | } else if (dd == grew) |
1233 | drbd_set_flag(mdev, RESYNC_AFTER_NEG); | 1564 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); |
1234 | 1565 | ||
1235 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | 1566 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || |
1567 | (test_bit(CRASHED_PRIMARY, &mdev->flags) && | ||
1568 | drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) { | ||
1236 | dev_info(DEV, "Assuming that all blocks are out of sync " | 1569 | dev_info(DEV, "Assuming that all blocks are out of sync " |
1237 | "(aka FullSync)\n"); | 1570 | "(aka FullSync)\n"); |
1238 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, | 1571 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, |
@@ -1242,16 +1575,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1242 | } | 1575 | } |
1243 | } else { | 1576 | } else { |
1244 | if (drbd_bitmap_io(mdev, &drbd_bm_read, | 1577 | if (drbd_bitmap_io(mdev, &drbd_bm_read, |
1245 | "read from attaching", BM_LOCKED_MASK) < 0) { | 1578 | "read from attaching", BM_LOCKED_MASK)) { |
1246 | retcode = ERR_IO_MD_DISK; | ||
1247 | goto force_diskless_dec; | ||
1248 | } | ||
1249 | } | ||
1250 | |||
1251 | if (cp_discovered) { | ||
1252 | drbd_al_apply_to_bm(mdev); | ||
1253 | if (drbd_bitmap_io(mdev, &drbd_bm_write, | ||
1254 | "crashed primary apply AL", BM_LOCKED_MASK)) { | ||
1255 | retcode = ERR_IO_MD_DISK; | 1579 | retcode = ERR_IO_MD_DISK; |
1256 | goto force_diskless_dec; | 1580 | goto force_diskless_dec; |
1257 | } | 1581 | } |
@@ -1260,9 +1584,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1260 | if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) | 1584 | if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) |
1261 | drbd_suspend_al(mdev); /* IO is still suspended here... */ | 1585 | drbd_suspend_al(mdev); /* IO is still suspended here... */ |
1262 | 1586 | ||
1263 | spin_lock_irq(&mdev->req_lock); | 1587 | spin_lock_irq(&mdev->tconn->req_lock); |
1264 | os = mdev->state; | 1588 | os = drbd_read_state(mdev); |
1265 | ns.i = os.i; | 1589 | ns = os; |
1266 | /* If MDF_CONSISTENT is not set go into inconsistent state, | 1590 | /* If MDF_CONSISTENT is not set go into inconsistent state, |
1267 | otherwise investigate MDF_WasUpToDate... | 1591 | otherwise investigate MDF_WasUpToDate... |
1268 | If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, | 1592 | If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, |
@@ -1280,8 +1604,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1280 | if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) | 1604 | if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) |
1281 | ns.pdsk = D_OUTDATED; | 1605 | ns.pdsk = D_OUTDATED; |
1282 | 1606 | ||
1283 | if ( ns.disk == D_CONSISTENT && | 1607 | rcu_read_lock(); |
1284 | (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) | 1608 | if (ns.disk == D_CONSISTENT && |
1609 | (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE)) | ||
1285 | ns.disk = D_UP_TO_DATE; | 1610 | ns.disk = D_UP_TO_DATE; |
1286 | 1611 | ||
1287 | /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, | 1612 | /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, |
@@ -1289,6 +1614,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1289 | this point, because drbd_request_state() modifies these | 1614 | this point, because drbd_request_state() modifies these |
1290 | flags. */ | 1615 | flags. */ |
1291 | 1616 | ||
1617 | if (rcu_dereference(mdev->ldev->disk_conf)->al_updates) | ||
1618 | mdev->ldev->md.flags &= ~MDF_AL_DISABLED; | ||
1619 | else | ||
1620 | mdev->ldev->md.flags |= MDF_AL_DISABLED; | ||
1621 | |||
1622 | rcu_read_unlock(); | ||
1623 | |||
1292 | /* In case we are C_CONNECTED postpone any decision on the new disk | 1624 | /* In case we are C_CONNECTED postpone any decision on the new disk |
1293 | state after the negotiation phase. */ | 1625 | state after the negotiation phase. */ |
1294 | if (mdev->state.conn == C_CONNECTED) { | 1626 | if (mdev->state.conn == C_CONNECTED) { |
@@ -1304,12 +1636,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1304 | } | 1636 | } |
1305 | 1637 | ||
1306 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | 1638 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); |
1307 | ns = mdev->state; | 1639 | spin_unlock_irq(&mdev->tconn->req_lock); |
1308 | spin_unlock_irq(&mdev->req_lock); | ||
1309 | 1640 | ||
1310 | if (rv < SS_SUCCESS) | 1641 | if (rv < SS_SUCCESS) |
1311 | goto force_diskless_dec; | 1642 | goto force_diskless_dec; |
1312 | 1643 | ||
1644 | mod_timer(&mdev->request_timer, jiffies + HZ); | ||
1645 | |||
1313 | if (mdev->state.role == R_PRIMARY) | 1646 | if (mdev->state.role == R_PRIMARY) |
1314 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; | 1647 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; |
1315 | else | 1648 | else |
@@ -1320,16 +1653,17 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1320 | 1653 | ||
1321 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | 1654 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); |
1322 | put_ldev(mdev); | 1655 | put_ldev(mdev); |
1323 | reply->ret_code = retcode; | 1656 | conn_reconfig_done(mdev->tconn); |
1324 | drbd_reconfig_done(mdev); | 1657 | drbd_adm_finish(info, retcode); |
1325 | return 0; | 1658 | return 0; |
1326 | 1659 | ||
1327 | force_diskless_dec: | 1660 | force_diskless_dec: |
1328 | put_ldev(mdev); | 1661 | put_ldev(mdev); |
1329 | force_diskless: | 1662 | force_diskless: |
1330 | drbd_force_state(mdev, NS(disk, D_FAILED)); | 1663 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); |
1331 | drbd_md_sync(mdev); | 1664 | drbd_md_sync(mdev); |
1332 | fail: | 1665 | fail: |
1666 | conn_reconfig_done(mdev->tconn); | ||
1333 | if (nbc) { | 1667 | if (nbc) { |
1334 | if (nbc->backing_bdev) | 1668 | if (nbc->backing_bdev) |
1335 | blkdev_put(nbc->backing_bdev, | 1669 | blkdev_put(nbc->backing_bdev, |
@@ -1339,34 +1673,24 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1339 | FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 1673 | FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
1340 | kfree(nbc); | 1674 | kfree(nbc); |
1341 | } | 1675 | } |
1676 | kfree(new_disk_conf); | ||
1342 | lc_destroy(resync_lru); | 1677 | lc_destroy(resync_lru); |
1678 | kfree(new_plan); | ||
1343 | 1679 | ||
1344 | reply->ret_code = retcode; | 1680 | finish: |
1345 | drbd_reconfig_done(mdev); | 1681 | drbd_adm_finish(info, retcode); |
1346 | return 0; | 1682 | return 0; |
1347 | } | 1683 | } |
1348 | 1684 | ||
1349 | /* Detaching the disk is a process in multiple stages. First we need to lock | 1685 | static int adm_detach(struct drbd_conf *mdev, int force) |
1350 | * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. | ||
1351 | * Then we transition to D_DISKLESS, and wait for put_ldev() to return all | ||
1352 | * internal references as well. | ||
1353 | * Only then we have finally detached. */ | ||
1354 | static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1355 | struct drbd_nl_cfg_reply *reply) | ||
1356 | { | 1686 | { |
1357 | enum drbd_ret_code retcode; | 1687 | enum drbd_state_rv retcode; |
1358 | int ret; | 1688 | int ret; |
1359 | struct detach dt = {}; | ||
1360 | 1689 | ||
1361 | if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { | 1690 | if (force) { |
1362 | reply->ret_code = ERR_MANDATORY_TAG; | 1691 | set_bit(FORCE_DETACH, &mdev->flags); |
1363 | goto out; | ||
1364 | } | ||
1365 | |||
1366 | if (dt.detach_force) { | ||
1367 | drbd_set_flag(mdev, FORCE_DETACH); | ||
1368 | drbd_force_state(mdev, NS(disk, D_FAILED)); | 1692 | drbd_force_state(mdev, NS(disk, D_FAILED)); |
1369 | reply->ret_code = SS_SUCCESS; | 1693 | retcode = SS_SUCCESS; |
1370 | goto out; | 1694 | goto out; |
1371 | } | 1695 | } |
1372 | 1696 | ||
@@ -1378,326 +1702,529 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1378 | ret = wait_event_interruptible(mdev->misc_wait, | 1702 | ret = wait_event_interruptible(mdev->misc_wait, |
1379 | mdev->state.disk != D_FAILED); | 1703 | mdev->state.disk != D_FAILED); |
1380 | drbd_resume_io(mdev); | 1704 | drbd_resume_io(mdev); |
1381 | |||
1382 | if ((int)retcode == (int)SS_IS_DISKLESS) | 1705 | if ((int)retcode == (int)SS_IS_DISKLESS) |
1383 | retcode = SS_NOTHING_TO_DO; | 1706 | retcode = SS_NOTHING_TO_DO; |
1384 | if (ret) | 1707 | if (ret) |
1385 | retcode = ERR_INTR; | 1708 | retcode = ERR_INTR; |
1386 | reply->ret_code = retcode; | ||
1387 | out: | 1709 | out: |
1388 | return 0; | 1710 | return retcode; |
1389 | } | 1711 | } |
1390 | 1712 | ||
1391 | static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 1713 | /* Detaching the disk is a process in multiple stages. First we need to lock |
1392 | struct drbd_nl_cfg_reply *reply) | 1714 | * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. |
1715 | * Then we transition to D_DISKLESS, and wait for put_ldev() to return all | ||
1716 | * internal references as well. | ||
1717 | * Only then we have finally detached. */ | ||
1718 | int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) | ||
1393 | { | 1719 | { |
1394 | int i, ns; | ||
1395 | enum drbd_ret_code retcode; | 1720 | enum drbd_ret_code retcode; |
1396 | struct net_conf *new_conf = NULL; | 1721 | struct detach_parms parms = { }; |
1397 | struct crypto_hash *tfm = NULL; | 1722 | int err; |
1398 | struct crypto_hash *integrity_w_tfm = NULL; | ||
1399 | struct crypto_hash *integrity_r_tfm = NULL; | ||
1400 | struct hlist_head *new_tl_hash = NULL; | ||
1401 | struct hlist_head *new_ee_hash = NULL; | ||
1402 | struct drbd_conf *odev; | ||
1403 | char hmac_name[CRYPTO_MAX_ALG_NAME]; | ||
1404 | void *int_dig_out = NULL; | ||
1405 | void *int_dig_in = NULL; | ||
1406 | void *int_dig_vv = NULL; | ||
1407 | struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; | ||
1408 | 1723 | ||
1409 | drbd_reconfig_start(mdev); | 1724 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
1725 | if (!adm_ctx.reply_skb) | ||
1726 | return retcode; | ||
1727 | if (retcode != NO_ERROR) | ||
1728 | goto out; | ||
1410 | 1729 | ||
1411 | if (mdev->state.conn > C_STANDALONE) { | 1730 | if (info->attrs[DRBD_NLA_DETACH_PARMS]) { |
1412 | retcode = ERR_NET_CONFIGURED; | 1731 | err = detach_parms_from_attrs(&parms, info); |
1413 | goto fail; | 1732 | if (err) { |
1733 | retcode = ERR_MANDATORY_TAG; | ||
1734 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1735 | goto out; | ||
1736 | } | ||
1737 | } | ||
1738 | |||
1739 | retcode = adm_detach(adm_ctx.mdev, parms.force_detach); | ||
1740 | out: | ||
1741 | drbd_adm_finish(info, retcode); | ||
1742 | return 0; | ||
1743 | } | ||
1744 | |||
1745 | static bool conn_resync_running(struct drbd_tconn *tconn) | ||
1746 | { | ||
1747 | struct drbd_conf *mdev; | ||
1748 | bool rv = false; | ||
1749 | int vnr; | ||
1750 | |||
1751 | rcu_read_lock(); | ||
1752 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1753 | if (mdev->state.conn == C_SYNC_SOURCE || | ||
1754 | mdev->state.conn == C_SYNC_TARGET || | ||
1755 | mdev->state.conn == C_PAUSED_SYNC_S || | ||
1756 | mdev->state.conn == C_PAUSED_SYNC_T) { | ||
1757 | rv = true; | ||
1758 | break; | ||
1759 | } | ||
1760 | } | ||
1761 | rcu_read_unlock(); | ||
1762 | |||
1763 | return rv; | ||
1764 | } | ||
1765 | |||
1766 | static bool conn_ov_running(struct drbd_tconn *tconn) | ||
1767 | { | ||
1768 | struct drbd_conf *mdev; | ||
1769 | bool rv = false; | ||
1770 | int vnr; | ||
1771 | |||
1772 | rcu_read_lock(); | ||
1773 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1774 | if (mdev->state.conn == C_VERIFY_S || | ||
1775 | mdev->state.conn == C_VERIFY_T) { | ||
1776 | rv = true; | ||
1777 | break; | ||
1778 | } | ||
1779 | } | ||
1780 | rcu_read_unlock(); | ||
1781 | |||
1782 | return rv; | ||
1783 | } | ||
1784 | |||
1785 | static enum drbd_ret_code | ||
1786 | _check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf) | ||
1787 | { | ||
1788 | struct drbd_conf *mdev; | ||
1789 | int i; | ||
1790 | |||
1791 | if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) { | ||
1792 | if (new_conf->wire_protocol != old_conf->wire_protocol) | ||
1793 | return ERR_NEED_APV_100; | ||
1794 | |||
1795 | if (new_conf->two_primaries != old_conf->two_primaries) | ||
1796 | return ERR_NEED_APV_100; | ||
1797 | |||
1798 | if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg)) | ||
1799 | return ERR_NEED_APV_100; | ||
1800 | } | ||
1801 | |||
1802 | if (!new_conf->two_primaries && | ||
1803 | conn_highest_role(tconn) == R_PRIMARY && | ||
1804 | conn_highest_peer(tconn) == R_PRIMARY) | ||
1805 | return ERR_NEED_ALLOW_TWO_PRI; | ||
1806 | |||
1807 | if (new_conf->two_primaries && | ||
1808 | (new_conf->wire_protocol != DRBD_PROT_C)) | ||
1809 | return ERR_NOT_PROTO_C; | ||
1810 | |||
1811 | idr_for_each_entry(&tconn->volumes, mdev, i) { | ||
1812 | if (get_ldev(mdev)) { | ||
1813 | enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; | ||
1814 | put_ldev(mdev); | ||
1815 | if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) | ||
1816 | return ERR_STONITH_AND_PROT_A; | ||
1817 | } | ||
1818 | if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data) | ||
1819 | return ERR_DISCARD_IMPOSSIBLE; | ||
1820 | } | ||
1821 | |||
1822 | if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) | ||
1823 | return ERR_CONG_NOT_PROTO_A; | ||
1824 | |||
1825 | return NO_ERROR; | ||
1826 | } | ||
1827 | |||
1828 | static enum drbd_ret_code | ||
1829 | check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf) | ||
1830 | { | ||
1831 | static enum drbd_ret_code rv; | ||
1832 | struct drbd_conf *mdev; | ||
1833 | int i; | ||
1834 | |||
1835 | rcu_read_lock(); | ||
1836 | rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf); | ||
1837 | rcu_read_unlock(); | ||
1838 | |||
1839 | /* tconn->volumes protected by genl_lock() here */ | ||
1840 | idr_for_each_entry(&tconn->volumes, mdev, i) { | ||
1841 | if (!mdev->bitmap) { | ||
1842 | if(drbd_bm_init(mdev)) | ||
1843 | return ERR_NOMEM; | ||
1844 | } | ||
1845 | } | ||
1846 | |||
1847 | return rv; | ||
1848 | } | ||
1849 | |||
1850 | struct crypto { | ||
1851 | struct crypto_hash *verify_tfm; | ||
1852 | struct crypto_hash *csums_tfm; | ||
1853 | struct crypto_hash *cram_hmac_tfm; | ||
1854 | struct crypto_hash *integrity_tfm; | ||
1855 | }; | ||
1856 | |||
1857 | static int | ||
1858 | alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg) | ||
1859 | { | ||
1860 | if (!tfm_name[0]) | ||
1861 | return NO_ERROR; | ||
1862 | |||
1863 | *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC); | ||
1864 | if (IS_ERR(*tfm)) { | ||
1865 | *tfm = NULL; | ||
1866 | return err_alg; | ||
1414 | } | 1867 | } |
1415 | 1868 | ||
1416 | /* allocation not in the IO path, cqueue thread context */ | 1869 | return NO_ERROR; |
1870 | } | ||
1871 | |||
1872 | static enum drbd_ret_code | ||
1873 | alloc_crypto(struct crypto *crypto, struct net_conf *new_conf) | ||
1874 | { | ||
1875 | char hmac_name[CRYPTO_MAX_ALG_NAME]; | ||
1876 | enum drbd_ret_code rv; | ||
1877 | |||
1878 | rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg, | ||
1879 | ERR_CSUMS_ALG); | ||
1880 | if (rv != NO_ERROR) | ||
1881 | return rv; | ||
1882 | rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg, | ||
1883 | ERR_VERIFY_ALG); | ||
1884 | if (rv != NO_ERROR) | ||
1885 | return rv; | ||
1886 | rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg, | ||
1887 | ERR_INTEGRITY_ALG); | ||
1888 | if (rv != NO_ERROR) | ||
1889 | return rv; | ||
1890 | if (new_conf->cram_hmac_alg[0] != 0) { | ||
1891 | snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", | ||
1892 | new_conf->cram_hmac_alg); | ||
1893 | |||
1894 | rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name, | ||
1895 | ERR_AUTH_ALG); | ||
1896 | } | ||
1897 | |||
1898 | return rv; | ||
1899 | } | ||
1900 | |||
1901 | static void free_crypto(struct crypto *crypto) | ||
1902 | { | ||
1903 | crypto_free_hash(crypto->cram_hmac_tfm); | ||
1904 | crypto_free_hash(crypto->integrity_tfm); | ||
1905 | crypto_free_hash(crypto->csums_tfm); | ||
1906 | crypto_free_hash(crypto->verify_tfm); | ||
1907 | } | ||
1908 | |||
1909 | int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) | ||
1910 | { | ||
1911 | enum drbd_ret_code retcode; | ||
1912 | struct drbd_tconn *tconn; | ||
1913 | struct net_conf *old_conf, *new_conf = NULL; | ||
1914 | int err; | ||
1915 | int ovr; /* online verify running */ | ||
1916 | int rsr; /* re-sync running */ | ||
1917 | struct crypto crypto = { }; | ||
1918 | |||
1919 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); | ||
1920 | if (!adm_ctx.reply_skb) | ||
1921 | return retcode; | ||
1922 | if (retcode != NO_ERROR) | ||
1923 | goto out; | ||
1924 | |||
1925 | tconn = adm_ctx.tconn; | ||
1926 | |||
1417 | new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); | 1927 | new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); |
1418 | if (!new_conf) { | 1928 | if (!new_conf) { |
1419 | retcode = ERR_NOMEM; | 1929 | retcode = ERR_NOMEM; |
1930 | goto out; | ||
1931 | } | ||
1932 | |||
1933 | conn_reconfig_start(tconn); | ||
1934 | |||
1935 | mutex_lock(&tconn->data.mutex); | ||
1936 | mutex_lock(&tconn->conf_update); | ||
1937 | old_conf = tconn->net_conf; | ||
1938 | |||
1939 | if (!old_conf) { | ||
1940 | drbd_msg_put_info("net conf missing, try connect"); | ||
1941 | retcode = ERR_INVALID_REQUEST; | ||
1420 | goto fail; | 1942 | goto fail; |
1421 | } | 1943 | } |
1422 | 1944 | ||
1423 | new_conf->timeout = DRBD_TIMEOUT_DEF; | 1945 | *new_conf = *old_conf; |
1424 | new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; | 1946 | if (should_set_defaults(info)) |
1425 | new_conf->ping_int = DRBD_PING_INT_DEF; | 1947 | set_net_conf_defaults(new_conf); |
1426 | new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; | 1948 | |
1427 | new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; | 1949 | err = net_conf_from_attrs_for_change(new_conf, info); |
1428 | new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; | 1950 | if (err && err != -ENOMSG) { |
1429 | new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; | ||
1430 | new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; | ||
1431 | new_conf->ko_count = DRBD_KO_COUNT_DEF; | ||
1432 | new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; | ||
1433 | new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; | ||
1434 | new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; | ||
1435 | new_conf->want_lose = 0; | ||
1436 | new_conf->two_primaries = 0; | ||
1437 | new_conf->wire_protocol = DRBD_PROT_C; | ||
1438 | new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; | ||
1439 | new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; | ||
1440 | new_conf->on_congestion = DRBD_ON_CONGESTION_DEF; | ||
1441 | new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF; | ||
1442 | |||
1443 | if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { | ||
1444 | retcode = ERR_MANDATORY_TAG; | 1951 | retcode = ERR_MANDATORY_TAG; |
1952 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1445 | goto fail; | 1953 | goto fail; |
1446 | } | 1954 | } |
1447 | 1955 | ||
1448 | if (new_conf->two_primaries | 1956 | retcode = check_net_options(tconn, new_conf); |
1449 | && (new_conf->wire_protocol != DRBD_PROT_C)) { | 1957 | if (retcode != NO_ERROR) |
1450 | retcode = ERR_NOT_PROTO_C; | ||
1451 | goto fail; | 1958 | goto fail; |
1452 | } | ||
1453 | 1959 | ||
1454 | if (get_ldev(mdev)) { | 1960 | /* re-sync running */ |
1455 | enum drbd_fencing_p fp = mdev->ldev->dc.fencing; | 1961 | rsr = conn_resync_running(tconn); |
1456 | put_ldev(mdev); | 1962 | if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) { |
1457 | if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) { | 1963 | retcode = ERR_CSUMS_RESYNC_RUNNING; |
1458 | retcode = ERR_STONITH_AND_PROT_A; | 1964 | goto fail; |
1459 | goto fail; | ||
1460 | } | ||
1461 | } | 1965 | } |
1462 | 1966 | ||
1463 | if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) { | 1967 | /* online verify running */ |
1464 | retcode = ERR_CONG_NOT_PROTO_A; | 1968 | ovr = conn_ov_running(tconn); |
1969 | if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) { | ||
1970 | retcode = ERR_VERIFY_RUNNING; | ||
1465 | goto fail; | 1971 | goto fail; |
1466 | } | 1972 | } |
1467 | 1973 | ||
1468 | if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { | 1974 | retcode = alloc_crypto(&crypto, new_conf); |
1469 | retcode = ERR_DISCARD; | 1975 | if (retcode != NO_ERROR) |
1470 | goto fail; | 1976 | goto fail; |
1471 | } | ||
1472 | 1977 | ||
1473 | retcode = NO_ERROR; | 1978 | rcu_assign_pointer(tconn->net_conf, new_conf); |
1474 | 1979 | ||
1475 | new_my_addr = (struct sockaddr *)&new_conf->my_addr; | 1980 | if (!rsr) { |
1476 | new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; | 1981 | crypto_free_hash(tconn->csums_tfm); |
1477 | for (i = 0; i < minor_count; i++) { | 1982 | tconn->csums_tfm = crypto.csums_tfm; |
1478 | odev = minor_to_mdev(i); | 1983 | crypto.csums_tfm = NULL; |
1479 | if (!odev || odev == mdev) | 1984 | } |
1480 | continue; | 1985 | if (!ovr) { |
1481 | if (get_net_conf(odev)) { | 1986 | crypto_free_hash(tconn->verify_tfm); |
1482 | taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; | 1987 | tconn->verify_tfm = crypto.verify_tfm; |
1483 | if (new_conf->my_addr_len == odev->net_conf->my_addr_len && | 1988 | crypto.verify_tfm = NULL; |
1484 | !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) | ||
1485 | retcode = ERR_LOCAL_ADDR; | ||
1486 | |||
1487 | taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; | ||
1488 | if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && | ||
1489 | !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) | ||
1490 | retcode = ERR_PEER_ADDR; | ||
1491 | |||
1492 | put_net_conf(odev); | ||
1493 | if (retcode != NO_ERROR) | ||
1494 | goto fail; | ||
1495 | } | ||
1496 | } | 1989 | } |
1497 | 1990 | ||
1498 | if (new_conf->cram_hmac_alg[0] != 0) { | 1991 | crypto_free_hash(tconn->integrity_tfm); |
1499 | snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", | 1992 | tconn->integrity_tfm = crypto.integrity_tfm; |
1500 | new_conf->cram_hmac_alg); | 1993 | if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100) |
1501 | tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); | 1994 | /* Do this without trying to take tconn->data.mutex again. */ |
1502 | if (IS_ERR(tfm)) { | 1995 | __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE); |
1503 | tfm = NULL; | ||
1504 | retcode = ERR_AUTH_ALG; | ||
1505 | goto fail; | ||
1506 | } | ||
1507 | 1996 | ||
1508 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | 1997 | crypto_free_hash(tconn->cram_hmac_tfm); |
1509 | retcode = ERR_AUTH_ALG_ND; | 1998 | tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; |
1510 | goto fail; | ||
1511 | } | ||
1512 | } | ||
1513 | 1999 | ||
1514 | if (new_conf->integrity_alg[0]) { | 2000 | mutex_unlock(&tconn->conf_update); |
1515 | integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | 2001 | mutex_unlock(&tconn->data.mutex); |
1516 | if (IS_ERR(integrity_w_tfm)) { | 2002 | synchronize_rcu(); |
1517 | integrity_w_tfm = NULL; | 2003 | kfree(old_conf); |
1518 | retcode=ERR_INTEGRITY_ALG; | ||
1519 | goto fail; | ||
1520 | } | ||
1521 | 2004 | ||
1522 | if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { | 2005 | if (tconn->cstate >= C_WF_REPORT_PARAMS) |
1523 | retcode=ERR_INTEGRITY_ALG_ND; | 2006 | drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn))); |
1524 | goto fail; | ||
1525 | } | ||
1526 | 2007 | ||
1527 | integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | 2008 | goto done; |
1528 | if (IS_ERR(integrity_r_tfm)) { | 2009 | |
1529 | integrity_r_tfm = NULL; | 2010 | fail: |
1530 | retcode=ERR_INTEGRITY_ALG; | 2011 | mutex_unlock(&tconn->conf_update); |
1531 | goto fail; | 2012 | mutex_unlock(&tconn->data.mutex); |
1532 | } | 2013 | free_crypto(&crypto); |
2014 | kfree(new_conf); | ||
2015 | done: | ||
2016 | conn_reconfig_done(tconn); | ||
2017 | out: | ||
2018 | drbd_adm_finish(info, retcode); | ||
2019 | return 0; | ||
2020 | } | ||
2021 | |||
2022 | int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) | ||
2023 | { | ||
2024 | struct drbd_conf *mdev; | ||
2025 | struct net_conf *old_conf, *new_conf = NULL; | ||
2026 | struct crypto crypto = { }; | ||
2027 | struct drbd_tconn *tconn; | ||
2028 | enum drbd_ret_code retcode; | ||
2029 | int i; | ||
2030 | int err; | ||
2031 | |||
2032 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); | ||
2033 | |||
2034 | if (!adm_ctx.reply_skb) | ||
2035 | return retcode; | ||
2036 | if (retcode != NO_ERROR) | ||
2037 | goto out; | ||
2038 | if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { | ||
2039 | drbd_msg_put_info("connection endpoint(s) missing"); | ||
2040 | retcode = ERR_INVALID_REQUEST; | ||
2041 | goto out; | ||
1533 | } | 2042 | } |
1534 | 2043 | ||
1535 | ns = new_conf->max_epoch_size/8; | 2044 | /* No need for _rcu here. All reconfiguration is |
1536 | if (mdev->tl_hash_s != ns) { | 2045 | * strictly serialized on genl_lock(). We are protected against |
1537 | new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | 2046 | * concurrent reconfiguration/addition/deletion */ |
1538 | if (!new_tl_hash) { | 2047 | list_for_each_entry(tconn, &drbd_tconns, all_tconn) { |
1539 | retcode = ERR_NOMEM; | 2048 | if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len && |
1540 | goto fail; | 2049 | !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) { |
2050 | retcode = ERR_LOCAL_ADDR; | ||
2051 | goto out; | ||
1541 | } | 2052 | } |
1542 | } | ||
1543 | 2053 | ||
1544 | ns = new_conf->max_buffers/8; | 2054 | if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len && |
1545 | if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { | 2055 | !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) { |
1546 | new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | 2056 | retcode = ERR_PEER_ADDR; |
1547 | if (!new_ee_hash) { | 2057 | goto out; |
1548 | retcode = ERR_NOMEM; | ||
1549 | goto fail; | ||
1550 | } | 2058 | } |
1551 | } | 2059 | } |
1552 | 2060 | ||
1553 | ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; | 2061 | tconn = adm_ctx.tconn; |
2062 | conn_reconfig_start(tconn); | ||
1554 | 2063 | ||
1555 | if (integrity_w_tfm) { | 2064 | if (tconn->cstate > C_STANDALONE) { |
1556 | i = crypto_hash_digestsize(integrity_w_tfm); | 2065 | retcode = ERR_NET_CONFIGURED; |
1557 | int_dig_out = kmalloc(i, GFP_KERNEL); | 2066 | goto fail; |
1558 | if (!int_dig_out) { | ||
1559 | retcode = ERR_NOMEM; | ||
1560 | goto fail; | ||
1561 | } | ||
1562 | int_dig_in = kmalloc(i, GFP_KERNEL); | ||
1563 | if (!int_dig_in) { | ||
1564 | retcode = ERR_NOMEM; | ||
1565 | goto fail; | ||
1566 | } | ||
1567 | int_dig_vv = kmalloc(i, GFP_KERNEL); | ||
1568 | if (!int_dig_vv) { | ||
1569 | retcode = ERR_NOMEM; | ||
1570 | goto fail; | ||
1571 | } | ||
1572 | } | 2067 | } |
1573 | 2068 | ||
1574 | if (!mdev->bitmap) { | 2069 | /* allocation not in the IO path, drbdsetup / netlink process context */ |
1575 | if(drbd_bm_init(mdev)) { | 2070 | new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL); |
1576 | retcode = ERR_NOMEM; | 2071 | if (!new_conf) { |
1577 | goto fail; | 2072 | retcode = ERR_NOMEM; |
1578 | } | 2073 | goto fail; |
1579 | } | 2074 | } |
1580 | 2075 | ||
1581 | drbd_flush_workqueue(mdev); | 2076 | set_net_conf_defaults(new_conf); |
1582 | spin_lock_irq(&mdev->req_lock); | 2077 | |
1583 | if (mdev->net_conf != NULL) { | 2078 | err = net_conf_from_attrs(new_conf, info); |
1584 | retcode = ERR_NET_CONFIGURED; | 2079 | if (err && err != -ENOMSG) { |
1585 | spin_unlock_irq(&mdev->req_lock); | 2080 | retcode = ERR_MANDATORY_TAG; |
2081 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1586 | goto fail; | 2082 | goto fail; |
1587 | } | 2083 | } |
1588 | mdev->net_conf = new_conf; | ||
1589 | 2084 | ||
1590 | mdev->send_cnt = 0; | 2085 | retcode = check_net_options(tconn, new_conf); |
1591 | mdev->recv_cnt = 0; | 2086 | if (retcode != NO_ERROR) |
2087 | goto fail; | ||
1592 | 2088 | ||
1593 | if (new_tl_hash) { | 2089 | retcode = alloc_crypto(&crypto, new_conf); |
1594 | kfree(mdev->tl_hash); | 2090 | if (retcode != NO_ERROR) |
1595 | mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; | 2091 | goto fail; |
1596 | mdev->tl_hash = new_tl_hash; | ||
1597 | } | ||
1598 | 2092 | ||
1599 | if (new_ee_hash) { | 2093 | ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; |
1600 | kfree(mdev->ee_hash); | 2094 | |
1601 | mdev->ee_hash_s = mdev->net_conf->max_buffers/8; | 2095 | conn_flush_workqueue(tconn); |
1602 | mdev->ee_hash = new_ee_hash; | 2096 | |
2097 | mutex_lock(&tconn->conf_update); | ||
2098 | old_conf = tconn->net_conf; | ||
2099 | if (old_conf) { | ||
2100 | retcode = ERR_NET_CONFIGURED; | ||
2101 | mutex_unlock(&tconn->conf_update); | ||
2102 | goto fail; | ||
1603 | } | 2103 | } |
2104 | rcu_assign_pointer(tconn->net_conf, new_conf); | ||
1604 | 2105 | ||
1605 | crypto_free_hash(mdev->cram_hmac_tfm); | 2106 | conn_free_crypto(tconn); |
1606 | mdev->cram_hmac_tfm = tfm; | 2107 | tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; |
2108 | tconn->integrity_tfm = crypto.integrity_tfm; | ||
2109 | tconn->csums_tfm = crypto.csums_tfm; | ||
2110 | tconn->verify_tfm = crypto.verify_tfm; | ||
1607 | 2111 | ||
1608 | crypto_free_hash(mdev->integrity_w_tfm); | 2112 | tconn->my_addr_len = nla_len(adm_ctx.my_addr); |
1609 | mdev->integrity_w_tfm = integrity_w_tfm; | 2113 | memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len); |
2114 | tconn->peer_addr_len = nla_len(adm_ctx.peer_addr); | ||
2115 | memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len); | ||
1610 | 2116 | ||
1611 | crypto_free_hash(mdev->integrity_r_tfm); | 2117 | mutex_unlock(&tconn->conf_update); |
1612 | mdev->integrity_r_tfm = integrity_r_tfm; | ||
1613 | 2118 | ||
1614 | kfree(mdev->int_dig_out); | 2119 | rcu_read_lock(); |
1615 | kfree(mdev->int_dig_in); | 2120 | idr_for_each_entry(&tconn->volumes, mdev, i) { |
1616 | kfree(mdev->int_dig_vv); | 2121 | mdev->send_cnt = 0; |
1617 | mdev->int_dig_out=int_dig_out; | 2122 | mdev->recv_cnt = 0; |
1618 | mdev->int_dig_in=int_dig_in; | 2123 | } |
1619 | mdev->int_dig_vv=int_dig_vv; | 2124 | rcu_read_unlock(); |
1620 | retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL); | ||
1621 | spin_unlock_irq(&mdev->req_lock); | ||
1622 | 2125 | ||
1623 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | 2126 | retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); |
1624 | reply->ret_code = retcode; | 2127 | |
1625 | drbd_reconfig_done(mdev); | 2128 | conn_reconfig_done(tconn); |
2129 | drbd_adm_finish(info, retcode); | ||
1626 | return 0; | 2130 | return 0; |
1627 | 2131 | ||
1628 | fail: | 2132 | fail: |
1629 | kfree(int_dig_out); | 2133 | free_crypto(&crypto); |
1630 | kfree(int_dig_in); | ||
1631 | kfree(int_dig_vv); | ||
1632 | crypto_free_hash(tfm); | ||
1633 | crypto_free_hash(integrity_w_tfm); | ||
1634 | crypto_free_hash(integrity_r_tfm); | ||
1635 | kfree(new_tl_hash); | ||
1636 | kfree(new_ee_hash); | ||
1637 | kfree(new_conf); | 2134 | kfree(new_conf); |
1638 | 2135 | ||
1639 | reply->ret_code = retcode; | 2136 | conn_reconfig_done(tconn); |
1640 | drbd_reconfig_done(mdev); | 2137 | out: |
2138 | drbd_adm_finish(info, retcode); | ||
1641 | return 0; | 2139 | return 0; |
1642 | } | 2140 | } |
1643 | 2141 | ||
1644 | static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2142 | static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force) |
1645 | struct drbd_nl_cfg_reply *reply) | ||
1646 | { | 2143 | { |
1647 | int retcode; | 2144 | enum drbd_state_rv rv; |
1648 | struct disconnect dc; | ||
1649 | |||
1650 | memset(&dc, 0, sizeof(struct disconnect)); | ||
1651 | if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) { | ||
1652 | retcode = ERR_MANDATORY_TAG; | ||
1653 | goto fail; | ||
1654 | } | ||
1655 | |||
1656 | if (dc.force) { | ||
1657 | spin_lock_irq(&mdev->req_lock); | ||
1658 | if (mdev->state.conn >= C_WF_CONNECTION) | ||
1659 | _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL); | ||
1660 | spin_unlock_irq(&mdev->req_lock); | ||
1661 | goto done; | ||
1662 | } | ||
1663 | 2145 | ||
1664 | retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); | 2146 | rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), |
2147 | force ? CS_HARD : 0); | ||
1665 | 2148 | ||
1666 | if (retcode == SS_NOTHING_TO_DO) | 2149 | switch (rv) { |
1667 | goto done; | 2150 | case SS_NOTHING_TO_DO: |
1668 | else if (retcode == SS_ALREADY_STANDALONE) | 2151 | break; |
1669 | goto done; | 2152 | case SS_ALREADY_STANDALONE: |
1670 | else if (retcode == SS_PRIMARY_NOP) { | 2153 | return SS_SUCCESS; |
1671 | /* Our statche checking code wants to see the peer outdated. */ | 2154 | case SS_PRIMARY_NOP: |
1672 | retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | 2155 | /* Our state checking code wants to see the peer outdated. */ |
1673 | pdsk, D_OUTDATED)); | 2156 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, |
1674 | } else if (retcode == SS_CW_FAILED_BY_PEER) { | 2157 | pdsk, D_OUTDATED), CS_VERBOSE); |
2158 | break; | ||
2159 | case SS_CW_FAILED_BY_PEER: | ||
1675 | /* The peer probably wants to see us outdated. */ | 2160 | /* The peer probably wants to see us outdated. */ |
1676 | retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | 2161 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, |
1677 | disk, D_OUTDATED), | 2162 | disk, D_OUTDATED), 0); |
1678 | CS_ORDERED); | 2163 | if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) { |
1679 | if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { | 2164 | rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), |
1680 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 2165 | CS_HARD); |
1681 | retcode = SS_SUCCESS; | ||
1682 | } | 2166 | } |
2167 | break; | ||
2168 | default:; | ||
2169 | /* no special handling necessary */ | ||
2170 | } | ||
2171 | |||
2172 | if (rv >= SS_SUCCESS) { | ||
2173 | enum drbd_state_rv rv2; | ||
2174 | /* No one else can reconfigure the network while I am here. | ||
2175 | * The state handling only uses drbd_thread_stop_nowait(), | ||
2176 | * we want to really wait here until the receiver is no more. | ||
2177 | */ | ||
2178 | drbd_thread_stop(&adm_ctx.tconn->receiver); | ||
2179 | |||
2180 | /* Race breaker. This additional state change request may be | ||
2181 | * necessary, if this was a forced disconnect during a receiver | ||
2182 | * restart. We may have "killed" the receiver thread just | ||
2183 | * after drbdd_init() returned. Typically, we should be | ||
2184 | * C_STANDALONE already, now, and this becomes a no-op. | ||
2185 | */ | ||
2186 | rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE), | ||
2187 | CS_VERBOSE | CS_HARD); | ||
2188 | if (rv2 < SS_SUCCESS) | ||
2189 | conn_err(tconn, | ||
2190 | "unexpected rv2=%d in conn_try_disconnect()\n", | ||
2191 | rv2); | ||
1683 | } | 2192 | } |
2193 | return rv; | ||
2194 | } | ||
1684 | 2195 | ||
1685 | if (retcode < SS_SUCCESS) | 2196 | int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) |
1686 | goto fail; | 2197 | { |
2198 | struct disconnect_parms parms; | ||
2199 | struct drbd_tconn *tconn; | ||
2200 | enum drbd_state_rv rv; | ||
2201 | enum drbd_ret_code retcode; | ||
2202 | int err; | ||
1687 | 2203 | ||
1688 | if (wait_event_interruptible(mdev->state_wait, | 2204 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); |
1689 | mdev->state.conn != C_DISCONNECTING)) { | 2205 | if (!adm_ctx.reply_skb) |
1690 | /* Do not test for mdev->state.conn == C_STANDALONE, since | 2206 | return retcode; |
1691 | someone else might connect us in the mean time! */ | 2207 | if (retcode != NO_ERROR) |
1692 | retcode = ERR_INTR; | ||
1693 | goto fail; | 2208 | goto fail; |
2209 | |||
2210 | tconn = adm_ctx.tconn; | ||
2211 | memset(&parms, 0, sizeof(parms)); | ||
2212 | if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { | ||
2213 | err = disconnect_parms_from_attrs(&parms, info); | ||
2214 | if (err) { | ||
2215 | retcode = ERR_MANDATORY_TAG; | ||
2216 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
2217 | goto fail; | ||
2218 | } | ||
1694 | } | 2219 | } |
1695 | 2220 | ||
1696 | done: | 2221 | rv = conn_try_disconnect(tconn, parms.force_disconnect); |
1697 | retcode = NO_ERROR; | 2222 | if (rv < SS_SUCCESS) |
2223 | retcode = rv; /* FIXME: Type mismatch. */ | ||
2224 | else | ||
2225 | retcode = NO_ERROR; | ||
1698 | fail: | 2226 | fail: |
1699 | drbd_md_sync(mdev); | 2227 | drbd_adm_finish(info, retcode); |
1700 | reply->ret_code = retcode; | ||
1701 | return 0; | 2228 | return 0; |
1702 | } | 2229 | } |
1703 | 2230 | ||
@@ -1709,7 +2236,7 @@ void resync_after_online_grow(struct drbd_conf *mdev) | |||
1709 | if (mdev->state.role != mdev->state.peer) | 2236 | if (mdev->state.role != mdev->state.peer) |
1710 | iass = (mdev->state.role == R_PRIMARY); | 2237 | iass = (mdev->state.role == R_PRIMARY); |
1711 | else | 2238 | else |
1712 | iass = drbd_test_flag(mdev, DISCARD_CONCURRENT); | 2239 | iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags); |
1713 | 2240 | ||
1714 | if (iass) | 2241 | if (iass) |
1715 | drbd_start_resync(mdev, C_SYNC_SOURCE); | 2242 | drbd_start_resync(mdev, C_SYNC_SOURCE); |
@@ -1717,20 +2244,34 @@ void resync_after_online_grow(struct drbd_conf *mdev) | |||
1717 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); | 2244 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); |
1718 | } | 2245 | } |
1719 | 2246 | ||
1720 | static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2247 | int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) |
1721 | struct drbd_nl_cfg_reply *reply) | ||
1722 | { | 2248 | { |
1723 | struct resize rs; | 2249 | struct disk_conf *old_disk_conf, *new_disk_conf = NULL; |
1724 | int retcode = NO_ERROR; | 2250 | struct resize_parms rs; |
2251 | struct drbd_conf *mdev; | ||
2252 | enum drbd_ret_code retcode; | ||
1725 | enum determine_dev_size dd; | 2253 | enum determine_dev_size dd; |
1726 | enum dds_flags ddsf; | 2254 | enum dds_flags ddsf; |
2255 | sector_t u_size; | ||
2256 | int err; | ||
1727 | 2257 | ||
1728 | memset(&rs, 0, sizeof(struct resize)); | 2258 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
1729 | if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { | 2259 | if (!adm_ctx.reply_skb) |
1730 | retcode = ERR_MANDATORY_TAG; | 2260 | return retcode; |
2261 | if (retcode != NO_ERROR) | ||
1731 | goto fail; | 2262 | goto fail; |
2263 | |||
2264 | memset(&rs, 0, sizeof(struct resize_parms)); | ||
2265 | if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { | ||
2266 | err = resize_parms_from_attrs(&rs, info); | ||
2267 | if (err) { | ||
2268 | retcode = ERR_MANDATORY_TAG; | ||
2269 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
2270 | goto fail; | ||
2271 | } | ||
1732 | } | 2272 | } |
1733 | 2273 | ||
2274 | mdev = adm_ctx.mdev; | ||
1734 | if (mdev->state.conn > C_CONNECTED) { | 2275 | if (mdev->state.conn > C_CONNECTED) { |
1735 | retcode = ERR_RESIZE_RESYNC; | 2276 | retcode = ERR_RESIZE_RESYNC; |
1736 | goto fail; | 2277 | goto fail; |
@@ -1747,15 +2288,36 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1747 | goto fail; | 2288 | goto fail; |
1748 | } | 2289 | } |
1749 | 2290 | ||
1750 | if (rs.no_resync && mdev->agreed_pro_version < 93) { | 2291 | if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { |
1751 | retcode = ERR_NEED_APV_93; | 2292 | retcode = ERR_NEED_APV_93; |
1752 | goto fail_ldev; | 2293 | goto fail_ldev; |
1753 | } | 2294 | } |
1754 | 2295 | ||
2296 | rcu_read_lock(); | ||
2297 | u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
2298 | rcu_read_unlock(); | ||
2299 | if (u_size != (sector_t)rs.resize_size) { | ||
2300 | new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
2301 | if (!new_disk_conf) { | ||
2302 | retcode = ERR_NOMEM; | ||
2303 | goto fail_ldev; | ||
2304 | } | ||
2305 | } | ||
2306 | |||
1755 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) | 2307 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) |
1756 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | 2308 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); |
1757 | 2309 | ||
1758 | mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; | 2310 | if (new_disk_conf) { |
2311 | mutex_lock(&mdev->tconn->conf_update); | ||
2312 | old_disk_conf = mdev->ldev->disk_conf; | ||
2313 | *new_disk_conf = *old_disk_conf; | ||
2314 | new_disk_conf->disk_size = (sector_t)rs.resize_size; | ||
2315 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
2316 | mutex_unlock(&mdev->tconn->conf_update); | ||
2317 | synchronize_rcu(); | ||
2318 | kfree(old_disk_conf); | ||
2319 | } | ||
2320 | |||
1759 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); | 2321 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); |
1760 | dd = drbd_determine_dev_size(mdev, ddsf); | 2322 | dd = drbd_determine_dev_size(mdev, ddsf); |
1761 | drbd_md_sync(mdev); | 2323 | drbd_md_sync(mdev); |
@@ -1767,14 +2329,14 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1767 | 2329 | ||
1768 | if (mdev->state.conn == C_CONNECTED) { | 2330 | if (mdev->state.conn == C_CONNECTED) { |
1769 | if (dd == grew) | 2331 | if (dd == grew) |
1770 | drbd_set_flag(mdev, RESIZE_PENDING); | 2332 | set_bit(RESIZE_PENDING, &mdev->flags); |
1771 | 2333 | ||
1772 | drbd_send_uuids(mdev); | 2334 | drbd_send_uuids(mdev); |
1773 | drbd_send_sizes(mdev, 1, ddsf); | 2335 | drbd_send_sizes(mdev, 1, ddsf); |
1774 | } | 2336 | } |
1775 | 2337 | ||
1776 | fail: | 2338 | fail: |
1777 | reply->ret_code = retcode; | 2339 | drbd_adm_finish(info, retcode); |
1778 | return 0; | 2340 | return 0; |
1779 | 2341 | ||
1780 | fail_ldev: | 2342 | fail_ldev: |
@@ -1782,210 +2344,61 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1782 | goto fail; | 2344 | goto fail; |
1783 | } | 2345 | } |
1784 | 2346 | ||
1785 | static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2347 | int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) |
1786 | struct drbd_nl_cfg_reply *reply) | ||
1787 | { | 2348 | { |
1788 | int retcode = NO_ERROR; | 2349 | enum drbd_ret_code retcode; |
2350 | struct drbd_tconn *tconn; | ||
2351 | struct res_opts res_opts; | ||
1789 | int err; | 2352 | int err; |
1790 | int ovr; /* online verify running */ | ||
1791 | int rsr; /* re-sync running */ | ||
1792 | struct crypto_hash *verify_tfm = NULL; | ||
1793 | struct crypto_hash *csums_tfm = NULL; | ||
1794 | struct syncer_conf sc; | ||
1795 | cpumask_var_t new_cpu_mask; | ||
1796 | int *rs_plan_s = NULL; | ||
1797 | int fifo_size; | ||
1798 | |||
1799 | if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { | ||
1800 | retcode = ERR_NOMEM; | ||
1801 | goto fail; | ||
1802 | } | ||
1803 | |||
1804 | if (nlp->flags & DRBD_NL_SET_DEFAULTS) { | ||
1805 | memset(&sc, 0, sizeof(struct syncer_conf)); | ||
1806 | sc.rate = DRBD_RATE_DEF; | ||
1807 | sc.after = DRBD_AFTER_DEF; | ||
1808 | sc.al_extents = DRBD_AL_EXTENTS_DEF; | ||
1809 | sc.on_no_data = DRBD_ON_NO_DATA_DEF; | ||
1810 | sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF; | ||
1811 | sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF; | ||
1812 | sc.c_fill_target = DRBD_C_FILL_TARGET_DEF; | ||
1813 | sc.c_max_rate = DRBD_C_MAX_RATE_DEF; | ||
1814 | sc.c_min_rate = DRBD_C_MIN_RATE_DEF; | ||
1815 | } else | ||
1816 | memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); | ||
1817 | 2353 | ||
1818 | if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { | 2354 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); |
1819 | retcode = ERR_MANDATORY_TAG; | 2355 | if (!adm_ctx.reply_skb) |
1820 | goto fail; | 2356 | return retcode; |
1821 | } | ||
1822 | |||
1823 | /* re-sync running */ | ||
1824 | rsr = ( mdev->state.conn == C_SYNC_SOURCE || | ||
1825 | mdev->state.conn == C_SYNC_TARGET || | ||
1826 | mdev->state.conn == C_PAUSED_SYNC_S || | ||
1827 | mdev->state.conn == C_PAUSED_SYNC_T ); | ||
1828 | |||
1829 | if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { | ||
1830 | retcode = ERR_CSUMS_RESYNC_RUNNING; | ||
1831 | goto fail; | ||
1832 | } | ||
1833 | |||
1834 | if (!rsr && sc.csums_alg[0]) { | ||
1835 | csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); | ||
1836 | if (IS_ERR(csums_tfm)) { | ||
1837 | csums_tfm = NULL; | ||
1838 | retcode = ERR_CSUMS_ALG; | ||
1839 | goto fail; | ||
1840 | } | ||
1841 | |||
1842 | if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { | ||
1843 | retcode = ERR_CSUMS_ALG_ND; | ||
1844 | goto fail; | ||
1845 | } | ||
1846 | } | ||
1847 | |||
1848 | /* online verify running */ | ||
1849 | ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); | ||
1850 | |||
1851 | if (ovr) { | ||
1852 | if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { | ||
1853 | retcode = ERR_VERIFY_RUNNING; | ||
1854 | goto fail; | ||
1855 | } | ||
1856 | } | ||
1857 | |||
1858 | if (!ovr && sc.verify_alg[0]) { | ||
1859 | verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); | ||
1860 | if (IS_ERR(verify_tfm)) { | ||
1861 | verify_tfm = NULL; | ||
1862 | retcode = ERR_VERIFY_ALG; | ||
1863 | goto fail; | ||
1864 | } | ||
1865 | |||
1866 | if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { | ||
1867 | retcode = ERR_VERIFY_ALG_ND; | ||
1868 | goto fail; | ||
1869 | } | ||
1870 | } | ||
1871 | |||
1872 | /* silently ignore cpu mask on UP kernel */ | ||
1873 | if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { | ||
1874 | err = bitmap_parse(sc.cpu_mask, 32, | ||
1875 | cpumask_bits(new_cpu_mask), nr_cpu_ids); | ||
1876 | if (err) { | ||
1877 | dev_warn(DEV, "bitmap_parse() failed with %d\n", err); | ||
1878 | retcode = ERR_CPU_MASK_PARSE; | ||
1879 | goto fail; | ||
1880 | } | ||
1881 | } | ||
1882 | |||
1883 | ERR_IF (sc.rate < 1) sc.rate = 1; | ||
1884 | ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ | ||
1885 | #define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) | ||
1886 | if (sc.al_extents > AL_MAX) { | ||
1887 | dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); | ||
1888 | sc.al_extents = AL_MAX; | ||
1889 | } | ||
1890 | #undef AL_MAX | ||
1891 | |||
1892 | /* to avoid spurious errors when configuring minors before configuring | ||
1893 | * the minors they depend on: if necessary, first create the minor we | ||
1894 | * depend on */ | ||
1895 | if (sc.after >= 0) | ||
1896 | ensure_mdev(sc.after, 1); | ||
1897 | |||
1898 | /* most sanity checks done, try to assign the new sync-after | ||
1899 | * dependency. need to hold the global lock in there, | ||
1900 | * to avoid a race in the dependency loop check. */ | ||
1901 | retcode = drbd_alter_sa(mdev, sc.after); | ||
1902 | if (retcode != NO_ERROR) | 2357 | if (retcode != NO_ERROR) |
1903 | goto fail; | 2358 | goto fail; |
2359 | tconn = adm_ctx.tconn; | ||
1904 | 2360 | ||
1905 | fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ; | 2361 | res_opts = tconn->res_opts; |
1906 | if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { | 2362 | if (should_set_defaults(info)) |
1907 | rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); | 2363 | set_res_opts_defaults(&res_opts); |
1908 | if (!rs_plan_s) { | ||
1909 | dev_err(DEV, "kmalloc of fifo_buffer failed"); | ||
1910 | retcode = ERR_NOMEM; | ||
1911 | goto fail; | ||
1912 | } | ||
1913 | } | ||
1914 | 2364 | ||
1915 | /* ok, assign the rest of it as well. | 2365 | err = res_opts_from_attrs(&res_opts, info); |
1916 | * lock against receive_SyncParam() */ | 2366 | if (err && err != -ENOMSG) { |
1917 | spin_lock(&mdev->peer_seq_lock); | 2367 | retcode = ERR_MANDATORY_TAG; |
1918 | mdev->sync_conf = sc; | 2368 | drbd_msg_put_info(from_attrs_err_to_txt(err)); |
1919 | 2369 | goto fail; | |
1920 | if (!rsr) { | ||
1921 | crypto_free_hash(mdev->csums_tfm); | ||
1922 | mdev->csums_tfm = csums_tfm; | ||
1923 | csums_tfm = NULL; | ||
1924 | } | ||
1925 | |||
1926 | if (!ovr) { | ||
1927 | crypto_free_hash(mdev->verify_tfm); | ||
1928 | mdev->verify_tfm = verify_tfm; | ||
1929 | verify_tfm = NULL; | ||
1930 | } | ||
1931 | |||
1932 | if (fifo_size != mdev->rs_plan_s.size) { | ||
1933 | kfree(mdev->rs_plan_s.values); | ||
1934 | mdev->rs_plan_s.values = rs_plan_s; | ||
1935 | mdev->rs_plan_s.size = fifo_size; | ||
1936 | mdev->rs_planed = 0; | ||
1937 | rs_plan_s = NULL; | ||
1938 | } | 2370 | } |
1939 | 2371 | ||
1940 | spin_unlock(&mdev->peer_seq_lock); | 2372 | err = set_resource_options(tconn, &res_opts); |
1941 | 2373 | if (err) { | |
1942 | if (get_ldev(mdev)) { | 2374 | retcode = ERR_INVALID_REQUEST; |
1943 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | 2375 | if (err == -ENOMEM) |
1944 | drbd_al_shrink(mdev); | ||
1945 | err = drbd_check_al_size(mdev); | ||
1946 | lc_unlock(mdev->act_log); | ||
1947 | wake_up(&mdev->al_wait); | ||
1948 | |||
1949 | put_ldev(mdev); | ||
1950 | drbd_md_sync(mdev); | ||
1951 | |||
1952 | if (err) { | ||
1953 | retcode = ERR_NOMEM; | 2376 | retcode = ERR_NOMEM; |
1954 | goto fail; | ||
1955 | } | ||
1956 | } | ||
1957 | |||
1958 | if (mdev->state.conn >= C_CONNECTED) | ||
1959 | drbd_send_sync_param(mdev, &sc); | ||
1960 | |||
1961 | if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { | ||
1962 | cpumask_copy(mdev->cpu_mask, new_cpu_mask); | ||
1963 | drbd_calc_cpu_mask(mdev); | ||
1964 | mdev->receiver.reset_cpu_mask = 1; | ||
1965 | mdev->asender.reset_cpu_mask = 1; | ||
1966 | mdev->worker.reset_cpu_mask = 1; | ||
1967 | } | 2377 | } |
1968 | 2378 | ||
1969 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1970 | fail: | 2379 | fail: |
1971 | kfree(rs_plan_s); | 2380 | drbd_adm_finish(info, retcode); |
1972 | free_cpumask_var(new_cpu_mask); | ||
1973 | crypto_free_hash(csums_tfm); | ||
1974 | crypto_free_hash(verify_tfm); | ||
1975 | reply->ret_code = retcode; | ||
1976 | return 0; | 2381 | return 0; |
1977 | } | 2382 | } |
1978 | 2383 | ||
1979 | static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2384 | int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) |
1980 | struct drbd_nl_cfg_reply *reply) | ||
1981 | { | 2385 | { |
1982 | int retcode; | 2386 | struct drbd_conf *mdev; |
2387 | int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ | ||
2388 | |||
2389 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2390 | if (!adm_ctx.reply_skb) | ||
2391 | return retcode; | ||
2392 | if (retcode != NO_ERROR) | ||
2393 | goto out; | ||
2394 | |||
2395 | mdev = adm_ctx.mdev; | ||
1983 | 2396 | ||
1984 | /* If there is still bitmap IO pending, probably because of a previous | 2397 | /* If there is still bitmap IO pending, probably because of a previous |
1985 | * resync just being finished, wait for it before requesting a new resync. | 2398 | * resync just being finished, wait for it before requesting a new resync. |
1986 | * Also wait for it's after_state_ch(). */ | 2399 | * Also wait for it's after_state_ch(). */ |
1987 | drbd_suspend_io(mdev); | 2400 | drbd_suspend_io(mdev); |
1988 | wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO)); | 2401 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
1989 | drbd_flush_workqueue(mdev); | 2402 | drbd_flush_workqueue(mdev); |
1990 | 2403 | ||
1991 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); | 2404 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); |
@@ -1994,10 +2407,10 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
1994 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | 2407 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); |
1995 | 2408 | ||
1996 | while (retcode == SS_NEED_CONNECTION) { | 2409 | while (retcode == SS_NEED_CONNECTION) { |
1997 | spin_lock_irq(&mdev->req_lock); | 2410 | spin_lock_irq(&mdev->tconn->req_lock); |
1998 | if (mdev->state.conn < C_CONNECTED) | 2411 | if (mdev->state.conn < C_CONNECTED) |
1999 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); | 2412 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); |
2000 | spin_unlock_irq(&mdev->req_lock); | 2413 | spin_unlock_irq(&mdev->tconn->req_lock); |
2001 | 2414 | ||
2002 | if (retcode != SS_NEED_CONNECTION) | 2415 | if (retcode != SS_NEED_CONNECTION) |
2003 | break; | 2416 | break; |
@@ -2006,7 +2419,25 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2006 | } | 2419 | } |
2007 | drbd_resume_io(mdev); | 2420 | drbd_resume_io(mdev); |
2008 | 2421 | ||
2009 | reply->ret_code = retcode; | 2422 | out: |
2423 | drbd_adm_finish(info, retcode); | ||
2424 | return 0; | ||
2425 | } | ||
2426 | |||
2427 | static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, | ||
2428 | union drbd_state mask, union drbd_state val) | ||
2429 | { | ||
2430 | enum drbd_ret_code retcode; | ||
2431 | |||
2432 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2433 | if (!adm_ctx.reply_skb) | ||
2434 | return retcode; | ||
2435 | if (retcode != NO_ERROR) | ||
2436 | goto out; | ||
2437 | |||
2438 | retcode = drbd_request_state(adm_ctx.mdev, mask, val); | ||
2439 | out: | ||
2440 | drbd_adm_finish(info, retcode); | ||
2010 | return 0; | 2441 | return 0; |
2011 | } | 2442 | } |
2012 | 2443 | ||
@@ -2019,29 +2450,36 @@ static int drbd_bmio_set_susp_al(struct drbd_conf *mdev) | |||
2019 | return rv; | 2450 | return rv; |
2020 | } | 2451 | } |
2021 | 2452 | ||
2022 | static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2453 | int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) |
2023 | struct drbd_nl_cfg_reply *reply) | ||
2024 | { | 2454 | { |
2025 | int retcode; | 2455 | int retcode; /* drbd_ret_code, drbd_state_rv */ |
2456 | struct drbd_conf *mdev; | ||
2457 | |||
2458 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2459 | if (!adm_ctx.reply_skb) | ||
2460 | return retcode; | ||
2461 | if (retcode != NO_ERROR) | ||
2462 | goto out; | ||
2463 | |||
2464 | mdev = adm_ctx.mdev; | ||
2026 | 2465 | ||
2027 | /* If there is still bitmap IO pending, probably because of a previous | 2466 | /* If there is still bitmap IO pending, probably because of a previous |
2028 | * resync just being finished, wait for it before requesting a new resync. | 2467 | * resync just being finished, wait for it before requesting a new resync. |
2029 | * Also wait for it's after_state_ch(). */ | 2468 | * Also wait for it's after_state_ch(). */ |
2030 | drbd_suspend_io(mdev); | 2469 | drbd_suspend_io(mdev); |
2031 | wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO)); | 2470 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
2032 | drbd_flush_workqueue(mdev); | 2471 | drbd_flush_workqueue(mdev); |
2033 | 2472 | ||
2034 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); | 2473 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); |
2035 | |||
2036 | if (retcode < SS_SUCCESS) { | 2474 | if (retcode < SS_SUCCESS) { |
2037 | if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { | 2475 | if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { |
2038 | /* The peer will get a resync upon connect anyways. Just make that | 2476 | /* The peer will get a resync upon connect anyways. |
2039 | into a full resync. */ | 2477 | * Just make that into a full resync. */ |
2040 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); | 2478 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); |
2041 | if (retcode >= SS_SUCCESS) { | 2479 | if (retcode >= SS_SUCCESS) { |
2042 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, | 2480 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, |
2043 | "set_n_write from invalidate_peer", | 2481 | "set_n_write from invalidate_peer", |
2044 | BM_LOCKED_SET_ALLOWED)) | 2482 | BM_LOCKED_SET_ALLOWED)) |
2045 | retcode = ERR_IO_MD_DISK; | 2483 | retcode = ERR_IO_MD_DISK; |
2046 | } | 2484 | } |
2047 | } else | 2485 | } else |
@@ -2049,30 +2487,41 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re | |||
2049 | } | 2487 | } |
2050 | drbd_resume_io(mdev); | 2488 | drbd_resume_io(mdev); |
2051 | 2489 | ||
2052 | reply->ret_code = retcode; | 2490 | out: |
2491 | drbd_adm_finish(info, retcode); | ||
2053 | return 0; | 2492 | return 0; |
2054 | } | 2493 | } |
2055 | 2494 | ||
2056 | static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2495 | int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) |
2057 | struct drbd_nl_cfg_reply *reply) | ||
2058 | { | 2496 | { |
2059 | int retcode = NO_ERROR; | 2497 | enum drbd_ret_code retcode; |
2060 | 2498 | ||
2061 | if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) | 2499 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2062 | retcode = ERR_PAUSE_IS_SET; | 2500 | if (!adm_ctx.reply_skb) |
2501 | return retcode; | ||
2502 | if (retcode != NO_ERROR) | ||
2503 | goto out; | ||
2063 | 2504 | ||
2064 | reply->ret_code = retcode; | 2505 | if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) |
2506 | retcode = ERR_PAUSE_IS_SET; | ||
2507 | out: | ||
2508 | drbd_adm_finish(info, retcode); | ||
2065 | return 0; | 2509 | return 0; |
2066 | } | 2510 | } |
2067 | 2511 | ||
2068 | static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2512 | int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) |
2069 | struct drbd_nl_cfg_reply *reply) | ||
2070 | { | 2513 | { |
2071 | int retcode = NO_ERROR; | 2514 | union drbd_dev_state s; |
2072 | union drbd_state s; | 2515 | enum drbd_ret_code retcode; |
2516 | |||
2517 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2518 | if (!adm_ctx.reply_skb) | ||
2519 | return retcode; | ||
2520 | if (retcode != NO_ERROR) | ||
2521 | goto out; | ||
2073 | 2522 | ||
2074 | if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { | 2523 | if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { |
2075 | s = mdev->state; | 2524 | s = adm_ctx.mdev->state; |
2076 | if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { | 2525 | if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { |
2077 | retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : | 2526 | retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : |
2078 | s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; | 2527 | s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; |
@@ -2081,178 +2530,482 @@ static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n | |||
2081 | } | 2530 | } |
2082 | } | 2531 | } |
2083 | 2532 | ||
2084 | reply->ret_code = retcode; | 2533 | out: |
2534 | drbd_adm_finish(info, retcode); | ||
2085 | return 0; | 2535 | return 0; |
2086 | } | 2536 | } |
2087 | 2537 | ||
2088 | static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2538 | int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) |
2089 | struct drbd_nl_cfg_reply *reply) | ||
2090 | { | 2539 | { |
2091 | reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); | 2540 | return drbd_adm_simple_request_state(skb, info, NS(susp, 1)); |
2092 | |||
2093 | return 0; | ||
2094 | } | 2541 | } |
2095 | 2542 | ||
2096 | static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2543 | int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) |
2097 | struct drbd_nl_cfg_reply *reply) | ||
2098 | { | 2544 | { |
2099 | if (drbd_test_flag(mdev, NEW_CUR_UUID)) { | 2545 | struct drbd_conf *mdev; |
2546 | int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ | ||
2547 | |||
2548 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2549 | if (!adm_ctx.reply_skb) | ||
2550 | return retcode; | ||
2551 | if (retcode != NO_ERROR) | ||
2552 | goto out; | ||
2553 | |||
2554 | mdev = adm_ctx.mdev; | ||
2555 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | ||
2100 | drbd_uuid_new_current(mdev); | 2556 | drbd_uuid_new_current(mdev); |
2101 | drbd_clear_flag(mdev, NEW_CUR_UUID); | 2557 | clear_bit(NEW_CUR_UUID, &mdev->flags); |
2102 | } | 2558 | } |
2103 | drbd_suspend_io(mdev); | 2559 | drbd_suspend_io(mdev); |
2104 | reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); | 2560 | retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); |
2105 | if (reply->ret_code == SS_SUCCESS) { | 2561 | if (retcode == SS_SUCCESS) { |
2106 | if (mdev->state.conn < C_CONNECTED) | 2562 | if (mdev->state.conn < C_CONNECTED) |
2107 | tl_clear(mdev); | 2563 | tl_clear(mdev->tconn); |
2108 | if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) | 2564 | if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) |
2109 | tl_restart(mdev, fail_frozen_disk_io); | 2565 | tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO); |
2110 | } | 2566 | } |
2111 | drbd_resume_io(mdev); | 2567 | drbd_resume_io(mdev); |
2112 | 2568 | ||
2569 | out: | ||
2570 | drbd_adm_finish(info, retcode); | ||
2113 | return 0; | 2571 | return 0; |
2114 | } | 2572 | } |
2115 | 2573 | ||
2116 | static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2574 | int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info) |
2117 | struct drbd_nl_cfg_reply *reply) | ||
2118 | { | 2575 | { |
2119 | reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); | 2576 | return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED)); |
2120 | return 0; | ||
2121 | } | 2577 | } |
2122 | 2578 | ||
2123 | static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2579 | int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr) |
2124 | struct drbd_nl_cfg_reply *reply) | ||
2125 | { | 2580 | { |
2126 | unsigned short *tl; | 2581 | struct nlattr *nla; |
2582 | nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT); | ||
2583 | if (!nla) | ||
2584 | goto nla_put_failure; | ||
2585 | if (vnr != VOLUME_UNSPECIFIED && | ||
2586 | nla_put_u32(skb, T_ctx_volume, vnr)) | ||
2587 | goto nla_put_failure; | ||
2588 | if (nla_put_string(skb, T_ctx_resource_name, tconn->name)) | ||
2589 | goto nla_put_failure; | ||
2590 | if (tconn->my_addr_len && | ||
2591 | nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr)) | ||
2592 | goto nla_put_failure; | ||
2593 | if (tconn->peer_addr_len && | ||
2594 | nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr)) | ||
2595 | goto nla_put_failure; | ||
2596 | nla_nest_end(skb, nla); | ||
2597 | return 0; | ||
2127 | 2598 | ||
2128 | tl = reply->tag_list; | 2599 | nla_put_failure: |
2600 | if (nla) | ||
2601 | nla_nest_cancel(skb, nla); | ||
2602 | return -EMSGSIZE; | ||
2603 | } | ||
2129 | 2604 | ||
2130 | if (get_ldev(mdev)) { | 2605 | int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, |
2131 | tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); | 2606 | const struct sib_info *sib) |
2132 | put_ldev(mdev); | 2607 | { |
2133 | } | 2608 | struct state_info *si = NULL; /* for sizeof(si->member); */ |
2609 | struct net_conf *nc; | ||
2610 | struct nlattr *nla; | ||
2611 | int got_ldev; | ||
2612 | int err = 0; | ||
2613 | int exclude_sensitive; | ||
2614 | |||
2615 | /* If sib != NULL, this is drbd_bcast_event, which anyone can listen | ||
2616 | * to. So we better exclude_sensitive information. | ||
2617 | * | ||
2618 | * If sib == NULL, this is drbd_adm_get_status, executed synchronously | ||
2619 | * in the context of the requesting user process. Exclude sensitive | ||
2620 | * information, unless current has superuser. | ||
2621 | * | ||
2622 | * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and | ||
2623 | * relies on the current implementation of netlink_dump(), which | ||
2624 | * executes the dump callback successively from netlink_recvmsg(), | ||
2625 | * always in the context of the receiving process */ | ||
2626 | exclude_sensitive = sib || !capable(CAP_SYS_ADMIN); | ||
2627 | |||
2628 | got_ldev = get_ldev(mdev); | ||
2629 | |||
2630 | /* We need to add connection name and volume number information still. | ||
2631 | * Minor number is in drbd_genlmsghdr. */ | ||
2632 | if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr)) | ||
2633 | goto nla_put_failure; | ||
2634 | |||
2635 | if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive)) | ||
2636 | goto nla_put_failure; | ||
2637 | |||
2638 | rcu_read_lock(); | ||
2639 | if (got_ldev) | ||
2640 | if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) | ||
2641 | goto nla_put_failure; | ||
2642 | |||
2643 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2644 | if (nc) | ||
2645 | err = net_conf_to_skb(skb, nc, exclude_sensitive); | ||
2646 | rcu_read_unlock(); | ||
2647 | if (err) | ||
2648 | goto nla_put_failure; | ||
2649 | |||
2650 | nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO); | ||
2651 | if (!nla) | ||
2652 | goto nla_put_failure; | ||
2653 | if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || | ||
2654 | nla_put_u32(skb, T_current_state, mdev->state.i) || | ||
2655 | nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) || | ||
2656 | nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) || | ||
2657 | nla_put_u64(skb, T_send_cnt, mdev->send_cnt) || | ||
2658 | nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) || | ||
2659 | nla_put_u64(skb, T_read_cnt, mdev->read_cnt) || | ||
2660 | nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) || | ||
2661 | nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) || | ||
2662 | nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) || | ||
2663 | nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) || | ||
2664 | nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) || | ||
2665 | nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt))) | ||
2666 | goto nla_put_failure; | ||
2667 | |||
2668 | if (got_ldev) { | ||
2669 | int err; | ||
2134 | 2670 | ||
2135 | if (get_net_conf(mdev)) { | 2671 | spin_lock_irq(&mdev->ldev->md.uuid_lock); |
2136 | tl = net_conf_to_tags(mdev, mdev->net_conf, tl); | 2672 | err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid); |
2137 | put_net_conf(mdev); | 2673 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); |
2674 | |||
2675 | if (err) | ||
2676 | goto nla_put_failure; | ||
2677 | |||
2678 | if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) || | ||
2679 | nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) || | ||
2680 | nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev))) | ||
2681 | goto nla_put_failure; | ||
2682 | if (C_SYNC_SOURCE <= mdev->state.conn && | ||
2683 | C_PAUSED_SYNC_T >= mdev->state.conn) { | ||
2684 | if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) || | ||
2685 | nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed)) | ||
2686 | goto nla_put_failure; | ||
2687 | } | ||
2138 | } | 2688 | } |
2139 | tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); | ||
2140 | 2689 | ||
2141 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2690 | if (sib) { |
2691 | switch(sib->sib_reason) { | ||
2692 | case SIB_SYNC_PROGRESS: | ||
2693 | case SIB_GET_STATUS_REPLY: | ||
2694 | break; | ||
2695 | case SIB_STATE_CHANGE: | ||
2696 | if (nla_put_u32(skb, T_prev_state, sib->os.i) || | ||
2697 | nla_put_u32(skb, T_new_state, sib->ns.i)) | ||
2698 | goto nla_put_failure; | ||
2699 | break; | ||
2700 | case SIB_HELPER_POST: | ||
2701 | if (nla_put_u32(skb, T_helper_exit_code, | ||
2702 | sib->helper_exit_code)) | ||
2703 | goto nla_put_failure; | ||
2704 | /* fall through */ | ||
2705 | case SIB_HELPER_PRE: | ||
2706 | if (nla_put_string(skb, T_helper, sib->helper_name)) | ||
2707 | goto nla_put_failure; | ||
2708 | break; | ||
2709 | } | ||
2710 | } | ||
2711 | nla_nest_end(skb, nla); | ||
2142 | 2712 | ||
2143 | return (int)((char *)tl - (char *)reply->tag_list); | 2713 | if (0) |
2714 | nla_put_failure: | ||
2715 | err = -EMSGSIZE; | ||
2716 | if (got_ldev) | ||
2717 | put_ldev(mdev); | ||
2718 | return err; | ||
2144 | } | 2719 | } |
2145 | 2720 | ||
2146 | static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2721 | int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) |
2147 | struct drbd_nl_cfg_reply *reply) | ||
2148 | { | 2722 | { |
2149 | unsigned short *tl = reply->tag_list; | 2723 | enum drbd_ret_code retcode; |
2150 | union drbd_state s = mdev->state; | 2724 | int err; |
2151 | unsigned long rs_left; | ||
2152 | unsigned int res; | ||
2153 | 2725 | ||
2154 | tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); | 2726 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2727 | if (!adm_ctx.reply_skb) | ||
2728 | return retcode; | ||
2729 | if (retcode != NO_ERROR) | ||
2730 | goto out; | ||
2155 | 2731 | ||
2156 | /* no local ref, no bitmap, no syncer progress. */ | 2732 | err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL); |
2157 | if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { | 2733 | if (err) { |
2158 | if (get_ldev(mdev)) { | 2734 | nlmsg_free(adm_ctx.reply_skb); |
2159 | drbd_get_syncer_progress(mdev, &rs_left, &res); | 2735 | return err; |
2160 | tl = tl_add_int(tl, T_sync_progress, &res); | ||
2161 | put_ldev(mdev); | ||
2162 | } | ||
2163 | } | 2736 | } |
2164 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2737 | out: |
2165 | 2738 | drbd_adm_finish(info, retcode); | |
2166 | return (int)((char *)tl - (char *)reply->tag_list); | 2739 | return 0; |
2167 | } | 2740 | } |
2168 | 2741 | ||
2169 | static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2742 | int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) |
2170 | struct drbd_nl_cfg_reply *reply) | ||
2171 | { | 2743 | { |
2172 | unsigned short *tl; | 2744 | struct drbd_conf *mdev; |
2173 | 2745 | struct drbd_genlmsghdr *dh; | |
2174 | tl = reply->tag_list; | 2746 | struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0]; |
2747 | struct drbd_tconn *tconn = NULL; | ||
2748 | struct drbd_tconn *tmp; | ||
2749 | unsigned volume = cb->args[1]; | ||
2750 | |||
2751 | /* Open coded, deferred, iteration: | ||
2752 | * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { | ||
2753 | * idr_for_each_entry(&tconn->volumes, mdev, i) { | ||
2754 | * ... | ||
2755 | * } | ||
2756 | * } | ||
2757 | * where tconn is cb->args[0]; | ||
2758 | * and i is cb->args[1]; | ||
2759 | * | ||
2760 | * cb->args[2] indicates if we shall loop over all resources, | ||
2761 | * or just dump all volumes of a single resource. | ||
2762 | * | ||
2763 | * This may miss entries inserted after this dump started, | ||
2764 | * or entries deleted before they are reached. | ||
2765 | * | ||
2766 | * We need to make sure the mdev won't disappear while | ||
2767 | * we are looking at it, and revalidate our iterators | ||
2768 | * on each iteration. | ||
2769 | */ | ||
2175 | 2770 | ||
2176 | if (get_ldev(mdev)) { | 2771 | /* synchronize with conn_create()/conn_destroy() */ |
2177 | unsigned long flags; | 2772 | rcu_read_lock(); |
2178 | spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); | 2773 | /* revalidate iterator position */ |
2179 | tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); | 2774 | list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) { |
2180 | tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); | 2775 | if (pos == NULL) { |
2181 | spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); | 2776 | /* first iteration */ |
2182 | put_ldev(mdev); | 2777 | pos = tmp; |
2778 | tconn = pos; | ||
2779 | break; | ||
2780 | } | ||
2781 | if (tmp == pos) { | ||
2782 | tconn = pos; | ||
2783 | break; | ||
2784 | } | ||
2183 | } | 2785 | } |
2184 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2786 | if (tconn) { |
2787 | next_tconn: | ||
2788 | mdev = idr_get_next(&tconn->volumes, &volume); | ||
2789 | if (!mdev) { | ||
2790 | /* No more volumes to dump on this tconn. | ||
2791 | * Advance tconn iterator. */ | ||
2792 | pos = list_entry_rcu(tconn->all_tconn.next, | ||
2793 | struct drbd_tconn, all_tconn); | ||
2794 | /* Did we dump any volume on this tconn yet? */ | ||
2795 | if (volume != 0) { | ||
2796 | /* If we reached the end of the list, | ||
2797 | * or only a single resource dump was requested, | ||
2798 | * we are done. */ | ||
2799 | if (&pos->all_tconn == &drbd_tconns || cb->args[2]) | ||
2800 | goto out; | ||
2801 | volume = 0; | ||
2802 | tconn = pos; | ||
2803 | goto next_tconn; | ||
2804 | } | ||
2805 | } | ||
2185 | 2806 | ||
2186 | return (int)((char *)tl - (char *)reply->tag_list); | 2807 | dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, |
2808 | cb->nlh->nlmsg_seq, &drbd_genl_family, | ||
2809 | NLM_F_MULTI, DRBD_ADM_GET_STATUS); | ||
2810 | if (!dh) | ||
2811 | goto out; | ||
2812 | |||
2813 | if (!mdev) { | ||
2814 | /* This is a tconn without a single volume. | ||
2815 | * Suprisingly enough, it may have a network | ||
2816 | * configuration. */ | ||
2817 | struct net_conf *nc; | ||
2818 | dh->minor = -1U; | ||
2819 | dh->ret_code = NO_ERROR; | ||
2820 | if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED)) | ||
2821 | goto cancel; | ||
2822 | nc = rcu_dereference(tconn->net_conf); | ||
2823 | if (nc && net_conf_to_skb(skb, nc, 1) != 0) | ||
2824 | goto cancel; | ||
2825 | goto done; | ||
2826 | } | ||
2827 | |||
2828 | D_ASSERT(mdev->vnr == volume); | ||
2829 | D_ASSERT(mdev->tconn == tconn); | ||
2830 | |||
2831 | dh->minor = mdev_to_minor(mdev); | ||
2832 | dh->ret_code = NO_ERROR; | ||
2833 | |||
2834 | if (nla_put_status_info(skb, mdev, NULL)) { | ||
2835 | cancel: | ||
2836 | genlmsg_cancel(skb, dh); | ||
2837 | goto out; | ||
2838 | } | ||
2839 | done: | ||
2840 | genlmsg_end(skb, dh); | ||
2841 | } | ||
2842 | |||
2843 | out: | ||
2844 | rcu_read_unlock(); | ||
2845 | /* where to start the next iteration */ | ||
2846 | cb->args[0] = (long)pos; | ||
2847 | cb->args[1] = (pos == tconn) ? volume + 1 : 0; | ||
2848 | |||
2849 | /* No more tconns/volumes/minors found results in an empty skb. | ||
2850 | * Which will terminate the dump. */ | ||
2851 | return skb->len; | ||
2187 | } | 2852 | } |
2188 | 2853 | ||
2189 | /** | 2854 | /* |
2190 | * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use | 2855 | * Request status of all resources, or of all volumes within a single resource. |
2191 | * @mdev: DRBD device. | 2856 | * |
2192 | * @nlp: Netlink/connector packet from drbdsetup | 2857 | * This is a dump, as the answer may not fit in a single reply skb otherwise. |
2193 | * @reply: Reply packet for drbdsetup | 2858 | * Which means we cannot use the family->attrbuf or other such members, because |
2859 | * dump is NOT protected by the genl_lock(). During dump, we only have access | ||
2860 | * to the incoming skb, and need to opencode "parsing" of the nlattr payload. | ||
2861 | * | ||
2862 | * Once things are setup properly, we call into get_one_status(). | ||
2194 | */ | 2863 | */ |
2195 | static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2864 | int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) |
2196 | struct drbd_nl_cfg_reply *reply) | ||
2197 | { | 2865 | { |
2198 | unsigned short *tl; | 2866 | const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; |
2199 | char rv; | 2867 | struct nlattr *nla; |
2868 | const char *resource_name; | ||
2869 | struct drbd_tconn *tconn; | ||
2870 | int maxtype; | ||
2871 | |||
2872 | /* Is this a followup call? */ | ||
2873 | if (cb->args[0]) { | ||
2874 | /* ... of a single resource dump, | ||
2875 | * and the resource iterator has been advanced already? */ | ||
2876 | if (cb->args[2] && cb->args[2] != cb->args[0]) | ||
2877 | return 0; /* DONE. */ | ||
2878 | goto dump; | ||
2879 | } | ||
2880 | |||
2881 | /* First call (from netlink_dump_start). We need to figure out | ||
2882 | * which resource(s) the user wants us to dump. */ | ||
2883 | nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen), | ||
2884 | nlmsg_attrlen(cb->nlh, hdrlen), | ||
2885 | DRBD_NLA_CFG_CONTEXT); | ||
2886 | |||
2887 | /* No explicit context given. Dump all. */ | ||
2888 | if (!nla) | ||
2889 | goto dump; | ||
2890 | maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; | ||
2891 | nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); | ||
2892 | if (IS_ERR(nla)) | ||
2893 | return PTR_ERR(nla); | ||
2894 | /* context given, but no name present? */ | ||
2895 | if (!nla) | ||
2896 | return -EINVAL; | ||
2897 | resource_name = nla_data(nla); | ||
2898 | tconn = conn_get_by_name(resource_name); | ||
2899 | |||
2900 | if (!tconn) | ||
2901 | return -ENODEV; | ||
2902 | |||
2903 | kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */ | ||
2904 | |||
2905 | /* prime iterators, and set "filter" mode mark: | ||
2906 | * only dump this tconn. */ | ||
2907 | cb->args[0] = (long)tconn; | ||
2908 | /* cb->args[1] = 0; passed in this way. */ | ||
2909 | cb->args[2] = (long)tconn; | ||
2910 | |||
2911 | dump: | ||
2912 | return get_one_status(skb, cb); | ||
2913 | } | ||
2200 | 2914 | ||
2201 | tl = reply->tag_list; | 2915 | int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) |
2916 | { | ||
2917 | enum drbd_ret_code retcode; | ||
2918 | struct timeout_parms tp; | ||
2919 | int err; | ||
2202 | 2920 | ||
2203 | rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : | 2921 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2204 | drbd_test_flag(mdev, USE_DEGR_WFC_T) ? UT_DEGRADED : UT_DEFAULT; | 2922 | if (!adm_ctx.reply_skb) |
2923 | return retcode; | ||
2924 | if (retcode != NO_ERROR) | ||
2925 | goto out; | ||
2205 | 2926 | ||
2206 | tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); | 2927 | tp.timeout_type = |
2207 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2928 | adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : |
2929 | test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED : | ||
2930 | UT_DEFAULT; | ||
2208 | 2931 | ||
2209 | return (int)((char *)tl - (char *)reply->tag_list); | 2932 | err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); |
2933 | if (err) { | ||
2934 | nlmsg_free(adm_ctx.reply_skb); | ||
2935 | return err; | ||
2936 | } | ||
2937 | out: | ||
2938 | drbd_adm_finish(info, retcode); | ||
2939 | return 0; | ||
2210 | } | 2940 | } |
2211 | 2941 | ||
2212 | static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2942 | int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) |
2213 | struct drbd_nl_cfg_reply *reply) | ||
2214 | { | 2943 | { |
2215 | /* default to resume from last known position, if possible */ | 2944 | struct drbd_conf *mdev; |
2216 | struct start_ov args = { | 2945 | enum drbd_ret_code retcode; |
2217 | .start_sector = mdev->ov_start_sector, | 2946 | struct start_ov_parms parms; |
2218 | .stop_sector = ULLONG_MAX, | ||
2219 | }; | ||
2220 | 2947 | ||
2221 | if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { | 2948 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2222 | reply->ret_code = ERR_MANDATORY_TAG; | 2949 | if (!adm_ctx.reply_skb) |
2223 | return 0; | 2950 | return retcode; |
2951 | if (retcode != NO_ERROR) | ||
2952 | goto out; | ||
2953 | |||
2954 | mdev = adm_ctx.mdev; | ||
2955 | |||
2956 | /* resume from last known position, if possible */ | ||
2957 | parms.ov_start_sector = mdev->ov_start_sector; | ||
2958 | parms.ov_stop_sector = ULLONG_MAX; | ||
2959 | if (info->attrs[DRBD_NLA_START_OV_PARMS]) { | ||
2960 | int err = start_ov_parms_from_attrs(&parms, info); | ||
2961 | if (err) { | ||
2962 | retcode = ERR_MANDATORY_TAG; | ||
2963 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
2964 | goto out; | ||
2965 | } | ||
2224 | } | 2966 | } |
2967 | /* w_make_ov_request expects position to be aligned */ | ||
2968 | mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); | ||
2969 | mdev->ov_stop_sector = parms.ov_stop_sector; | ||
2225 | 2970 | ||
2226 | /* If there is still bitmap IO pending, e.g. previous resync or verify | 2971 | /* If there is still bitmap IO pending, e.g. previous resync or verify |
2227 | * just being finished, wait for it before requesting a new resync. */ | 2972 | * just being finished, wait for it before requesting a new resync. */ |
2228 | drbd_suspend_io(mdev); | 2973 | drbd_suspend_io(mdev); |
2229 | wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO)); | 2974 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
2230 | 2975 | retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); | |
2231 | /* w_make_ov_request expects start position to be aligned */ | ||
2232 | mdev->ov_start_sector = args.start_sector & ~(BM_SECT_PER_BIT-1); | ||
2233 | mdev->ov_stop_sector = args.stop_sector; | ||
2234 | reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); | ||
2235 | drbd_resume_io(mdev); | 2976 | drbd_resume_io(mdev); |
2977 | out: | ||
2978 | drbd_adm_finish(info, retcode); | ||
2236 | return 0; | 2979 | return 0; |
2237 | } | 2980 | } |
2238 | 2981 | ||
2239 | 2982 | ||
2240 | static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2983 | int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) |
2241 | struct drbd_nl_cfg_reply *reply) | ||
2242 | { | 2984 | { |
2243 | int retcode = NO_ERROR; | 2985 | struct drbd_conf *mdev; |
2986 | enum drbd_ret_code retcode; | ||
2244 | int skip_initial_sync = 0; | 2987 | int skip_initial_sync = 0; |
2245 | int err; | 2988 | int err; |
2989 | struct new_c_uuid_parms args; | ||
2246 | 2990 | ||
2247 | struct new_c_uuid args; | 2991 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2992 | if (!adm_ctx.reply_skb) | ||
2993 | return retcode; | ||
2994 | if (retcode != NO_ERROR) | ||
2995 | goto out_nolock; | ||
2248 | 2996 | ||
2249 | memset(&args, 0, sizeof(struct new_c_uuid)); | 2997 | mdev = adm_ctx.mdev; |
2250 | if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { | 2998 | memset(&args, 0, sizeof(args)); |
2251 | reply->ret_code = ERR_MANDATORY_TAG; | 2999 | if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { |
2252 | return 0; | 3000 | err = new_c_uuid_parms_from_attrs(&args, info); |
3001 | if (err) { | ||
3002 | retcode = ERR_MANDATORY_TAG; | ||
3003 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
3004 | goto out_nolock; | ||
3005 | } | ||
2253 | } | 3006 | } |
2254 | 3007 | ||
2255 | mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ | 3008 | mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */ |
2256 | 3009 | ||
2257 | if (!get_ldev(mdev)) { | 3010 | if (!get_ldev(mdev)) { |
2258 | retcode = ERR_NO_DISK; | 3011 | retcode = ERR_NO_DISK; |
@@ -2260,7 +3013,7 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2260 | } | 3013 | } |
2261 | 3014 | ||
2262 | /* this is "skip initial sync", assume to be clean */ | 3015 | /* this is "skip initial sync", assume to be clean */ |
2263 | if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && | 3016 | if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 && |
2264 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { | 3017 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { |
2265 | dev_info(DEV, "Preparing to skip initial sync\n"); | 3018 | dev_info(DEV, "Preparing to skip initial sync\n"); |
2266 | skip_initial_sync = 1; | 3019 | skip_initial_sync = 1; |
@@ -2283,10 +3036,10 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2283 | drbd_send_uuids_skip_initial_sync(mdev); | 3036 | drbd_send_uuids_skip_initial_sync(mdev); |
2284 | _drbd_uuid_set(mdev, UI_BITMAP, 0); | 3037 | _drbd_uuid_set(mdev, UI_BITMAP, 0); |
2285 | drbd_print_uuids(mdev, "cleared bitmap UUID"); | 3038 | drbd_print_uuids(mdev, "cleared bitmap UUID"); |
2286 | spin_lock_irq(&mdev->req_lock); | 3039 | spin_lock_irq(&mdev->tconn->req_lock); |
2287 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), | 3040 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), |
2288 | CS_VERBOSE, NULL); | 3041 | CS_VERBOSE, NULL); |
2289 | spin_unlock_irq(&mdev->req_lock); | 3042 | spin_unlock_irq(&mdev->tconn->req_lock); |
2290 | } | 3043 | } |
2291 | } | 3044 | } |
2292 | 3045 | ||
@@ -2294,416 +3047,283 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2294 | out_dec: | 3047 | out_dec: |
2295 | put_ldev(mdev); | 3048 | put_ldev(mdev); |
2296 | out: | 3049 | out: |
2297 | mutex_unlock(&mdev->state_mutex); | 3050 | mutex_unlock(mdev->state_mutex); |
2298 | 3051 | out_nolock: | |
2299 | reply->ret_code = retcode; | 3052 | drbd_adm_finish(info, retcode); |
2300 | return 0; | 3053 | return 0; |
2301 | } | 3054 | } |
2302 | 3055 | ||
2303 | struct cn_handler_struct { | 3056 | static enum drbd_ret_code |
2304 | int (*function)(struct drbd_conf *, | 3057 | drbd_check_resource_name(const char *name) |
2305 | struct drbd_nl_cfg_req *, | ||
2306 | struct drbd_nl_cfg_reply *); | ||
2307 | int reply_body_size; | ||
2308 | }; | ||
2309 | |||
2310 | static struct cn_handler_struct cnd_table[] = { | ||
2311 | [ P_primary ] = { &drbd_nl_primary, 0 }, | ||
2312 | [ P_secondary ] = { &drbd_nl_secondary, 0 }, | ||
2313 | [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, | ||
2314 | [ P_detach ] = { &drbd_nl_detach, 0 }, | ||
2315 | [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, | ||
2316 | [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, | ||
2317 | [ P_resize ] = { &drbd_nl_resize, 0 }, | ||
2318 | [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, | ||
2319 | [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, | ||
2320 | [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, | ||
2321 | [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, | ||
2322 | [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, | ||
2323 | [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, | ||
2324 | [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, | ||
2325 | [ P_outdate ] = { &drbd_nl_outdate, 0 }, | ||
2326 | [ P_get_config ] = { &drbd_nl_get_config, | ||
2327 | sizeof(struct syncer_conf_tag_len_struct) + | ||
2328 | sizeof(struct disk_conf_tag_len_struct) + | ||
2329 | sizeof(struct net_conf_tag_len_struct) }, | ||
2330 | [ P_get_state ] = { &drbd_nl_get_state, | ||
2331 | sizeof(struct get_state_tag_len_struct) + | ||
2332 | sizeof(struct sync_progress_tag_len_struct) }, | ||
2333 | [ P_get_uuids ] = { &drbd_nl_get_uuids, | ||
2334 | sizeof(struct get_uuids_tag_len_struct) }, | ||
2335 | [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, | ||
2336 | sizeof(struct get_timeout_flag_tag_len_struct)}, | ||
2337 | [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, | ||
2338 | [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, | ||
2339 | }; | ||
2340 | |||
2341 | static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) | ||
2342 | { | 3058 | { |
2343 | struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; | 3059 | if (!name || !name[0]) { |
2344 | struct cn_handler_struct *cm; | 3060 | drbd_msg_put_info("resource name missing"); |
2345 | struct cn_msg *cn_reply; | 3061 | return ERR_MANDATORY_TAG; |
2346 | struct drbd_nl_cfg_reply *reply; | ||
2347 | struct drbd_conf *mdev; | ||
2348 | int retcode, rr; | ||
2349 | int reply_size = sizeof(struct cn_msg) | ||
2350 | + sizeof(struct drbd_nl_cfg_reply) | ||
2351 | + sizeof(short int); | ||
2352 | |||
2353 | if (!try_module_get(THIS_MODULE)) { | ||
2354 | printk(KERN_ERR "drbd: try_module_get() failed!\n"); | ||
2355 | return; | ||
2356 | } | ||
2357 | |||
2358 | if (!capable(CAP_SYS_ADMIN)) { | ||
2359 | retcode = ERR_PERM; | ||
2360 | goto fail; | ||
2361 | } | 3062 | } |
2362 | 3063 | /* if we want to use these in sysfs/configfs/debugfs some day, | |
2363 | mdev = ensure_mdev(nlp->drbd_minor, | 3064 | * we must not allow slashes */ |
2364 | (nlp->flags & DRBD_NL_CREATE_DEVICE)); | 3065 | if (strchr(name, '/')) { |
2365 | if (!mdev) { | 3066 | drbd_msg_put_info("invalid resource name"); |
2366 | retcode = ERR_MINOR_INVALID; | 3067 | return ERR_INVALID_REQUEST; |
2367 | goto fail; | ||
2368 | } | 3068 | } |
3069 | return NO_ERROR; | ||
3070 | } | ||
2369 | 3071 | ||
2370 | if (nlp->packet_type >= P_nl_after_last_packet || | 3072 | int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) |
2371 | nlp->packet_type == P_return_code_only) { | 3073 | { |
2372 | retcode = ERR_PACKET_NR; | 3074 | enum drbd_ret_code retcode; |
2373 | goto fail; | 3075 | struct res_opts res_opts; |
2374 | } | 3076 | int err; |
2375 | 3077 | ||
2376 | cm = cnd_table + nlp->packet_type; | 3078 | retcode = drbd_adm_prepare(skb, info, 0); |
3079 | if (!adm_ctx.reply_skb) | ||
3080 | return retcode; | ||
3081 | if (retcode != NO_ERROR) | ||
3082 | goto out; | ||
2377 | 3083 | ||
2378 | /* This may happen if packet number is 0: */ | 3084 | set_res_opts_defaults(&res_opts); |
2379 | if (cm->function == NULL) { | 3085 | err = res_opts_from_attrs(&res_opts, info); |
2380 | retcode = ERR_PACKET_NR; | 3086 | if (err && err != -ENOMSG) { |
2381 | goto fail; | 3087 | retcode = ERR_MANDATORY_TAG; |
3088 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
3089 | goto out; | ||
2382 | } | 3090 | } |
2383 | 3091 | ||
2384 | reply_size += cm->reply_body_size; | 3092 | retcode = drbd_check_resource_name(adm_ctx.resource_name); |
3093 | if (retcode != NO_ERROR) | ||
3094 | goto out; | ||
2385 | 3095 | ||
2386 | /* allocation not in the IO path, cqueue thread context */ | 3096 | if (adm_ctx.tconn) { |
2387 | cn_reply = kzalloc(reply_size, GFP_KERNEL); | 3097 | if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { |
2388 | if (!cn_reply) { | 3098 | retcode = ERR_INVALID_REQUEST; |
2389 | retcode = ERR_NOMEM; | 3099 | drbd_msg_put_info("resource exists"); |
2390 | goto fail; | 3100 | } |
3101 | /* else: still NO_ERROR */ | ||
3102 | goto out; | ||
2391 | } | 3103 | } |
2392 | reply = (struct drbd_nl_cfg_reply *) cn_reply->data; | ||
2393 | |||
2394 | reply->packet_type = | ||
2395 | cm->reply_body_size ? nlp->packet_type : P_return_code_only; | ||
2396 | reply->minor = nlp->drbd_minor; | ||
2397 | reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ | ||
2398 | /* reply->tag_list; might be modified by cm->function. */ | ||
2399 | |||
2400 | rr = cm->function(mdev, nlp, reply); | ||
2401 | |||
2402 | cn_reply->id = req->id; | ||
2403 | cn_reply->seq = req->seq; | ||
2404 | cn_reply->ack = req->ack + 1; | ||
2405 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; | ||
2406 | cn_reply->flags = 0; | ||
2407 | 3104 | ||
2408 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); | 3105 | if (!conn_create(adm_ctx.resource_name, &res_opts)) |
2409 | if (rr && rr != -ESRCH) | 3106 | retcode = ERR_NOMEM; |
2410 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | 3107 | out: |
2411 | 3108 | drbd_adm_finish(info, retcode); | |
2412 | kfree(cn_reply); | 3109 | return 0; |
2413 | module_put(THIS_MODULE); | ||
2414 | return; | ||
2415 | fail: | ||
2416 | drbd_nl_send_reply(req, retcode); | ||
2417 | module_put(THIS_MODULE); | ||
2418 | } | 3110 | } |
2419 | 3111 | ||
2420 | static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ | 3112 | int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info) |
2421 | |||
2422 | static unsigned short * | ||
2423 | __tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, | ||
2424 | unsigned short len, int nul_terminated) | ||
2425 | { | 3113 | { |
2426 | unsigned short l = tag_descriptions[tag_number(tag)].max_len; | 3114 | struct drbd_genlmsghdr *dh = info->userhdr; |
2427 | len = (len < l) ? len : l; | 3115 | enum drbd_ret_code retcode; |
2428 | put_unaligned(tag, tl++); | ||
2429 | put_unaligned(len, tl++); | ||
2430 | memcpy(tl, data, len); | ||
2431 | tl = (unsigned short*)((char*)tl + len); | ||
2432 | if (nul_terminated) | ||
2433 | *((char*)tl - 1) = 0; | ||
2434 | return tl; | ||
2435 | } | ||
2436 | 3116 | ||
2437 | static unsigned short * | 3117 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); |
2438 | tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) | 3118 | if (!adm_ctx.reply_skb) |
2439 | { | 3119 | return retcode; |
2440 | return __tl_add_blob(tl, tag, data, len, 0); | 3120 | if (retcode != NO_ERROR) |
2441 | } | 3121 | goto out; |
2442 | 3122 | ||
2443 | static unsigned short * | 3123 | if (dh->minor > MINORMASK) { |
2444 | tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) | 3124 | drbd_msg_put_info("requested minor out of range"); |
2445 | { | 3125 | retcode = ERR_INVALID_REQUEST; |
2446 | return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); | 3126 | goto out; |
2447 | } | 3127 | } |
3128 | if (adm_ctx.volume > DRBD_VOLUME_MAX) { | ||
3129 | drbd_msg_put_info("requested volume id out of range"); | ||
3130 | retcode = ERR_INVALID_REQUEST; | ||
3131 | goto out; | ||
3132 | } | ||
2448 | 3133 | ||
2449 | static unsigned short * | 3134 | /* drbd_adm_prepare made sure already |
2450 | tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) | 3135 | * that mdev->tconn and mdev->vnr match the request. */ |
2451 | { | 3136 | if (adm_ctx.mdev) { |
2452 | put_unaligned(tag, tl++); | 3137 | if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) |
2453 | switch(tag_type(tag)) { | 3138 | retcode = ERR_MINOR_EXISTS; |
2454 | case TT_INTEGER: | 3139 | /* else: still NO_ERROR */ |
2455 | put_unaligned(sizeof(int), tl++); | 3140 | goto out; |
2456 | put_unaligned(*(int *)val, (int *)tl); | ||
2457 | tl = (unsigned short*)((char*)tl+sizeof(int)); | ||
2458 | break; | ||
2459 | case TT_INT64: | ||
2460 | put_unaligned(sizeof(u64), tl++); | ||
2461 | put_unaligned(*(u64 *)val, (u64 *)tl); | ||
2462 | tl = (unsigned short*)((char*)tl+sizeof(u64)); | ||
2463 | break; | ||
2464 | default: | ||
2465 | /* someone did something stupid. */ | ||
2466 | ; | ||
2467 | } | 3141 | } |
2468 | return tl; | 3142 | |
3143 | retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume); | ||
3144 | out: | ||
3145 | drbd_adm_finish(info, retcode); | ||
3146 | return 0; | ||
2469 | } | 3147 | } |
2470 | 3148 | ||
2471 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) | 3149 | static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) |
2472 | { | 3150 | { |
2473 | char buffer[sizeof(struct cn_msg)+ | 3151 | if (mdev->state.disk == D_DISKLESS && |
2474 | sizeof(struct drbd_nl_cfg_reply)+ | 3152 | /* no need to be mdev->state.conn == C_STANDALONE && |
2475 | sizeof(struct get_state_tag_len_struct)+ | 3153 | * we may want to delete a minor from a live replication group. |
2476 | sizeof(short int)]; | 3154 | */ |
2477 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | 3155 | mdev->state.role == R_SECONDARY) { |
2478 | struct drbd_nl_cfg_reply *reply = | 3156 | _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS), |
2479 | (struct drbd_nl_cfg_reply *)cn_reply->data; | 3157 | CS_VERBOSE + CS_WAIT_COMPLETE); |
2480 | unsigned short *tl = reply->tag_list; | 3158 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
2481 | 3159 | idr_remove(&minors, mdev_to_minor(mdev)); | |
2482 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | 3160 | del_gendisk(mdev->vdisk); |
2483 | 3161 | synchronize_rcu(); | |
2484 | tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); | 3162 | kref_put(&mdev->kref, &drbd_minor_destroy); |
2485 | 3163 | return NO_ERROR; | |
2486 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 3164 | } else |
2487 | 3165 | return ERR_MINOR_CONFIGURED; | |
2488 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2489 | cn_reply->id.val = CN_VAL_DRBD; | ||
2490 | |||
2491 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2492 | cn_reply->ack = 0; /* not used here. */ | ||
2493 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2494 | (int)((char *)tl - (char *)reply->tag_list); | ||
2495 | cn_reply->flags = 0; | ||
2496 | |||
2497 | reply->packet_type = P_get_state; | ||
2498 | reply->minor = mdev_to_minor(mdev); | ||
2499 | reply->ret_code = NO_ERROR; | ||
2500 | |||
2501 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2502 | } | 3166 | } |
2503 | 3167 | ||
2504 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) | 3168 | int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info) |
2505 | { | 3169 | { |
2506 | char buffer[sizeof(struct cn_msg)+ | 3170 | enum drbd_ret_code retcode; |
2507 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2508 | sizeof(struct call_helper_tag_len_struct)+ | ||
2509 | sizeof(short int)]; | ||
2510 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2511 | struct drbd_nl_cfg_reply *reply = | ||
2512 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2513 | unsigned short *tl = reply->tag_list; | ||
2514 | |||
2515 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | ||
2516 | |||
2517 | tl = tl_add_str(tl, T_helper, helper_name); | ||
2518 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2519 | |||
2520 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2521 | cn_reply->id.val = CN_VAL_DRBD; | ||
2522 | |||
2523 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2524 | cn_reply->ack = 0; /* not used here. */ | ||
2525 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2526 | (int)((char *)tl - (char *)reply->tag_list); | ||
2527 | cn_reply->flags = 0; | ||
2528 | 3171 | ||
2529 | reply->packet_type = P_call_helper; | 3172 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2530 | reply->minor = mdev_to_minor(mdev); | 3173 | if (!adm_ctx.reply_skb) |
2531 | reply->ret_code = NO_ERROR; | 3174 | return retcode; |
3175 | if (retcode != NO_ERROR) | ||
3176 | goto out; | ||
2532 | 3177 | ||
2533 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3178 | retcode = adm_delete_minor(adm_ctx.mdev); |
3179 | out: | ||
3180 | drbd_adm_finish(info, retcode); | ||
3181 | return 0; | ||
2534 | } | 3182 | } |
2535 | 3183 | ||
2536 | void drbd_bcast_ee(struct drbd_conf *mdev, | 3184 | int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) |
2537 | const char *reason, const int dgs, | ||
2538 | const char* seen_hash, const char* calc_hash, | ||
2539 | const struct drbd_epoch_entry* e) | ||
2540 | { | 3185 | { |
2541 | struct cn_msg *cn_reply; | 3186 | int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ |
2542 | struct drbd_nl_cfg_reply *reply; | 3187 | struct drbd_conf *mdev; |
2543 | unsigned short *tl; | 3188 | unsigned i; |
2544 | struct page *page; | ||
2545 | unsigned len; | ||
2546 | 3189 | ||
2547 | if (!e) | 3190 | retcode = drbd_adm_prepare(skb, info, 0); |
2548 | return; | 3191 | if (!adm_ctx.reply_skb) |
2549 | if (!reason || !reason[0]) | 3192 | return retcode; |
2550 | return; | 3193 | if (retcode != NO_ERROR) |
3194 | goto out; | ||
2551 | 3195 | ||
2552 | /* apparently we have to memcpy twice, first to prepare the data for the | 3196 | if (!adm_ctx.tconn) { |
2553 | * struct cn_msg, then within cn_netlink_send from the cn_msg to the | 3197 | retcode = ERR_RES_NOT_KNOWN; |
2554 | * netlink skb. */ | 3198 | goto out; |
2555 | /* receiver thread context, which is not in the writeout path (of this node), | ||
2556 | * but may be in the writeout path of the _other_ node. | ||
2557 | * GFP_NOIO to avoid potential "distributed deadlock". */ | ||
2558 | cn_reply = kzalloc( | ||
2559 | sizeof(struct cn_msg)+ | ||
2560 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2561 | sizeof(struct dump_ee_tag_len_struct)+ | ||
2562 | sizeof(short int), | ||
2563 | GFP_NOIO); | ||
2564 | |||
2565 | if (!cn_reply) { | ||
2566 | dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", | ||
2567 | (unsigned long long)e->sector, e->size); | ||
2568 | return; | ||
2569 | } | 3199 | } |
2570 | 3200 | ||
2571 | reply = (struct drbd_nl_cfg_reply*)cn_reply->data; | 3201 | /* demote */ |
2572 | tl = reply->tag_list; | 3202 | idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { |
2573 | 3203 | retcode = drbd_set_role(mdev, R_SECONDARY, 0); | |
2574 | tl = tl_add_str(tl, T_dump_ee_reason, reason); | 3204 | if (retcode < SS_SUCCESS) { |
2575 | tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); | 3205 | drbd_msg_put_info("failed to demote"); |
2576 | tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); | 3206 | goto out; |
2577 | tl = tl_add_int(tl, T_ee_sector, &e->sector); | 3207 | } |
2578 | tl = tl_add_int(tl, T_ee_block_id, &e->block_id); | ||
2579 | |||
2580 | /* dump the first 32k */ | ||
2581 | len = min_t(unsigned, e->size, 32 << 10); | ||
2582 | put_unaligned(T_ee_data, tl++); | ||
2583 | put_unaligned(len, tl++); | ||
2584 | |||
2585 | page = e->pages; | ||
2586 | page_chain_for_each(page) { | ||
2587 | void *d = kmap_atomic(page); | ||
2588 | unsigned l = min_t(unsigned, len, PAGE_SIZE); | ||
2589 | memcpy(tl, d, l); | ||
2590 | kunmap_atomic(d); | ||
2591 | tl = (unsigned short*)((char*)tl + l); | ||
2592 | len -= l; | ||
2593 | if (len == 0) | ||
2594 | break; | ||
2595 | } | 3208 | } |
2596 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2597 | |||
2598 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2599 | cn_reply->id.val = CN_VAL_DRBD; | ||
2600 | 3209 | ||
2601 | cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); | 3210 | retcode = conn_try_disconnect(adm_ctx.tconn, 0); |
2602 | cn_reply->ack = 0; // not used here. | 3211 | if (retcode < SS_SUCCESS) { |
2603 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | 3212 | drbd_msg_put_info("failed to disconnect"); |
2604 | (int)((char*)tl - (char*)reply->tag_list); | 3213 | goto out; |
2605 | cn_reply->flags = 0; | 3214 | } |
2606 | |||
2607 | reply->packet_type = P_dump_ee; | ||
2608 | reply->minor = mdev_to_minor(mdev); | ||
2609 | reply->ret_code = NO_ERROR; | ||
2610 | |||
2611 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2612 | kfree(cn_reply); | ||
2613 | } | ||
2614 | |||
2615 | void drbd_bcast_sync_progress(struct drbd_conf *mdev) | ||
2616 | { | ||
2617 | char buffer[sizeof(struct cn_msg)+ | ||
2618 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2619 | sizeof(struct sync_progress_tag_len_struct)+ | ||
2620 | sizeof(short int)]; | ||
2621 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2622 | struct drbd_nl_cfg_reply *reply = | ||
2623 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2624 | unsigned short *tl = reply->tag_list; | ||
2625 | unsigned long rs_left; | ||
2626 | unsigned int res; | ||
2627 | 3215 | ||
2628 | /* no local ref, no bitmap, no syncer progress, no broadcast. */ | 3216 | /* detach */ |
2629 | if (!get_ldev(mdev)) | 3217 | idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { |
2630 | return; | 3218 | retcode = adm_detach(mdev, 0); |
2631 | drbd_get_syncer_progress(mdev, &rs_left, &res); | 3219 | if (retcode < SS_SUCCESS || retcode > NO_ERROR) { |
2632 | put_ldev(mdev); | 3220 | drbd_msg_put_info("failed to detach"); |
3221 | goto out; | ||
3222 | } | ||
3223 | } | ||
2633 | 3224 | ||
2634 | tl = tl_add_int(tl, T_sync_progress, &res); | 3225 | /* If we reach this, all volumes (of this tconn) are Secondary, |
2635 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 3226 | * Disconnected, Diskless, aka Unconfigured. Make sure all threads have |
3227 | * actually stopped, state handling only does drbd_thread_stop_nowait(). */ | ||
3228 | drbd_thread_stop(&adm_ctx.tconn->worker); | ||
2636 | 3229 | ||
2637 | cn_reply->id.idx = CN_IDX_DRBD; | 3230 | /* Now, nothing can fail anymore */ |
2638 | cn_reply->id.val = CN_VAL_DRBD; | ||
2639 | 3231 | ||
2640 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | 3232 | /* delete volumes */ |
2641 | cn_reply->ack = 0; /* not used here. */ | 3233 | idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { |
2642 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | 3234 | retcode = adm_delete_minor(mdev); |
2643 | (int)((char *)tl - (char *)reply->tag_list); | 3235 | if (retcode != NO_ERROR) { |
2644 | cn_reply->flags = 0; | 3236 | /* "can not happen" */ |
3237 | drbd_msg_put_info("failed to delete volume"); | ||
3238 | goto out; | ||
3239 | } | ||
3240 | } | ||
2645 | 3241 | ||
2646 | reply->packet_type = P_sync_progress; | 3242 | /* delete connection */ |
2647 | reply->minor = mdev_to_minor(mdev); | 3243 | if (conn_lowest_minor(adm_ctx.tconn) < 0) { |
2648 | reply->ret_code = NO_ERROR; | 3244 | list_del_rcu(&adm_ctx.tconn->all_tconn); |
3245 | synchronize_rcu(); | ||
3246 | kref_put(&adm_ctx.tconn->kref, &conn_destroy); | ||
2649 | 3247 | ||
2650 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3248 | retcode = NO_ERROR; |
3249 | } else { | ||
3250 | /* "can not happen" */ | ||
3251 | retcode = ERR_RES_IN_USE; | ||
3252 | drbd_msg_put_info("failed to delete connection"); | ||
3253 | } | ||
3254 | goto out; | ||
3255 | out: | ||
3256 | drbd_adm_finish(info, retcode); | ||
3257 | return 0; | ||
2651 | } | 3258 | } |
2652 | 3259 | ||
2653 | int __init drbd_nl_init(void) | 3260 | int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) |
2654 | { | 3261 | { |
2655 | static struct cb_id cn_id_drbd; | 3262 | enum drbd_ret_code retcode; |
2656 | int err, try=10; | ||
2657 | 3263 | ||
2658 | cn_id_drbd.val = CN_VAL_DRBD; | 3264 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); |
2659 | do { | 3265 | if (!adm_ctx.reply_skb) |
2660 | cn_id_drbd.idx = cn_idx; | 3266 | return retcode; |
2661 | err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); | 3267 | if (retcode != NO_ERROR) |
2662 | if (!err) | 3268 | goto out; |
2663 | break; | ||
2664 | cn_idx = (cn_idx + CN_IDX_STEP); | ||
2665 | } while (try--); | ||
2666 | 3269 | ||
2667 | if (err) { | 3270 | if (conn_lowest_minor(adm_ctx.tconn) < 0) { |
2668 | printk(KERN_ERR "drbd: cn_drbd failed to register\n"); | 3271 | list_del_rcu(&adm_ctx.tconn->all_tconn); |
2669 | return err; | 3272 | synchronize_rcu(); |
3273 | kref_put(&adm_ctx.tconn->kref, &conn_destroy); | ||
3274 | |||
3275 | retcode = NO_ERROR; | ||
3276 | } else { | ||
3277 | retcode = ERR_RES_IN_USE; | ||
2670 | } | 3278 | } |
2671 | 3279 | ||
3280 | if (retcode == NO_ERROR) | ||
3281 | drbd_thread_stop(&adm_ctx.tconn->worker); | ||
3282 | out: | ||
3283 | drbd_adm_finish(info, retcode); | ||
2672 | return 0; | 3284 | return 0; |
2673 | } | 3285 | } |
2674 | 3286 | ||
2675 | void drbd_nl_cleanup(void) | 3287 | void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib) |
2676 | { | 3288 | { |
2677 | static struct cb_id cn_id_drbd; | 3289 | static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ |
2678 | 3290 | struct sk_buff *msg; | |
2679 | cn_id_drbd.idx = cn_idx; | 3291 | struct drbd_genlmsghdr *d_out; |
2680 | cn_id_drbd.val = CN_VAL_DRBD; | 3292 | unsigned seq; |
2681 | 3293 | int err = -ENOMEM; | |
2682 | cn_del_callback(&cn_id_drbd); | 3294 | |
2683 | } | 3295 | if (sib->sib_reason == SIB_SYNC_PROGRESS && |
2684 | 3296 | time_after(jiffies, mdev->rs_last_bcast + HZ)) | |
2685 | void drbd_nl_send_reply(struct cn_msg *req, int ret_code) | 3297 | mdev->rs_last_bcast = jiffies; |
2686 | { | 3298 | else |
2687 | char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; | 3299 | return; |
2688 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2689 | struct drbd_nl_cfg_reply *reply = | ||
2690 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2691 | int rr; | ||
2692 | |||
2693 | memset(buffer, 0, sizeof(buffer)); | ||
2694 | cn_reply->id = req->id; | ||
2695 | 3300 | ||
2696 | cn_reply->seq = req->seq; | 3301 | seq = atomic_inc_return(&drbd_genl_seq); |
2697 | cn_reply->ack = req->ack + 1; | 3302 | msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); |
2698 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply); | 3303 | if (!msg) |
2699 | cn_reply->flags = 0; | 3304 | goto failed; |
3305 | |||
3306 | err = -EMSGSIZE; | ||
3307 | d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT); | ||
3308 | if (!d_out) /* cannot happen, but anyways. */ | ||
3309 | goto nla_put_failure; | ||
3310 | d_out->minor = mdev_to_minor(mdev); | ||
3311 | d_out->ret_code = NO_ERROR; | ||
3312 | |||
3313 | if (nla_put_status_info(msg, mdev, sib)) | ||
3314 | goto nla_put_failure; | ||
3315 | genlmsg_end(msg, d_out); | ||
3316 | err = drbd_genl_multicast_events(msg, 0); | ||
3317 | /* msg has been consumed or freed in netlink_broadcast() */ | ||
3318 | if (err && err != -ESRCH) | ||
3319 | goto failed; | ||
2700 | 3320 | ||
2701 | reply->packet_type = P_return_code_only; | 3321 | return; |
2702 | reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; | ||
2703 | reply->ret_code = ret_code; | ||
2704 | 3322 | ||
2705 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3323 | nla_put_failure: |
2706 | if (rr && rr != -ESRCH) | 3324 | nlmsg_free(msg); |
2707 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | 3325 | failed: |
3326 | dev_err(DEV, "Error %d while broadcasting event. " | ||
3327 | "Event seq:%u sib_reason:%u\n", | ||
3328 | err, seq, sib->sib_reason); | ||
2708 | } | 3329 | } |
2709 | |||
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c new file mode 100644 index 000000000000..fa672b6df8d6 --- /dev/null +++ b/drivers/block/drbd/drbd_nla.c | |||
@@ -0,0 +1,55 @@ | |||
1 | #include "drbd_wrappers.h" | ||
2 | #include <linux/kernel.h> | ||
3 | #include <net/netlink.h> | ||
4 | #include <linux/drbd_genl_api.h> | ||
5 | #include "drbd_nla.h" | ||
6 | |||
7 | static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla) | ||
8 | { | ||
9 | struct nlattr *head = nla_data(nla); | ||
10 | int len = nla_len(nla); | ||
11 | int rem; | ||
12 | |||
13 | /* | ||
14 | * validate_nla (called from nla_parse_nested) ignores attributes | ||
15 | * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag. | ||
16 | * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY | ||
17 | * flag set also, check and remove that flag before calling | ||
18 | * nla_parse_nested. | ||
19 | */ | ||
20 | |||
21 | nla_for_each_attr(nla, head, len, rem) { | ||
22 | if (nla->nla_type & DRBD_GENLA_F_MANDATORY) { | ||
23 | nla->nla_type &= ~DRBD_GENLA_F_MANDATORY; | ||
24 | if (nla_type(nla) > maxtype) | ||
25 | return -EOPNOTSUPP; | ||
26 | } | ||
27 | } | ||
28 | return 0; | ||
29 | } | ||
30 | |||
31 | int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, | ||
32 | const struct nla_policy *policy) | ||
33 | { | ||
34 | int err; | ||
35 | |||
36 | err = drbd_nla_check_mandatory(maxtype, nla); | ||
37 | if (!err) | ||
38 | err = nla_parse_nested(tb, maxtype, nla, policy); | ||
39 | |||
40 | return err; | ||
41 | } | ||
42 | |||
43 | struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype) | ||
44 | { | ||
45 | int err; | ||
46 | /* | ||
47 | * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and | ||
48 | * we don't know about that attribute, reject all the nested | ||
49 | * attributes. | ||
50 | */ | ||
51 | err = drbd_nla_check_mandatory(maxtype, nla); | ||
52 | if (err) | ||
53 | return ERR_PTR(err); | ||
54 | return nla_find_nested(nla, attrtype); | ||
55 | } | ||
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h new file mode 100644 index 000000000000..679c2d5b4535 --- /dev/null +++ b/drivers/block/drbd/drbd_nla.h | |||
@@ -0,0 +1,8 @@ | |||
1 | #ifndef __DRBD_NLA_H | ||
2 | #define __DRBD_NLA_H | ||
3 | |||
4 | extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, | ||
5 | const struct nla_policy *policy); | ||
6 | extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype); | ||
7 | |||
8 | #endif /* __DRBD_NLA_H */ | ||
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 662bc8ef830a..56672a61eb94 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -171,7 +171,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) | |||
171 | if (mdev->state.conn == C_VERIFY_S || | 171 | if (mdev->state.conn == C_VERIFY_S || |
172 | mdev->state.conn == C_VERIFY_T) { | 172 | mdev->state.conn == C_VERIFY_T) { |
173 | bit_pos = bm_bits - mdev->ov_left; | 173 | bit_pos = bm_bits - mdev->ov_left; |
174 | if (mdev->agreed_pro_version >= 97) | 174 | if (verify_can_do_stop_sector(mdev)) |
175 | stop_sector = mdev->ov_stop_sector; | 175 | stop_sector = mdev->ov_stop_sector; |
176 | } else | 176 | } else |
177 | bit_pos = mdev->bm_resync_fo; | 177 | bit_pos = mdev->bm_resync_fo; |
@@ -200,9 +200,11 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) | |||
200 | 200 | ||
201 | static int drbd_seq_show(struct seq_file *seq, void *v) | 201 | static int drbd_seq_show(struct seq_file *seq, void *v) |
202 | { | 202 | { |
203 | int i, hole = 0; | 203 | int i, prev_i = -1; |
204 | const char *sn; | 204 | const char *sn; |
205 | struct drbd_conf *mdev; | 205 | struct drbd_conf *mdev; |
206 | struct net_conf *nc; | ||
207 | char wp; | ||
206 | 208 | ||
207 | static char write_ordering_chars[] = { | 209 | static char write_ordering_chars[] = { |
208 | [WO_none] = 'n', | 210 | [WO_none] = 'n', |
@@ -233,16 +235,11 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
233 | oos .. known out-of-sync kB | 235 | oos .. known out-of-sync kB |
234 | */ | 236 | */ |
235 | 237 | ||
236 | for (i = 0; i < minor_count; i++) { | 238 | rcu_read_lock(); |
237 | mdev = minor_to_mdev(i); | 239 | idr_for_each_entry(&minors, mdev, i) { |
238 | if (!mdev) { | 240 | if (prev_i != i - 1) |
239 | hole = 1; | ||
240 | continue; | ||
241 | } | ||
242 | if (hole) { | ||
243 | hole = 0; | ||
244 | seq_printf(seq, "\n"); | 241 | seq_printf(seq, "\n"); |
245 | } | 242 | prev_i = i; |
246 | 243 | ||
247 | sn = drbd_conn_str(mdev->state.conn); | 244 | sn = drbd_conn_str(mdev->state.conn); |
248 | 245 | ||
@@ -254,6 +251,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
254 | /* reset mdev->congestion_reason */ | 251 | /* reset mdev->congestion_reason */ |
255 | bdi_rw_congested(&mdev->rq_queue->backing_dev_info); | 252 | bdi_rw_congested(&mdev->rq_queue->backing_dev_info); |
256 | 253 | ||
254 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
255 | wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' '; | ||
257 | seq_printf(seq, | 256 | seq_printf(seq, |
258 | "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" | 257 | "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" |
259 | " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " | 258 | " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " |
@@ -263,14 +262,13 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
263 | drbd_role_str(mdev->state.peer), | 262 | drbd_role_str(mdev->state.peer), |
264 | drbd_disk_str(mdev->state.disk), | 263 | drbd_disk_str(mdev->state.disk), |
265 | drbd_disk_str(mdev->state.pdsk), | 264 | drbd_disk_str(mdev->state.pdsk), |
266 | (mdev->net_conf == NULL ? ' ' : | 265 | wp, |
267 | (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), | 266 | drbd_suspended(mdev) ? 's' : 'r', |
268 | is_susp(mdev->state) ? 's' : 'r', | ||
269 | mdev->state.aftr_isp ? 'a' : '-', | 267 | mdev->state.aftr_isp ? 'a' : '-', |
270 | mdev->state.peer_isp ? 'p' : '-', | 268 | mdev->state.peer_isp ? 'p' : '-', |
271 | mdev->state.user_isp ? 'u' : '-', | 269 | mdev->state.user_isp ? 'u' : '-', |
272 | mdev->congestion_reason ?: '-', | 270 | mdev->congestion_reason ?: '-', |
273 | drbd_test_flag(mdev, AL_SUSPENDED) ? 's' : '-', | 271 | test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-', |
274 | mdev->send_cnt/2, | 272 | mdev->send_cnt/2, |
275 | mdev->recv_cnt/2, | 273 | mdev->recv_cnt/2, |
276 | mdev->writ_cnt/2, | 274 | mdev->writ_cnt/2, |
@@ -282,8 +280,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
282 | atomic_read(&mdev->rs_pending_cnt), | 280 | atomic_read(&mdev->rs_pending_cnt), |
283 | atomic_read(&mdev->unacked_cnt), | 281 | atomic_read(&mdev->unacked_cnt), |
284 | atomic_read(&mdev->ap_bio_cnt), | 282 | atomic_read(&mdev->ap_bio_cnt), |
285 | mdev->epochs, | 283 | mdev->tconn->epochs, |
286 | write_ordering_chars[mdev->write_ordering] | 284 | write_ordering_chars[mdev->tconn->write_ordering] |
287 | ); | 285 | ); |
288 | seq_printf(seq, " oos:%llu\n", | 286 | seq_printf(seq, " oos:%llu\n", |
289 | Bit2KB((unsigned long long) | 287 | Bit2KB((unsigned long long) |
@@ -308,6 +306,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
308 | } | 306 | } |
309 | } | 307 | } |
310 | } | 308 | } |
309 | rcu_read_unlock(); | ||
311 | 310 | ||
312 | return 0; | 311 | return 0; |
313 | } | 312 | } |
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index eb0cafea1423..0331ad0b61e1 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -48,17 +48,25 @@ | |||
48 | 48 | ||
49 | #include "drbd_vli.h" | 49 | #include "drbd_vli.h" |
50 | 50 | ||
51 | struct packet_info { | ||
52 | enum drbd_packet cmd; | ||
53 | unsigned int size; | ||
54 | unsigned int vnr; | ||
55 | void *data; | ||
56 | }; | ||
57 | |||
51 | enum finish_epoch { | 58 | enum finish_epoch { |
52 | FE_STILL_LIVE, | 59 | FE_STILL_LIVE, |
53 | FE_DESTROYED, | 60 | FE_DESTROYED, |
54 | FE_RECYCLED, | 61 | FE_RECYCLED, |
55 | }; | 62 | }; |
56 | 63 | ||
57 | static int drbd_do_handshake(struct drbd_conf *mdev); | 64 | static int drbd_do_features(struct drbd_tconn *tconn); |
58 | static int drbd_do_auth(struct drbd_conf *mdev); | 65 | static int drbd_do_auth(struct drbd_tconn *tconn); |
66 | static int drbd_disconnected(struct drbd_conf *mdev); | ||
59 | 67 | ||
60 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); | 68 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event); |
61 | static int e_end_block(struct drbd_conf *, struct drbd_work *, int); | 69 | static int e_end_block(struct drbd_work *, int); |
62 | 70 | ||
63 | 71 | ||
64 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | 72 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) |
@@ -142,11 +150,12 @@ static void page_chain_add(struct page **head, | |||
142 | *head = chain_first; | 150 | *head = chain_first; |
143 | } | 151 | } |
144 | 152 | ||
145 | static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number) | 153 | static struct page *__drbd_alloc_pages(struct drbd_conf *mdev, |
154 | unsigned int number) | ||
146 | { | 155 | { |
147 | struct page *page = NULL; | 156 | struct page *page = NULL; |
148 | struct page *tmp = NULL; | 157 | struct page *tmp = NULL; |
149 | int i = 0; | 158 | unsigned int i = 0; |
150 | 159 | ||
151 | /* Yes, testing drbd_pp_vacant outside the lock is racy. | 160 | /* Yes, testing drbd_pp_vacant outside the lock is racy. |
152 | * So what. It saves a spin_lock. */ | 161 | * So what. It saves a spin_lock. */ |
@@ -175,7 +184,7 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int | |||
175 | return page; | 184 | return page; |
176 | 185 | ||
177 | /* Not enough pages immediately available this time. | 186 | /* Not enough pages immediately available this time. |
178 | * No need to jump around here, drbd_pp_alloc will retry this | 187 | * No need to jump around here, drbd_alloc_pages will retry this |
179 | * function "soon". */ | 188 | * function "soon". */ |
180 | if (page) { | 189 | if (page) { |
181 | tmp = page_chain_tail(page, NULL); | 190 | tmp = page_chain_tail(page, NULL); |
@@ -187,9 +196,10 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int | |||
187 | return NULL; | 196 | return NULL; |
188 | } | 197 | } |
189 | 198 | ||
190 | static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) | 199 | static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev, |
200 | struct list_head *to_be_freed) | ||
191 | { | 201 | { |
192 | struct drbd_epoch_entry *e; | 202 | struct drbd_peer_request *peer_req; |
193 | struct list_head *le, *tle; | 203 | struct list_head *le, *tle; |
194 | 204 | ||
195 | /* The EEs are always appended to the end of the list. Since | 205 | /* The EEs are always appended to the end of the list. Since |
@@ -198,8 +208,8 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed | |||
198 | stop to examine the list... */ | 208 | stop to examine the list... */ |
199 | 209 | ||
200 | list_for_each_safe(le, tle, &mdev->net_ee) { | 210 | list_for_each_safe(le, tle, &mdev->net_ee) { |
201 | e = list_entry(le, struct drbd_epoch_entry, w.list); | 211 | peer_req = list_entry(le, struct drbd_peer_request, w.list); |
202 | if (drbd_ee_has_active_page(e)) | 212 | if (drbd_peer_req_has_active_page(peer_req)) |
203 | break; | 213 | break; |
204 | list_move(le, to_be_freed); | 214 | list_move(le, to_be_freed); |
205 | } | 215 | } |
@@ -208,18 +218,18 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed | |||
208 | static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | 218 | static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) |
209 | { | 219 | { |
210 | LIST_HEAD(reclaimed); | 220 | LIST_HEAD(reclaimed); |
211 | struct drbd_epoch_entry *e, *t; | 221 | struct drbd_peer_request *peer_req, *t; |
212 | 222 | ||
213 | spin_lock_irq(&mdev->req_lock); | 223 | spin_lock_irq(&mdev->tconn->req_lock); |
214 | reclaim_net_ee(mdev, &reclaimed); | 224 | reclaim_finished_net_peer_reqs(mdev, &reclaimed); |
215 | spin_unlock_irq(&mdev->req_lock); | 225 | spin_unlock_irq(&mdev->tconn->req_lock); |
216 | 226 | ||
217 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | 227 | list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) |
218 | drbd_free_net_ee(mdev, e); | 228 | drbd_free_net_peer_req(mdev, peer_req); |
219 | } | 229 | } |
220 | 230 | ||
221 | /** | 231 | /** |
222 | * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled) | 232 | * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) |
223 | * @mdev: DRBD device. | 233 | * @mdev: DRBD device. |
224 | * @number: number of pages requested | 234 | * @number: number of pages requested |
225 | * @retry: whether to retry, if not enough pages are available right now | 235 | * @retry: whether to retry, if not enough pages are available right now |
@@ -230,23 +240,31 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | |||
230 | * | 240 | * |
231 | * Returns a page chain linked via page->private. | 241 | * Returns a page chain linked via page->private. |
232 | */ | 242 | */ |
233 | static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry) | 243 | struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number, |
244 | bool retry) | ||
234 | { | 245 | { |
235 | struct page *page = NULL; | 246 | struct page *page = NULL; |
247 | struct net_conf *nc; | ||
236 | DEFINE_WAIT(wait); | 248 | DEFINE_WAIT(wait); |
249 | int mxb; | ||
237 | 250 | ||
238 | /* Yes, we may run up to @number over max_buffers. If we | 251 | /* Yes, we may run up to @number over max_buffers. If we |
239 | * follow it strictly, the admin will get it wrong anyways. */ | 252 | * follow it strictly, the admin will get it wrong anyways. */ |
240 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) | 253 | rcu_read_lock(); |
241 | page = drbd_pp_first_pages_or_try_alloc(mdev, number); | 254 | nc = rcu_dereference(mdev->tconn->net_conf); |
255 | mxb = nc ? nc->max_buffers : 1000000; | ||
256 | rcu_read_unlock(); | ||
257 | |||
258 | if (atomic_read(&mdev->pp_in_use) < mxb) | ||
259 | page = __drbd_alloc_pages(mdev, number); | ||
242 | 260 | ||
243 | while (page == NULL) { | 261 | while (page == NULL) { |
244 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); | 262 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); |
245 | 263 | ||
246 | drbd_kick_lo_and_reclaim_net(mdev); | 264 | drbd_kick_lo_and_reclaim_net(mdev); |
247 | 265 | ||
248 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | 266 | if (atomic_read(&mdev->pp_in_use) < mxb) { |
249 | page = drbd_pp_first_pages_or_try_alloc(mdev, number); | 267 | page = __drbd_alloc_pages(mdev, number); |
250 | if (page) | 268 | if (page) |
251 | break; | 269 | break; |
252 | } | 270 | } |
@@ -255,7 +273,7 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool | |||
255 | break; | 273 | break; |
256 | 274 | ||
257 | if (signal_pending(current)) { | 275 | if (signal_pending(current)) { |
258 | dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); | 276 | dev_warn(DEV, "drbd_alloc_pages interrupted!\n"); |
259 | break; | 277 | break; |
260 | } | 278 | } |
261 | 279 | ||
@@ -268,11 +286,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool | |||
268 | return page; | 286 | return page; |
269 | } | 287 | } |
270 | 288 | ||
271 | /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. | 289 | /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages. |
272 | * Is also used from inside an other spin_lock_irq(&mdev->req_lock); | 290 | * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock); |
273 | * Either links the page chain back to the global pool, | 291 | * Either links the page chain back to the global pool, |
274 | * or returns all pages to the system. */ | 292 | * or returns all pages to the system. */ |
275 | static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) | 293 | static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net) |
276 | { | 294 | { |
277 | atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; | 295 | atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; |
278 | int i; | 296 | int i; |
@@ -280,7 +298,7 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) | |||
280 | if (page == NULL) | 298 | if (page == NULL) |
281 | return; | 299 | return; |
282 | 300 | ||
283 | if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count) | 301 | if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) |
284 | i = page_chain_free(page); | 302 | i = page_chain_free(page); |
285 | else { | 303 | else { |
286 | struct page *tmp; | 304 | struct page *tmp; |
@@ -302,127 +320,130 @@ You need to hold the req_lock: | |||
302 | _drbd_wait_ee_list_empty() | 320 | _drbd_wait_ee_list_empty() |
303 | 321 | ||
304 | You must not have the req_lock: | 322 | You must not have the req_lock: |
305 | drbd_free_ee() | 323 | drbd_free_peer_req() |
306 | drbd_alloc_ee() | 324 | drbd_alloc_peer_req() |
307 | drbd_init_ee() | 325 | drbd_free_peer_reqs() |
308 | drbd_release_ee() | ||
309 | drbd_ee_fix_bhs() | 326 | drbd_ee_fix_bhs() |
310 | drbd_process_done_ee() | 327 | drbd_finish_peer_reqs() |
311 | drbd_clear_done_ee() | 328 | drbd_clear_done_ee() |
312 | drbd_wait_ee_list_empty() | 329 | drbd_wait_ee_list_empty() |
313 | */ | 330 | */ |
314 | 331 | ||
315 | struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | 332 | struct drbd_peer_request * |
316 | u64 id, | 333 | drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector, |
317 | sector_t sector, | 334 | unsigned int data_size, gfp_t gfp_mask) __must_hold(local) |
318 | unsigned int data_size, | ||
319 | gfp_t gfp_mask) __must_hold(local) | ||
320 | { | 335 | { |
321 | struct drbd_epoch_entry *e; | 336 | struct drbd_peer_request *peer_req; |
322 | struct page *page = NULL; | 337 | struct page *page = NULL; |
323 | unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; | 338 | unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; |
324 | 339 | ||
325 | if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) | 340 | if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) |
326 | return NULL; | 341 | return NULL; |
327 | 342 | ||
328 | e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); | 343 | peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); |
329 | if (!e) { | 344 | if (!peer_req) { |
330 | if (!(gfp_mask & __GFP_NOWARN)) | 345 | if (!(gfp_mask & __GFP_NOWARN)) |
331 | dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); | 346 | dev_err(DEV, "%s: allocation failed\n", __func__); |
332 | return NULL; | 347 | return NULL; |
333 | } | 348 | } |
334 | 349 | ||
335 | if (data_size) { | 350 | if (data_size) { |
336 | page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); | 351 | page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); |
337 | if (!page) | 352 | if (!page) |
338 | goto fail; | 353 | goto fail; |
339 | } | 354 | } |
340 | 355 | ||
341 | INIT_HLIST_NODE(&e->collision); | 356 | drbd_clear_interval(&peer_req->i); |
342 | e->epoch = NULL; | 357 | peer_req->i.size = data_size; |
343 | e->mdev = mdev; | 358 | peer_req->i.sector = sector; |
344 | e->pages = page; | 359 | peer_req->i.local = false; |
345 | atomic_set(&e->pending_bios, 0); | 360 | peer_req->i.waiting = false; |
346 | e->size = data_size; | 361 | |
347 | e->flags = 0; | 362 | peer_req->epoch = NULL; |
348 | e->sector = sector; | 363 | peer_req->w.mdev = mdev; |
349 | e->block_id = id; | 364 | peer_req->pages = page; |
365 | atomic_set(&peer_req->pending_bios, 0); | ||
366 | peer_req->flags = 0; | ||
367 | /* | ||
368 | * The block_id is opaque to the receiver. It is not endianness | ||
369 | * converted, and sent back to the sender unchanged. | ||
370 | */ | ||
371 | peer_req->block_id = id; | ||
350 | 372 | ||
351 | return e; | 373 | return peer_req; |
352 | 374 | ||
353 | fail: | 375 | fail: |
354 | mempool_free(e, drbd_ee_mempool); | 376 | mempool_free(peer_req, drbd_ee_mempool); |
355 | return NULL; | 377 | return NULL; |
356 | } | 378 | } |
357 | 379 | ||
358 | void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net) | 380 | void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req, |
381 | int is_net) | ||
359 | { | 382 | { |
360 | if (e->flags & EE_HAS_DIGEST) | 383 | if (peer_req->flags & EE_HAS_DIGEST) |
361 | kfree(e->digest); | 384 | kfree(peer_req->digest); |
362 | drbd_pp_free(mdev, e->pages, is_net); | 385 | drbd_free_pages(mdev, peer_req->pages, is_net); |
363 | D_ASSERT(atomic_read(&e->pending_bios) == 0); | 386 | D_ASSERT(atomic_read(&peer_req->pending_bios) == 0); |
364 | D_ASSERT(hlist_unhashed(&e->collision)); | 387 | D_ASSERT(drbd_interval_empty(&peer_req->i)); |
365 | mempool_free(e, drbd_ee_mempool); | 388 | mempool_free(peer_req, drbd_ee_mempool); |
366 | } | 389 | } |
367 | 390 | ||
368 | int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) | 391 | int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list) |
369 | { | 392 | { |
370 | LIST_HEAD(work_list); | 393 | LIST_HEAD(work_list); |
371 | struct drbd_epoch_entry *e, *t; | 394 | struct drbd_peer_request *peer_req, *t; |
372 | int count = 0; | 395 | int count = 0; |
373 | int is_net = list == &mdev->net_ee; | 396 | int is_net = list == &mdev->net_ee; |
374 | 397 | ||
375 | spin_lock_irq(&mdev->req_lock); | 398 | spin_lock_irq(&mdev->tconn->req_lock); |
376 | list_splice_init(list, &work_list); | 399 | list_splice_init(list, &work_list); |
377 | spin_unlock_irq(&mdev->req_lock); | 400 | spin_unlock_irq(&mdev->tconn->req_lock); |
378 | 401 | ||
379 | list_for_each_entry_safe(e, t, &work_list, w.list) { | 402 | list_for_each_entry_safe(peer_req, t, &work_list, w.list) { |
380 | drbd_free_some_ee(mdev, e, is_net); | 403 | __drbd_free_peer_req(mdev, peer_req, is_net); |
381 | count++; | 404 | count++; |
382 | } | 405 | } |
383 | return count; | 406 | return count; |
384 | } | 407 | } |
385 | 408 | ||
386 | |||
387 | /* | 409 | /* |
388 | * This function is called from _asender only_ | 410 | * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier. |
389 | * but see also comments in _req_mod(,barrier_acked) | ||
390 | * and receive_Barrier. | ||
391 | * | ||
392 | * Move entries from net_ee to done_ee, if ready. | ||
393 | * Grab done_ee, call all callbacks, free the entries. | ||
394 | * The callbacks typically send out ACKs. | ||
395 | */ | 411 | */ |
396 | static int drbd_process_done_ee(struct drbd_conf *mdev) | 412 | static int drbd_finish_peer_reqs(struct drbd_conf *mdev) |
397 | { | 413 | { |
398 | LIST_HEAD(work_list); | 414 | LIST_HEAD(work_list); |
399 | LIST_HEAD(reclaimed); | 415 | LIST_HEAD(reclaimed); |
400 | struct drbd_epoch_entry *e, *t; | 416 | struct drbd_peer_request *peer_req, *t; |
401 | int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); | 417 | int err = 0; |
402 | 418 | ||
403 | spin_lock_irq(&mdev->req_lock); | 419 | spin_lock_irq(&mdev->tconn->req_lock); |
404 | reclaim_net_ee(mdev, &reclaimed); | 420 | reclaim_finished_net_peer_reqs(mdev, &reclaimed); |
405 | list_splice_init(&mdev->done_ee, &work_list); | 421 | list_splice_init(&mdev->done_ee, &work_list); |
406 | spin_unlock_irq(&mdev->req_lock); | 422 | spin_unlock_irq(&mdev->tconn->req_lock); |
407 | 423 | ||
408 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | 424 | list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) |
409 | drbd_free_net_ee(mdev, e); | 425 | drbd_free_net_peer_req(mdev, peer_req); |
410 | 426 | ||
411 | /* possible callbacks here: | 427 | /* possible callbacks here: |
412 | * e_end_block, and e_end_resync_block, e_send_discard_ack. | 428 | * e_end_block, and e_end_resync_block, e_send_superseded. |
413 | * all ignore the last argument. | 429 | * all ignore the last argument. |
414 | */ | 430 | */ |
415 | list_for_each_entry_safe(e, t, &work_list, w.list) { | 431 | list_for_each_entry_safe(peer_req, t, &work_list, w.list) { |
432 | int err2; | ||
433 | |||
416 | /* list_del not necessary, next/prev members not touched */ | 434 | /* list_del not necessary, next/prev members not touched */ |
417 | ok = e->w.cb(mdev, &e->w, !ok) && ok; | 435 | err2 = peer_req->w.cb(&peer_req->w, !!err); |
418 | drbd_free_ee(mdev, e); | 436 | if (!err) |
437 | err = err2; | ||
438 | drbd_free_peer_req(mdev, peer_req); | ||
419 | } | 439 | } |
420 | wake_up(&mdev->ee_wait); | 440 | wake_up(&mdev->ee_wait); |
421 | 441 | ||
422 | return ok; | 442 | return err; |
423 | } | 443 | } |
424 | 444 | ||
425 | void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | 445 | static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, |
446 | struct list_head *head) | ||
426 | { | 447 | { |
427 | DEFINE_WAIT(wait); | 448 | DEFINE_WAIT(wait); |
428 | 449 | ||
@@ -430,55 +451,22 @@ void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | |||
430 | * and calling prepare_to_wait in the fast path */ | 451 | * and calling prepare_to_wait in the fast path */ |
431 | while (!list_empty(head)) { | 452 | while (!list_empty(head)) { |
432 | prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); | 453 | prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); |
433 | spin_unlock_irq(&mdev->req_lock); | 454 | spin_unlock_irq(&mdev->tconn->req_lock); |
434 | io_schedule(); | 455 | io_schedule(); |
435 | finish_wait(&mdev->ee_wait, &wait); | 456 | finish_wait(&mdev->ee_wait, &wait); |
436 | spin_lock_irq(&mdev->req_lock); | 457 | spin_lock_irq(&mdev->tconn->req_lock); |
437 | } | 458 | } |
438 | } | 459 | } |
439 | 460 | ||
440 | void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | 461 | static void drbd_wait_ee_list_empty(struct drbd_conf *mdev, |
462 | struct list_head *head) | ||
441 | { | 463 | { |
442 | spin_lock_irq(&mdev->req_lock); | 464 | spin_lock_irq(&mdev->tconn->req_lock); |
443 | _drbd_wait_ee_list_empty(mdev, head); | 465 | _drbd_wait_ee_list_empty(mdev, head); |
444 | spin_unlock_irq(&mdev->req_lock); | 466 | spin_unlock_irq(&mdev->tconn->req_lock); |
445 | } | ||
446 | |||
447 | /* see also kernel_accept; which is only present since 2.6.18. | ||
448 | * also we want to log which part of it failed, exactly */ | ||
449 | static int drbd_accept(struct drbd_conf *mdev, const char **what, | ||
450 | struct socket *sock, struct socket **newsock) | ||
451 | { | ||
452 | struct sock *sk = sock->sk; | ||
453 | int err = 0; | ||
454 | |||
455 | *what = "listen"; | ||
456 | err = sock->ops->listen(sock, 5); | ||
457 | if (err < 0) | ||
458 | goto out; | ||
459 | |||
460 | *what = "sock_create_lite"; | ||
461 | err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, | ||
462 | newsock); | ||
463 | if (err < 0) | ||
464 | goto out; | ||
465 | |||
466 | *what = "accept"; | ||
467 | err = sock->ops->accept(sock, *newsock, 0); | ||
468 | if (err < 0) { | ||
469 | sock_release(*newsock); | ||
470 | *newsock = NULL; | ||
471 | goto out; | ||
472 | } | ||
473 | (*newsock)->ops = sock->ops; | ||
474 | __module_get((*newsock)->ops->owner); | ||
475 | |||
476 | out: | ||
477 | return err; | ||
478 | } | 467 | } |
479 | 468 | ||
480 | static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, | 469 | static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) |
481 | void *buf, size_t size, int flags) | ||
482 | { | 470 | { |
483 | mm_segment_t oldfs; | 471 | mm_segment_t oldfs; |
484 | struct kvec iov = { | 472 | struct kvec iov = { |
@@ -500,48 +488,62 @@ static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, | |||
500 | return rv; | 488 | return rv; |
501 | } | 489 | } |
502 | 490 | ||
503 | static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) | 491 | static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size) |
504 | { | 492 | { |
505 | mm_segment_t oldfs; | ||
506 | struct kvec iov = { | ||
507 | .iov_base = buf, | ||
508 | .iov_len = size, | ||
509 | }; | ||
510 | struct msghdr msg = { | ||
511 | .msg_iovlen = 1, | ||
512 | .msg_iov = (struct iovec *)&iov, | ||
513 | .msg_flags = MSG_WAITALL | MSG_NOSIGNAL | ||
514 | }; | ||
515 | int rv; | 493 | int rv; |
516 | 494 | ||
517 | oldfs = get_fs(); | 495 | rv = drbd_recv_short(tconn->data.socket, buf, size, 0); |
518 | set_fs(KERNEL_DS); | ||
519 | rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); | ||
520 | set_fs(oldfs); | ||
521 | 496 | ||
522 | if (rv < 0) { | 497 | if (rv < 0) { |
523 | if (rv == -ECONNRESET) | 498 | if (rv == -ECONNRESET) |
524 | dev_info(DEV, "sock was reset by peer\n"); | 499 | conn_info(tconn, "sock was reset by peer\n"); |
525 | else if (rv != -ERESTARTSYS) | 500 | else if (rv != -ERESTARTSYS) |
526 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | 501 | conn_err(tconn, "sock_recvmsg returned %d\n", rv); |
527 | } else if (rv == 0) { | 502 | } else if (rv == 0) { |
528 | if (drbd_test_flag(mdev, DISCONNECT_SENT)) { | 503 | if (test_bit(DISCONNECT_SENT, &tconn->flags)) { |
529 | long t; /* time_left */ | 504 | long t; |
530 | t = wait_event_timeout(mdev->state_wait, mdev->state.conn < C_CONNECTED, | 505 | rcu_read_lock(); |
531 | mdev->net_conf->ping_timeo * HZ/10); | 506 | t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10; |
507 | rcu_read_unlock(); | ||
508 | |||
509 | t = wait_event_timeout(tconn->ping_wait, tconn->cstate < C_WF_REPORT_PARAMS, t); | ||
510 | |||
532 | if (t) | 511 | if (t) |
533 | goto out; | 512 | goto out; |
534 | } | 513 | } |
535 | dev_info(DEV, "sock was shut down by peer\n"); | 514 | conn_info(tconn, "sock was shut down by peer\n"); |
536 | } | 515 | } |
537 | 516 | ||
538 | if (rv != size) | 517 | if (rv != size) |
539 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | 518 | conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); |
540 | 519 | ||
541 | out: | 520 | out: |
542 | return rv; | 521 | return rv; |
543 | } | 522 | } |
544 | 523 | ||
524 | static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size) | ||
525 | { | ||
526 | int err; | ||
527 | |||
528 | err = drbd_recv(tconn, buf, size); | ||
529 | if (err != size) { | ||
530 | if (err >= 0) | ||
531 | err = -EIO; | ||
532 | } else | ||
533 | err = 0; | ||
534 | return err; | ||
535 | } | ||
536 | |||
537 | static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size) | ||
538 | { | ||
539 | int err; | ||
540 | |||
541 | err = drbd_recv_all(tconn, buf, size); | ||
542 | if (err && !signal_pending(current)) | ||
543 | conn_warn(tconn, "short read (expected size %d)\n", (int)size); | ||
544 | return err; | ||
545 | } | ||
546 | |||
545 | /* quoting tcp(7): | 547 | /* quoting tcp(7): |
546 | * On individual connections, the socket buffer size must be set prior to the | 548 | * On individual connections, the socket buffer size must be set prior to the |
547 | * listen(2) or connect(2) calls in order to have it take effect. | 549 | * listen(2) or connect(2) calls in order to have it take effect. |
@@ -561,29 +563,50 @@ static void drbd_setbufsize(struct socket *sock, unsigned int snd, | |||
561 | } | 563 | } |
562 | } | 564 | } |
563 | 565 | ||
564 | static struct socket *drbd_try_connect(struct drbd_conf *mdev) | 566 | static struct socket *drbd_try_connect(struct drbd_tconn *tconn) |
565 | { | 567 | { |
566 | const char *what; | 568 | const char *what; |
567 | struct socket *sock; | 569 | struct socket *sock; |
568 | struct sockaddr_in6 src_in6; | 570 | struct sockaddr_in6 src_in6; |
569 | int err; | 571 | struct sockaddr_in6 peer_in6; |
572 | struct net_conf *nc; | ||
573 | int err, peer_addr_len, my_addr_len; | ||
574 | int sndbuf_size, rcvbuf_size, connect_int; | ||
570 | int disconnect_on_error = 1; | 575 | int disconnect_on_error = 1; |
571 | 576 | ||
572 | if (!get_net_conf(mdev)) | 577 | rcu_read_lock(); |
578 | nc = rcu_dereference(tconn->net_conf); | ||
579 | if (!nc) { | ||
580 | rcu_read_unlock(); | ||
573 | return NULL; | 581 | return NULL; |
582 | } | ||
583 | sndbuf_size = nc->sndbuf_size; | ||
584 | rcvbuf_size = nc->rcvbuf_size; | ||
585 | connect_int = nc->connect_int; | ||
586 | rcu_read_unlock(); | ||
587 | |||
588 | my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6)); | ||
589 | memcpy(&src_in6, &tconn->my_addr, my_addr_len); | ||
590 | |||
591 | if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6) | ||
592 | src_in6.sin6_port = 0; | ||
593 | else | ||
594 | ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ | ||
595 | |||
596 | peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6)); | ||
597 | memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len); | ||
574 | 598 | ||
575 | what = "sock_create_kern"; | 599 | what = "sock_create_kern"; |
576 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | 600 | err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, |
577 | SOCK_STREAM, IPPROTO_TCP, &sock); | 601 | SOCK_STREAM, IPPROTO_TCP, &sock); |
578 | if (err < 0) { | 602 | if (err < 0) { |
579 | sock = NULL; | 603 | sock = NULL; |
580 | goto out; | 604 | goto out; |
581 | } | 605 | } |
582 | 606 | ||
583 | sock->sk->sk_rcvtimeo = | 607 | sock->sk->sk_rcvtimeo = |
584 | sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; | 608 | sock->sk->sk_sndtimeo = connect_int * HZ; |
585 | drbd_setbufsize(sock, mdev->net_conf->sndbuf_size, | 609 | drbd_setbufsize(sock, sndbuf_size, rcvbuf_size); |
586 | mdev->net_conf->rcvbuf_size); | ||
587 | 610 | ||
588 | /* explicitly bind to the configured IP as source IP | 611 | /* explicitly bind to the configured IP as source IP |
589 | * for the outgoing connections. | 612 | * for the outgoing connections. |
@@ -592,17 +615,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev) | |||
592 | * Make sure to use 0 as port number, so linux selects | 615 | * Make sure to use 0 as port number, so linux selects |
593 | * a free one dynamically. | 616 | * a free one dynamically. |
594 | */ | 617 | */ |
595 | memcpy(&src_in6, mdev->net_conf->my_addr, | ||
596 | min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); | ||
597 | if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) | ||
598 | src_in6.sin6_port = 0; | ||
599 | else | ||
600 | ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ | ||
601 | |||
602 | what = "bind before connect"; | 618 | what = "bind before connect"; |
603 | err = sock->ops->bind(sock, | 619 | err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); |
604 | (struct sockaddr *) &src_in6, | ||
605 | mdev->net_conf->my_addr_len); | ||
606 | if (err < 0) | 620 | if (err < 0) |
607 | goto out; | 621 | goto out; |
608 | 622 | ||
@@ -610,9 +624,7 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev) | |||
610 | * stay C_WF_CONNECTION, don't go Disconnecting! */ | 624 | * stay C_WF_CONNECTION, don't go Disconnecting! */ |
611 | disconnect_on_error = 0; | 625 | disconnect_on_error = 0; |
612 | what = "connect"; | 626 | what = "connect"; |
613 | err = sock->ops->connect(sock, | 627 | err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); |
614 | (struct sockaddr *)mdev->net_conf->peer_addr, | ||
615 | mdev->net_conf->peer_addr_len, 0); | ||
616 | 628 | ||
617 | out: | 629 | out: |
618 | if (err < 0) { | 630 | if (err < 0) { |
@@ -630,91 +642,174 @@ out: | |||
630 | disconnect_on_error = 0; | 642 | disconnect_on_error = 0; |
631 | break; | 643 | break; |
632 | default: | 644 | default: |
633 | dev_err(DEV, "%s failed, err = %d\n", what, err); | 645 | conn_err(tconn, "%s failed, err = %d\n", what, err); |
634 | } | 646 | } |
635 | if (disconnect_on_error) | 647 | if (disconnect_on_error) |
636 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 648 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
637 | } | 649 | } |
638 | put_net_conf(mdev); | 650 | |
639 | return sock; | 651 | return sock; |
640 | } | 652 | } |
641 | 653 | ||
642 | static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) | 654 | struct accept_wait_data { |
655 | struct drbd_tconn *tconn; | ||
656 | struct socket *s_listen; | ||
657 | struct completion door_bell; | ||
658 | void (*original_sk_state_change)(struct sock *sk); | ||
659 | |||
660 | }; | ||
661 | |||
662 | static void drbd_incoming_connection(struct sock *sk) | ||
663 | { | ||
664 | struct accept_wait_data *ad = sk->sk_user_data; | ||
665 | void (*state_change)(struct sock *sk); | ||
666 | |||
667 | state_change = ad->original_sk_state_change; | ||
668 | if (sk->sk_state == TCP_ESTABLISHED) | ||
669 | complete(&ad->door_bell); | ||
670 | state_change(sk); | ||
671 | } | ||
672 | |||
673 | static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad) | ||
643 | { | 674 | { |
644 | int timeo, err; | 675 | int err, sndbuf_size, rcvbuf_size, my_addr_len; |
645 | struct socket *s_estab = NULL, *s_listen; | 676 | struct sockaddr_in6 my_addr; |
677 | struct socket *s_listen; | ||
678 | struct net_conf *nc; | ||
646 | const char *what; | 679 | const char *what; |
647 | 680 | ||
648 | if (!get_net_conf(mdev)) | 681 | rcu_read_lock(); |
649 | return NULL; | 682 | nc = rcu_dereference(tconn->net_conf); |
683 | if (!nc) { | ||
684 | rcu_read_unlock(); | ||
685 | return -EIO; | ||
686 | } | ||
687 | sndbuf_size = nc->sndbuf_size; | ||
688 | rcvbuf_size = nc->rcvbuf_size; | ||
689 | rcu_read_unlock(); | ||
690 | |||
691 | my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6)); | ||
692 | memcpy(&my_addr, &tconn->my_addr, my_addr_len); | ||
650 | 693 | ||
651 | what = "sock_create_kern"; | 694 | what = "sock_create_kern"; |
652 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | 695 | err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, |
653 | SOCK_STREAM, IPPROTO_TCP, &s_listen); | 696 | SOCK_STREAM, IPPROTO_TCP, &s_listen); |
654 | if (err) { | 697 | if (err) { |
655 | s_listen = NULL; | 698 | s_listen = NULL; |
656 | goto out; | 699 | goto out; |
657 | } | 700 | } |
658 | 701 | ||
659 | timeo = mdev->net_conf->try_connect_int * HZ; | 702 | s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ |
660 | timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ | 703 | drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); |
661 | |||
662 | s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ | ||
663 | s_listen->sk->sk_rcvtimeo = timeo; | ||
664 | s_listen->sk->sk_sndtimeo = timeo; | ||
665 | drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, | ||
666 | mdev->net_conf->rcvbuf_size); | ||
667 | 704 | ||
668 | what = "bind before listen"; | 705 | what = "bind before listen"; |
669 | err = s_listen->ops->bind(s_listen, | 706 | err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); |
670 | (struct sockaddr *) mdev->net_conf->my_addr, | ||
671 | mdev->net_conf->my_addr_len); | ||
672 | if (err < 0) | 707 | if (err < 0) |
673 | goto out; | 708 | goto out; |
674 | 709 | ||
675 | err = drbd_accept(mdev, &what, s_listen, &s_estab); | 710 | ad->s_listen = s_listen; |
711 | write_lock_bh(&s_listen->sk->sk_callback_lock); | ||
712 | ad->original_sk_state_change = s_listen->sk->sk_state_change; | ||
713 | s_listen->sk->sk_state_change = drbd_incoming_connection; | ||
714 | s_listen->sk->sk_user_data = ad; | ||
715 | write_unlock_bh(&s_listen->sk->sk_callback_lock); | ||
716 | |||
717 | what = "listen"; | ||
718 | err = s_listen->ops->listen(s_listen, 5); | ||
719 | if (err < 0) | ||
720 | goto out; | ||
676 | 721 | ||
722 | return 0; | ||
677 | out: | 723 | out: |
678 | if (s_listen) | 724 | if (s_listen) |
679 | sock_release(s_listen); | 725 | sock_release(s_listen); |
680 | if (err < 0) { | 726 | if (err < 0) { |
681 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { | 727 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { |
682 | dev_err(DEV, "%s failed, err = %d\n", what, err); | 728 | conn_err(tconn, "%s failed, err = %d\n", what, err); |
683 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 729 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
684 | } | 730 | } |
685 | } | 731 | } |
686 | put_net_conf(mdev); | ||
687 | 732 | ||
688 | return s_estab; | 733 | return -EIO; |
689 | } | 734 | } |
690 | 735 | ||
691 | static int drbd_send_fp(struct drbd_conf *mdev, | 736 | static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad) |
692 | struct socket *sock, enum drbd_packets cmd) | ||
693 | { | 737 | { |
694 | struct p_header80 *h = &mdev->data.sbuf.header.h80; | 738 | write_lock_bh(&sk->sk_callback_lock); |
695 | 739 | sk->sk_state_change = ad->original_sk_state_change; | |
696 | return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); | 740 | sk->sk_user_data = NULL; |
741 | write_unlock_bh(&sk->sk_callback_lock); | ||
697 | } | 742 | } |
698 | 743 | ||
699 | static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) | 744 | static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad) |
700 | { | 745 | { |
701 | struct p_header80 *h = &mdev->data.rbuf.header.h80; | 746 | int timeo, connect_int, err = 0; |
702 | int rr; | 747 | struct socket *s_estab = NULL; |
748 | struct net_conf *nc; | ||
749 | |||
750 | rcu_read_lock(); | ||
751 | nc = rcu_dereference(tconn->net_conf); | ||
752 | if (!nc) { | ||
753 | rcu_read_unlock(); | ||
754 | return NULL; | ||
755 | } | ||
756 | connect_int = nc->connect_int; | ||
757 | rcu_read_unlock(); | ||
758 | |||
759 | timeo = connect_int * HZ; | ||
760 | timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ | ||
703 | 761 | ||
704 | rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); | 762 | err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); |
763 | if (err <= 0) | ||
764 | return NULL; | ||
705 | 765 | ||
706 | if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) | 766 | err = kernel_accept(ad->s_listen, &s_estab, 0); |
707 | return be16_to_cpu(h->command); | 767 | if (err < 0) { |
768 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { | ||
769 | conn_err(tconn, "accept failed, err = %d\n", err); | ||
770 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); | ||
771 | } | ||
772 | } | ||
708 | 773 | ||
709 | return 0xffff; | 774 | if (s_estab) |
775 | unregister_state_change(s_estab->sk, ad); | ||
776 | |||
777 | return s_estab; | ||
778 | } | ||
779 | |||
780 | static int decode_header(struct drbd_tconn *, void *, struct packet_info *); | ||
781 | |||
782 | static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock, | ||
783 | enum drbd_packet cmd) | ||
784 | { | ||
785 | if (!conn_prepare_command(tconn, sock)) | ||
786 | return -EIO; | ||
787 | return conn_send_command(tconn, sock, cmd, 0, NULL, 0); | ||
788 | } | ||
789 | |||
790 | static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock) | ||
791 | { | ||
792 | unsigned int header_size = drbd_header_size(tconn); | ||
793 | struct packet_info pi; | ||
794 | int err; | ||
795 | |||
796 | err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0); | ||
797 | if (err != header_size) { | ||
798 | if (err >= 0) | ||
799 | err = -EIO; | ||
800 | return err; | ||
801 | } | ||
802 | err = decode_header(tconn, tconn->data.rbuf, &pi); | ||
803 | if (err) | ||
804 | return err; | ||
805 | return pi.cmd; | ||
710 | } | 806 | } |
711 | 807 | ||
712 | /** | 808 | /** |
713 | * drbd_socket_okay() - Free the socket if its connection is not okay | 809 | * drbd_socket_okay() - Free the socket if its connection is not okay |
714 | * @mdev: DRBD device. | ||
715 | * @sock: pointer to the pointer to the socket. | 810 | * @sock: pointer to the pointer to the socket. |
716 | */ | 811 | */ |
717 | static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | 812 | static int drbd_socket_okay(struct socket **sock) |
718 | { | 813 | { |
719 | int rr; | 814 | int rr; |
720 | char tb[4]; | 815 | char tb[4]; |
@@ -722,7 +817,7 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | |||
722 | if (!*sock) | 817 | if (!*sock) |
723 | return false; | 818 | return false; |
724 | 819 | ||
725 | rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); | 820 | rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); |
726 | 821 | ||
727 | if (rr > 0 || rr == -EAGAIN) { | 822 | if (rr > 0 || rr == -EAGAIN) { |
728 | return true; | 823 | return true; |
@@ -732,6 +827,31 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | |||
732 | return false; | 827 | return false; |
733 | } | 828 | } |
734 | } | 829 | } |
830 | /* Gets called if a connection is established, or if a new minor gets created | ||
831 | in a connection */ | ||
832 | int drbd_connected(struct drbd_conf *mdev) | ||
833 | { | ||
834 | int err; | ||
835 | |||
836 | atomic_set(&mdev->packet_seq, 0); | ||
837 | mdev->peer_seq = 0; | ||
838 | |||
839 | mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ? | ||
840 | &mdev->tconn->cstate_mutex : | ||
841 | &mdev->own_state_mutex; | ||
842 | |||
843 | err = drbd_send_sync_param(mdev); | ||
844 | if (!err) | ||
845 | err = drbd_send_sizes(mdev, 0, 0); | ||
846 | if (!err) | ||
847 | err = drbd_send_uuids(mdev); | ||
848 | if (!err) | ||
849 | err = drbd_send_current_state(mdev); | ||
850 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
851 | clear_bit(RESIZE_PENDING, &mdev->flags); | ||
852 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ | ||
853 | return err; | ||
854 | } | ||
735 | 855 | ||
736 | /* | 856 | /* |
737 | * return values: | 857 | * return values: |
@@ -741,232 +861,305 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | |||
741 | * no point in trying again, please go standalone. | 861 | * no point in trying again, please go standalone. |
742 | * -2 We do not have a network config... | 862 | * -2 We do not have a network config... |
743 | */ | 863 | */ |
744 | static int drbd_connect(struct drbd_conf *mdev) | 864 | static int conn_connect(struct drbd_tconn *tconn) |
745 | { | 865 | { |
746 | struct socket *s, *sock, *msock; | 866 | struct drbd_socket sock, msock; |
747 | int try, h, ok; | 867 | struct drbd_conf *mdev; |
868 | struct net_conf *nc; | ||
869 | int vnr, timeout, h, ok; | ||
870 | bool discard_my_data; | ||
748 | enum drbd_state_rv rv; | 871 | enum drbd_state_rv rv; |
872 | struct accept_wait_data ad = { | ||
873 | .tconn = tconn, | ||
874 | .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell), | ||
875 | }; | ||
749 | 876 | ||
750 | D_ASSERT(!mdev->data.socket); | 877 | clear_bit(DISCONNECT_SENT, &tconn->flags); |
751 | 878 | if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS) | |
752 | drbd_clear_flag(mdev, DISCONNECT_SENT); | ||
753 | if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) | ||
754 | return -2; | 879 | return -2; |
755 | 880 | ||
756 | sock = NULL; | 881 | mutex_init(&sock.mutex); |
757 | msock = NULL; | 882 | sock.sbuf = tconn->data.sbuf; |
883 | sock.rbuf = tconn->data.rbuf; | ||
884 | sock.socket = NULL; | ||
885 | mutex_init(&msock.mutex); | ||
886 | msock.sbuf = tconn->meta.sbuf; | ||
887 | msock.rbuf = tconn->meta.rbuf; | ||
888 | msock.socket = NULL; | ||
889 | |||
890 | /* Assume that the peer only understands protocol 80 until we know better. */ | ||
891 | tconn->agreed_pro_version = 80; | ||
892 | |||
893 | if (prepare_listen_socket(tconn, &ad)) | ||
894 | return 0; | ||
758 | 895 | ||
759 | do { | 896 | do { |
760 | for (try = 0;;) { | 897 | struct socket *s; |
761 | /* 3 tries, this should take less than a second! */ | ||
762 | s = drbd_try_connect(mdev); | ||
763 | if (s || ++try >= 3) | ||
764 | break; | ||
765 | /* give the other side time to call bind() & listen() */ | ||
766 | schedule_timeout_interruptible(HZ / 10); | ||
767 | } | ||
768 | 898 | ||
899 | s = drbd_try_connect(tconn); | ||
769 | if (s) { | 900 | if (s) { |
770 | if (!sock) { | 901 | if (!sock.socket) { |
771 | drbd_send_fp(mdev, s, P_HAND_SHAKE_S); | 902 | sock.socket = s; |
772 | sock = s; | 903 | send_first_packet(tconn, &sock, P_INITIAL_DATA); |
773 | s = NULL; | 904 | } else if (!msock.socket) { |
774 | } else if (!msock) { | 905 | clear_bit(RESOLVE_CONFLICTS, &tconn->flags); |
775 | drbd_clear_flag(mdev, DISCARD_CONCURRENT); | 906 | msock.socket = s; |
776 | drbd_send_fp(mdev, s, P_HAND_SHAKE_M); | 907 | send_first_packet(tconn, &msock, P_INITIAL_META); |
777 | msock = s; | ||
778 | s = NULL; | ||
779 | } else { | 908 | } else { |
780 | dev_err(DEV, "Logic error in drbd_connect()\n"); | 909 | conn_err(tconn, "Logic error in conn_connect()\n"); |
781 | goto out_release_sockets; | 910 | goto out_release_sockets; |
782 | } | 911 | } |
783 | } | 912 | } |
784 | 913 | ||
785 | if (sock && msock) { | 914 | if (sock.socket && msock.socket) { |
786 | schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10); | 915 | rcu_read_lock(); |
787 | ok = drbd_socket_okay(mdev, &sock); | 916 | nc = rcu_dereference(tconn->net_conf); |
788 | ok = drbd_socket_okay(mdev, &msock) && ok; | 917 | timeout = nc->ping_timeo * HZ / 10; |
918 | rcu_read_unlock(); | ||
919 | schedule_timeout_interruptible(timeout); | ||
920 | ok = drbd_socket_okay(&sock.socket); | ||
921 | ok = drbd_socket_okay(&msock.socket) && ok; | ||
789 | if (ok) | 922 | if (ok) |
790 | break; | 923 | break; |
791 | } | 924 | } |
792 | 925 | ||
793 | retry: | 926 | retry: |
794 | s = drbd_wait_for_connect(mdev); | 927 | s = drbd_wait_for_connect(tconn, &ad); |
795 | if (s) { | 928 | if (s) { |
796 | try = drbd_recv_fp(mdev, s); | 929 | int fp = receive_first_packet(tconn, s); |
797 | drbd_socket_okay(mdev, &sock); | 930 | drbd_socket_okay(&sock.socket); |
798 | drbd_socket_okay(mdev, &msock); | 931 | drbd_socket_okay(&msock.socket); |
799 | switch (try) { | 932 | switch (fp) { |
800 | case P_HAND_SHAKE_S: | 933 | case P_INITIAL_DATA: |
801 | if (sock) { | 934 | if (sock.socket) { |
802 | dev_warn(DEV, "initial packet S crossed\n"); | 935 | conn_warn(tconn, "initial packet S crossed\n"); |
803 | sock_release(sock); | 936 | sock_release(sock.socket); |
937 | sock.socket = s; | ||
938 | goto randomize; | ||
804 | } | 939 | } |
805 | sock = s; | 940 | sock.socket = s; |
806 | break; | 941 | break; |
807 | case P_HAND_SHAKE_M: | 942 | case P_INITIAL_META: |
808 | if (msock) { | 943 | set_bit(RESOLVE_CONFLICTS, &tconn->flags); |
809 | dev_warn(DEV, "initial packet M crossed\n"); | 944 | if (msock.socket) { |
810 | sock_release(msock); | 945 | conn_warn(tconn, "initial packet M crossed\n"); |
946 | sock_release(msock.socket); | ||
947 | msock.socket = s; | ||
948 | goto randomize; | ||
811 | } | 949 | } |
812 | msock = s; | 950 | msock.socket = s; |
813 | drbd_set_flag(mdev, DISCARD_CONCURRENT); | ||
814 | break; | 951 | break; |
815 | default: | 952 | default: |
816 | dev_warn(DEV, "Error receiving initial packet\n"); | 953 | conn_warn(tconn, "Error receiving initial packet\n"); |
817 | sock_release(s); | 954 | sock_release(s); |
955 | randomize: | ||
818 | if (random32() & 1) | 956 | if (random32() & 1) |
819 | goto retry; | 957 | goto retry; |
820 | } | 958 | } |
821 | } | 959 | } |
822 | 960 | ||
823 | if (mdev->state.conn <= C_DISCONNECTING) | 961 | if (tconn->cstate <= C_DISCONNECTING) |
824 | goto out_release_sockets; | 962 | goto out_release_sockets; |
825 | if (signal_pending(current)) { | 963 | if (signal_pending(current)) { |
826 | flush_signals(current); | 964 | flush_signals(current); |
827 | smp_rmb(); | 965 | smp_rmb(); |
828 | if (get_t_state(&mdev->receiver) == Exiting) | 966 | if (get_t_state(&tconn->receiver) == EXITING) |
829 | goto out_release_sockets; | 967 | goto out_release_sockets; |
830 | } | 968 | } |
831 | 969 | ||
832 | if (sock && msock) { | 970 | ok = drbd_socket_okay(&sock.socket); |
833 | ok = drbd_socket_okay(mdev, &sock); | 971 | ok = drbd_socket_okay(&msock.socket) && ok; |
834 | ok = drbd_socket_okay(mdev, &msock) && ok; | 972 | } while (!ok); |
835 | if (ok) | ||
836 | break; | ||
837 | } | ||
838 | } while (1); | ||
839 | 973 | ||
840 | msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ | 974 | if (ad.s_listen) |
841 | sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ | 975 | sock_release(ad.s_listen); |
842 | 976 | ||
843 | sock->sk->sk_allocation = GFP_NOIO; | 977 | sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ |
844 | msock->sk->sk_allocation = GFP_NOIO; | 978 | msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ |
845 | 979 | ||
846 | sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; | 980 | sock.socket->sk->sk_allocation = GFP_NOIO; |
847 | msock->sk->sk_priority = TC_PRIO_INTERACTIVE; | 981 | msock.socket->sk->sk_allocation = GFP_NOIO; |
982 | |||
983 | sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; | ||
984 | msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; | ||
848 | 985 | ||
849 | /* NOT YET ... | 986 | /* NOT YET ... |
850 | * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | 987 | * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10; |
851 | * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | 988 | * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; |
852 | * first set it to the P_HAND_SHAKE timeout, | 989 | * first set it to the P_CONNECTION_FEATURES timeout, |
853 | * which we set to 4x the configured ping_timeout. */ | 990 | * which we set to 4x the configured ping_timeout. */ |
854 | sock->sk->sk_sndtimeo = | 991 | rcu_read_lock(); |
855 | sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; | 992 | nc = rcu_dereference(tconn->net_conf); |
993 | |||
994 | sock.socket->sk->sk_sndtimeo = | ||
995 | sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; | ||
996 | |||
997 | msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; | ||
998 | timeout = nc->timeout * HZ / 10; | ||
999 | discard_my_data = nc->discard_my_data; | ||
1000 | rcu_read_unlock(); | ||
856 | 1001 | ||
857 | msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | 1002 | msock.socket->sk->sk_sndtimeo = timeout; |
858 | msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | ||
859 | 1003 | ||
860 | /* we don't want delays. | 1004 | /* we don't want delays. |
861 | * we use TCP_CORK where appropriate, though */ | 1005 | * we use TCP_CORK where appropriate, though */ |
862 | drbd_tcp_nodelay(sock); | 1006 | drbd_tcp_nodelay(sock.socket); |
863 | drbd_tcp_nodelay(msock); | 1007 | drbd_tcp_nodelay(msock.socket); |
864 | 1008 | ||
865 | mdev->data.socket = sock; | 1009 | tconn->data.socket = sock.socket; |
866 | mdev->meta.socket = msock; | 1010 | tconn->meta.socket = msock.socket; |
867 | mdev->last_received = jiffies; | 1011 | tconn->last_received = jiffies; |
868 | 1012 | ||
869 | D_ASSERT(mdev->asender.task == NULL); | 1013 | h = drbd_do_features(tconn); |
870 | |||
871 | h = drbd_do_handshake(mdev); | ||
872 | if (h <= 0) | 1014 | if (h <= 0) |
873 | return h; | 1015 | return h; |
874 | 1016 | ||
875 | if (mdev->cram_hmac_tfm) { | 1017 | if (tconn->cram_hmac_tfm) { |
876 | /* drbd_request_state(mdev, NS(conn, WFAuth)); */ | 1018 | /* drbd_request_state(mdev, NS(conn, WFAuth)); */ |
877 | switch (drbd_do_auth(mdev)) { | 1019 | switch (drbd_do_auth(tconn)) { |
878 | case -1: | 1020 | case -1: |
879 | dev_err(DEV, "Authentication of peer failed\n"); | 1021 | conn_err(tconn, "Authentication of peer failed\n"); |
880 | return -1; | 1022 | return -1; |
881 | case 0: | 1023 | case 0: |
882 | dev_err(DEV, "Authentication of peer failed, trying again.\n"); | 1024 | conn_err(tconn, "Authentication of peer failed, trying again.\n"); |
883 | return 0; | 1025 | return 0; |
884 | } | 1026 | } |
885 | } | 1027 | } |
886 | 1028 | ||
887 | sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | 1029 | tconn->data.socket->sk->sk_sndtimeo = timeout; |
888 | sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | 1030 | tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; |
889 | |||
890 | atomic_set(&mdev->packet_seq, 0); | ||
891 | mdev->peer_seq = 0; | ||
892 | 1031 | ||
893 | if (drbd_send_protocol(mdev) == -1) | 1032 | if (drbd_send_protocol(tconn) == -EOPNOTSUPP) |
894 | return -1; | 1033 | return -1; |
895 | drbd_set_flag(mdev, STATE_SENT); | 1034 | |
896 | drbd_send_sync_param(mdev, &mdev->sync_conf); | 1035 | set_bit(STATE_SENT, &tconn->flags); |
897 | drbd_send_sizes(mdev, 0, 0); | 1036 | |
898 | drbd_send_uuids(mdev); | 1037 | rcu_read_lock(); |
899 | drbd_send_current_state(mdev); | 1038 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
900 | drbd_clear_flag(mdev, USE_DEGR_WFC_T); | 1039 | kref_get(&mdev->kref); |
901 | drbd_clear_flag(mdev, RESIZE_PENDING); | 1040 | rcu_read_unlock(); |
902 | 1041 | ||
903 | spin_lock_irq(&mdev->req_lock); | 1042 | if (discard_my_data) |
904 | rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); | 1043 | set_bit(DISCARD_MY_DATA, &mdev->flags); |
905 | if (mdev->state.conn != C_WF_REPORT_PARAMS) | 1044 | else |
906 | drbd_clear_flag(mdev, STATE_SENT); | 1045 | clear_bit(DISCARD_MY_DATA, &mdev->flags); |
907 | spin_unlock_irq(&mdev->req_lock); | 1046 | |
908 | 1047 | drbd_connected(mdev); | |
909 | if (rv < SS_SUCCESS) | 1048 | kref_put(&mdev->kref, &drbd_minor_destroy); |
1049 | rcu_read_lock(); | ||
1050 | } | ||
1051 | rcu_read_unlock(); | ||
1052 | |||
1053 | rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE); | ||
1054 | if (rv < SS_SUCCESS) { | ||
1055 | clear_bit(STATE_SENT, &tconn->flags); | ||
910 | return 0; | 1056 | return 0; |
1057 | } | ||
911 | 1058 | ||
912 | drbd_thread_start(&mdev->asender); | 1059 | drbd_thread_start(&tconn->asender); |
913 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ | ||
914 | 1060 | ||
915 | return 1; | 1061 | mutex_lock(&tconn->conf_update); |
1062 | /* The discard_my_data flag is a single-shot modifier to the next | ||
1063 | * connection attempt, the handshake of which is now well underway. | ||
1064 | * No need for rcu style copying of the whole struct | ||
1065 | * just to clear a single value. */ | ||
1066 | tconn->net_conf->discard_my_data = 0; | ||
1067 | mutex_unlock(&tconn->conf_update); | ||
1068 | |||
1069 | return h; | ||
916 | 1070 | ||
917 | out_release_sockets: | 1071 | out_release_sockets: |
918 | if (sock) | 1072 | if (ad.s_listen) |
919 | sock_release(sock); | 1073 | sock_release(ad.s_listen); |
920 | if (msock) | 1074 | if (sock.socket) |
921 | sock_release(msock); | 1075 | sock_release(sock.socket); |
1076 | if (msock.socket) | ||
1077 | sock_release(msock.socket); | ||
922 | return -1; | 1078 | return -1; |
923 | } | 1079 | } |
924 | 1080 | ||
925 | static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size) | 1081 | static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi) |
926 | { | 1082 | { |
927 | union p_header *h = &mdev->data.rbuf.header; | 1083 | unsigned int header_size = drbd_header_size(tconn); |
928 | int r; | 1084 | |
929 | 1085 | if (header_size == sizeof(struct p_header100) && | |
930 | r = drbd_recv(mdev, h, sizeof(*h)); | 1086 | *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) { |
931 | if (unlikely(r != sizeof(*h))) { | 1087 | struct p_header100 *h = header; |
932 | if (!signal_pending(current)) | 1088 | if (h->pad != 0) { |
933 | dev_warn(DEV, "short read expecting header on sock: r=%d\n", r); | 1089 | conn_err(tconn, "Header padding is not zero\n"); |
934 | return false; | 1090 | return -EINVAL; |
935 | } | 1091 | } |
936 | 1092 | pi->vnr = be16_to_cpu(h->volume); | |
937 | if (likely(h->h80.magic == BE_DRBD_MAGIC)) { | 1093 | pi->cmd = be16_to_cpu(h->command); |
938 | *cmd = be16_to_cpu(h->h80.command); | 1094 | pi->size = be32_to_cpu(h->length); |
939 | *packet_size = be16_to_cpu(h->h80.length); | 1095 | } else if (header_size == sizeof(struct p_header95) && |
940 | } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) { | 1096 | *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) { |
941 | *cmd = be16_to_cpu(h->h95.command); | 1097 | struct p_header95 *h = header; |
942 | *packet_size = be32_to_cpu(h->h95.length); | 1098 | pi->cmd = be16_to_cpu(h->command); |
1099 | pi->size = be32_to_cpu(h->length); | ||
1100 | pi->vnr = 0; | ||
1101 | } else if (header_size == sizeof(struct p_header80) && | ||
1102 | *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) { | ||
1103 | struct p_header80 *h = header; | ||
1104 | pi->cmd = be16_to_cpu(h->command); | ||
1105 | pi->size = be16_to_cpu(h->length); | ||
1106 | pi->vnr = 0; | ||
943 | } else { | 1107 | } else { |
944 | dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n", | 1108 | conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n", |
945 | be32_to_cpu(h->h80.magic), | 1109 | be32_to_cpu(*(__be32 *)header), |
946 | be16_to_cpu(h->h80.command), | 1110 | tconn->agreed_pro_version); |
947 | be16_to_cpu(h->h80.length)); | 1111 | return -EINVAL; |
948 | return false; | ||
949 | } | 1112 | } |
950 | mdev->last_received = jiffies; | 1113 | pi->data = header + header_size; |
1114 | return 0; | ||
1115 | } | ||
1116 | |||
1117 | static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi) | ||
1118 | { | ||
1119 | void *buffer = tconn->data.rbuf; | ||
1120 | int err; | ||
1121 | |||
1122 | err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn)); | ||
1123 | if (err) | ||
1124 | return err; | ||
1125 | |||
1126 | err = decode_header(tconn, buffer, pi); | ||
1127 | tconn->last_received = jiffies; | ||
951 | 1128 | ||
952 | return true; | 1129 | return err; |
953 | } | 1130 | } |
954 | 1131 | ||
955 | static void drbd_flush(struct drbd_conf *mdev) | 1132 | static void drbd_flush(struct drbd_tconn *tconn) |
956 | { | 1133 | { |
957 | int rv; | 1134 | int rv; |
1135 | struct drbd_conf *mdev; | ||
1136 | int vnr; | ||
958 | 1137 | ||
959 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { | 1138 | if (tconn->write_ordering >= WO_bdev_flush) { |
960 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_NOIO, | 1139 | rcu_read_lock(); |
961 | NULL); | 1140 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
962 | if (rv) { | 1141 | if (!get_ldev(mdev)) |
963 | dev_info(DEV, "local disk flush failed with status %d\n", rv); | 1142 | continue; |
964 | /* would rather check on EOPNOTSUPP, but that is not reliable. | 1143 | kref_get(&mdev->kref); |
965 | * don't try again for ANY return value != 0 | 1144 | rcu_read_unlock(); |
966 | * if (rv == -EOPNOTSUPP) */ | 1145 | |
967 | drbd_bump_write_ordering(mdev, WO_drain_io); | 1146 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, |
1147 | GFP_NOIO, NULL); | ||
1148 | if (rv) { | ||
1149 | dev_info(DEV, "local disk flush failed with status %d\n", rv); | ||
1150 | /* would rather check on EOPNOTSUPP, but that is not reliable. | ||
1151 | * don't try again for ANY return value != 0 | ||
1152 | * if (rv == -EOPNOTSUPP) */ | ||
1153 | drbd_bump_write_ordering(tconn, WO_drain_io); | ||
1154 | } | ||
1155 | put_ldev(mdev); | ||
1156 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1157 | |||
1158 | rcu_read_lock(); | ||
1159 | if (rv) | ||
1160 | break; | ||
968 | } | 1161 | } |
969 | put_ldev(mdev); | 1162 | rcu_read_unlock(); |
970 | } | 1163 | } |
971 | } | 1164 | } |
972 | 1165 | ||
@@ -976,7 +1169,7 @@ static void drbd_flush(struct drbd_conf *mdev) | |||
976 | * @epoch: Epoch object. | 1169 | * @epoch: Epoch object. |
977 | * @ev: Epoch event. | 1170 | * @ev: Epoch event. |
978 | */ | 1171 | */ |
979 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | 1172 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn, |
980 | struct drbd_epoch *epoch, | 1173 | struct drbd_epoch *epoch, |
981 | enum epoch_event ev) | 1174 | enum epoch_event ev) |
982 | { | 1175 | { |
@@ -984,7 +1177,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
984 | struct drbd_epoch *next_epoch; | 1177 | struct drbd_epoch *next_epoch; |
985 | enum finish_epoch rv = FE_STILL_LIVE; | 1178 | enum finish_epoch rv = FE_STILL_LIVE; |
986 | 1179 | ||
987 | spin_lock(&mdev->epoch_lock); | 1180 | spin_lock(&tconn->epoch_lock); |
988 | do { | 1181 | do { |
989 | next_epoch = NULL; | 1182 | next_epoch = NULL; |
990 | 1183 | ||
@@ -1006,18 +1199,22 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
1006 | atomic_read(&epoch->active) == 0 && | 1199 | atomic_read(&epoch->active) == 0 && |
1007 | (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { | 1200 | (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { |
1008 | if (!(ev & EV_CLEANUP)) { | 1201 | if (!(ev & EV_CLEANUP)) { |
1009 | spin_unlock(&mdev->epoch_lock); | 1202 | spin_unlock(&tconn->epoch_lock); |
1010 | drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); | 1203 | drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size); |
1011 | spin_lock(&mdev->epoch_lock); | 1204 | spin_lock(&tconn->epoch_lock); |
1012 | } | 1205 | } |
1206 | #if 0 | ||
1207 | /* FIXME: dec unacked on connection, once we have | ||
1208 | * something to count pending connection packets in. */ | ||
1013 | if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) | 1209 | if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) |
1014 | dec_unacked(mdev); | 1210 | dec_unacked(epoch->tconn); |
1211 | #endif | ||
1015 | 1212 | ||
1016 | if (mdev->current_epoch != epoch) { | 1213 | if (tconn->current_epoch != epoch) { |
1017 | next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); | 1214 | next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); |
1018 | list_del(&epoch->list); | 1215 | list_del(&epoch->list); |
1019 | ev = EV_BECAME_LAST | (ev & EV_CLEANUP); | 1216 | ev = EV_BECAME_LAST | (ev & EV_CLEANUP); |
1020 | mdev->epochs--; | 1217 | tconn->epochs--; |
1021 | kfree(epoch); | 1218 | kfree(epoch); |
1022 | 1219 | ||
1023 | if (rv == FE_STILL_LIVE) | 1220 | if (rv == FE_STILL_LIVE) |
@@ -1028,7 +1225,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
1028 | /* atomic_set(&epoch->active, 0); is already zero */ | 1225 | /* atomic_set(&epoch->active, 0); is already zero */ |
1029 | if (rv == FE_STILL_LIVE) | 1226 | if (rv == FE_STILL_LIVE) |
1030 | rv = FE_RECYCLED; | 1227 | rv = FE_RECYCLED; |
1031 | wake_up(&mdev->ee_wait); | ||
1032 | } | 1228 | } |
1033 | } | 1229 | } |
1034 | 1230 | ||
@@ -1038,40 +1234,52 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
1038 | epoch = next_epoch; | 1234 | epoch = next_epoch; |
1039 | } while (1); | 1235 | } while (1); |
1040 | 1236 | ||
1041 | spin_unlock(&mdev->epoch_lock); | 1237 | spin_unlock(&tconn->epoch_lock); |
1042 | 1238 | ||
1043 | return rv; | 1239 | return rv; |
1044 | } | 1240 | } |
1045 | 1241 | ||
1046 | /** | 1242 | /** |
1047 | * drbd_bump_write_ordering() - Fall back to an other write ordering method | 1243 | * drbd_bump_write_ordering() - Fall back to an other write ordering method |
1048 | * @mdev: DRBD device. | 1244 | * @tconn: DRBD connection. |
1049 | * @wo: Write ordering method to try. | 1245 | * @wo: Write ordering method to try. |
1050 | */ | 1246 | */ |
1051 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) | 1247 | void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo) |
1052 | { | 1248 | { |
1249 | struct disk_conf *dc; | ||
1250 | struct drbd_conf *mdev; | ||
1053 | enum write_ordering_e pwo; | 1251 | enum write_ordering_e pwo; |
1252 | int vnr; | ||
1054 | static char *write_ordering_str[] = { | 1253 | static char *write_ordering_str[] = { |
1055 | [WO_none] = "none", | 1254 | [WO_none] = "none", |
1056 | [WO_drain_io] = "drain", | 1255 | [WO_drain_io] = "drain", |
1057 | [WO_bdev_flush] = "flush", | 1256 | [WO_bdev_flush] = "flush", |
1058 | }; | 1257 | }; |
1059 | 1258 | ||
1060 | pwo = mdev->write_ordering; | 1259 | pwo = tconn->write_ordering; |
1061 | wo = min(pwo, wo); | 1260 | wo = min(pwo, wo); |
1062 | if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) | 1261 | rcu_read_lock(); |
1063 | wo = WO_drain_io; | 1262 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
1064 | if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) | 1263 | if (!get_ldev_if_state(mdev, D_ATTACHING)) |
1065 | wo = WO_none; | 1264 | continue; |
1066 | mdev->write_ordering = wo; | 1265 | dc = rcu_dereference(mdev->ldev->disk_conf); |
1067 | if (pwo != mdev->write_ordering || wo == WO_bdev_flush) | 1266 | |
1068 | dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); | 1267 | if (wo == WO_bdev_flush && !dc->disk_flushes) |
1268 | wo = WO_drain_io; | ||
1269 | if (wo == WO_drain_io && !dc->disk_drain) | ||
1270 | wo = WO_none; | ||
1271 | put_ldev(mdev); | ||
1272 | } | ||
1273 | rcu_read_unlock(); | ||
1274 | tconn->write_ordering = wo; | ||
1275 | if (pwo != tconn->write_ordering || wo == WO_bdev_flush) | ||
1276 | conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]); | ||
1069 | } | 1277 | } |
1070 | 1278 | ||
1071 | /** | 1279 | /** |
1072 | * drbd_submit_ee() | 1280 | * drbd_submit_peer_request() |
1073 | * @mdev: DRBD device. | 1281 | * @mdev: DRBD device. |
1074 | * @e: epoch entry | 1282 | * @peer_req: peer request |
1075 | * @rw: flag field, see bio->bi_rw | 1283 | * @rw: flag field, see bio->bi_rw |
1076 | * | 1284 | * |
1077 | * May spread the pages to multiple bios, | 1285 | * May spread the pages to multiple bios, |
@@ -1085,14 +1293,15 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) | |||
1085 | * on certain Xen deployments. | 1293 | * on certain Xen deployments. |
1086 | */ | 1294 | */ |
1087 | /* TODO allocate from our own bio_set. */ | 1295 | /* TODO allocate from our own bio_set. */ |
1088 | int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1296 | int drbd_submit_peer_request(struct drbd_conf *mdev, |
1089 | const unsigned rw, const int fault_type) | 1297 | struct drbd_peer_request *peer_req, |
1298 | const unsigned rw, const int fault_type) | ||
1090 | { | 1299 | { |
1091 | struct bio *bios = NULL; | 1300 | struct bio *bios = NULL; |
1092 | struct bio *bio; | 1301 | struct bio *bio; |
1093 | struct page *page = e->pages; | 1302 | struct page *page = peer_req->pages; |
1094 | sector_t sector = e->sector; | 1303 | sector_t sector = peer_req->i.sector; |
1095 | unsigned ds = e->size; | 1304 | unsigned ds = peer_req->i.size; |
1096 | unsigned n_bios = 0; | 1305 | unsigned n_bios = 0; |
1097 | unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; | 1306 | unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; |
1098 | int err = -ENOMEM; | 1307 | int err = -ENOMEM; |
@@ -1111,12 +1320,12 @@ next_bio: | |||
1111 | dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); | 1320 | dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); |
1112 | goto fail; | 1321 | goto fail; |
1113 | } | 1322 | } |
1114 | /* > e->sector, unless this is the first bio */ | 1323 | /* > peer_req->i.sector, unless this is the first bio */ |
1115 | bio->bi_sector = sector; | 1324 | bio->bi_sector = sector; |
1116 | bio->bi_bdev = mdev->ldev->backing_bdev; | 1325 | bio->bi_bdev = mdev->ldev->backing_bdev; |
1117 | bio->bi_rw = rw; | 1326 | bio->bi_rw = rw; |
1118 | bio->bi_private = e; | 1327 | bio->bi_private = peer_req; |
1119 | bio->bi_end_io = drbd_endio_sec; | 1328 | bio->bi_end_io = drbd_peer_request_endio; |
1120 | 1329 | ||
1121 | bio->bi_next = bios; | 1330 | bio->bi_next = bios; |
1122 | bios = bio; | 1331 | bios = bio; |
@@ -1145,7 +1354,7 @@ next_bio: | |||
1145 | D_ASSERT(page == NULL); | 1354 | D_ASSERT(page == NULL); |
1146 | D_ASSERT(ds == 0); | 1355 | D_ASSERT(ds == 0); |
1147 | 1356 | ||
1148 | atomic_set(&e->pending_bios, n_bios); | 1357 | atomic_set(&peer_req->pending_bios, n_bios); |
1149 | do { | 1358 | do { |
1150 | bio = bios; | 1359 | bio = bios; |
1151 | bios = bios->bi_next; | 1360 | bios = bios->bi_next; |
@@ -1164,26 +1373,57 @@ fail: | |||
1164 | return err; | 1373 | return err; |
1165 | } | 1374 | } |
1166 | 1375 | ||
1167 | static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1376 | static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev, |
1377 | struct drbd_peer_request *peer_req) | ||
1378 | { | ||
1379 | struct drbd_interval *i = &peer_req->i; | ||
1380 | |||
1381 | drbd_remove_interval(&mdev->write_requests, i); | ||
1382 | drbd_clear_interval(i); | ||
1383 | |||
1384 | /* Wake up any processes waiting for this peer request to complete. */ | ||
1385 | if (i->waiting) | ||
1386 | wake_up(&mdev->misc_wait); | ||
1387 | } | ||
1388 | |||
1389 | void conn_wait_active_ee_empty(struct drbd_tconn *tconn) | ||
1390 | { | ||
1391 | struct drbd_conf *mdev; | ||
1392 | int vnr; | ||
1393 | |||
1394 | rcu_read_lock(); | ||
1395 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1396 | kref_get(&mdev->kref); | ||
1397 | rcu_read_unlock(); | ||
1398 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
1399 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1400 | rcu_read_lock(); | ||
1401 | } | ||
1402 | rcu_read_unlock(); | ||
1403 | } | ||
1404 | |||
1405 | static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi) | ||
1168 | { | 1406 | { |
1169 | int rv; | 1407 | int rv; |
1170 | struct p_barrier *p = &mdev->data.rbuf.barrier; | 1408 | struct p_barrier *p = pi->data; |
1171 | struct drbd_epoch *epoch; | 1409 | struct drbd_epoch *epoch; |
1172 | 1410 | ||
1173 | inc_unacked(mdev); | 1411 | /* FIXME these are unacked on connection, |
1174 | 1412 | * not a specific (peer)device. | |
1175 | mdev->current_epoch->barrier_nr = p->barrier; | 1413 | */ |
1176 | rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); | 1414 | tconn->current_epoch->barrier_nr = p->barrier; |
1415 | tconn->current_epoch->tconn = tconn; | ||
1416 | rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR); | ||
1177 | 1417 | ||
1178 | /* P_BARRIER_ACK may imply that the corresponding extent is dropped from | 1418 | /* P_BARRIER_ACK may imply that the corresponding extent is dropped from |
1179 | * the activity log, which means it would not be resynced in case the | 1419 | * the activity log, which means it would not be resynced in case the |
1180 | * R_PRIMARY crashes now. | 1420 | * R_PRIMARY crashes now. |
1181 | * Therefore we must send the barrier_ack after the barrier request was | 1421 | * Therefore we must send the barrier_ack after the barrier request was |
1182 | * completed. */ | 1422 | * completed. */ |
1183 | switch (mdev->write_ordering) { | 1423 | switch (tconn->write_ordering) { |
1184 | case WO_none: | 1424 | case WO_none: |
1185 | if (rv == FE_RECYCLED) | 1425 | if (rv == FE_RECYCLED) |
1186 | return true; | 1426 | return 0; |
1187 | 1427 | ||
1188 | /* receiver context, in the writeout path of the other node. | 1428 | /* receiver context, in the writeout path of the other node. |
1189 | * avoid potential distributed deadlock */ | 1429 | * avoid potential distributed deadlock */ |
@@ -1191,81 +1431,75 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign | |||
1191 | if (epoch) | 1431 | if (epoch) |
1192 | break; | 1432 | break; |
1193 | else | 1433 | else |
1194 | dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); | 1434 | conn_warn(tconn, "Allocation of an epoch failed, slowing down\n"); |
1195 | /* Fall through */ | 1435 | /* Fall through */ |
1196 | 1436 | ||
1197 | case WO_bdev_flush: | 1437 | case WO_bdev_flush: |
1198 | case WO_drain_io: | 1438 | case WO_drain_io: |
1199 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | 1439 | conn_wait_active_ee_empty(tconn); |
1200 | drbd_flush(mdev); | 1440 | drbd_flush(tconn); |
1201 | 1441 | ||
1202 | if (atomic_read(&mdev->current_epoch->epoch_size)) { | 1442 | if (atomic_read(&tconn->current_epoch->epoch_size)) { |
1203 | epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); | 1443 | epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); |
1204 | if (epoch) | 1444 | if (epoch) |
1205 | break; | 1445 | break; |
1206 | } | 1446 | } |
1207 | 1447 | ||
1208 | epoch = mdev->current_epoch; | 1448 | return 0; |
1209 | wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0); | ||
1210 | |||
1211 | D_ASSERT(atomic_read(&epoch->active) == 0); | ||
1212 | D_ASSERT(epoch->flags == 0); | ||
1213 | |||
1214 | return true; | ||
1215 | default: | 1449 | default: |
1216 | dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering); | 1450 | conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering); |
1217 | return false; | 1451 | return -EIO; |
1218 | } | 1452 | } |
1219 | 1453 | ||
1220 | epoch->flags = 0; | 1454 | epoch->flags = 0; |
1221 | atomic_set(&epoch->epoch_size, 0); | 1455 | atomic_set(&epoch->epoch_size, 0); |
1222 | atomic_set(&epoch->active, 0); | 1456 | atomic_set(&epoch->active, 0); |
1223 | 1457 | ||
1224 | spin_lock(&mdev->epoch_lock); | 1458 | spin_lock(&tconn->epoch_lock); |
1225 | if (atomic_read(&mdev->current_epoch->epoch_size)) { | 1459 | if (atomic_read(&tconn->current_epoch->epoch_size)) { |
1226 | list_add(&epoch->list, &mdev->current_epoch->list); | 1460 | list_add(&epoch->list, &tconn->current_epoch->list); |
1227 | mdev->current_epoch = epoch; | 1461 | tconn->current_epoch = epoch; |
1228 | mdev->epochs++; | 1462 | tconn->epochs++; |
1229 | } else { | 1463 | } else { |
1230 | /* The current_epoch got recycled while we allocated this one... */ | 1464 | /* The current_epoch got recycled while we allocated this one... */ |
1231 | kfree(epoch); | 1465 | kfree(epoch); |
1232 | } | 1466 | } |
1233 | spin_unlock(&mdev->epoch_lock); | 1467 | spin_unlock(&tconn->epoch_lock); |
1234 | 1468 | ||
1235 | return true; | 1469 | return 0; |
1236 | } | 1470 | } |
1237 | 1471 | ||
1238 | /* used from receive_RSDataReply (recv_resync_read) | 1472 | /* used from receive_RSDataReply (recv_resync_read) |
1239 | * and from receive_Data */ | 1473 | * and from receive_Data */ |
1240 | static struct drbd_epoch_entry * | 1474 | static struct drbd_peer_request * |
1241 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) | 1475 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, |
1476 | int data_size) __must_hold(local) | ||
1242 | { | 1477 | { |
1243 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 1478 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
1244 | struct drbd_epoch_entry *e; | 1479 | struct drbd_peer_request *peer_req; |
1245 | struct page *page; | 1480 | struct page *page; |
1246 | int dgs, ds, rr; | 1481 | int dgs, ds, err; |
1247 | void *dig_in = mdev->int_dig_in; | 1482 | void *dig_in = mdev->tconn->int_dig_in; |
1248 | void *dig_vv = mdev->int_dig_vv; | 1483 | void *dig_vv = mdev->tconn->int_dig_vv; |
1249 | unsigned long *data; | 1484 | unsigned long *data; |
1250 | 1485 | ||
1251 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | 1486 | dgs = 0; |
1252 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | 1487 | if (mdev->tconn->peer_integrity_tfm) { |
1253 | 1488 | dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); | |
1254 | if (dgs) { | 1489 | /* |
1255 | rr = drbd_recv(mdev, dig_in, dgs); | 1490 | * FIXME: Receive the incoming digest into the receive buffer |
1256 | if (rr != dgs) { | 1491 | * here, together with its struct p_data? |
1257 | if (!signal_pending(current)) | 1492 | */ |
1258 | dev_warn(DEV, | 1493 | err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); |
1259 | "short read receiving data digest: read %d expected %d\n", | 1494 | if (err) |
1260 | rr, dgs); | ||
1261 | return NULL; | 1495 | return NULL; |
1262 | } | 1496 | data_size -= dgs; |
1263 | } | 1497 | } |
1264 | 1498 | ||
1265 | data_size -= dgs; | 1499 | if (!expect(IS_ALIGNED(data_size, 512))) |
1266 | 1500 | return NULL; | |
1267 | ERR_IF(data_size & 0x1ff) return NULL; | 1501 | if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) |
1268 | ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL; | 1502 | return NULL; |
1269 | 1503 | ||
1270 | /* even though we trust out peer, | 1504 | /* even though we trust out peer, |
1271 | * we sometimes have to double check. */ | 1505 | * we sometimes have to double check. */ |
@@ -1280,47 +1514,42 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ | |||
1280 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | 1514 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD |
1281 | * "criss-cross" setup, that might cause write-out on some other DRBD, | 1515 | * "criss-cross" setup, that might cause write-out on some other DRBD, |
1282 | * which in turn might block on the other node at this very place. */ | 1516 | * which in turn might block on the other node at this very place. */ |
1283 | e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); | 1517 | peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO); |
1284 | if (!e) | 1518 | if (!peer_req) |
1285 | return NULL; | 1519 | return NULL; |
1286 | 1520 | ||
1287 | if (!data_size) | 1521 | if (!data_size) |
1288 | return e; | 1522 | return peer_req; |
1289 | 1523 | ||
1290 | ds = data_size; | 1524 | ds = data_size; |
1291 | page = e->pages; | 1525 | page = peer_req->pages; |
1292 | page_chain_for_each(page) { | 1526 | page_chain_for_each(page) { |
1293 | unsigned len = min_t(int, ds, PAGE_SIZE); | 1527 | unsigned len = min_t(int, ds, PAGE_SIZE); |
1294 | data = kmap(page); | 1528 | data = kmap(page); |
1295 | rr = drbd_recv(mdev, data, len); | 1529 | err = drbd_recv_all_warn(mdev->tconn, data, len); |
1296 | if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) { | 1530 | if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) { |
1297 | dev_err(DEV, "Fault injection: Corrupting data on receive\n"); | 1531 | dev_err(DEV, "Fault injection: Corrupting data on receive\n"); |
1298 | data[0] = data[0] ^ (unsigned long)-1; | 1532 | data[0] = data[0] ^ (unsigned long)-1; |
1299 | } | 1533 | } |
1300 | kunmap(page); | 1534 | kunmap(page); |
1301 | if (rr != len) { | 1535 | if (err) { |
1302 | drbd_free_ee(mdev, e); | 1536 | drbd_free_peer_req(mdev, peer_req); |
1303 | if (!signal_pending(current)) | ||
1304 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | ||
1305 | rr, len); | ||
1306 | return NULL; | 1537 | return NULL; |
1307 | } | 1538 | } |
1308 | ds -= rr; | 1539 | ds -= len; |
1309 | } | 1540 | } |
1310 | 1541 | ||
1311 | if (dgs) { | 1542 | if (dgs) { |
1312 | drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv); | 1543 | drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv); |
1313 | if (memcmp(dig_in, dig_vv, dgs)) { | 1544 | if (memcmp(dig_in, dig_vv, dgs)) { |
1314 | dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n", | 1545 | dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n", |
1315 | (unsigned long long)sector, data_size); | 1546 | (unsigned long long)sector, data_size); |
1316 | drbd_bcast_ee(mdev, "digest failed", | 1547 | drbd_free_peer_req(mdev, peer_req); |
1317 | dgs, dig_in, dig_vv, e); | ||
1318 | drbd_free_ee(mdev, e); | ||
1319 | return NULL; | 1548 | return NULL; |
1320 | } | 1549 | } |
1321 | } | 1550 | } |
1322 | mdev->recv_cnt += data_size>>9; | 1551 | mdev->recv_cnt += data_size>>9; |
1323 | return e; | 1552 | return peer_req; |
1324 | } | 1553 | } |
1325 | 1554 | ||
1326 | /* drbd_drain_block() just takes a data block | 1555 | /* drbd_drain_block() just takes a data block |
@@ -1329,30 +1558,26 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ | |||
1329 | static int drbd_drain_block(struct drbd_conf *mdev, int data_size) | 1558 | static int drbd_drain_block(struct drbd_conf *mdev, int data_size) |
1330 | { | 1559 | { |
1331 | struct page *page; | 1560 | struct page *page; |
1332 | int rr, rv = 1; | 1561 | int err = 0; |
1333 | void *data; | 1562 | void *data; |
1334 | 1563 | ||
1335 | if (!data_size) | 1564 | if (!data_size) |
1336 | return true; | 1565 | return 0; |
1337 | 1566 | ||
1338 | page = drbd_pp_alloc(mdev, 1, 1); | 1567 | page = drbd_alloc_pages(mdev, 1, 1); |
1339 | 1568 | ||
1340 | data = kmap(page); | 1569 | data = kmap(page); |
1341 | while (data_size) { | 1570 | while (data_size) { |
1342 | rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); | 1571 | unsigned int len = min_t(int, data_size, PAGE_SIZE); |
1343 | if (rr != min_t(int, data_size, PAGE_SIZE)) { | 1572 | |
1344 | rv = 0; | 1573 | err = drbd_recv_all_warn(mdev->tconn, data, len); |
1345 | if (!signal_pending(current)) | 1574 | if (err) |
1346 | dev_warn(DEV, | ||
1347 | "short read receiving data: read %d expected %d\n", | ||
1348 | rr, min_t(int, data_size, PAGE_SIZE)); | ||
1349 | break; | 1575 | break; |
1350 | } | 1576 | data_size -= len; |
1351 | data_size -= rr; | ||
1352 | } | 1577 | } |
1353 | kunmap(page); | 1578 | kunmap(page); |
1354 | drbd_pp_free(mdev, page, 0); | 1579 | drbd_free_pages(mdev, page, 0); |
1355 | return rv; | 1580 | return err; |
1356 | } | 1581 | } |
1357 | 1582 | ||
1358 | static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | 1583 | static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, |
@@ -1360,26 +1585,19 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | |||
1360 | { | 1585 | { |
1361 | struct bio_vec *bvec; | 1586 | struct bio_vec *bvec; |
1362 | struct bio *bio; | 1587 | struct bio *bio; |
1363 | int dgs, rr, i, expect; | 1588 | int dgs, err, i, expect; |
1364 | void *dig_in = mdev->int_dig_in; | 1589 | void *dig_in = mdev->tconn->int_dig_in; |
1365 | void *dig_vv = mdev->int_dig_vv; | 1590 | void *dig_vv = mdev->tconn->int_dig_vv; |
1366 | |||
1367 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | ||
1368 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | ||
1369 | 1591 | ||
1370 | if (dgs) { | 1592 | dgs = 0; |
1371 | rr = drbd_recv(mdev, dig_in, dgs); | 1593 | if (mdev->tconn->peer_integrity_tfm) { |
1372 | if (rr != dgs) { | 1594 | dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); |
1373 | if (!signal_pending(current)) | 1595 | err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); |
1374 | dev_warn(DEV, | 1596 | if (err) |
1375 | "short read receiving data reply digest: read %d expected %d\n", | 1597 | return err; |
1376 | rr, dgs); | 1598 | data_size -= dgs; |
1377 | return 0; | ||
1378 | } | ||
1379 | } | 1599 | } |
1380 | 1600 | ||
1381 | data_size -= dgs; | ||
1382 | |||
1383 | /* optimistically update recv_cnt. if receiving fails below, | 1601 | /* optimistically update recv_cnt. if receiving fails below, |
1384 | * we disconnect anyways, and counters will be reset. */ | 1602 | * we disconnect anyways, and counters will be reset. */ |
1385 | mdev->recv_cnt += data_size>>9; | 1603 | mdev->recv_cnt += data_size>>9; |
@@ -1388,63 +1606,61 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | |||
1388 | D_ASSERT(sector == bio->bi_sector); | 1606 | D_ASSERT(sector == bio->bi_sector); |
1389 | 1607 | ||
1390 | bio_for_each_segment(bvec, bio, i) { | 1608 | bio_for_each_segment(bvec, bio, i) { |
1609 | void *mapped = kmap(bvec->bv_page) + bvec->bv_offset; | ||
1391 | expect = min_t(int, data_size, bvec->bv_len); | 1610 | expect = min_t(int, data_size, bvec->bv_len); |
1392 | rr = drbd_recv(mdev, | 1611 | err = drbd_recv_all_warn(mdev->tconn, mapped, expect); |
1393 | kmap(bvec->bv_page)+bvec->bv_offset, | ||
1394 | expect); | ||
1395 | kunmap(bvec->bv_page); | 1612 | kunmap(bvec->bv_page); |
1396 | if (rr != expect) { | 1613 | if (err) |
1397 | if (!signal_pending(current)) | 1614 | return err; |
1398 | dev_warn(DEV, "short read receiving data reply: " | 1615 | data_size -= expect; |
1399 | "read %d expected %d\n", | ||
1400 | rr, expect); | ||
1401 | return 0; | ||
1402 | } | ||
1403 | data_size -= rr; | ||
1404 | } | 1616 | } |
1405 | 1617 | ||
1406 | if (dgs) { | 1618 | if (dgs) { |
1407 | drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv); | 1619 | drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv); |
1408 | if (memcmp(dig_in, dig_vv, dgs)) { | 1620 | if (memcmp(dig_in, dig_vv, dgs)) { |
1409 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); | 1621 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); |
1410 | return 0; | 1622 | return -EINVAL; |
1411 | } | 1623 | } |
1412 | } | 1624 | } |
1413 | 1625 | ||
1414 | D_ASSERT(data_size == 0); | 1626 | D_ASSERT(data_size == 0); |
1415 | return 1; | 1627 | return 0; |
1416 | } | 1628 | } |
1417 | 1629 | ||
1418 | /* e_end_resync_block() is called via | 1630 | /* |
1419 | * drbd_process_done_ee() by asender only */ | 1631 | * e_end_resync_block() is called in asender context via |
1420 | static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 1632 | * drbd_finish_peer_reqs(). |
1633 | */ | ||
1634 | static int e_end_resync_block(struct drbd_work *w, int unused) | ||
1421 | { | 1635 | { |
1422 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1636 | struct drbd_peer_request *peer_req = |
1423 | sector_t sector = e->sector; | 1637 | container_of(w, struct drbd_peer_request, w); |
1424 | int ok; | 1638 | struct drbd_conf *mdev = w->mdev; |
1639 | sector_t sector = peer_req->i.sector; | ||
1640 | int err; | ||
1425 | 1641 | ||
1426 | D_ASSERT(hlist_unhashed(&e->collision)); | 1642 | D_ASSERT(drbd_interval_empty(&peer_req->i)); |
1427 | 1643 | ||
1428 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1644 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1429 | drbd_set_in_sync(mdev, sector, e->size); | 1645 | drbd_set_in_sync(mdev, sector, peer_req->i.size); |
1430 | ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); | 1646 | err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req); |
1431 | } else { | 1647 | } else { |
1432 | /* Record failure to sync */ | 1648 | /* Record failure to sync */ |
1433 | drbd_rs_failed_io(mdev, sector, e->size); | 1649 | drbd_rs_failed_io(mdev, sector, peer_req->i.size); |
1434 | 1650 | ||
1435 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | 1651 | err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); |
1436 | } | 1652 | } |
1437 | dec_unacked(mdev); | 1653 | dec_unacked(mdev); |
1438 | 1654 | ||
1439 | return ok; | 1655 | return err; |
1440 | } | 1656 | } |
1441 | 1657 | ||
1442 | static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) | 1658 | static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) |
1443 | { | 1659 | { |
1444 | struct drbd_epoch_entry *e; | 1660 | struct drbd_peer_request *peer_req; |
1445 | 1661 | ||
1446 | e = read_in_block(mdev, ID_SYNCER, sector, data_size); | 1662 | peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size); |
1447 | if (!e) | 1663 | if (!peer_req) |
1448 | goto fail; | 1664 | goto fail; |
1449 | 1665 | ||
1450 | dec_rs_pending(mdev); | 1666 | dec_rs_pending(mdev); |
@@ -1453,64 +1669,88 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si | |||
1453 | /* corresponding dec_unacked() in e_end_resync_block() | 1669 | /* corresponding dec_unacked() in e_end_resync_block() |
1454 | * respective _drbd_clear_done_ee */ | 1670 | * respective _drbd_clear_done_ee */ |
1455 | 1671 | ||
1456 | e->w.cb = e_end_resync_block; | 1672 | peer_req->w.cb = e_end_resync_block; |
1457 | 1673 | ||
1458 | spin_lock_irq(&mdev->req_lock); | 1674 | spin_lock_irq(&mdev->tconn->req_lock); |
1459 | list_add(&e->w.list, &mdev->sync_ee); | 1675 | list_add(&peer_req->w.list, &mdev->sync_ee); |
1460 | spin_unlock_irq(&mdev->req_lock); | 1676 | spin_unlock_irq(&mdev->tconn->req_lock); |
1461 | 1677 | ||
1462 | atomic_add(data_size >> 9, &mdev->rs_sect_ev); | 1678 | atomic_add(data_size >> 9, &mdev->rs_sect_ev); |
1463 | if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) | 1679 | if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) |
1464 | return true; | 1680 | return 0; |
1465 | 1681 | ||
1466 | /* don't care for the reason here */ | 1682 | /* don't care for the reason here */ |
1467 | dev_err(DEV, "submit failed, triggering re-connect\n"); | 1683 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
1468 | spin_lock_irq(&mdev->req_lock); | 1684 | spin_lock_irq(&mdev->tconn->req_lock); |
1469 | list_del(&e->w.list); | 1685 | list_del(&peer_req->w.list); |
1470 | spin_unlock_irq(&mdev->req_lock); | 1686 | spin_unlock_irq(&mdev->tconn->req_lock); |
1471 | 1687 | ||
1472 | drbd_free_ee(mdev, e); | 1688 | drbd_free_peer_req(mdev, peer_req); |
1473 | fail: | 1689 | fail: |
1474 | put_ldev(mdev); | 1690 | put_ldev(mdev); |
1475 | return false; | 1691 | return -EIO; |
1692 | } | ||
1693 | |||
1694 | static struct drbd_request * | ||
1695 | find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id, | ||
1696 | sector_t sector, bool missing_ok, const char *func) | ||
1697 | { | ||
1698 | struct drbd_request *req; | ||
1699 | |||
1700 | /* Request object according to our peer */ | ||
1701 | req = (struct drbd_request *)(unsigned long)id; | ||
1702 | if (drbd_contains_interval(root, sector, &req->i) && req->i.local) | ||
1703 | return req; | ||
1704 | if (!missing_ok) { | ||
1705 | dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func, | ||
1706 | (unsigned long)id, (unsigned long long)sector); | ||
1707 | } | ||
1708 | return NULL; | ||
1476 | } | 1709 | } |
1477 | 1710 | ||
1478 | static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1711 | static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi) |
1479 | { | 1712 | { |
1713 | struct drbd_conf *mdev; | ||
1480 | struct drbd_request *req; | 1714 | struct drbd_request *req; |
1481 | sector_t sector; | 1715 | sector_t sector; |
1482 | int ok; | 1716 | int err; |
1483 | struct p_data *p = &mdev->data.rbuf.data; | 1717 | struct p_data *p = pi->data; |
1718 | |||
1719 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
1720 | if (!mdev) | ||
1721 | return -EIO; | ||
1484 | 1722 | ||
1485 | sector = be64_to_cpu(p->sector); | 1723 | sector = be64_to_cpu(p->sector); |
1486 | 1724 | ||
1487 | spin_lock_irq(&mdev->req_lock); | 1725 | spin_lock_irq(&mdev->tconn->req_lock); |
1488 | req = _ar_id_to_req(mdev, p->block_id, sector); | 1726 | req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__); |
1489 | spin_unlock_irq(&mdev->req_lock); | 1727 | spin_unlock_irq(&mdev->tconn->req_lock); |
1490 | if (unlikely(!req)) { | 1728 | if (unlikely(!req)) |
1491 | dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); | 1729 | return -EIO; |
1492 | return false; | ||
1493 | } | ||
1494 | 1730 | ||
1495 | /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid | 1731 | /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid |
1496 | * special casing it there for the various failure cases. | 1732 | * special casing it there for the various failure cases. |
1497 | * still no race with drbd_fail_pending_reads */ | 1733 | * still no race with drbd_fail_pending_reads */ |
1498 | ok = recv_dless_read(mdev, req, sector, data_size); | 1734 | err = recv_dless_read(mdev, req, sector, pi->size); |
1499 | 1735 | if (!err) | |
1500 | if (ok) | 1736 | req_mod(req, DATA_RECEIVED); |
1501 | req_mod(req, data_received); | ||
1502 | /* else: nothing. handled from drbd_disconnect... | 1737 | /* else: nothing. handled from drbd_disconnect... |
1503 | * I don't think we may complete this just yet | 1738 | * I don't think we may complete this just yet |
1504 | * in case we are "on-disconnect: freeze" */ | 1739 | * in case we are "on-disconnect: freeze" */ |
1505 | 1740 | ||
1506 | return ok; | 1741 | return err; |
1507 | } | 1742 | } |
1508 | 1743 | ||
1509 | static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1744 | static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi) |
1510 | { | 1745 | { |
1746 | struct drbd_conf *mdev; | ||
1511 | sector_t sector; | 1747 | sector_t sector; |
1512 | int ok; | 1748 | int err; |
1513 | struct p_data *p = &mdev->data.rbuf.data; | 1749 | struct p_data *p = pi->data; |
1750 | |||
1751 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
1752 | if (!mdev) | ||
1753 | return -EIO; | ||
1514 | 1754 | ||
1515 | sector = be64_to_cpu(p->sector); | 1755 | sector = be64_to_cpu(p->sector); |
1516 | D_ASSERT(p->block_id == ID_SYNCER); | 1756 | D_ASSERT(p->block_id == ID_SYNCER); |
@@ -1518,42 +1758,63 @@ static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
1518 | if (get_ldev(mdev)) { | 1758 | if (get_ldev(mdev)) { |
1519 | /* data is submitted to disk within recv_resync_read. | 1759 | /* data is submitted to disk within recv_resync_read. |
1520 | * corresponding put_ldev done below on error, | 1760 | * corresponding put_ldev done below on error, |
1521 | * or in drbd_endio_write_sec. */ | 1761 | * or in drbd_peer_request_endio. */ |
1522 | ok = recv_resync_read(mdev, sector, data_size); | 1762 | err = recv_resync_read(mdev, sector, pi->size); |
1523 | } else { | 1763 | } else { |
1524 | if (__ratelimit(&drbd_ratelimit_state)) | 1764 | if (__ratelimit(&drbd_ratelimit_state)) |
1525 | dev_err(DEV, "Can not write resync data to local disk.\n"); | 1765 | dev_err(DEV, "Can not write resync data to local disk.\n"); |
1526 | 1766 | ||
1527 | ok = drbd_drain_block(mdev, data_size); | 1767 | err = drbd_drain_block(mdev, pi->size); |
1528 | 1768 | ||
1529 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); | 1769 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); |
1530 | } | 1770 | } |
1531 | 1771 | ||
1532 | atomic_add(data_size >> 9, &mdev->rs_sect_in); | 1772 | atomic_add(pi->size >> 9, &mdev->rs_sect_in); |
1533 | 1773 | ||
1534 | return ok; | 1774 | return err; |
1535 | } | 1775 | } |
1536 | 1776 | ||
1537 | /* e_end_block() is called via drbd_process_done_ee(). | 1777 | static void restart_conflicting_writes(struct drbd_conf *mdev, |
1538 | * this means this function only runs in the asender thread | 1778 | sector_t sector, int size) |
1539 | */ | ||
1540 | static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1541 | { | 1779 | { |
1542 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1780 | struct drbd_interval *i; |
1543 | sector_t sector = e->sector; | 1781 | struct drbd_request *req; |
1544 | int ok = 1, pcmd; | ||
1545 | 1782 | ||
1546 | if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { | 1783 | drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { |
1547 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1784 | if (!i->local) |
1785 | continue; | ||
1786 | req = container_of(i, struct drbd_request, i); | ||
1787 | if (req->rq_state & RQ_LOCAL_PENDING || | ||
1788 | !(req->rq_state & RQ_POSTPONED)) | ||
1789 | continue; | ||
1790 | /* as it is RQ_POSTPONED, this will cause it to | ||
1791 | * be queued on the retry workqueue. */ | ||
1792 | __req_mod(req, CONFLICT_RESOLVED, NULL); | ||
1793 | } | ||
1794 | } | ||
1795 | |||
1796 | /* | ||
1797 | * e_end_block() is called in asender context via drbd_finish_peer_reqs(). | ||
1798 | */ | ||
1799 | static int e_end_block(struct drbd_work *w, int cancel) | ||
1800 | { | ||
1801 | struct drbd_peer_request *peer_req = | ||
1802 | container_of(w, struct drbd_peer_request, w); | ||
1803 | struct drbd_conf *mdev = w->mdev; | ||
1804 | sector_t sector = peer_req->i.sector; | ||
1805 | int err = 0, pcmd; | ||
1806 | |||
1807 | if (peer_req->flags & EE_SEND_WRITE_ACK) { | ||
1808 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { | ||
1548 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && | 1809 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && |
1549 | mdev->state.conn <= C_PAUSED_SYNC_T && | 1810 | mdev->state.conn <= C_PAUSED_SYNC_T && |
1550 | e->flags & EE_MAY_SET_IN_SYNC) ? | 1811 | peer_req->flags & EE_MAY_SET_IN_SYNC) ? |
1551 | P_RS_WRITE_ACK : P_WRITE_ACK; | 1812 | P_RS_WRITE_ACK : P_WRITE_ACK; |
1552 | ok &= drbd_send_ack(mdev, pcmd, e); | 1813 | err = drbd_send_ack(mdev, pcmd, peer_req); |
1553 | if (pcmd == P_RS_WRITE_ACK) | 1814 | if (pcmd == P_RS_WRITE_ACK) |
1554 | drbd_set_in_sync(mdev, sector, e->size); | 1815 | drbd_set_in_sync(mdev, sector, peer_req->i.size); |
1555 | } else { | 1816 | } else { |
1556 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | 1817 | err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); |
1557 | /* we expect it to be marked out of sync anyways... | 1818 | /* we expect it to be marked out of sync anyways... |
1558 | * maybe assert this? */ | 1819 | * maybe assert this? */ |
1559 | } | 1820 | } |
@@ -1561,52 +1822,115 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1561 | } | 1822 | } |
1562 | /* we delete from the conflict detection hash _after_ we sent out the | 1823 | /* we delete from the conflict detection hash _after_ we sent out the |
1563 | * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ | 1824 | * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ |
1564 | if (mdev->net_conf->two_primaries) { | 1825 | if (peer_req->flags & EE_IN_INTERVAL_TREE) { |
1565 | spin_lock_irq(&mdev->req_lock); | 1826 | spin_lock_irq(&mdev->tconn->req_lock); |
1566 | D_ASSERT(!hlist_unhashed(&e->collision)); | 1827 | D_ASSERT(!drbd_interval_empty(&peer_req->i)); |
1567 | hlist_del_init(&e->collision); | 1828 | drbd_remove_epoch_entry_interval(mdev, peer_req); |
1568 | spin_unlock_irq(&mdev->req_lock); | 1829 | if (peer_req->flags & EE_RESTART_REQUESTS) |
1569 | } else { | 1830 | restart_conflicting_writes(mdev, sector, peer_req->i.size); |
1570 | D_ASSERT(hlist_unhashed(&e->collision)); | 1831 | spin_unlock_irq(&mdev->tconn->req_lock); |
1571 | } | 1832 | } else |
1833 | D_ASSERT(drbd_interval_empty(&peer_req->i)); | ||
1572 | 1834 | ||
1573 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); | 1835 | drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); |
1574 | 1836 | ||
1575 | return ok; | 1837 | return err; |
1576 | } | 1838 | } |
1577 | 1839 | ||
1578 | static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 1840 | static int e_send_ack(struct drbd_work *w, enum drbd_packet ack) |
1579 | { | 1841 | { |
1580 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1842 | struct drbd_conf *mdev = w->mdev; |
1581 | int ok = 1; | 1843 | struct drbd_peer_request *peer_req = |
1844 | container_of(w, struct drbd_peer_request, w); | ||
1845 | int err; | ||
1582 | 1846 | ||
1583 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 1847 | err = drbd_send_ack(mdev, ack, peer_req); |
1584 | ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); | 1848 | dec_unacked(mdev); |
1585 | 1849 | ||
1586 | spin_lock_irq(&mdev->req_lock); | 1850 | return err; |
1587 | D_ASSERT(!hlist_unhashed(&e->collision)); | 1851 | } |
1588 | hlist_del_init(&e->collision); | ||
1589 | spin_unlock_irq(&mdev->req_lock); | ||
1590 | 1852 | ||
1591 | dec_unacked(mdev); | 1853 | static int e_send_superseded(struct drbd_work *w, int unused) |
1854 | { | ||
1855 | return e_send_ack(w, P_SUPERSEDED); | ||
1856 | } | ||
1857 | |||
1858 | static int e_send_retry_write(struct drbd_work *w, int unused) | ||
1859 | { | ||
1860 | struct drbd_tconn *tconn = w->mdev->tconn; | ||
1861 | |||
1862 | return e_send_ack(w, tconn->agreed_pro_version >= 100 ? | ||
1863 | P_RETRY_WRITE : P_SUPERSEDED); | ||
1864 | } | ||
1865 | |||
1866 | static bool seq_greater(u32 a, u32 b) | ||
1867 | { | ||
1868 | /* | ||
1869 | * We assume 32-bit wrap-around here. | ||
1870 | * For 24-bit wrap-around, we would have to shift: | ||
1871 | * a <<= 8; b <<= 8; | ||
1872 | */ | ||
1873 | return (s32)a - (s32)b > 0; | ||
1874 | } | ||
1875 | |||
1876 | static u32 seq_max(u32 a, u32 b) | ||
1877 | { | ||
1878 | return seq_greater(a, b) ? a : b; | ||
1879 | } | ||
1880 | |||
1881 | static bool need_peer_seq(struct drbd_conf *mdev) | ||
1882 | { | ||
1883 | struct drbd_tconn *tconn = mdev->tconn; | ||
1884 | int tp; | ||
1885 | |||
1886 | /* | ||
1887 | * We only need to keep track of the last packet_seq number of our peer | ||
1888 | * if we are in dual-primary mode and we have the resolve-conflicts flag set; see | ||
1889 | * handle_write_conflicts(). | ||
1890 | */ | ||
1891 | |||
1892 | rcu_read_lock(); | ||
1893 | tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; | ||
1894 | rcu_read_unlock(); | ||
1895 | |||
1896 | return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags); | ||
1897 | } | ||
1898 | |||
1899 | static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq) | ||
1900 | { | ||
1901 | unsigned int newest_peer_seq; | ||
1592 | 1902 | ||
1593 | return ok; | 1903 | if (need_peer_seq(mdev)) { |
1904 | spin_lock(&mdev->peer_seq_lock); | ||
1905 | newest_peer_seq = seq_max(mdev->peer_seq, peer_seq); | ||
1906 | mdev->peer_seq = newest_peer_seq; | ||
1907 | spin_unlock(&mdev->peer_seq_lock); | ||
1908 | /* wake up only if we actually changed mdev->peer_seq */ | ||
1909 | if (peer_seq == newest_peer_seq) | ||
1910 | wake_up(&mdev->seq_wait); | ||
1911 | } | ||
1594 | } | 1912 | } |
1595 | 1913 | ||
1596 | static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) | 1914 | static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) |
1597 | { | 1915 | { |
1916 | return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); | ||
1917 | } | ||
1598 | 1918 | ||
1599 | struct drbd_epoch_entry *rs_e; | 1919 | /* maybe change sync_ee into interval trees as well? */ |
1920 | static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req) | ||
1921 | { | ||
1922 | struct drbd_peer_request *rs_req; | ||
1600 | bool rv = 0; | 1923 | bool rv = 0; |
1601 | 1924 | ||
1602 | spin_lock_irq(&mdev->req_lock); | 1925 | spin_lock_irq(&mdev->tconn->req_lock); |
1603 | list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { | 1926 | list_for_each_entry(rs_req, &mdev->sync_ee, w.list) { |
1604 | if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { | 1927 | if (overlaps(peer_req->i.sector, peer_req->i.size, |
1928 | rs_req->i.sector, rs_req->i.size)) { | ||
1605 | rv = 1; | 1929 | rv = 1; |
1606 | break; | 1930 | break; |
1607 | } | 1931 | } |
1608 | } | 1932 | } |
1609 | spin_unlock_irq(&mdev->req_lock); | 1933 | spin_unlock_irq(&mdev->tconn->req_lock); |
1610 | 1934 | ||
1611 | return rv; | 1935 | return rv; |
1612 | } | 1936 | } |
@@ -1632,35 +1956,41 @@ static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_e | |||
1632 | * | 1956 | * |
1633 | * returns 0 if we may process the packet, | 1957 | * returns 0 if we may process the packet, |
1634 | * -ERESTARTSYS if we were interrupted (by disconnect signal). */ | 1958 | * -ERESTARTSYS if we were interrupted (by disconnect signal). */ |
1635 | static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) | 1959 | static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq) |
1636 | { | 1960 | { |
1637 | DEFINE_WAIT(wait); | 1961 | DEFINE_WAIT(wait); |
1638 | unsigned int p_seq; | ||
1639 | long timeout; | 1962 | long timeout; |
1640 | int ret = 0; | 1963 | int ret; |
1964 | |||
1965 | if (!need_peer_seq(mdev)) | ||
1966 | return 0; | ||
1967 | |||
1641 | spin_lock(&mdev->peer_seq_lock); | 1968 | spin_lock(&mdev->peer_seq_lock); |
1642 | for (;;) { | 1969 | for (;;) { |
1643 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); | 1970 | if (!seq_greater(peer_seq - 1, mdev->peer_seq)) { |
1644 | if (seq_le(packet_seq, mdev->peer_seq+1)) | 1971 | mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq); |
1972 | ret = 0; | ||
1645 | break; | 1973 | break; |
1974 | } | ||
1646 | if (signal_pending(current)) { | 1975 | if (signal_pending(current)) { |
1647 | ret = -ERESTARTSYS; | 1976 | ret = -ERESTARTSYS; |
1648 | break; | 1977 | break; |
1649 | } | 1978 | } |
1650 | p_seq = mdev->peer_seq; | 1979 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); |
1651 | spin_unlock(&mdev->peer_seq_lock); | 1980 | spin_unlock(&mdev->peer_seq_lock); |
1652 | timeout = schedule_timeout(30*HZ); | 1981 | rcu_read_lock(); |
1982 | timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10; | ||
1983 | rcu_read_unlock(); | ||
1984 | timeout = schedule_timeout(timeout); | ||
1653 | spin_lock(&mdev->peer_seq_lock); | 1985 | spin_lock(&mdev->peer_seq_lock); |
1654 | if (timeout == 0 && p_seq == mdev->peer_seq) { | 1986 | if (!timeout) { |
1655 | ret = -ETIMEDOUT; | 1987 | ret = -ETIMEDOUT; |
1656 | dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); | 1988 | dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n"); |
1657 | break; | 1989 | break; |
1658 | } | 1990 | } |
1659 | } | 1991 | } |
1660 | finish_wait(&mdev->seq_wait, &wait); | ||
1661 | if (mdev->peer_seq+1 == packet_seq) | ||
1662 | mdev->peer_seq++; | ||
1663 | spin_unlock(&mdev->peer_seq_lock); | 1992 | spin_unlock(&mdev->peer_seq_lock); |
1993 | finish_wait(&mdev->seq_wait, &wait); | ||
1664 | return ret; | 1994 | return ret; |
1665 | } | 1995 | } |
1666 | 1996 | ||
@@ -1675,233 +2005,277 @@ static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf) | |||
1675 | (dpf & DP_DISCARD ? REQ_DISCARD : 0); | 2005 | (dpf & DP_DISCARD ? REQ_DISCARD : 0); |
1676 | } | 2006 | } |
1677 | 2007 | ||
2008 | static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector, | ||
2009 | unsigned int size) | ||
2010 | { | ||
2011 | struct drbd_interval *i; | ||
2012 | |||
2013 | repeat: | ||
2014 | drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { | ||
2015 | struct drbd_request *req; | ||
2016 | struct bio_and_error m; | ||
2017 | |||
2018 | if (!i->local) | ||
2019 | continue; | ||
2020 | req = container_of(i, struct drbd_request, i); | ||
2021 | if (!(req->rq_state & RQ_POSTPONED)) | ||
2022 | continue; | ||
2023 | req->rq_state &= ~RQ_POSTPONED; | ||
2024 | __req_mod(req, NEG_ACKED, &m); | ||
2025 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
2026 | if (m.bio) | ||
2027 | complete_master_bio(mdev, &m); | ||
2028 | spin_lock_irq(&mdev->tconn->req_lock); | ||
2029 | goto repeat; | ||
2030 | } | ||
2031 | } | ||
2032 | |||
2033 | static int handle_write_conflicts(struct drbd_conf *mdev, | ||
2034 | struct drbd_peer_request *peer_req) | ||
2035 | { | ||
2036 | struct drbd_tconn *tconn = mdev->tconn; | ||
2037 | bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &tconn->flags); | ||
2038 | sector_t sector = peer_req->i.sector; | ||
2039 | const unsigned int size = peer_req->i.size; | ||
2040 | struct drbd_interval *i; | ||
2041 | bool equal; | ||
2042 | int err; | ||
2043 | |||
2044 | /* | ||
2045 | * Inserting the peer request into the write_requests tree will prevent | ||
2046 | * new conflicting local requests from being added. | ||
2047 | */ | ||
2048 | drbd_insert_interval(&mdev->write_requests, &peer_req->i); | ||
2049 | |||
2050 | repeat: | ||
2051 | drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { | ||
2052 | if (i == &peer_req->i) | ||
2053 | continue; | ||
2054 | |||
2055 | if (!i->local) { | ||
2056 | /* | ||
2057 | * Our peer has sent a conflicting remote request; this | ||
2058 | * should not happen in a two-node setup. Wait for the | ||
2059 | * earlier peer request to complete. | ||
2060 | */ | ||
2061 | err = drbd_wait_misc(mdev, i); | ||
2062 | if (err) | ||
2063 | goto out; | ||
2064 | goto repeat; | ||
2065 | } | ||
2066 | |||
2067 | equal = i->sector == sector && i->size == size; | ||
2068 | if (resolve_conflicts) { | ||
2069 | /* | ||
2070 | * If the peer request is fully contained within the | ||
2071 | * overlapping request, it can be considered overwritten | ||
2072 | * and thus superseded; otherwise, it will be retried | ||
2073 | * once all overlapping requests have completed. | ||
2074 | */ | ||
2075 | bool superseded = i->sector <= sector && i->sector + | ||
2076 | (i->size >> 9) >= sector + (size >> 9); | ||
2077 | |||
2078 | if (!equal) | ||
2079 | dev_alert(DEV, "Concurrent writes detected: " | ||
2080 | "local=%llus +%u, remote=%llus +%u, " | ||
2081 | "assuming %s came first\n", | ||
2082 | (unsigned long long)i->sector, i->size, | ||
2083 | (unsigned long long)sector, size, | ||
2084 | superseded ? "local" : "remote"); | ||
2085 | |||
2086 | inc_unacked(mdev); | ||
2087 | peer_req->w.cb = superseded ? e_send_superseded : | ||
2088 | e_send_retry_write; | ||
2089 | list_add_tail(&peer_req->w.list, &mdev->done_ee); | ||
2090 | wake_asender(mdev->tconn); | ||
2091 | |||
2092 | err = -ENOENT; | ||
2093 | goto out; | ||
2094 | } else { | ||
2095 | struct drbd_request *req = | ||
2096 | container_of(i, struct drbd_request, i); | ||
2097 | |||
2098 | if (!equal) | ||
2099 | dev_alert(DEV, "Concurrent writes detected: " | ||
2100 | "local=%llus +%u, remote=%llus +%u\n", | ||
2101 | (unsigned long long)i->sector, i->size, | ||
2102 | (unsigned long long)sector, size); | ||
2103 | |||
2104 | if (req->rq_state & RQ_LOCAL_PENDING || | ||
2105 | !(req->rq_state & RQ_POSTPONED)) { | ||
2106 | /* | ||
2107 | * Wait for the node with the discard flag to | ||
2108 | * decide if this request has been superseded | ||
2109 | * or needs to be retried. | ||
2110 | * Requests that have been superseded will | ||
2111 | * disappear from the write_requests tree. | ||
2112 | * | ||
2113 | * In addition, wait for the conflicting | ||
2114 | * request to finish locally before submitting | ||
2115 | * the conflicting peer request. | ||
2116 | */ | ||
2117 | err = drbd_wait_misc(mdev, &req->i); | ||
2118 | if (err) { | ||
2119 | _conn_request_state(mdev->tconn, | ||
2120 | NS(conn, C_TIMEOUT), | ||
2121 | CS_HARD); | ||
2122 | fail_postponed_requests(mdev, sector, size); | ||
2123 | goto out; | ||
2124 | } | ||
2125 | goto repeat; | ||
2126 | } | ||
2127 | /* | ||
2128 | * Remember to restart the conflicting requests after | ||
2129 | * the new peer request has completed. | ||
2130 | */ | ||
2131 | peer_req->flags |= EE_RESTART_REQUESTS; | ||
2132 | } | ||
2133 | } | ||
2134 | err = 0; | ||
2135 | |||
2136 | out: | ||
2137 | if (err) | ||
2138 | drbd_remove_epoch_entry_interval(mdev, peer_req); | ||
2139 | return err; | ||
2140 | } | ||
2141 | |||
1678 | /* mirrored write */ | 2142 | /* mirrored write */ |
1679 | static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 2143 | static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) |
1680 | { | 2144 | { |
2145 | struct drbd_conf *mdev; | ||
1681 | sector_t sector; | 2146 | sector_t sector; |
1682 | struct drbd_epoch_entry *e; | 2147 | struct drbd_peer_request *peer_req; |
1683 | struct p_data *p = &mdev->data.rbuf.data; | 2148 | struct p_data *p = pi->data; |
2149 | u32 peer_seq = be32_to_cpu(p->seq_num); | ||
1684 | int rw = WRITE; | 2150 | int rw = WRITE; |
1685 | u32 dp_flags; | 2151 | u32 dp_flags; |
2152 | int err, tp; | ||
1686 | 2153 | ||
1687 | if (!get_ldev(mdev)) { | 2154 | mdev = vnr_to_mdev(tconn, pi->vnr); |
1688 | spin_lock(&mdev->peer_seq_lock); | 2155 | if (!mdev) |
1689 | if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) | 2156 | return -EIO; |
1690 | mdev->peer_seq++; | ||
1691 | spin_unlock(&mdev->peer_seq_lock); | ||
1692 | 2157 | ||
1693 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); | 2158 | if (!get_ldev(mdev)) { |
1694 | atomic_inc(&mdev->current_epoch->epoch_size); | 2159 | int err2; |
1695 | return drbd_drain_block(mdev, data_size); | 2160 | |
2161 | err = wait_for_and_update_peer_seq(mdev, peer_seq); | ||
2162 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); | ||
2163 | atomic_inc(&tconn->current_epoch->epoch_size); | ||
2164 | err2 = drbd_drain_block(mdev, pi->size); | ||
2165 | if (!err) | ||
2166 | err = err2; | ||
2167 | return err; | ||
1696 | } | 2168 | } |
1697 | 2169 | ||
1698 | /* get_ldev(mdev) successful. | 2170 | /* |
1699 | * Corresponding put_ldev done either below (on various errors), | 2171 | * Corresponding put_ldev done either below (on various errors), or in |
1700 | * or in drbd_endio_write_sec, if we successfully submit the data at | 2172 | * drbd_peer_request_endio, if we successfully submit the data at the |
1701 | * the end of this function. */ | 2173 | * end of this function. |
2174 | */ | ||
1702 | 2175 | ||
1703 | sector = be64_to_cpu(p->sector); | 2176 | sector = be64_to_cpu(p->sector); |
1704 | e = read_in_block(mdev, p->block_id, sector, data_size); | 2177 | peer_req = read_in_block(mdev, p->block_id, sector, pi->size); |
1705 | if (!e) { | 2178 | if (!peer_req) { |
1706 | put_ldev(mdev); | 2179 | put_ldev(mdev); |
1707 | return false; | 2180 | return -EIO; |
1708 | } | 2181 | } |
1709 | 2182 | ||
1710 | e->w.cb = e_end_block; | 2183 | peer_req->w.cb = e_end_block; |
1711 | 2184 | ||
1712 | dp_flags = be32_to_cpu(p->dp_flags); | 2185 | dp_flags = be32_to_cpu(p->dp_flags); |
1713 | rw |= wire_flags_to_bio(mdev, dp_flags); | 2186 | rw |= wire_flags_to_bio(mdev, dp_flags); |
1714 | if (e->pages == NULL) { | 2187 | if (peer_req->pages == NULL) { |
1715 | D_ASSERT(e->size == 0); | 2188 | D_ASSERT(peer_req->i.size == 0); |
1716 | D_ASSERT(dp_flags & DP_FLUSH); | 2189 | D_ASSERT(dp_flags & DP_FLUSH); |
1717 | } | 2190 | } |
1718 | 2191 | ||
1719 | if (dp_flags & DP_MAY_SET_IN_SYNC) | 2192 | if (dp_flags & DP_MAY_SET_IN_SYNC) |
1720 | e->flags |= EE_MAY_SET_IN_SYNC; | 2193 | peer_req->flags |= EE_MAY_SET_IN_SYNC; |
1721 | 2194 | ||
1722 | spin_lock(&mdev->epoch_lock); | 2195 | spin_lock(&tconn->epoch_lock); |
1723 | e->epoch = mdev->current_epoch; | 2196 | peer_req->epoch = tconn->current_epoch; |
1724 | atomic_inc(&e->epoch->epoch_size); | 2197 | atomic_inc(&peer_req->epoch->epoch_size); |
1725 | atomic_inc(&e->epoch->active); | 2198 | atomic_inc(&peer_req->epoch->active); |
1726 | spin_unlock(&mdev->epoch_lock); | 2199 | spin_unlock(&tconn->epoch_lock); |
1727 | 2200 | ||
1728 | /* I'm the receiver, I do hold a net_cnt reference. */ | 2201 | rcu_read_lock(); |
1729 | if (!mdev->net_conf->two_primaries) { | 2202 | tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; |
1730 | spin_lock_irq(&mdev->req_lock); | 2203 | rcu_read_unlock(); |
1731 | } else { | 2204 | if (tp) { |
1732 | /* don't get the req_lock yet, | 2205 | peer_req->flags |= EE_IN_INTERVAL_TREE; |
1733 | * we may sleep in drbd_wait_peer_seq */ | 2206 | err = wait_for_and_update_peer_seq(mdev, peer_seq); |
1734 | const int size = e->size; | 2207 | if (err) |
1735 | const int discard = drbd_test_flag(mdev, DISCARD_CONCURRENT); | ||
1736 | DEFINE_WAIT(wait); | ||
1737 | struct drbd_request *i; | ||
1738 | struct hlist_node *n; | ||
1739 | struct hlist_head *slot; | ||
1740 | int first; | ||
1741 | |||
1742 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
1743 | BUG_ON(mdev->ee_hash == NULL); | ||
1744 | BUG_ON(mdev->tl_hash == NULL); | ||
1745 | |||
1746 | /* conflict detection and handling: | ||
1747 | * 1. wait on the sequence number, | ||
1748 | * in case this data packet overtook ACK packets. | ||
1749 | * 2. check our hash tables for conflicting requests. | ||
1750 | * we only need to walk the tl_hash, since an ee can not | ||
1751 | * have a conflict with an other ee: on the submitting | ||
1752 | * node, the corresponding req had already been conflicting, | ||
1753 | * and a conflicting req is never sent. | ||
1754 | * | ||
1755 | * Note: for two_primaries, we are protocol C, | ||
1756 | * so there cannot be any request that is DONE | ||
1757 | * but still on the transfer log. | ||
1758 | * | ||
1759 | * unconditionally add to the ee_hash. | ||
1760 | * | ||
1761 | * if no conflicting request is found: | ||
1762 | * submit. | ||
1763 | * | ||
1764 | * if any conflicting request is found | ||
1765 | * that has not yet been acked, | ||
1766 | * AND I have the "discard concurrent writes" flag: | ||
1767 | * queue (via done_ee) the P_DISCARD_ACK; OUT. | ||
1768 | * | ||
1769 | * if any conflicting request is found: | ||
1770 | * block the receiver, waiting on misc_wait | ||
1771 | * until no more conflicting requests are there, | ||
1772 | * or we get interrupted (disconnect). | ||
1773 | * | ||
1774 | * we do not just write after local io completion of those | ||
1775 | * requests, but only after req is done completely, i.e. | ||
1776 | * we wait for the P_DISCARD_ACK to arrive! | ||
1777 | * | ||
1778 | * then proceed normally, i.e. submit. | ||
1779 | */ | ||
1780 | if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) | ||
1781 | goto out_interrupted; | 2208 | goto out_interrupted; |
1782 | 2209 | spin_lock_irq(&mdev->tconn->req_lock); | |
1783 | spin_lock_irq(&mdev->req_lock); | 2210 | err = handle_write_conflicts(mdev, peer_req); |
1784 | 2211 | if (err) { | |
1785 | hlist_add_head(&e->collision, ee_hash_slot(mdev, sector)); | 2212 | spin_unlock_irq(&mdev->tconn->req_lock); |
1786 | 2213 | if (err == -ENOENT) { | |
1787 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | ||
1788 | slot = tl_hash_slot(mdev, sector); | ||
1789 | first = 1; | ||
1790 | for (;;) { | ||
1791 | int have_unacked = 0; | ||
1792 | int have_conflict = 0; | ||
1793 | prepare_to_wait(&mdev->misc_wait, &wait, | ||
1794 | TASK_INTERRUPTIBLE); | ||
1795 | hlist_for_each_entry(i, n, slot, collision) { | ||
1796 | if (OVERLAPS) { | ||
1797 | /* only ALERT on first iteration, | ||
1798 | * we may be woken up early... */ | ||
1799 | if (first) | ||
1800 | dev_alert(DEV, "%s[%u] Concurrent local write detected!" | ||
1801 | " new: %llus +%u; pending: %llus +%u\n", | ||
1802 | current->comm, current->pid, | ||
1803 | (unsigned long long)sector, size, | ||
1804 | (unsigned long long)i->sector, i->size); | ||
1805 | if (i->rq_state & RQ_NET_PENDING) | ||
1806 | ++have_unacked; | ||
1807 | ++have_conflict; | ||
1808 | } | ||
1809 | } | ||
1810 | #undef OVERLAPS | ||
1811 | if (!have_conflict) | ||
1812 | break; | ||
1813 | |||
1814 | /* Discard Ack only for the _first_ iteration */ | ||
1815 | if (first && discard && have_unacked) { | ||
1816 | dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", | ||
1817 | (unsigned long long)sector); | ||
1818 | inc_unacked(mdev); | ||
1819 | e->w.cb = e_send_discard_ack; | ||
1820 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
1821 | |||
1822 | spin_unlock_irq(&mdev->req_lock); | ||
1823 | |||
1824 | /* we could probably send that P_DISCARD_ACK ourselves, | ||
1825 | * but I don't like the receiver using the msock */ | ||
1826 | |||
1827 | put_ldev(mdev); | 2214 | put_ldev(mdev); |
1828 | wake_asender(mdev); | 2215 | return 0; |
1829 | finish_wait(&mdev->misc_wait, &wait); | ||
1830 | return true; | ||
1831 | } | 2216 | } |
2217 | goto out_interrupted; | ||
2218 | } | ||
2219 | } else | ||
2220 | spin_lock_irq(&mdev->tconn->req_lock); | ||
2221 | list_add(&peer_req->w.list, &mdev->active_ee); | ||
2222 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
1832 | 2223 | ||
1833 | if (signal_pending(current)) { | 2224 | if (mdev->state.conn == C_SYNC_TARGET) |
1834 | hlist_del_init(&e->collision); | 2225 | wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req)); |
1835 | |||
1836 | spin_unlock_irq(&mdev->req_lock); | ||
1837 | |||
1838 | finish_wait(&mdev->misc_wait, &wait); | ||
1839 | goto out_interrupted; | ||
1840 | } | ||
1841 | 2226 | ||
1842 | spin_unlock_irq(&mdev->req_lock); | 2227 | if (mdev->tconn->agreed_pro_version < 100) { |
1843 | if (first) { | 2228 | rcu_read_lock(); |
1844 | first = 0; | 2229 | switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) { |
1845 | dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " | 2230 | case DRBD_PROT_C: |
1846 | "sec=%llus\n", (unsigned long long)sector); | 2231 | dp_flags |= DP_SEND_WRITE_ACK; |
1847 | } else if (discard) { | 2232 | break; |
1848 | /* we had none on the first iteration. | 2233 | case DRBD_PROT_B: |
1849 | * there must be none now. */ | 2234 | dp_flags |= DP_SEND_RECEIVE_ACK; |
1850 | D_ASSERT(have_unacked == 0); | 2235 | break; |
1851 | } | ||
1852 | schedule(); | ||
1853 | spin_lock_irq(&mdev->req_lock); | ||
1854 | } | 2236 | } |
1855 | finish_wait(&mdev->misc_wait, &wait); | 2237 | rcu_read_unlock(); |
1856 | } | 2238 | } |
1857 | 2239 | ||
1858 | list_add(&e->w.list, &mdev->active_ee); | 2240 | if (dp_flags & DP_SEND_WRITE_ACK) { |
1859 | spin_unlock_irq(&mdev->req_lock); | 2241 | peer_req->flags |= EE_SEND_WRITE_ACK; |
1860 | |||
1861 | if (mdev->state.conn == C_SYNC_TARGET) | ||
1862 | wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); | ||
1863 | |||
1864 | switch (mdev->net_conf->wire_protocol) { | ||
1865 | case DRBD_PROT_C: | ||
1866 | inc_unacked(mdev); | 2242 | inc_unacked(mdev); |
1867 | /* corresponding dec_unacked() in e_end_block() | 2243 | /* corresponding dec_unacked() in e_end_block() |
1868 | * respective _drbd_clear_done_ee */ | 2244 | * respective _drbd_clear_done_ee */ |
1869 | break; | 2245 | } |
1870 | case DRBD_PROT_B: | 2246 | |
2247 | if (dp_flags & DP_SEND_RECEIVE_ACK) { | ||
1871 | /* I really don't like it that the receiver thread | 2248 | /* I really don't like it that the receiver thread |
1872 | * sends on the msock, but anyways */ | 2249 | * sends on the msock, but anyways */ |
1873 | drbd_send_ack(mdev, P_RECV_ACK, e); | 2250 | drbd_send_ack(mdev, P_RECV_ACK, peer_req); |
1874 | break; | ||
1875 | case DRBD_PROT_A: | ||
1876 | /* nothing to do */ | ||
1877 | break; | ||
1878 | } | 2251 | } |
1879 | 2252 | ||
1880 | if (mdev->state.pdsk < D_INCONSISTENT) { | 2253 | if (mdev->state.pdsk < D_INCONSISTENT) { |
1881 | /* In case we have the only disk of the cluster, */ | 2254 | /* In case we have the only disk of the cluster, */ |
1882 | drbd_set_out_of_sync(mdev, e->sector, e->size); | 2255 | drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); |
1883 | e->flags |= EE_CALL_AL_COMPLETE_IO; | 2256 | peer_req->flags |= EE_CALL_AL_COMPLETE_IO; |
1884 | e->flags &= ~EE_MAY_SET_IN_SYNC; | 2257 | peer_req->flags &= ~EE_MAY_SET_IN_SYNC; |
1885 | drbd_al_begin_io(mdev, e->sector); | 2258 | drbd_al_begin_io(mdev, &peer_req->i); |
1886 | } | 2259 | } |
1887 | 2260 | ||
1888 | if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) | 2261 | err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); |
1889 | return true; | 2262 | if (!err) |
2263 | return 0; | ||
1890 | 2264 | ||
1891 | /* don't care for the reason here */ | 2265 | /* don't care for the reason here */ |
1892 | dev_err(DEV, "submit failed, triggering re-connect\n"); | 2266 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
1893 | spin_lock_irq(&mdev->req_lock); | 2267 | spin_lock_irq(&mdev->tconn->req_lock); |
1894 | list_del(&e->w.list); | 2268 | list_del(&peer_req->w.list); |
1895 | hlist_del_init(&e->collision); | 2269 | drbd_remove_epoch_entry_interval(mdev, peer_req); |
1896 | spin_unlock_irq(&mdev->req_lock); | 2270 | spin_unlock_irq(&mdev->tconn->req_lock); |
1897 | if (e->flags & EE_CALL_AL_COMPLETE_IO) | 2271 | if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) |
1898 | drbd_al_complete_io(mdev, e->sector); | 2272 | drbd_al_complete_io(mdev, &peer_req->i); |
1899 | 2273 | ||
1900 | out_interrupted: | 2274 | out_interrupted: |
1901 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP); | 2275 | drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP); |
1902 | put_ldev(mdev); | 2276 | put_ldev(mdev); |
1903 | drbd_free_ee(mdev, e); | 2277 | drbd_free_peer_req(mdev, peer_req); |
1904 | return false; | 2278 | return err; |
1905 | } | 2279 | } |
1906 | 2280 | ||
1907 | /* We may throttle resync, if the lower device seems to be busy, | 2281 | /* We may throttle resync, if the lower device seems to be busy, |
@@ -1922,9 +2296,14 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector) | |||
1922 | struct lc_element *tmp; | 2296 | struct lc_element *tmp; |
1923 | int curr_events; | 2297 | int curr_events; |
1924 | int throttle = 0; | 2298 | int throttle = 0; |
2299 | unsigned int c_min_rate; | ||
2300 | |||
2301 | rcu_read_lock(); | ||
2302 | c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate; | ||
2303 | rcu_read_unlock(); | ||
1925 | 2304 | ||
1926 | /* feature disabled? */ | 2305 | /* feature disabled? */ |
1927 | if (mdev->sync_conf.c_min_rate == 0) | 2306 | if (c_min_rate == 0) |
1928 | return 0; | 2307 | return 0; |
1929 | 2308 | ||
1930 | spin_lock_irq(&mdev->al_lock); | 2309 | spin_lock_irq(&mdev->al_lock); |
@@ -1964,40 +2343,46 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector) | |||
1964 | db = mdev->rs_mark_left[i] - rs_left; | 2343 | db = mdev->rs_mark_left[i] - rs_left; |
1965 | dbdt = Bit2KB(db/dt); | 2344 | dbdt = Bit2KB(db/dt); |
1966 | 2345 | ||
1967 | if (dbdt > mdev->sync_conf.c_min_rate) | 2346 | if (dbdt > c_min_rate) |
1968 | throttle = 1; | 2347 | throttle = 1; |
1969 | } | 2348 | } |
1970 | return throttle; | 2349 | return throttle; |
1971 | } | 2350 | } |
1972 | 2351 | ||
1973 | 2352 | ||
1974 | static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size) | 2353 | static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) |
1975 | { | 2354 | { |
2355 | struct drbd_conf *mdev; | ||
1976 | sector_t sector; | 2356 | sector_t sector; |
1977 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 2357 | sector_t capacity; |
1978 | struct drbd_epoch_entry *e; | 2358 | struct drbd_peer_request *peer_req; |
1979 | struct digest_info *di = NULL; | 2359 | struct digest_info *di = NULL; |
1980 | int size, verb; | 2360 | int size, verb; |
1981 | unsigned int fault_type; | 2361 | unsigned int fault_type; |
1982 | struct p_block_req *p = &mdev->data.rbuf.block_req; | 2362 | struct p_block_req *p = pi->data; |
2363 | |||
2364 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
2365 | if (!mdev) | ||
2366 | return -EIO; | ||
2367 | capacity = drbd_get_capacity(mdev->this_bdev); | ||
1983 | 2368 | ||
1984 | sector = be64_to_cpu(p->sector); | 2369 | sector = be64_to_cpu(p->sector); |
1985 | size = be32_to_cpu(p->blksize); | 2370 | size = be32_to_cpu(p->blksize); |
1986 | 2371 | ||
1987 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 2372 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
1988 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | 2373 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, |
1989 | (unsigned long long)sector, size); | 2374 | (unsigned long long)sector, size); |
1990 | return false; | 2375 | return -EINVAL; |
1991 | } | 2376 | } |
1992 | if (sector + (size>>9) > capacity) { | 2377 | if (sector + (size>>9) > capacity) { |
1993 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | 2378 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, |
1994 | (unsigned long long)sector, size); | 2379 | (unsigned long long)sector, size); |
1995 | return false; | 2380 | return -EINVAL; |
1996 | } | 2381 | } |
1997 | 2382 | ||
1998 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { | 2383 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { |
1999 | verb = 1; | 2384 | verb = 1; |
2000 | switch (cmd) { | 2385 | switch (pi->cmd) { |
2001 | case P_DATA_REQUEST: | 2386 | case P_DATA_REQUEST: |
2002 | drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); | 2387 | drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); |
2003 | break; | 2388 | break; |
@@ -2012,35 +2397,34 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2012 | drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); | 2397 | drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); |
2013 | break; | 2398 | break; |
2014 | default: | 2399 | default: |
2015 | dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", | 2400 | BUG(); |
2016 | cmdname(cmd)); | ||
2017 | } | 2401 | } |
2018 | if (verb && __ratelimit(&drbd_ratelimit_state)) | 2402 | if (verb && __ratelimit(&drbd_ratelimit_state)) |
2019 | dev_err(DEV, "Can not satisfy peer's read request, " | 2403 | dev_err(DEV, "Can not satisfy peer's read request, " |
2020 | "no local data.\n"); | 2404 | "no local data.\n"); |
2021 | 2405 | ||
2022 | /* drain possibly payload */ | 2406 | /* drain possibly payload */ |
2023 | return drbd_drain_block(mdev, digest_size); | 2407 | return drbd_drain_block(mdev, pi->size); |
2024 | } | 2408 | } |
2025 | 2409 | ||
2026 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | 2410 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD |
2027 | * "criss-cross" setup, that might cause write-out on some other DRBD, | 2411 | * "criss-cross" setup, that might cause write-out on some other DRBD, |
2028 | * which in turn might block on the other node at this very place. */ | 2412 | * which in turn might block on the other node at this very place. */ |
2029 | e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); | 2413 | peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO); |
2030 | if (!e) { | 2414 | if (!peer_req) { |
2031 | put_ldev(mdev); | 2415 | put_ldev(mdev); |
2032 | return false; | 2416 | return -ENOMEM; |
2033 | } | 2417 | } |
2034 | 2418 | ||
2035 | switch (cmd) { | 2419 | switch (pi->cmd) { |
2036 | case P_DATA_REQUEST: | 2420 | case P_DATA_REQUEST: |
2037 | e->w.cb = w_e_end_data_req; | 2421 | peer_req->w.cb = w_e_end_data_req; |
2038 | fault_type = DRBD_FAULT_DT_RD; | 2422 | fault_type = DRBD_FAULT_DT_RD; |
2039 | /* application IO, don't drbd_rs_begin_io */ | 2423 | /* application IO, don't drbd_rs_begin_io */ |
2040 | goto submit; | 2424 | goto submit; |
2041 | 2425 | ||
2042 | case P_RS_DATA_REQUEST: | 2426 | case P_RS_DATA_REQUEST: |
2043 | e->w.cb = w_e_end_rsdata_req; | 2427 | peer_req->w.cb = w_e_end_rsdata_req; |
2044 | fault_type = DRBD_FAULT_RS_RD; | 2428 | fault_type = DRBD_FAULT_RS_RD; |
2045 | /* used in the sector offset progress display */ | 2429 | /* used in the sector offset progress display */ |
2046 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | 2430 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); |
@@ -2049,28 +2433,28 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2049 | case P_OV_REPLY: | 2433 | case P_OV_REPLY: |
2050 | case P_CSUM_RS_REQUEST: | 2434 | case P_CSUM_RS_REQUEST: |
2051 | fault_type = DRBD_FAULT_RS_RD; | 2435 | fault_type = DRBD_FAULT_RS_RD; |
2052 | di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); | 2436 | di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); |
2053 | if (!di) | 2437 | if (!di) |
2054 | goto out_free_e; | 2438 | goto out_free_e; |
2055 | 2439 | ||
2056 | di->digest_size = digest_size; | 2440 | di->digest_size = pi->size; |
2057 | di->digest = (((char *)di)+sizeof(struct digest_info)); | 2441 | di->digest = (((char *)di)+sizeof(struct digest_info)); |
2058 | 2442 | ||
2059 | e->digest = di; | 2443 | peer_req->digest = di; |
2060 | e->flags |= EE_HAS_DIGEST; | 2444 | peer_req->flags |= EE_HAS_DIGEST; |
2061 | 2445 | ||
2062 | if (drbd_recv(mdev, di->digest, digest_size) != digest_size) | 2446 | if (drbd_recv_all(mdev->tconn, di->digest, pi->size)) |
2063 | goto out_free_e; | 2447 | goto out_free_e; |
2064 | 2448 | ||
2065 | if (cmd == P_CSUM_RS_REQUEST) { | 2449 | if (pi->cmd == P_CSUM_RS_REQUEST) { |
2066 | D_ASSERT(mdev->agreed_pro_version >= 89); | 2450 | D_ASSERT(mdev->tconn->agreed_pro_version >= 89); |
2067 | e->w.cb = w_e_end_csum_rs_req; | 2451 | peer_req->w.cb = w_e_end_csum_rs_req; |
2068 | /* used in the sector offset progress display */ | 2452 | /* used in the sector offset progress display */ |
2069 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | 2453 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); |
2070 | } else if (cmd == P_OV_REPLY) { | 2454 | } else if (pi->cmd == P_OV_REPLY) { |
2071 | /* track progress, we may need to throttle */ | 2455 | /* track progress, we may need to throttle */ |
2072 | atomic_add(size >> 9, &mdev->rs_sect_in); | 2456 | atomic_add(size >> 9, &mdev->rs_sect_in); |
2073 | e->w.cb = w_e_end_ov_reply; | 2457 | peer_req->w.cb = w_e_end_ov_reply; |
2074 | dec_rs_pending(mdev); | 2458 | dec_rs_pending(mdev); |
2075 | /* drbd_rs_begin_io done when we sent this request, | 2459 | /* drbd_rs_begin_io done when we sent this request, |
2076 | * but accounting still needs to be done. */ | 2460 | * but accounting still needs to be done. */ |
@@ -2080,7 +2464,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2080 | 2464 | ||
2081 | case P_OV_REQUEST: | 2465 | case P_OV_REQUEST: |
2082 | if (mdev->ov_start_sector == ~(sector_t)0 && | 2466 | if (mdev->ov_start_sector == ~(sector_t)0 && |
2083 | mdev->agreed_pro_version >= 90) { | 2467 | mdev->tconn->agreed_pro_version >= 90) { |
2084 | unsigned long now = jiffies; | 2468 | unsigned long now = jiffies; |
2085 | int i; | 2469 | int i; |
2086 | mdev->ov_start_sector = sector; | 2470 | mdev->ov_start_sector = sector; |
@@ -2094,15 +2478,12 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2094 | dev_info(DEV, "Online Verify start sector: %llu\n", | 2478 | dev_info(DEV, "Online Verify start sector: %llu\n", |
2095 | (unsigned long long)sector); | 2479 | (unsigned long long)sector); |
2096 | } | 2480 | } |
2097 | e->w.cb = w_e_end_ov_req; | 2481 | peer_req->w.cb = w_e_end_ov_req; |
2098 | fault_type = DRBD_FAULT_RS_RD; | 2482 | fault_type = DRBD_FAULT_RS_RD; |
2099 | break; | 2483 | break; |
2100 | 2484 | ||
2101 | default: | 2485 | default: |
2102 | dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", | 2486 | BUG(); |
2103 | cmdname(cmd)); | ||
2104 | fault_type = DRBD_FAULT_MAX; | ||
2105 | goto out_free_e; | ||
2106 | } | 2487 | } |
2107 | 2488 | ||
2108 | /* Throttle, drbd_rs_begin_io and submit should become asynchronous | 2489 | /* Throttle, drbd_rs_begin_io and submit should become asynchronous |
@@ -2137,30 +2518,31 @@ submit_for_resync: | |||
2137 | 2518 | ||
2138 | submit: | 2519 | submit: |
2139 | inc_unacked(mdev); | 2520 | inc_unacked(mdev); |
2140 | spin_lock_irq(&mdev->req_lock); | 2521 | spin_lock_irq(&mdev->tconn->req_lock); |
2141 | list_add_tail(&e->w.list, &mdev->read_ee); | 2522 | list_add_tail(&peer_req->w.list, &mdev->read_ee); |
2142 | spin_unlock_irq(&mdev->req_lock); | 2523 | spin_unlock_irq(&mdev->tconn->req_lock); |
2143 | 2524 | ||
2144 | if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) | 2525 | if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0) |
2145 | return true; | 2526 | return 0; |
2146 | 2527 | ||
2147 | /* don't care for the reason here */ | 2528 | /* don't care for the reason here */ |
2148 | dev_err(DEV, "submit failed, triggering re-connect\n"); | 2529 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
2149 | spin_lock_irq(&mdev->req_lock); | 2530 | spin_lock_irq(&mdev->tconn->req_lock); |
2150 | list_del(&e->w.list); | 2531 | list_del(&peer_req->w.list); |
2151 | spin_unlock_irq(&mdev->req_lock); | 2532 | spin_unlock_irq(&mdev->tconn->req_lock); |
2152 | /* no drbd_rs_complete_io(), we are dropping the connection anyways */ | 2533 | /* no drbd_rs_complete_io(), we are dropping the connection anyways */ |
2153 | 2534 | ||
2154 | out_free_e: | 2535 | out_free_e: |
2155 | put_ldev(mdev); | 2536 | put_ldev(mdev); |
2156 | drbd_free_ee(mdev, e); | 2537 | drbd_free_peer_req(mdev, peer_req); |
2157 | return false; | 2538 | return -EIO; |
2158 | } | 2539 | } |
2159 | 2540 | ||
2160 | static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | 2541 | static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) |
2161 | { | 2542 | { |
2162 | int self, peer, rv = -100; | 2543 | int self, peer, rv = -100; |
2163 | unsigned long ch_self, ch_peer; | 2544 | unsigned long ch_self, ch_peer; |
2545 | enum drbd_after_sb_p after_sb_0p; | ||
2164 | 2546 | ||
2165 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | 2547 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; |
2166 | peer = mdev->p_uuid[UI_BITMAP] & 1; | 2548 | peer = mdev->p_uuid[UI_BITMAP] & 1; |
@@ -2168,10 +2550,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2168 | ch_peer = mdev->p_uuid[UI_SIZE]; | 2550 | ch_peer = mdev->p_uuid[UI_SIZE]; |
2169 | ch_self = mdev->comm_bm_set; | 2551 | ch_self = mdev->comm_bm_set; |
2170 | 2552 | ||
2171 | switch (mdev->net_conf->after_sb_0p) { | 2553 | rcu_read_lock(); |
2554 | after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p; | ||
2555 | rcu_read_unlock(); | ||
2556 | switch (after_sb_0p) { | ||
2172 | case ASB_CONSENSUS: | 2557 | case ASB_CONSENSUS: |
2173 | case ASB_DISCARD_SECONDARY: | 2558 | case ASB_DISCARD_SECONDARY: |
2174 | case ASB_CALL_HELPER: | 2559 | case ASB_CALL_HELPER: |
2560 | case ASB_VIOLENTLY: | ||
2175 | dev_err(DEV, "Configuration error.\n"); | 2561 | dev_err(DEV, "Configuration error.\n"); |
2176 | break; | 2562 | break; |
2177 | case ASB_DISCONNECT: | 2563 | case ASB_DISCONNECT: |
@@ -2200,14 +2586,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2200 | "Using discard-least-changes instead\n"); | 2586 | "Using discard-least-changes instead\n"); |
2201 | case ASB_DISCARD_ZERO_CHG: | 2587 | case ASB_DISCARD_ZERO_CHG: |
2202 | if (ch_peer == 0 && ch_self == 0) { | 2588 | if (ch_peer == 0 && ch_self == 0) { |
2203 | rv = drbd_test_flag(mdev, DISCARD_CONCURRENT) | 2589 | rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) |
2204 | ? -1 : 1; | 2590 | ? -1 : 1; |
2205 | break; | 2591 | break; |
2206 | } else { | 2592 | } else { |
2207 | if (ch_peer == 0) { rv = 1; break; } | 2593 | if (ch_peer == 0) { rv = 1; break; } |
2208 | if (ch_self == 0) { rv = -1; break; } | 2594 | if (ch_self == 0) { rv = -1; break; } |
2209 | } | 2595 | } |
2210 | if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) | 2596 | if (after_sb_0p == ASB_DISCARD_ZERO_CHG) |
2211 | break; | 2597 | break; |
2212 | case ASB_DISCARD_LEAST_CHG: | 2598 | case ASB_DISCARD_LEAST_CHG: |
2213 | if (ch_self < ch_peer) | 2599 | if (ch_self < ch_peer) |
@@ -2216,7 +2602,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2216 | rv = 1; | 2602 | rv = 1; |
2217 | else /* ( ch_self == ch_peer ) */ | 2603 | else /* ( ch_self == ch_peer ) */ |
2218 | /* Well, then use something else. */ | 2604 | /* Well, then use something else. */ |
2219 | rv = drbd_test_flag(mdev, DISCARD_CONCURRENT) | 2605 | rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) |
2220 | ? -1 : 1; | 2606 | ? -1 : 1; |
2221 | break; | 2607 | break; |
2222 | case ASB_DISCARD_LOCAL: | 2608 | case ASB_DISCARD_LOCAL: |
@@ -2232,13 +2618,18 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2232 | static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | 2618 | static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) |
2233 | { | 2619 | { |
2234 | int hg, rv = -100; | 2620 | int hg, rv = -100; |
2621 | enum drbd_after_sb_p after_sb_1p; | ||
2235 | 2622 | ||
2236 | switch (mdev->net_conf->after_sb_1p) { | 2623 | rcu_read_lock(); |
2624 | after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p; | ||
2625 | rcu_read_unlock(); | ||
2626 | switch (after_sb_1p) { | ||
2237 | case ASB_DISCARD_YOUNGER_PRI: | 2627 | case ASB_DISCARD_YOUNGER_PRI: |
2238 | case ASB_DISCARD_OLDER_PRI: | 2628 | case ASB_DISCARD_OLDER_PRI: |
2239 | case ASB_DISCARD_LEAST_CHG: | 2629 | case ASB_DISCARD_LEAST_CHG: |
2240 | case ASB_DISCARD_LOCAL: | 2630 | case ASB_DISCARD_LOCAL: |
2241 | case ASB_DISCARD_REMOTE: | 2631 | case ASB_DISCARD_REMOTE: |
2632 | case ASB_DISCARD_ZERO_CHG: | ||
2242 | dev_err(DEV, "Configuration error.\n"); | 2633 | dev_err(DEV, "Configuration error.\n"); |
2243 | break; | 2634 | break; |
2244 | case ASB_DISCONNECT: | 2635 | case ASB_DISCONNECT: |
@@ -2281,8 +2672,12 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | |||
2281 | static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) | 2672 | static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) |
2282 | { | 2673 | { |
2283 | int hg, rv = -100; | 2674 | int hg, rv = -100; |
2675 | enum drbd_after_sb_p after_sb_2p; | ||
2284 | 2676 | ||
2285 | switch (mdev->net_conf->after_sb_2p) { | 2677 | rcu_read_lock(); |
2678 | after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p; | ||
2679 | rcu_read_unlock(); | ||
2680 | switch (after_sb_2p) { | ||
2286 | case ASB_DISCARD_YOUNGER_PRI: | 2681 | case ASB_DISCARD_YOUNGER_PRI: |
2287 | case ASB_DISCARD_OLDER_PRI: | 2682 | case ASB_DISCARD_OLDER_PRI: |
2288 | case ASB_DISCARD_LEAST_CHG: | 2683 | case ASB_DISCARD_LEAST_CHG: |
@@ -2290,6 +2685,7 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) | |||
2290 | case ASB_DISCARD_REMOTE: | 2685 | case ASB_DISCARD_REMOTE: |
2291 | case ASB_CONSENSUS: | 2686 | case ASB_CONSENSUS: |
2292 | case ASB_DISCARD_SECONDARY: | 2687 | case ASB_DISCARD_SECONDARY: |
2688 | case ASB_DISCARD_ZERO_CHG: | ||
2293 | dev_err(DEV, "Configuration error.\n"); | 2689 | dev_err(DEV, "Configuration error.\n"); |
2294 | break; | 2690 | break; |
2295 | case ASB_VIOLENTLY: | 2691 | case ASB_VIOLENTLY: |
@@ -2375,7 +2771,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2375 | 2771 | ||
2376 | if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { | 2772 | if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { |
2377 | 2773 | ||
2378 | if (mdev->agreed_pro_version < 91) | 2774 | if (mdev->tconn->agreed_pro_version < 91) |
2379 | return -1091; | 2775 | return -1091; |
2380 | 2776 | ||
2381 | if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && | 2777 | if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && |
@@ -2398,7 +2794,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2398 | 2794 | ||
2399 | if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { | 2795 | if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { |
2400 | 2796 | ||
2401 | if (mdev->agreed_pro_version < 91) | 2797 | if (mdev->tconn->agreed_pro_version < 91) |
2402 | return -1091; | 2798 | return -1091; |
2403 | 2799 | ||
2404 | if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && | 2800 | if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && |
@@ -2420,7 +2816,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2420 | } | 2816 | } |
2421 | 2817 | ||
2422 | /* Common power [off|failure] */ | 2818 | /* Common power [off|failure] */ |
2423 | rct = (drbd_test_flag(mdev, CRASHED_PRIMARY) ? 1 : 0) + | 2819 | rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) + |
2424 | (mdev->p_uuid[UI_FLAGS] & 2); | 2820 | (mdev->p_uuid[UI_FLAGS] & 2); |
2425 | /* lowest bit is set when we were primary, | 2821 | /* lowest bit is set when we were primary, |
2426 | * next bit (weight 2) is set when peer was primary */ | 2822 | * next bit (weight 2) is set when peer was primary */ |
@@ -2431,7 +2827,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2431 | case 1: /* self_pri && !peer_pri */ return 1; | 2827 | case 1: /* self_pri && !peer_pri */ return 1; |
2432 | case 2: /* !self_pri && peer_pri */ return -1; | 2828 | case 2: /* !self_pri && peer_pri */ return -1; |
2433 | case 3: /* self_pri && peer_pri */ | 2829 | case 3: /* self_pri && peer_pri */ |
2434 | dc = drbd_test_flag(mdev, DISCARD_CONCURRENT); | 2830 | dc = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags); |
2435 | return dc ? -1 : 1; | 2831 | return dc ? -1 : 1; |
2436 | } | 2832 | } |
2437 | } | 2833 | } |
@@ -2444,14 +2840,14 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2444 | *rule_nr = 51; | 2840 | *rule_nr = 51; |
2445 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); | 2841 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); |
2446 | if (self == peer) { | 2842 | if (self == peer) { |
2447 | if (mdev->agreed_pro_version < 96 ? | 2843 | if (mdev->tconn->agreed_pro_version < 96 ? |
2448 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == | 2844 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == |
2449 | (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : | 2845 | (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : |
2450 | peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) { | 2846 | peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) { |
2451 | /* The last P_SYNC_UUID did not get though. Undo the last start of | 2847 | /* The last P_SYNC_UUID did not get though. Undo the last start of |
2452 | resync as sync source modifications of the peer's UUIDs. */ | 2848 | resync as sync source modifications of the peer's UUIDs. */ |
2453 | 2849 | ||
2454 | if (mdev->agreed_pro_version < 91) | 2850 | if (mdev->tconn->agreed_pro_version < 91) |
2455 | return -1091; | 2851 | return -1091; |
2456 | 2852 | ||
2457 | mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; | 2853 | mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; |
@@ -2481,14 +2877,14 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2481 | *rule_nr = 71; | 2877 | *rule_nr = 71; |
2482 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); | 2878 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); |
2483 | if (self == peer) { | 2879 | if (self == peer) { |
2484 | if (mdev->agreed_pro_version < 96 ? | 2880 | if (mdev->tconn->agreed_pro_version < 96 ? |
2485 | (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == | 2881 | (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == |
2486 | (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) : | 2882 | (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) : |
2487 | self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { | 2883 | self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { |
2488 | /* The last P_SYNC_UUID did not get though. Undo the last start of | 2884 | /* The last P_SYNC_UUID did not get though. Undo the last start of |
2489 | resync as sync source modifications of our UUIDs. */ | 2885 | resync as sync source modifications of our UUIDs. */ |
2490 | 2886 | ||
2491 | if (mdev->agreed_pro_version < 91) | 2887 | if (mdev->tconn->agreed_pro_version < 91) |
2492 | return -1091; | 2888 | return -1091; |
2493 | 2889 | ||
2494 | __drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); | 2890 | __drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); |
@@ -2536,9 +2932,10 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2536 | static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, | 2932 | static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, |
2537 | enum drbd_disk_state peer_disk) __must_hold(local) | 2933 | enum drbd_disk_state peer_disk) __must_hold(local) |
2538 | { | 2934 | { |
2539 | int hg, rule_nr; | ||
2540 | enum drbd_conns rv = C_MASK; | 2935 | enum drbd_conns rv = C_MASK; |
2541 | enum drbd_disk_state mydisk; | 2936 | enum drbd_disk_state mydisk; |
2937 | struct net_conf *nc; | ||
2938 | int hg, rule_nr, rr_conflict, tentative; | ||
2542 | 2939 | ||
2543 | mydisk = mdev->state.disk; | 2940 | mydisk = mdev->state.disk; |
2544 | if (mydisk == D_NEGOTIATING) | 2941 | if (mydisk == D_NEGOTIATING) |
@@ -2578,7 +2975,10 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2578 | if (abs(hg) == 100) | 2975 | if (abs(hg) == 100) |
2579 | drbd_khelper(mdev, "initial-split-brain"); | 2976 | drbd_khelper(mdev, "initial-split-brain"); |
2580 | 2977 | ||
2581 | if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { | 2978 | rcu_read_lock(); |
2979 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2980 | |||
2981 | if (hg == 100 || (hg == -100 && nc->always_asbp)) { | ||
2582 | int pcount = (mdev->state.role == R_PRIMARY) | 2982 | int pcount = (mdev->state.role == R_PRIMARY) |
2583 | + (peer_role == R_PRIMARY); | 2983 | + (peer_role == R_PRIMARY); |
2584 | int forced = (hg == -100); | 2984 | int forced = (hg == -100); |
@@ -2607,9 +3007,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2607 | } | 3007 | } |
2608 | 3008 | ||
2609 | if (hg == -100) { | 3009 | if (hg == -100) { |
2610 | if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) | 3010 | if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1)) |
2611 | hg = -1; | 3011 | hg = -1; |
2612 | if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) | 3012 | if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1)) |
2613 | hg = 1; | 3013 | hg = 1; |
2614 | 3014 | ||
2615 | if (abs(hg) < 100) | 3015 | if (abs(hg) < 100) |
@@ -2617,6 +3017,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2617 | "Sync from %s node\n", | 3017 | "Sync from %s node\n", |
2618 | (hg < 0) ? "peer" : "this"); | 3018 | (hg < 0) ? "peer" : "this"); |
2619 | } | 3019 | } |
3020 | rr_conflict = nc->rr_conflict; | ||
3021 | tentative = nc->tentative; | ||
3022 | rcu_read_unlock(); | ||
2620 | 3023 | ||
2621 | if (hg == -100) { | 3024 | if (hg == -100) { |
2622 | /* FIXME this log message is not correct if we end up here | 3025 | /* FIXME this log message is not correct if we end up here |
@@ -2635,7 +3038,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2635 | 3038 | ||
2636 | if (hg < 0 && /* by intention we do not use mydisk here. */ | 3039 | if (hg < 0 && /* by intention we do not use mydisk here. */ |
2637 | mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { | 3040 | mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { |
2638 | switch (mdev->net_conf->rr_conflict) { | 3041 | switch (rr_conflict) { |
2639 | case ASB_CALL_HELPER: | 3042 | case ASB_CALL_HELPER: |
2640 | drbd_khelper(mdev, "pri-lost"); | 3043 | drbd_khelper(mdev, "pri-lost"); |
2641 | /* fall through */ | 3044 | /* fall through */ |
@@ -2648,7 +3051,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2648 | } | 3051 | } |
2649 | } | 3052 | } |
2650 | 3053 | ||
2651 | if (mdev->net_conf->dry_run || drbd_test_flag(mdev, CONN_DRY_RUN)) { | 3054 | if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) { |
2652 | if (hg == 0) | 3055 | if (hg == 0) |
2653 | dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); | 3056 | dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); |
2654 | else | 3057 | else |
@@ -2680,33 +3083,29 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2680 | return rv; | 3083 | return rv; |
2681 | } | 3084 | } |
2682 | 3085 | ||
2683 | /* returns 1 if invalid */ | 3086 | static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer) |
2684 | static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) | ||
2685 | { | 3087 | { |
2686 | /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ | 3088 | /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ |
2687 | if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || | 3089 | if (peer == ASB_DISCARD_REMOTE) |
2688 | (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) | 3090 | return ASB_DISCARD_LOCAL; |
2689 | return 0; | ||
2690 | 3091 | ||
2691 | /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ | 3092 | /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ |
2692 | if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || | 3093 | if (peer == ASB_DISCARD_LOCAL) |
2693 | self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) | 3094 | return ASB_DISCARD_REMOTE; |
2694 | return 1; | ||
2695 | 3095 | ||
2696 | /* everything else is valid if they are equal on both sides. */ | 3096 | /* everything else is valid if they are equal on both sides. */ |
2697 | if (peer == self) | 3097 | return peer; |
2698 | return 0; | ||
2699 | |||
2700 | /* everything es is invalid. */ | ||
2701 | return 1; | ||
2702 | } | 3098 | } |
2703 | 3099 | ||
2704 | static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3100 | static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi) |
2705 | { | 3101 | { |
2706 | struct p_protocol *p = &mdev->data.rbuf.protocol; | 3102 | struct p_protocol *p = pi->data; |
2707 | int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; | 3103 | enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; |
2708 | int p_want_lose, p_two_primaries, cf; | 3104 | int p_proto, p_discard_my_data, p_two_primaries, cf; |
2709 | char p_integrity_alg[SHARED_SECRET_MAX] = ""; | 3105 | struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; |
3106 | char integrity_alg[SHARED_SECRET_MAX] = ""; | ||
3107 | struct crypto_hash *peer_integrity_tfm = NULL; | ||
3108 | void *int_dig_in = NULL, *int_dig_vv = NULL; | ||
2710 | 3109 | ||
2711 | p_proto = be32_to_cpu(p->protocol); | 3110 | p_proto = be32_to_cpu(p->protocol); |
2712 | p_after_sb_0p = be32_to_cpu(p->after_sb_0p); | 3111 | p_after_sb_0p = be32_to_cpu(p->after_sb_0p); |
@@ -2714,63 +3113,138 @@ static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsig | |||
2714 | p_after_sb_2p = be32_to_cpu(p->after_sb_2p); | 3113 | p_after_sb_2p = be32_to_cpu(p->after_sb_2p); |
2715 | p_two_primaries = be32_to_cpu(p->two_primaries); | 3114 | p_two_primaries = be32_to_cpu(p->two_primaries); |
2716 | cf = be32_to_cpu(p->conn_flags); | 3115 | cf = be32_to_cpu(p->conn_flags); |
2717 | p_want_lose = cf & CF_WANT_LOSE; | 3116 | p_discard_my_data = cf & CF_DISCARD_MY_DATA; |
2718 | |||
2719 | drbd_clear_flag(mdev, CONN_DRY_RUN); | ||
2720 | 3117 | ||
2721 | if (cf & CF_DRY_RUN) | 3118 | if (tconn->agreed_pro_version >= 87) { |
2722 | drbd_set_flag(mdev, CONN_DRY_RUN); | 3119 | int err; |
2723 | 3120 | ||
2724 | if (p_proto != mdev->net_conf->wire_protocol) { | 3121 | if (pi->size > sizeof(integrity_alg)) |
2725 | dev_err(DEV, "incompatible communication protocols\n"); | 3122 | return -EIO; |
2726 | goto disconnect; | 3123 | err = drbd_recv_all(tconn, integrity_alg, pi->size); |
3124 | if (err) | ||
3125 | return err; | ||
3126 | integrity_alg[SHARED_SECRET_MAX - 1] = 0; | ||
2727 | } | 3127 | } |
2728 | 3128 | ||
2729 | if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { | 3129 | if (pi->cmd != P_PROTOCOL_UPDATE) { |
2730 | dev_err(DEV, "incompatible after-sb-0pri settings\n"); | 3130 | clear_bit(CONN_DRY_RUN, &tconn->flags); |
2731 | goto disconnect; | ||
2732 | } | ||
2733 | 3131 | ||
2734 | if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { | 3132 | if (cf & CF_DRY_RUN) |
2735 | dev_err(DEV, "incompatible after-sb-1pri settings\n"); | 3133 | set_bit(CONN_DRY_RUN, &tconn->flags); |
2736 | goto disconnect; | ||
2737 | } | ||
2738 | 3134 | ||
2739 | if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { | 3135 | rcu_read_lock(); |
2740 | dev_err(DEV, "incompatible after-sb-2pri settings\n"); | 3136 | nc = rcu_dereference(tconn->net_conf); |
2741 | goto disconnect; | ||
2742 | } | ||
2743 | 3137 | ||
2744 | if (p_want_lose && mdev->net_conf->want_lose) { | 3138 | if (p_proto != nc->wire_protocol) { |
2745 | dev_err(DEV, "both sides have the 'want_lose' flag set\n"); | 3139 | conn_err(tconn, "incompatible %s settings\n", "protocol"); |
2746 | goto disconnect; | 3140 | goto disconnect_rcu_unlock; |
2747 | } | 3141 | } |
2748 | 3142 | ||
2749 | if (p_two_primaries != mdev->net_conf->two_primaries) { | 3143 | if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) { |
2750 | dev_err(DEV, "incompatible setting of the two-primaries options\n"); | 3144 | conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri"); |
2751 | goto disconnect; | 3145 | goto disconnect_rcu_unlock; |
3146 | } | ||
3147 | |||
3148 | if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) { | ||
3149 | conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri"); | ||
3150 | goto disconnect_rcu_unlock; | ||
3151 | } | ||
3152 | |||
3153 | if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) { | ||
3154 | conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri"); | ||
3155 | goto disconnect_rcu_unlock; | ||
3156 | } | ||
3157 | |||
3158 | if (p_discard_my_data && nc->discard_my_data) { | ||
3159 | conn_err(tconn, "incompatible %s settings\n", "discard-my-data"); | ||
3160 | goto disconnect_rcu_unlock; | ||
3161 | } | ||
3162 | |||
3163 | if (p_two_primaries != nc->two_primaries) { | ||
3164 | conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries"); | ||
3165 | goto disconnect_rcu_unlock; | ||
3166 | } | ||
3167 | |||
3168 | if (strcmp(integrity_alg, nc->integrity_alg)) { | ||
3169 | conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg"); | ||
3170 | goto disconnect_rcu_unlock; | ||
3171 | } | ||
3172 | |||
3173 | rcu_read_unlock(); | ||
2752 | } | 3174 | } |
2753 | 3175 | ||
2754 | if (mdev->agreed_pro_version >= 87) { | 3176 | if (integrity_alg[0]) { |
2755 | unsigned char *my_alg = mdev->net_conf->integrity_alg; | 3177 | int hash_size; |
3178 | |||
3179 | /* | ||
3180 | * We can only change the peer data integrity algorithm | ||
3181 | * here. Changing our own data integrity algorithm | ||
3182 | * requires that we send a P_PROTOCOL_UPDATE packet at | ||
3183 | * the same time; otherwise, the peer has no way to | ||
3184 | * tell between which packets the algorithm should | ||
3185 | * change. | ||
3186 | */ | ||
2756 | 3187 | ||
2757 | if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) | 3188 | peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC); |
2758 | return false; | 3189 | if (!peer_integrity_tfm) { |
3190 | conn_err(tconn, "peer data-integrity-alg %s not supported\n", | ||
3191 | integrity_alg); | ||
3192 | goto disconnect; | ||
3193 | } | ||
2759 | 3194 | ||
2760 | p_integrity_alg[SHARED_SECRET_MAX-1] = 0; | 3195 | hash_size = crypto_hash_digestsize(peer_integrity_tfm); |
2761 | if (strcmp(p_integrity_alg, my_alg)) { | 3196 | int_dig_in = kmalloc(hash_size, GFP_KERNEL); |
2762 | dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); | 3197 | int_dig_vv = kmalloc(hash_size, GFP_KERNEL); |
3198 | if (!(int_dig_in && int_dig_vv)) { | ||
3199 | conn_err(tconn, "Allocation of buffers for data integrity checking failed\n"); | ||
2763 | goto disconnect; | 3200 | goto disconnect; |
2764 | } | 3201 | } |
2765 | dev_info(DEV, "data-integrity-alg: %s\n", | ||
2766 | my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); | ||
2767 | } | 3202 | } |
2768 | 3203 | ||
2769 | return true; | 3204 | new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); |
3205 | if (!new_net_conf) { | ||
3206 | conn_err(tconn, "Allocation of new net_conf failed\n"); | ||
3207 | goto disconnect; | ||
3208 | } | ||
3209 | |||
3210 | mutex_lock(&tconn->data.mutex); | ||
3211 | mutex_lock(&tconn->conf_update); | ||
3212 | old_net_conf = tconn->net_conf; | ||
3213 | *new_net_conf = *old_net_conf; | ||
3214 | |||
3215 | new_net_conf->wire_protocol = p_proto; | ||
3216 | new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p); | ||
3217 | new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p); | ||
3218 | new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p); | ||
3219 | new_net_conf->two_primaries = p_two_primaries; | ||
3220 | |||
3221 | rcu_assign_pointer(tconn->net_conf, new_net_conf); | ||
3222 | mutex_unlock(&tconn->conf_update); | ||
3223 | mutex_unlock(&tconn->data.mutex); | ||
3224 | |||
3225 | crypto_free_hash(tconn->peer_integrity_tfm); | ||
3226 | kfree(tconn->int_dig_in); | ||
3227 | kfree(tconn->int_dig_vv); | ||
3228 | tconn->peer_integrity_tfm = peer_integrity_tfm; | ||
3229 | tconn->int_dig_in = int_dig_in; | ||
3230 | tconn->int_dig_vv = int_dig_vv; | ||
3231 | |||
3232 | if (strcmp(old_net_conf->integrity_alg, integrity_alg)) | ||
3233 | conn_info(tconn, "peer data-integrity-alg: %s\n", | ||
3234 | integrity_alg[0] ? integrity_alg : "(none)"); | ||
2770 | 3235 | ||
3236 | synchronize_rcu(); | ||
3237 | kfree(old_net_conf); | ||
3238 | return 0; | ||
3239 | |||
3240 | disconnect_rcu_unlock: | ||
3241 | rcu_read_unlock(); | ||
2771 | disconnect: | 3242 | disconnect: |
2772 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3243 | crypto_free_hash(peer_integrity_tfm); |
2773 | return false; | 3244 | kfree(int_dig_in); |
3245 | kfree(int_dig_vv); | ||
3246 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); | ||
3247 | return -EIO; | ||
2774 | } | 3248 | } |
2775 | 3249 | ||
2776 | /* helper function | 3250 | /* helper function |
@@ -2792,24 +3266,64 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, | |||
2792 | alg, name, PTR_ERR(tfm)); | 3266 | alg, name, PTR_ERR(tfm)); |
2793 | return tfm; | 3267 | return tfm; |
2794 | } | 3268 | } |
2795 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | ||
2796 | crypto_free_hash(tfm); | ||
2797 | dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); | ||
2798 | return ERR_PTR(-EINVAL); | ||
2799 | } | ||
2800 | return tfm; | 3269 | return tfm; |
2801 | } | 3270 | } |
2802 | 3271 | ||
2803 | static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size) | 3272 | static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi) |
2804 | { | 3273 | { |
2805 | int ok = true; | 3274 | void *buffer = tconn->data.rbuf; |
2806 | struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95; | 3275 | int size = pi->size; |
3276 | |||
3277 | while (size) { | ||
3278 | int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE); | ||
3279 | s = drbd_recv(tconn, buffer, s); | ||
3280 | if (s <= 0) { | ||
3281 | if (s < 0) | ||
3282 | return s; | ||
3283 | break; | ||
3284 | } | ||
3285 | size -= s; | ||
3286 | } | ||
3287 | if (size) | ||
3288 | return -EIO; | ||
3289 | return 0; | ||
3290 | } | ||
3291 | |||
3292 | /* | ||
3293 | * config_unknown_volume - device configuration command for unknown volume | ||
3294 | * | ||
3295 | * When a device is added to an existing connection, the node on which the | ||
3296 | * device is added first will send configuration commands to its peer but the | ||
3297 | * peer will not know about the device yet. It will warn and ignore these | ||
3298 | * commands. Once the device is added on the second node, the second node will | ||
3299 | * send the same device configuration commands, but in the other direction. | ||
3300 | * | ||
3301 | * (We can also end up here if drbd is misconfigured.) | ||
3302 | */ | ||
3303 | static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi) | ||
3304 | { | ||
3305 | conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n", | ||
3306 | cmdname(pi->cmd), pi->vnr); | ||
3307 | return ignore_remaining_packet(tconn, pi); | ||
3308 | } | ||
3309 | |||
3310 | static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) | ||
3311 | { | ||
3312 | struct drbd_conf *mdev; | ||
3313 | struct p_rs_param_95 *p; | ||
2807 | unsigned int header_size, data_size, exp_max_sz; | 3314 | unsigned int header_size, data_size, exp_max_sz; |
2808 | struct crypto_hash *verify_tfm = NULL; | 3315 | struct crypto_hash *verify_tfm = NULL; |
2809 | struct crypto_hash *csums_tfm = NULL; | 3316 | struct crypto_hash *csums_tfm = NULL; |
2810 | const int apv = mdev->agreed_pro_version; | 3317 | struct net_conf *old_net_conf, *new_net_conf = NULL; |
2811 | int *rs_plan_s = NULL; | 3318 | struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; |
3319 | const int apv = tconn->agreed_pro_version; | ||
3320 | struct fifo_buffer *old_plan = NULL, *new_plan = NULL; | ||
2812 | int fifo_size = 0; | 3321 | int fifo_size = 0; |
3322 | int err; | ||
3323 | |||
3324 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3325 | if (!mdev) | ||
3326 | return config_unknown_volume(tconn, pi); | ||
2813 | 3327 | ||
2814 | exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) | 3328 | exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) |
2815 | : apv == 88 ? sizeof(struct p_rs_param) | 3329 | : apv == 88 ? sizeof(struct p_rs_param) |
@@ -2817,32 +3331,49 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2817 | : apv <= 94 ? sizeof(struct p_rs_param_89) | 3331 | : apv <= 94 ? sizeof(struct p_rs_param_89) |
2818 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); | 3332 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); |
2819 | 3333 | ||
2820 | if (packet_size > exp_max_sz) { | 3334 | if (pi->size > exp_max_sz) { |
2821 | dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", | 3335 | dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", |
2822 | packet_size, exp_max_sz); | 3336 | pi->size, exp_max_sz); |
2823 | return false; | 3337 | return -EIO; |
2824 | } | 3338 | } |
2825 | 3339 | ||
2826 | if (apv <= 88) { | 3340 | if (apv <= 88) { |
2827 | header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80); | 3341 | header_size = sizeof(struct p_rs_param); |
2828 | data_size = packet_size - header_size; | 3342 | data_size = pi->size - header_size; |
2829 | } else if (apv <= 94) { | 3343 | } else if (apv <= 94) { |
2830 | header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80); | 3344 | header_size = sizeof(struct p_rs_param_89); |
2831 | data_size = packet_size - header_size; | 3345 | data_size = pi->size - header_size; |
2832 | D_ASSERT(data_size == 0); | 3346 | D_ASSERT(data_size == 0); |
2833 | } else { | 3347 | } else { |
2834 | header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80); | 3348 | header_size = sizeof(struct p_rs_param_95); |
2835 | data_size = packet_size - header_size; | 3349 | data_size = pi->size - header_size; |
2836 | D_ASSERT(data_size == 0); | 3350 | D_ASSERT(data_size == 0); |
2837 | } | 3351 | } |
2838 | 3352 | ||
2839 | /* initialize verify_alg and csums_alg */ | 3353 | /* initialize verify_alg and csums_alg */ |
3354 | p = pi->data; | ||
2840 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | 3355 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); |
2841 | 3356 | ||
2842 | if (drbd_recv(mdev, &p->head.payload, header_size) != header_size) | 3357 | err = drbd_recv_all(mdev->tconn, p, header_size); |
2843 | return false; | 3358 | if (err) |
3359 | return err; | ||
2844 | 3360 | ||
2845 | mdev->sync_conf.rate = be32_to_cpu(p->rate); | 3361 | mutex_lock(&mdev->tconn->conf_update); |
3362 | old_net_conf = mdev->tconn->net_conf; | ||
3363 | if (get_ldev(mdev)) { | ||
3364 | new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
3365 | if (!new_disk_conf) { | ||
3366 | put_ldev(mdev); | ||
3367 | mutex_unlock(&mdev->tconn->conf_update); | ||
3368 | dev_err(DEV, "Allocation of new disk_conf failed\n"); | ||
3369 | return -ENOMEM; | ||
3370 | } | ||
3371 | |||
3372 | old_disk_conf = mdev->ldev->disk_conf; | ||
3373 | *new_disk_conf = *old_disk_conf; | ||
3374 | |||
3375 | new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate); | ||
3376 | } | ||
2846 | 3377 | ||
2847 | if (apv >= 88) { | 3378 | if (apv >= 88) { |
2848 | if (apv == 88) { | 3379 | if (apv == 88) { |
@@ -2850,12 +3381,13 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2850 | dev_err(DEV, "verify-alg of wrong size, " | 3381 | dev_err(DEV, "verify-alg of wrong size, " |
2851 | "peer wants %u, accepting only up to %u byte\n", | 3382 | "peer wants %u, accepting only up to %u byte\n", |
2852 | data_size, SHARED_SECRET_MAX); | 3383 | data_size, SHARED_SECRET_MAX); |
2853 | return false; | 3384 | err = -EIO; |
3385 | goto reconnect; | ||
2854 | } | 3386 | } |
2855 | 3387 | ||
2856 | if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) | 3388 | err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size); |
2857 | return false; | 3389 | if (err) |
2858 | 3390 | goto reconnect; | |
2859 | /* we expect NUL terminated string */ | 3391 | /* we expect NUL terminated string */ |
2860 | /* but just in case someone tries to be evil */ | 3392 | /* but just in case someone tries to be evil */ |
2861 | D_ASSERT(p->verify_alg[data_size-1] == 0); | 3393 | D_ASSERT(p->verify_alg[data_size-1] == 0); |
@@ -2870,10 +3402,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2870 | p->csums_alg[SHARED_SECRET_MAX-1] = 0; | 3402 | p->csums_alg[SHARED_SECRET_MAX-1] = 0; |
2871 | } | 3403 | } |
2872 | 3404 | ||
2873 | if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { | 3405 | if (strcmp(old_net_conf->verify_alg, p->verify_alg)) { |
2874 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | 3406 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { |
2875 | dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", | 3407 | dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", |
2876 | mdev->sync_conf.verify_alg, p->verify_alg); | 3408 | old_net_conf->verify_alg, p->verify_alg); |
2877 | goto disconnect; | 3409 | goto disconnect; |
2878 | } | 3410 | } |
2879 | verify_tfm = drbd_crypto_alloc_digest_safe(mdev, | 3411 | verify_tfm = drbd_crypto_alloc_digest_safe(mdev, |
@@ -2884,10 +3416,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2884 | } | 3416 | } |
2885 | } | 3417 | } |
2886 | 3418 | ||
2887 | if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { | 3419 | if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) { |
2888 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | 3420 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { |
2889 | dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", | 3421 | dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", |
2890 | mdev->sync_conf.csums_alg, p->csums_alg); | 3422 | old_net_conf->csums_alg, p->csums_alg); |
2891 | goto disconnect; | 3423 | goto disconnect; |
2892 | } | 3424 | } |
2893 | csums_tfm = drbd_crypto_alloc_digest_safe(mdev, | 3425 | csums_tfm = drbd_crypto_alloc_digest_safe(mdev, |
@@ -2898,57 +3430,91 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2898 | } | 3430 | } |
2899 | } | 3431 | } |
2900 | 3432 | ||
2901 | if (apv > 94) { | 3433 | if (apv > 94 && new_disk_conf) { |
2902 | mdev->sync_conf.rate = be32_to_cpu(p->rate); | 3434 | new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead); |
2903 | mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead); | 3435 | new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target); |
2904 | mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target); | 3436 | new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target); |
2905 | mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target); | 3437 | new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate); |
2906 | mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate); | 3438 | |
2907 | 3439 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; | |
2908 | fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; | 3440 | if (fifo_size != mdev->rs_plan_s->size) { |
2909 | if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { | 3441 | new_plan = fifo_alloc(fifo_size); |
2910 | rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_NOIO); | 3442 | if (!new_plan) { |
2911 | if (!rs_plan_s) { | ||
2912 | dev_err(DEV, "kmalloc of fifo_buffer failed"); | 3443 | dev_err(DEV, "kmalloc of fifo_buffer failed"); |
3444 | put_ldev(mdev); | ||
2913 | goto disconnect; | 3445 | goto disconnect; |
2914 | } | 3446 | } |
2915 | } | 3447 | } |
2916 | } | 3448 | } |
2917 | 3449 | ||
2918 | spin_lock(&mdev->peer_seq_lock); | 3450 | if (verify_tfm || csums_tfm) { |
2919 | /* lock against drbd_nl_syncer_conf() */ | 3451 | new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); |
2920 | if (verify_tfm) { | 3452 | if (!new_net_conf) { |
2921 | strcpy(mdev->sync_conf.verify_alg, p->verify_alg); | 3453 | dev_err(DEV, "Allocation of new net_conf failed\n"); |
2922 | mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; | 3454 | goto disconnect; |
2923 | crypto_free_hash(mdev->verify_tfm); | 3455 | } |
2924 | mdev->verify_tfm = verify_tfm; | 3456 | |
2925 | dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); | 3457 | *new_net_conf = *old_net_conf; |
2926 | } | 3458 | |
2927 | if (csums_tfm) { | 3459 | if (verify_tfm) { |
2928 | strcpy(mdev->sync_conf.csums_alg, p->csums_alg); | 3460 | strcpy(new_net_conf->verify_alg, p->verify_alg); |
2929 | mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; | 3461 | new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; |
2930 | crypto_free_hash(mdev->csums_tfm); | 3462 | crypto_free_hash(mdev->tconn->verify_tfm); |
2931 | mdev->csums_tfm = csums_tfm; | 3463 | mdev->tconn->verify_tfm = verify_tfm; |
2932 | dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); | 3464 | dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); |
2933 | } | 3465 | } |
2934 | if (fifo_size != mdev->rs_plan_s.size) { | 3466 | if (csums_tfm) { |
2935 | kfree(mdev->rs_plan_s.values); | 3467 | strcpy(new_net_conf->csums_alg, p->csums_alg); |
2936 | mdev->rs_plan_s.values = rs_plan_s; | 3468 | new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; |
2937 | mdev->rs_plan_s.size = fifo_size; | 3469 | crypto_free_hash(mdev->tconn->csums_tfm); |
2938 | mdev->rs_planed = 0; | 3470 | mdev->tconn->csums_tfm = csums_tfm; |
3471 | dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); | ||
3472 | } | ||
3473 | rcu_assign_pointer(tconn->net_conf, new_net_conf); | ||
2939 | } | 3474 | } |
2940 | spin_unlock(&mdev->peer_seq_lock); | ||
2941 | } | 3475 | } |
2942 | 3476 | ||
2943 | return ok; | 3477 | if (new_disk_conf) { |
3478 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
3479 | put_ldev(mdev); | ||
3480 | } | ||
3481 | |||
3482 | if (new_plan) { | ||
3483 | old_plan = mdev->rs_plan_s; | ||
3484 | rcu_assign_pointer(mdev->rs_plan_s, new_plan); | ||
3485 | } | ||
3486 | |||
3487 | mutex_unlock(&mdev->tconn->conf_update); | ||
3488 | synchronize_rcu(); | ||
3489 | if (new_net_conf) | ||
3490 | kfree(old_net_conf); | ||
3491 | kfree(old_disk_conf); | ||
3492 | kfree(old_plan); | ||
3493 | |||
3494 | return 0; | ||
3495 | |||
3496 | reconnect: | ||
3497 | if (new_disk_conf) { | ||
3498 | put_ldev(mdev); | ||
3499 | kfree(new_disk_conf); | ||
3500 | } | ||
3501 | mutex_unlock(&mdev->tconn->conf_update); | ||
3502 | return -EIO; | ||
3503 | |||
2944 | disconnect: | 3504 | disconnect: |
3505 | kfree(new_plan); | ||
3506 | if (new_disk_conf) { | ||
3507 | put_ldev(mdev); | ||
3508 | kfree(new_disk_conf); | ||
3509 | } | ||
3510 | mutex_unlock(&mdev->tconn->conf_update); | ||
2945 | /* just for completeness: actually not needed, | 3511 | /* just for completeness: actually not needed, |
2946 | * as this is not reached if csums_tfm was ok. */ | 3512 | * as this is not reached if csums_tfm was ok. */ |
2947 | crypto_free_hash(csums_tfm); | 3513 | crypto_free_hash(csums_tfm); |
2948 | /* but free the verify_tfm again, if csums_tfm did not work out */ | 3514 | /* but free the verify_tfm again, if csums_tfm did not work out */ |
2949 | crypto_free_hash(verify_tfm); | 3515 | crypto_free_hash(verify_tfm); |
2950 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3516 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
2951 | return false; | 3517 | return -EIO; |
2952 | } | 3518 | } |
2953 | 3519 | ||
2954 | /* warn if the arguments differ by more than 12.5% */ | 3520 | /* warn if the arguments differ by more than 12.5% */ |
@@ -2964,59 +3530,77 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev, | |||
2964 | (unsigned long long)a, (unsigned long long)b); | 3530 | (unsigned long long)a, (unsigned long long)b); |
2965 | } | 3531 | } |
2966 | 3532 | ||
2967 | static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3533 | static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) |
2968 | { | 3534 | { |
2969 | struct p_sizes *p = &mdev->data.rbuf.sizes; | 3535 | struct drbd_conf *mdev; |
3536 | struct p_sizes *p = pi->data; | ||
2970 | enum determine_dev_size dd = unchanged; | 3537 | enum determine_dev_size dd = unchanged; |
2971 | sector_t p_size, p_usize, my_usize; | 3538 | sector_t p_size, p_usize, my_usize; |
2972 | int ldsc = 0; /* local disk size changed */ | 3539 | int ldsc = 0; /* local disk size changed */ |
2973 | enum dds_flags ddsf; | 3540 | enum dds_flags ddsf; |
2974 | 3541 | ||
3542 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3543 | if (!mdev) | ||
3544 | return config_unknown_volume(tconn, pi); | ||
3545 | |||
2975 | p_size = be64_to_cpu(p->d_size); | 3546 | p_size = be64_to_cpu(p->d_size); |
2976 | p_usize = be64_to_cpu(p->u_size); | 3547 | p_usize = be64_to_cpu(p->u_size); |
2977 | 3548 | ||
2978 | if (p_size == 0 && mdev->state.disk == D_DISKLESS) { | ||
2979 | dev_err(DEV, "some backing storage is needed\n"); | ||
2980 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2981 | return false; | ||
2982 | } | ||
2983 | |||
2984 | /* just store the peer's disk size for now. | 3549 | /* just store the peer's disk size for now. |
2985 | * we still need to figure out whether we accept that. */ | 3550 | * we still need to figure out whether we accept that. */ |
2986 | mdev->p_size = p_size; | 3551 | mdev->p_size = p_size; |
2987 | 3552 | ||
2988 | if (get_ldev(mdev)) { | 3553 | if (get_ldev(mdev)) { |
3554 | rcu_read_lock(); | ||
3555 | my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
3556 | rcu_read_unlock(); | ||
3557 | |||
2989 | warn_if_differ_considerably(mdev, "lower level device sizes", | 3558 | warn_if_differ_considerably(mdev, "lower level device sizes", |
2990 | p_size, drbd_get_max_capacity(mdev->ldev)); | 3559 | p_size, drbd_get_max_capacity(mdev->ldev)); |
2991 | warn_if_differ_considerably(mdev, "user requested size", | 3560 | warn_if_differ_considerably(mdev, "user requested size", |
2992 | p_usize, mdev->ldev->dc.disk_size); | 3561 | p_usize, my_usize); |
2993 | 3562 | ||
2994 | /* if this is the first connect, or an otherwise expected | 3563 | /* if this is the first connect, or an otherwise expected |
2995 | * param exchange, choose the minimum */ | 3564 | * param exchange, choose the minimum */ |
2996 | if (mdev->state.conn == C_WF_REPORT_PARAMS) | 3565 | if (mdev->state.conn == C_WF_REPORT_PARAMS) |
2997 | p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, | 3566 | p_usize = min_not_zero(my_usize, p_usize); |
2998 | p_usize); | ||
2999 | |||
3000 | my_usize = mdev->ldev->dc.disk_size; | ||
3001 | |||
3002 | if (mdev->ldev->dc.disk_size != p_usize) { | ||
3003 | mdev->ldev->dc.disk_size = p_usize; | ||
3004 | dev_info(DEV, "Peer sets u_size to %lu sectors\n", | ||
3005 | (unsigned long)mdev->ldev->dc.disk_size); | ||
3006 | } | ||
3007 | 3567 | ||
3008 | /* Never shrink a device with usable data during connect. | 3568 | /* Never shrink a device with usable data during connect. |
3009 | But allow online shrinking if we are connected. */ | 3569 | But allow online shrinking if we are connected. */ |
3010 | if (drbd_new_dev_size(mdev, mdev->ldev, 0) < | 3570 | if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) < |
3011 | drbd_get_capacity(mdev->this_bdev) && | 3571 | drbd_get_capacity(mdev->this_bdev) && |
3012 | mdev->state.disk >= D_OUTDATED && | 3572 | mdev->state.disk >= D_OUTDATED && |
3013 | mdev->state.conn < C_CONNECTED) { | 3573 | mdev->state.conn < C_CONNECTED) { |
3014 | dev_err(DEV, "The peer's disk size is too small!\n"); | 3574 | dev_err(DEV, "The peer's disk size is too small!\n"); |
3015 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3575 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3016 | mdev->ldev->dc.disk_size = my_usize; | ||
3017 | put_ldev(mdev); | 3576 | put_ldev(mdev); |
3018 | return false; | 3577 | return -EIO; |
3578 | } | ||
3579 | |||
3580 | if (my_usize != p_usize) { | ||
3581 | struct disk_conf *old_disk_conf, *new_disk_conf = NULL; | ||
3582 | |||
3583 | new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
3584 | if (!new_disk_conf) { | ||
3585 | dev_err(DEV, "Allocation of new disk_conf failed\n"); | ||
3586 | put_ldev(mdev); | ||
3587 | return -ENOMEM; | ||
3588 | } | ||
3589 | |||
3590 | mutex_lock(&mdev->tconn->conf_update); | ||
3591 | old_disk_conf = mdev->ldev->disk_conf; | ||
3592 | *new_disk_conf = *old_disk_conf; | ||
3593 | new_disk_conf->disk_size = p_usize; | ||
3594 | |||
3595 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
3596 | mutex_unlock(&mdev->tconn->conf_update); | ||
3597 | synchronize_rcu(); | ||
3598 | kfree(old_disk_conf); | ||
3599 | |||
3600 | dev_info(DEV, "Peer sets u_size to %lu sectors\n", | ||
3601 | (unsigned long)my_usize); | ||
3019 | } | 3602 | } |
3603 | |||
3020 | put_ldev(mdev); | 3604 | put_ldev(mdev); |
3021 | } | 3605 | } |
3022 | 3606 | ||
@@ -3025,7 +3609,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3025 | dd = drbd_determine_dev_size(mdev, ddsf); | 3609 | dd = drbd_determine_dev_size(mdev, ddsf); |
3026 | put_ldev(mdev); | 3610 | put_ldev(mdev); |
3027 | if (dd == dev_size_error) | 3611 | if (dd == dev_size_error) |
3028 | return false; | 3612 | return -EIO; |
3029 | drbd_md_sync(mdev); | 3613 | drbd_md_sync(mdev); |
3030 | } else { | 3614 | } else { |
3031 | /* I am diskless, need to accept the peer's size. */ | 3615 | /* I am diskless, need to accept the peer's size. */ |
@@ -3051,7 +3635,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3051 | * needs to know my new size... */ | 3635 | * needs to know my new size... */ |
3052 | drbd_send_sizes(mdev, 0, ddsf); | 3636 | drbd_send_sizes(mdev, 0, ddsf); |
3053 | } | 3637 | } |
3054 | if (drbd_test_and_clear_flag(mdev, RESIZE_PENDING) || | 3638 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || |
3055 | (dd == grew && mdev->state.conn == C_CONNECTED)) { | 3639 | (dd == grew && mdev->state.conn == C_CONNECTED)) { |
3056 | if (mdev->state.pdsk >= D_INCONSISTENT && | 3640 | if (mdev->state.pdsk >= D_INCONSISTENT && |
3057 | mdev->state.disk >= D_INCONSISTENT) { | 3641 | mdev->state.disk >= D_INCONSISTENT) { |
@@ -3060,19 +3644,24 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3060 | else | 3644 | else |
3061 | resync_after_online_grow(mdev); | 3645 | resync_after_online_grow(mdev); |
3062 | } else | 3646 | } else |
3063 | drbd_set_flag(mdev, RESYNC_AFTER_NEG); | 3647 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); |
3064 | } | 3648 | } |
3065 | } | 3649 | } |
3066 | 3650 | ||
3067 | return true; | 3651 | return 0; |
3068 | } | 3652 | } |
3069 | 3653 | ||
3070 | static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3654 | static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi) |
3071 | { | 3655 | { |
3072 | struct p_uuids *p = &mdev->data.rbuf.uuids; | 3656 | struct drbd_conf *mdev; |
3657 | struct p_uuids *p = pi->data; | ||
3073 | u64 *p_uuid; | 3658 | u64 *p_uuid; |
3074 | int i, updated_uuids = 0; | 3659 | int i, updated_uuids = 0; |
3075 | 3660 | ||
3661 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3662 | if (!mdev) | ||
3663 | return config_unknown_volume(tconn, pi); | ||
3664 | |||
3076 | p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); | 3665 | p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); |
3077 | 3666 | ||
3078 | for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) | 3667 | for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) |
@@ -3087,14 +3676,14 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3087 | (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { | 3676 | (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { |
3088 | dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", | 3677 | dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", |
3089 | (unsigned long long)mdev->ed_uuid); | 3678 | (unsigned long long)mdev->ed_uuid); |
3090 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3679 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3091 | return false; | 3680 | return -EIO; |
3092 | } | 3681 | } |
3093 | 3682 | ||
3094 | if (get_ldev(mdev)) { | 3683 | if (get_ldev(mdev)) { |
3095 | int skip_initial_sync = | 3684 | int skip_initial_sync = |
3096 | mdev->state.conn == C_CONNECTED && | 3685 | mdev->state.conn == C_CONNECTED && |
3097 | mdev->agreed_pro_version >= 90 && | 3686 | mdev->tconn->agreed_pro_version >= 90 && |
3098 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && | 3687 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && |
3099 | (p_uuid[UI_FLAGS] & 8); | 3688 | (p_uuid[UI_FLAGS] & 8); |
3100 | if (skip_initial_sync) { | 3689 | if (skip_initial_sync) { |
@@ -3121,14 +3710,15 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3121 | ongoing cluster wide state change is finished. That is important if | 3710 | ongoing cluster wide state change is finished. That is important if |
3122 | we are primary and are detaching from our disk. We need to see the | 3711 | we are primary and are detaching from our disk. We need to see the |
3123 | new disk state... */ | 3712 | new disk state... */ |
3124 | wait_event(mdev->misc_wait, !drbd_test_flag(mdev, CLUSTER_ST_CHANGE)); | 3713 | mutex_lock(mdev->state_mutex); |
3714 | mutex_unlock(mdev->state_mutex); | ||
3125 | if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) | 3715 | if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) |
3126 | updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); | 3716 | updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); |
3127 | 3717 | ||
3128 | if (updated_uuids) | 3718 | if (updated_uuids) |
3129 | drbd_print_uuids(mdev, "receiver updated UUIDs to"); | 3719 | drbd_print_uuids(mdev, "receiver updated UUIDs to"); |
3130 | 3720 | ||
3131 | return true; | 3721 | return 0; |
3132 | } | 3722 | } |
3133 | 3723 | ||
3134 | /** | 3724 | /** |
@@ -3140,6 +3730,7 @@ static union drbd_state convert_state(union drbd_state ps) | |||
3140 | union drbd_state ms; | 3730 | union drbd_state ms; |
3141 | 3731 | ||
3142 | static enum drbd_conns c_tab[] = { | 3732 | static enum drbd_conns c_tab[] = { |
3733 | [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS, | ||
3143 | [C_CONNECTED] = C_CONNECTED, | 3734 | [C_CONNECTED] = C_CONNECTED, |
3144 | 3735 | ||
3145 | [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, | 3736 | [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, |
@@ -3161,40 +3752,74 @@ static union drbd_state convert_state(union drbd_state ps) | |||
3161 | return ms; | 3752 | return ms; |
3162 | } | 3753 | } |
3163 | 3754 | ||
3164 | static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3755 | static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi) |
3165 | { | 3756 | { |
3166 | struct p_req_state *p = &mdev->data.rbuf.req_state; | 3757 | struct drbd_conf *mdev; |
3758 | struct p_req_state *p = pi->data; | ||
3167 | union drbd_state mask, val; | 3759 | union drbd_state mask, val; |
3168 | enum drbd_state_rv rv; | 3760 | enum drbd_state_rv rv; |
3169 | 3761 | ||
3762 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3763 | if (!mdev) | ||
3764 | return -EIO; | ||
3765 | |||
3170 | mask.i = be32_to_cpu(p->mask); | 3766 | mask.i = be32_to_cpu(p->mask); |
3171 | val.i = be32_to_cpu(p->val); | 3767 | val.i = be32_to_cpu(p->val); |
3172 | 3768 | ||
3173 | if (drbd_test_flag(mdev, DISCARD_CONCURRENT) && | 3769 | if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) && |
3174 | drbd_test_flag(mdev, CLUSTER_ST_CHANGE)) { | 3770 | mutex_is_locked(mdev->state_mutex)) { |
3175 | drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); | 3771 | drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); |
3176 | return true; | 3772 | return 0; |
3177 | } | 3773 | } |
3178 | 3774 | ||
3179 | mask = convert_state(mask); | 3775 | mask = convert_state(mask); |
3180 | val = convert_state(val); | 3776 | val = convert_state(val); |
3181 | 3777 | ||
3182 | rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); | 3778 | rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); |
3183 | |||
3184 | drbd_send_sr_reply(mdev, rv); | 3779 | drbd_send_sr_reply(mdev, rv); |
3780 | |||
3185 | drbd_md_sync(mdev); | 3781 | drbd_md_sync(mdev); |
3186 | 3782 | ||
3187 | return true; | 3783 | return 0; |
3188 | } | 3784 | } |
3189 | 3785 | ||
3190 | static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3786 | static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi) |
3191 | { | 3787 | { |
3192 | struct p_state *p = &mdev->data.rbuf.state; | 3788 | struct p_req_state *p = pi->data; |
3789 | union drbd_state mask, val; | ||
3790 | enum drbd_state_rv rv; | ||
3791 | |||
3792 | mask.i = be32_to_cpu(p->mask); | ||
3793 | val.i = be32_to_cpu(p->val); | ||
3794 | |||
3795 | if (test_bit(RESOLVE_CONFLICTS, &tconn->flags) && | ||
3796 | mutex_is_locked(&tconn->cstate_mutex)) { | ||
3797 | conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG); | ||
3798 | return 0; | ||
3799 | } | ||
3800 | |||
3801 | mask = convert_state(mask); | ||
3802 | val = convert_state(val); | ||
3803 | |||
3804 | rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL); | ||
3805 | conn_send_sr_reply(tconn, rv); | ||
3806 | |||
3807 | return 0; | ||
3808 | } | ||
3809 | |||
3810 | static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) | ||
3811 | { | ||
3812 | struct drbd_conf *mdev; | ||
3813 | struct p_state *p = pi->data; | ||
3193 | union drbd_state os, ns, peer_state; | 3814 | union drbd_state os, ns, peer_state; |
3194 | enum drbd_disk_state real_peer_disk; | 3815 | enum drbd_disk_state real_peer_disk; |
3195 | enum chg_state_flags cs_flags; | 3816 | enum chg_state_flags cs_flags; |
3196 | int rv; | 3817 | int rv; |
3197 | 3818 | ||
3819 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3820 | if (!mdev) | ||
3821 | return config_unknown_volume(tconn, pi); | ||
3822 | |||
3198 | peer_state.i = be32_to_cpu(p->state); | 3823 | peer_state.i = be32_to_cpu(p->state); |
3199 | 3824 | ||
3200 | real_peer_disk = peer_state.disk; | 3825 | real_peer_disk = peer_state.disk; |
@@ -3203,16 +3828,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3203 | dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); | 3828 | dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); |
3204 | } | 3829 | } |
3205 | 3830 | ||
3206 | spin_lock_irq(&mdev->req_lock); | 3831 | spin_lock_irq(&mdev->tconn->req_lock); |
3207 | retry: | 3832 | retry: |
3208 | os = ns = mdev->state; | 3833 | os = ns = drbd_read_state(mdev); |
3209 | spin_unlock_irq(&mdev->req_lock); | 3834 | spin_unlock_irq(&mdev->tconn->req_lock); |
3210 | 3835 | ||
3211 | /* If some other part of the code (asender thread, timeout) | 3836 | /* If some other part of the code (asender thread, timeout) |
3212 | * already decided to close the connection again, | 3837 | * already decided to close the connection again, |
3213 | * we must not "re-establish" it here. */ | 3838 | * we must not "re-establish" it here. */ |
3214 | if (os.conn <= C_TEAR_DOWN) | 3839 | if (os.conn <= C_TEAR_DOWN) |
3215 | return false; | 3840 | return -ECONNRESET; |
3216 | 3841 | ||
3217 | /* If this is the "end of sync" confirmation, usually the peer disk | 3842 | /* If this is the "end of sync" confirmation, usually the peer disk |
3218 | * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits | 3843 | * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits |
@@ -3240,16 +3865,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3240 | peer_state.conn == C_CONNECTED) { | 3865 | peer_state.conn == C_CONNECTED) { |
3241 | if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) | 3866 | if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) |
3242 | drbd_resync_finished(mdev); | 3867 | drbd_resync_finished(mdev); |
3243 | return true; | 3868 | return 0; |
3244 | } | 3869 | } |
3245 | } | 3870 | } |
3246 | 3871 | ||
3247 | /* explicit verify finished notification, stop sector reached. */ | 3872 | /* explicit verify finished notification, stop sector reached. */ |
3248 | if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && | 3873 | if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && |
3249 | peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { | 3874 | peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { |
3250 | ov_oos_print(mdev); | 3875 | ov_out_of_sync_print(mdev); |
3251 | drbd_resync_finished(mdev); | 3876 | drbd_resync_finished(mdev); |
3252 | return true; | 3877 | return 0; |
3253 | } | 3878 | } |
3254 | 3879 | ||
3255 | /* peer says his disk is inconsistent, while we think it is uptodate, | 3880 | /* peer says his disk is inconsistent, while we think it is uptodate, |
@@ -3280,7 +3905,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3280 | os.disk == D_NEGOTIATING)); | 3905 | os.disk == D_NEGOTIATING)); |
3281 | /* if we have both been inconsistent, and the peer has been | 3906 | /* if we have both been inconsistent, and the peer has been |
3282 | * forced to be UpToDate with --overwrite-data */ | 3907 | * forced to be UpToDate with --overwrite-data */ |
3283 | cr |= drbd_test_flag(mdev, CONSIDER_RESYNC); | 3908 | cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); |
3284 | /* if we had been plain connected, and the admin requested to | 3909 | /* if we had been plain connected, and the admin requested to |
3285 | * start a sync by "invalidate" or "invalidate-remote" */ | 3910 | * start a sync by "invalidate" or "invalidate-remote" */ |
3286 | cr |= (os.conn == C_CONNECTED && | 3911 | cr |= (os.conn == C_CONNECTED && |
@@ -3300,44 +3925,44 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3300 | peer_state.disk = D_DISKLESS; | 3925 | peer_state.disk = D_DISKLESS; |
3301 | real_peer_disk = D_DISKLESS; | 3926 | real_peer_disk = D_DISKLESS; |
3302 | } else { | 3927 | } else { |
3303 | if (drbd_test_and_clear_flag(mdev, CONN_DRY_RUN)) | 3928 | if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags)) |
3304 | return false; | 3929 | return -EIO; |
3305 | D_ASSERT(os.conn == C_WF_REPORT_PARAMS); | 3930 | D_ASSERT(os.conn == C_WF_REPORT_PARAMS); |
3306 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3931 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3307 | return false; | 3932 | return -EIO; |
3308 | } | 3933 | } |
3309 | } | 3934 | } |
3310 | } | 3935 | } |
3311 | 3936 | ||
3312 | spin_lock_irq(&mdev->req_lock); | 3937 | spin_lock_irq(&mdev->tconn->req_lock); |
3313 | if (mdev->state.i != os.i) | 3938 | if (os.i != drbd_read_state(mdev).i) |
3314 | goto retry; | 3939 | goto retry; |
3315 | drbd_clear_flag(mdev, CONSIDER_RESYNC); | 3940 | clear_bit(CONSIDER_RESYNC, &mdev->flags); |
3316 | ns.peer = peer_state.role; | 3941 | ns.peer = peer_state.role; |
3317 | ns.pdsk = real_peer_disk; | 3942 | ns.pdsk = real_peer_disk; |
3318 | ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); | 3943 | ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); |
3319 | if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) | 3944 | if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) |
3320 | ns.disk = mdev->new_state_tmp.disk; | 3945 | ns.disk = mdev->new_state_tmp.disk; |
3321 | cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); | 3946 | cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); |
3322 | if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && | 3947 | if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && |
3323 | drbd_test_flag(mdev, NEW_CUR_UUID)) { | 3948 | test_bit(NEW_CUR_UUID, &mdev->flags)) { |
3324 | /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this | 3949 | /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this |
3325 | for temporal network outages! */ | 3950 | for temporal network outages! */ |
3326 | spin_unlock_irq(&mdev->req_lock); | 3951 | spin_unlock_irq(&mdev->tconn->req_lock); |
3327 | dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); | 3952 | dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); |
3328 | tl_clear(mdev); | 3953 | tl_clear(mdev->tconn); |
3329 | drbd_uuid_new_current(mdev); | 3954 | drbd_uuid_new_current(mdev); |
3330 | drbd_clear_flag(mdev, NEW_CUR_UUID); | 3955 | clear_bit(NEW_CUR_UUID, &mdev->flags); |
3331 | drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0)); | 3956 | conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD); |
3332 | return false; | 3957 | return -EIO; |
3333 | } | 3958 | } |
3334 | rv = _drbd_set_state(mdev, ns, cs_flags, NULL); | 3959 | rv = _drbd_set_state(mdev, ns, cs_flags, NULL); |
3335 | ns = mdev->state; | 3960 | ns = drbd_read_state(mdev); |
3336 | spin_unlock_irq(&mdev->req_lock); | 3961 | spin_unlock_irq(&mdev->tconn->req_lock); |
3337 | 3962 | ||
3338 | if (rv < SS_SUCCESS) { | 3963 | if (rv < SS_SUCCESS) { |
3339 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3964 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3340 | return false; | 3965 | return -EIO; |
3341 | } | 3966 | } |
3342 | 3967 | ||
3343 | if (os.conn > C_WF_REPORT_PARAMS) { | 3968 | if (os.conn > C_WF_REPORT_PARAMS) { |
@@ -3351,16 +3976,21 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3351 | } | 3976 | } |
3352 | } | 3977 | } |
3353 | 3978 | ||
3354 | mdev->net_conf->want_lose = 0; | 3979 | clear_bit(DISCARD_MY_DATA, &mdev->flags); |
3355 | 3980 | ||
3356 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ | 3981 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ |
3357 | 3982 | ||
3358 | return true; | 3983 | return 0; |
3359 | } | 3984 | } |
3360 | 3985 | ||
3361 | static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3986 | static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi) |
3362 | { | 3987 | { |
3363 | struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid; | 3988 | struct drbd_conf *mdev; |
3989 | struct p_rs_uuid *p = pi->data; | ||
3990 | |||
3991 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3992 | if (!mdev) | ||
3993 | return -EIO; | ||
3364 | 3994 | ||
3365 | wait_event(mdev->misc_wait, | 3995 | wait_event(mdev->misc_wait, |
3366 | mdev->state.conn == C_WF_SYNC_UUID || | 3996 | mdev->state.conn == C_WF_SYNC_UUID || |
@@ -3383,7 +4013,7 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
3383 | } else | 4013 | } else |
3384 | dev_err(DEV, "Ignoring SyncUUID packet!\n"); | 4014 | dev_err(DEV, "Ignoring SyncUUID packet!\n"); |
3385 | 4015 | ||
3386 | return true; | 4016 | return 0; |
3387 | } | 4017 | } |
3388 | 4018 | ||
3389 | /** | 4019 | /** |
@@ -3393,27 +4023,27 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
3393 | * code upon failure. | 4023 | * code upon failure. |
3394 | */ | 4024 | */ |
3395 | static int | 4025 | static int |
3396 | receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, | 4026 | receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size, |
3397 | unsigned long *buffer, struct bm_xfer_ctx *c) | 4027 | unsigned long *p, struct bm_xfer_ctx *c) |
3398 | { | 4028 | { |
3399 | unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | 4029 | unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - |
3400 | unsigned want = num_words * sizeof(long); | 4030 | drbd_header_size(mdev->tconn); |
4031 | unsigned int num_words = min_t(size_t, data_size / sizeof(*p), | ||
4032 | c->bm_words - c->word_offset); | ||
4033 | unsigned int want = num_words * sizeof(*p); | ||
3401 | int err; | 4034 | int err; |
3402 | 4035 | ||
3403 | if (want != data_size) { | 4036 | if (want != size) { |
3404 | dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size); | 4037 | dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size); |
3405 | return -EIO; | 4038 | return -EIO; |
3406 | } | 4039 | } |
3407 | if (want == 0) | 4040 | if (want == 0) |
3408 | return 0; | 4041 | return 0; |
3409 | err = drbd_recv(mdev, buffer, want); | 4042 | err = drbd_recv_all(mdev->tconn, p, want); |
3410 | if (err != want) { | 4043 | if (err) |
3411 | if (err >= 0) | ||
3412 | err = -EIO; | ||
3413 | return err; | 4044 | return err; |
3414 | } | ||
3415 | 4045 | ||
3416 | drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); | 4046 | drbd_bm_merge_lel(mdev, c->word_offset, num_words, p); |
3417 | 4047 | ||
3418 | c->word_offset += num_words; | 4048 | c->word_offset += num_words; |
3419 | c->bit_offset = c->word_offset * BITS_PER_LONG; | 4049 | c->bit_offset = c->word_offset * BITS_PER_LONG; |
@@ -3423,6 +4053,21 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, | |||
3423 | return 1; | 4053 | return 1; |
3424 | } | 4054 | } |
3425 | 4055 | ||
4056 | static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p) | ||
4057 | { | ||
4058 | return (enum drbd_bitmap_code)(p->encoding & 0x0f); | ||
4059 | } | ||
4060 | |||
4061 | static int dcbp_get_start(struct p_compressed_bm *p) | ||
4062 | { | ||
4063 | return (p->encoding & 0x80) != 0; | ||
4064 | } | ||
4065 | |||
4066 | static int dcbp_get_pad_bits(struct p_compressed_bm *p) | ||
4067 | { | ||
4068 | return (p->encoding >> 4) & 0x7; | ||
4069 | } | ||
4070 | |||
3426 | /** | 4071 | /** |
3427 | * recv_bm_rle_bits | 4072 | * recv_bm_rle_bits |
3428 | * | 4073 | * |
@@ -3432,7 +4077,8 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, | |||
3432 | static int | 4077 | static int |
3433 | recv_bm_rle_bits(struct drbd_conf *mdev, | 4078 | recv_bm_rle_bits(struct drbd_conf *mdev, |
3434 | struct p_compressed_bm *p, | 4079 | struct p_compressed_bm *p, |
3435 | struct bm_xfer_ctx *c) | 4080 | struct bm_xfer_ctx *c, |
4081 | unsigned int len) | ||
3436 | { | 4082 | { |
3437 | struct bitstream bs; | 4083 | struct bitstream bs; |
3438 | u64 look_ahead; | 4084 | u64 look_ahead; |
@@ -3440,12 +4086,11 @@ recv_bm_rle_bits(struct drbd_conf *mdev, | |||
3440 | u64 tmp; | 4086 | u64 tmp; |
3441 | unsigned long s = c->bit_offset; | 4087 | unsigned long s = c->bit_offset; |
3442 | unsigned long e; | 4088 | unsigned long e; |
3443 | int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head)); | 4089 | int toggle = dcbp_get_start(p); |
3444 | int toggle = DCBP_get_start(p); | ||
3445 | int have; | 4090 | int have; |
3446 | int bits; | 4091 | int bits; |
3447 | 4092 | ||
3448 | bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); | 4093 | bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p)); |
3449 | 4094 | ||
3450 | bits = bitstream_get_bits(&bs, &look_ahead, 64); | 4095 | bits = bitstream_get_bits(&bs, &look_ahead, 64); |
3451 | if (bits < 0) | 4096 | if (bits < 0) |
@@ -3497,17 +4142,18 @@ recv_bm_rle_bits(struct drbd_conf *mdev, | |||
3497 | static int | 4142 | static int |
3498 | decode_bitmap_c(struct drbd_conf *mdev, | 4143 | decode_bitmap_c(struct drbd_conf *mdev, |
3499 | struct p_compressed_bm *p, | 4144 | struct p_compressed_bm *p, |
3500 | struct bm_xfer_ctx *c) | 4145 | struct bm_xfer_ctx *c, |
4146 | unsigned int len) | ||
3501 | { | 4147 | { |
3502 | if (DCBP_get_code(p) == RLE_VLI_Bits) | 4148 | if (dcbp_get_code(p) == RLE_VLI_Bits) |
3503 | return recv_bm_rle_bits(mdev, p, c); | 4149 | return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p)); |
3504 | 4150 | ||
3505 | /* other variants had been implemented for evaluation, | 4151 | /* other variants had been implemented for evaluation, |
3506 | * but have been dropped as this one turned out to be "best" | 4152 | * but have been dropped as this one turned out to be "best" |
3507 | * during all our tests. */ | 4153 | * during all our tests. */ |
3508 | 4154 | ||
3509 | dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); | 4155 | dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); |
3510 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | 4156 | conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); |
3511 | return -EIO; | 4157 | return -EIO; |
3512 | } | 4158 | } |
3513 | 4159 | ||
@@ -3515,11 +4161,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, | |||
3515 | const char *direction, struct bm_xfer_ctx *c) | 4161 | const char *direction, struct bm_xfer_ctx *c) |
3516 | { | 4162 | { |
3517 | /* what would it take to transfer it "plaintext" */ | 4163 | /* what would it take to transfer it "plaintext" */ |
3518 | unsigned plain = sizeof(struct p_header80) * | 4164 | unsigned int header_size = drbd_header_size(mdev->tconn); |
3519 | ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) | 4165 | unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; |
3520 | + c->bm_words * sizeof(long); | 4166 | unsigned int plain = |
3521 | unsigned total = c->bytes[0] + c->bytes[1]; | 4167 | header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + |
3522 | unsigned r; | 4168 | c->bm_words * sizeof(unsigned long); |
4169 | unsigned int total = c->bytes[0] + c->bytes[1]; | ||
4170 | unsigned int r; | ||
3523 | 4171 | ||
3524 | /* total can not be zero. but just in case: */ | 4172 | /* total can not be zero. but just in case: */ |
3525 | if (total == 0) | 4173 | if (total == 0) |
@@ -3553,67 +4201,63 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, | |||
3553 | in order to be agnostic to the 32 vs 64 bits issue. | 4201 | in order to be agnostic to the 32 vs 64 bits issue. |
3554 | 4202 | ||
3555 | returns 0 on failure, 1 if we successfully received it. */ | 4203 | returns 0 on failure, 1 if we successfully received it. */ |
3556 | static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4204 | static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi) |
3557 | { | 4205 | { |
4206 | struct drbd_conf *mdev; | ||
3558 | struct bm_xfer_ctx c; | 4207 | struct bm_xfer_ctx c; |
3559 | void *buffer; | ||
3560 | int err; | 4208 | int err; |
3561 | int ok = false; | 4209 | |
3562 | struct p_header80 *h = &mdev->data.rbuf.header.h80; | 4210 | mdev = vnr_to_mdev(tconn, pi->vnr); |
4211 | if (!mdev) | ||
4212 | return -EIO; | ||
3563 | 4213 | ||
3564 | drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED); | 4214 | drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED); |
3565 | /* you are supposed to send additional out-of-sync information | 4215 | /* you are supposed to send additional out-of-sync information |
3566 | * if you actually set bits during this phase */ | 4216 | * if you actually set bits during this phase */ |
3567 | 4217 | ||
3568 | /* maybe we should use some per thread scratch page, | ||
3569 | * and allocate that during initial device creation? */ | ||
3570 | buffer = (unsigned long *) __get_free_page(GFP_NOIO); | ||
3571 | if (!buffer) { | ||
3572 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
3573 | goto out; | ||
3574 | } | ||
3575 | |||
3576 | c = (struct bm_xfer_ctx) { | 4218 | c = (struct bm_xfer_ctx) { |
3577 | .bm_bits = drbd_bm_bits(mdev), | 4219 | .bm_bits = drbd_bm_bits(mdev), |
3578 | .bm_words = drbd_bm_words(mdev), | 4220 | .bm_words = drbd_bm_words(mdev), |
3579 | }; | 4221 | }; |
3580 | 4222 | ||
3581 | for(;;) { | 4223 | for(;;) { |
3582 | if (cmd == P_BITMAP) { | 4224 | if (pi->cmd == P_BITMAP) |
3583 | err = receive_bitmap_plain(mdev, data_size, buffer, &c); | 4225 | err = receive_bitmap_plain(mdev, pi->size, pi->data, &c); |
3584 | } else if (cmd == P_COMPRESSED_BITMAP) { | 4226 | else if (pi->cmd == P_COMPRESSED_BITMAP) { |
3585 | /* MAYBE: sanity check that we speak proto >= 90, | 4227 | /* MAYBE: sanity check that we speak proto >= 90, |
3586 | * and the feature is enabled! */ | 4228 | * and the feature is enabled! */ |
3587 | struct p_compressed_bm *p; | 4229 | struct p_compressed_bm *p = pi->data; |
3588 | 4230 | ||
3589 | if (data_size > BM_PACKET_PAYLOAD_BYTES) { | 4231 | if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) { |
3590 | dev_err(DEV, "ReportCBitmap packet too large\n"); | 4232 | dev_err(DEV, "ReportCBitmap packet too large\n"); |
4233 | err = -EIO; | ||
3591 | goto out; | 4234 | goto out; |
3592 | } | 4235 | } |
3593 | /* use the page buff */ | 4236 | if (pi->size <= sizeof(*p)) { |
3594 | p = buffer; | 4237 | dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size); |
3595 | memcpy(p, h, sizeof(*h)); | 4238 | err = -EIO; |
3596 | if (drbd_recv(mdev, p->head.payload, data_size) != data_size) | ||
3597 | goto out; | ||
3598 | if (data_size <= (sizeof(*p) - sizeof(p->head))) { | ||
3599 | dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size); | ||
3600 | goto out; | 4239 | goto out; |
3601 | } | 4240 | } |
3602 | err = decode_bitmap_c(mdev, p, &c); | 4241 | err = drbd_recv_all(mdev->tconn, p, pi->size); |
4242 | if (err) | ||
4243 | goto out; | ||
4244 | err = decode_bitmap_c(mdev, p, &c, pi->size); | ||
3603 | } else { | 4245 | } else { |
3604 | dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd); | 4246 | dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd); |
4247 | err = -EIO; | ||
3605 | goto out; | 4248 | goto out; |
3606 | } | 4249 | } |
3607 | 4250 | ||
3608 | c.packets[cmd == P_BITMAP]++; | 4251 | c.packets[pi->cmd == P_BITMAP]++; |
3609 | c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size; | 4252 | c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size; |
3610 | 4253 | ||
3611 | if (err <= 0) { | 4254 | if (err <= 0) { |
3612 | if (err < 0) | 4255 | if (err < 0) |
3613 | goto out; | 4256 | goto out; |
3614 | break; | 4257 | break; |
3615 | } | 4258 | } |
3616 | if (!drbd_recv_header(mdev, &cmd, &data_size)) | 4259 | err = drbd_recv_header(mdev->tconn, pi); |
4260 | if (err) | ||
3617 | goto out; | 4261 | goto out; |
3618 | } | 4262 | } |
3619 | 4263 | ||
@@ -3622,8 +4266,8 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne | |||
3622 | if (mdev->state.conn == C_WF_BITMAP_T) { | 4266 | if (mdev->state.conn == C_WF_BITMAP_T) { |
3623 | enum drbd_state_rv rv; | 4267 | enum drbd_state_rv rv; |
3624 | 4268 | ||
3625 | ok = !drbd_send_bitmap(mdev); | 4269 | err = drbd_send_bitmap(mdev); |
3626 | if (!ok) | 4270 | if (err) |
3627 | goto out; | 4271 | goto out; |
3628 | /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ | 4272 | /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ |
3629 | rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | 4273 | rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); |
@@ -3634,47 +4278,40 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne | |||
3634 | dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", | 4278 | dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", |
3635 | drbd_conn_str(mdev->state.conn)); | 4279 | drbd_conn_str(mdev->state.conn)); |
3636 | } | 4280 | } |
4281 | err = 0; | ||
3637 | 4282 | ||
3638 | ok = true; | ||
3639 | out: | 4283 | out: |
3640 | drbd_bm_unlock(mdev); | 4284 | drbd_bm_unlock(mdev); |
3641 | if (ok && mdev->state.conn == C_WF_BITMAP_S) | 4285 | if (!err && mdev->state.conn == C_WF_BITMAP_S) |
3642 | drbd_start_resync(mdev, C_SYNC_SOURCE); | 4286 | drbd_start_resync(mdev, C_SYNC_SOURCE); |
3643 | free_page((unsigned long) buffer); | 4287 | return err; |
3644 | return ok; | ||
3645 | } | 4288 | } |
3646 | 4289 | ||
3647 | static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4290 | static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi) |
3648 | { | 4291 | { |
3649 | /* TODO zero copy sink :) */ | 4292 | conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n", |
3650 | static char sink[128]; | 4293 | pi->cmd, pi->size); |
3651 | int size, want, r; | ||
3652 | 4294 | ||
3653 | dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", | 4295 | return ignore_remaining_packet(tconn, pi); |
3654 | cmd, data_size); | ||
3655 | |||
3656 | size = data_size; | ||
3657 | while (size > 0) { | ||
3658 | want = min_t(int, size, sizeof(sink)); | ||
3659 | r = drbd_recv(mdev, sink, want); | ||
3660 | ERR_IF(r <= 0) break; | ||
3661 | size -= r; | ||
3662 | } | ||
3663 | return size == 0; | ||
3664 | } | 4296 | } |
3665 | 4297 | ||
3666 | static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4298 | static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi) |
3667 | { | 4299 | { |
3668 | /* Make sure we've acked all the TCP data associated | 4300 | /* Make sure we've acked all the TCP data associated |
3669 | * with the data requests being unplugged */ | 4301 | * with the data requests being unplugged */ |
3670 | drbd_tcp_quickack(mdev->data.socket); | 4302 | drbd_tcp_quickack(tconn->data.socket); |
3671 | 4303 | ||
3672 | return true; | 4304 | return 0; |
3673 | } | 4305 | } |
3674 | 4306 | ||
3675 | static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4307 | static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi) |
3676 | { | 4308 | { |
3677 | struct p_block_desc *p = &mdev->data.rbuf.block_desc; | 4309 | struct drbd_conf *mdev; |
4310 | struct p_block_desc *p = pi->data; | ||
4311 | |||
4312 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4313 | if (!mdev) | ||
4314 | return -EIO; | ||
3678 | 4315 | ||
3679 | switch (mdev->state.conn) { | 4316 | switch (mdev->state.conn) { |
3680 | case C_WF_SYNC_UUID: | 4317 | case C_WF_SYNC_UUID: |
@@ -3688,15 +4325,13 @@ static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
3688 | 4325 | ||
3689 | drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); | 4326 | drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); |
3690 | 4327 | ||
3691 | return true; | 4328 | return 0; |
3692 | } | 4329 | } |
3693 | 4330 | ||
3694 | typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive); | ||
3695 | |||
3696 | struct data_cmd { | 4331 | struct data_cmd { |
3697 | int expect_payload; | 4332 | int expect_payload; |
3698 | size_t pkt_size; | 4333 | size_t pkt_size; |
3699 | drbd_cmd_handler_f function; | 4334 | int (*fn)(struct drbd_tconn *, struct packet_info *); |
3700 | }; | 4335 | }; |
3701 | 4336 | ||
3702 | static struct data_cmd drbd_cmd_handler[] = { | 4337 | static struct data_cmd drbd_cmd_handler[] = { |
@@ -3704,13 +4339,13 @@ static struct data_cmd drbd_cmd_handler[] = { | |||
3704 | [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, | 4339 | [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, |
3705 | [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , | 4340 | [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , |
3706 | [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , | 4341 | [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , |
3707 | [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , | 4342 | [P_BITMAP] = { 1, 0, receive_bitmap } , |
3708 | [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , | 4343 | [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } , |
3709 | [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote }, | 4344 | [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote }, |
3710 | [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, | 4345 | [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, |
3711 | [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, | 4346 | [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, |
3712 | [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam }, | 4347 | [P_SYNC_PARAM] = { 1, 0, receive_SyncParam }, |
3713 | [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam }, | 4348 | [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam }, |
3714 | [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, | 4349 | [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, |
3715 | [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, | 4350 | [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, |
3716 | [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, | 4351 | [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, |
@@ -3722,124 +4357,75 @@ static struct data_cmd drbd_cmd_handler[] = { | |||
3722 | [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, | 4357 | [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, |
3723 | [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, | 4358 | [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, |
3724 | [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, | 4359 | [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, |
3725 | /* anything missing from this table is in | 4360 | [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, |
3726 | * the asender_tbl, see get_asender_cmd */ | 4361 | [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, |
3727 | [P_MAX_CMD] = { 0, 0, NULL }, | ||
3728 | }; | 4362 | }; |
3729 | 4363 | ||
3730 | /* All handler functions that expect a sub-header get that sub-heder in | 4364 | static void drbdd(struct drbd_tconn *tconn) |
3731 | mdev->data.rbuf.header.head.payload. | ||
3732 | |||
3733 | Usually in mdev->data.rbuf.header.head the callback can find the usual | ||
3734 | p_header, but they may not rely on that. Since there is also p_header95 ! | ||
3735 | */ | ||
3736 | |||
3737 | static void drbdd(struct drbd_conf *mdev) | ||
3738 | { | 4365 | { |
3739 | union p_header *header = &mdev->data.rbuf.header; | 4366 | struct packet_info pi; |
3740 | unsigned int packet_size; | ||
3741 | enum drbd_packets cmd; | ||
3742 | size_t shs; /* sub header size */ | 4367 | size_t shs; /* sub header size */ |
3743 | int rv; | 4368 | int err; |
4369 | |||
4370 | while (get_t_state(&tconn->receiver) == RUNNING) { | ||
4371 | struct data_cmd *cmd; | ||
3744 | 4372 | ||
3745 | while (get_t_state(&mdev->receiver) == Running) { | 4373 | drbd_thread_current_set_cpu(&tconn->receiver); |
3746 | drbd_thread_current_set_cpu(mdev); | 4374 | if (drbd_recv_header(tconn, &pi)) |
3747 | if (!drbd_recv_header(mdev, &cmd, &packet_size)) | ||
3748 | goto err_out; | 4375 | goto err_out; |
3749 | 4376 | ||
3750 | if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) { | 4377 | cmd = &drbd_cmd_handler[pi.cmd]; |
3751 | dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size); | 4378 | if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) { |
4379 | conn_err(tconn, "Unexpected data packet %s (0x%04x)", | ||
4380 | cmdname(pi.cmd), pi.cmd); | ||
3752 | goto err_out; | 4381 | goto err_out; |
3753 | } | 4382 | } |
3754 | 4383 | ||
3755 | shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header); | 4384 | shs = cmd->pkt_size; |
3756 | if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) { | 4385 | if (pi.size > shs && !cmd->expect_payload) { |
3757 | dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size); | 4386 | conn_err(tconn, "No payload expected %s l:%d\n", |
4387 | cmdname(pi.cmd), pi.size); | ||
3758 | goto err_out; | 4388 | goto err_out; |
3759 | } | 4389 | } |
3760 | 4390 | ||
3761 | if (shs) { | 4391 | if (shs) { |
3762 | rv = drbd_recv(mdev, &header->h80.payload, shs); | 4392 | err = drbd_recv_all_warn(tconn, pi.data, shs); |
3763 | if (unlikely(rv != shs)) { | 4393 | if (err) |
3764 | if (!signal_pending(current)) | ||
3765 | dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv); | ||
3766 | goto err_out; | 4394 | goto err_out; |
3767 | } | 4395 | pi.size -= shs; |
3768 | } | 4396 | } |
3769 | 4397 | ||
3770 | rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs); | 4398 | err = cmd->fn(tconn, &pi); |
3771 | 4399 | if (err) { | |
3772 | if (unlikely(!rv)) { | 4400 | conn_err(tconn, "error receiving %s, e: %d l: %d!\n", |
3773 | dev_err(DEV, "error receiving %s, l: %d!\n", | 4401 | cmdname(pi.cmd), err, pi.size); |
3774 | cmdname(cmd), packet_size); | ||
3775 | goto err_out; | 4402 | goto err_out; |
3776 | } | 4403 | } |
3777 | } | 4404 | } |
4405 | return; | ||
3778 | 4406 | ||
3779 | if (0) { | 4407 | err_out: |
3780 | err_out: | 4408 | conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); |
3781 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3782 | } | ||
3783 | /* If we leave here, we probably want to update at least the | ||
3784 | * "Connected" indicator on stable storage. Do so explicitly here. */ | ||
3785 | drbd_md_sync(mdev); | ||
3786 | } | 4409 | } |
3787 | 4410 | ||
3788 | void drbd_flush_workqueue(struct drbd_conf *mdev) | 4411 | void conn_flush_workqueue(struct drbd_tconn *tconn) |
3789 | { | 4412 | { |
3790 | struct drbd_wq_barrier barr; | 4413 | struct drbd_wq_barrier barr; |
3791 | 4414 | ||
3792 | barr.w.cb = w_prev_work_done; | 4415 | barr.w.cb = w_prev_work_done; |
4416 | barr.w.tconn = tconn; | ||
3793 | init_completion(&barr.done); | 4417 | init_completion(&barr.done); |
3794 | drbd_queue_work(&mdev->data.work, &barr.w); | 4418 | drbd_queue_work(&tconn->sender_work, &barr.w); |
3795 | wait_for_completion(&barr.done); | 4419 | wait_for_completion(&barr.done); |
3796 | } | 4420 | } |
3797 | 4421 | ||
3798 | void drbd_free_tl_hash(struct drbd_conf *mdev) | 4422 | static void conn_disconnect(struct drbd_tconn *tconn) |
3799 | { | 4423 | { |
3800 | struct hlist_head *h; | 4424 | struct drbd_conf *mdev; |
3801 | 4425 | enum drbd_conns oc; | |
3802 | spin_lock_irq(&mdev->req_lock); | 4426 | int vnr; |
3803 | 4427 | ||
3804 | if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) { | 4428 | if (tconn->cstate == C_STANDALONE) |
3805 | spin_unlock_irq(&mdev->req_lock); | ||
3806 | return; | ||
3807 | } | ||
3808 | /* paranoia code */ | ||
3809 | for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) | ||
3810 | if (h->first) | ||
3811 | dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", | ||
3812 | (int)(h - mdev->ee_hash), h->first); | ||
3813 | kfree(mdev->ee_hash); | ||
3814 | mdev->ee_hash = NULL; | ||
3815 | mdev->ee_hash_s = 0; | ||
3816 | |||
3817 | /* We may not have had the chance to wait for all locally pending | ||
3818 | * application requests. The hlist_add_fake() prevents access after | ||
3819 | * free on master bio completion. */ | ||
3820 | for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) { | ||
3821 | struct drbd_request *req; | ||
3822 | struct hlist_node *pos, *n; | ||
3823 | hlist_for_each_entry_safe(req, pos, n, h, collision) { | ||
3824 | hlist_del_init(&req->collision); | ||
3825 | hlist_add_fake(&req->collision); | ||
3826 | } | ||
3827 | } | ||
3828 | |||
3829 | kfree(mdev->tl_hash); | ||
3830 | mdev->tl_hash = NULL; | ||
3831 | mdev->tl_hash_s = 0; | ||
3832 | spin_unlock_irq(&mdev->req_lock); | ||
3833 | } | ||
3834 | |||
3835 | static void drbd_disconnect(struct drbd_conf *mdev) | ||
3836 | { | ||
3837 | enum drbd_fencing_p fp; | ||
3838 | union drbd_state os, ns; | ||
3839 | int rv = SS_UNKNOWN_ERROR; | ||
3840 | unsigned int i; | ||
3841 | |||
3842 | if (mdev->state.conn == C_STANDALONE) | ||
3843 | return; | 4429 | return; |
3844 | 4430 | ||
3845 | /* We are about to start the cleanup after connection loss. | 4431 | /* We are about to start the cleanup after connection loss. |
@@ -3847,18 +4433,54 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3847 | * Usually we should be in some network failure state already, | 4433 | * Usually we should be in some network failure state already, |
3848 | * but just in case we are not, we fix it up here. | 4434 | * but just in case we are not, we fix it up here. |
3849 | */ | 4435 | */ |
3850 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | 4436 | conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); |
3851 | 4437 | ||
3852 | /* asender does not clean up anything. it must not interfere, either */ | 4438 | /* asender does not clean up anything. it must not interfere, either */ |
3853 | drbd_thread_stop(&mdev->asender); | 4439 | drbd_thread_stop(&tconn->asender); |
3854 | drbd_free_sock(mdev); | 4440 | drbd_free_sock(tconn); |
4441 | |||
4442 | rcu_read_lock(); | ||
4443 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
4444 | kref_get(&mdev->kref); | ||
4445 | rcu_read_unlock(); | ||
4446 | drbd_disconnected(mdev); | ||
4447 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
4448 | rcu_read_lock(); | ||
4449 | } | ||
4450 | rcu_read_unlock(); | ||
4451 | |||
4452 | if (!list_empty(&tconn->current_epoch->list)) | ||
4453 | conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n"); | ||
4454 | /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ | ||
4455 | atomic_set(&tconn->current_epoch->epoch_size, 0); | ||
4456 | tconn->send.seen_any_write_yet = false; | ||
4457 | |||
4458 | conn_info(tconn, "Connection closed\n"); | ||
4459 | |||
4460 | if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN) | ||
4461 | conn_try_outdate_peer_async(tconn); | ||
4462 | |||
4463 | spin_lock_irq(&tconn->req_lock); | ||
4464 | oc = tconn->cstate; | ||
4465 | if (oc >= C_UNCONNECTED) | ||
4466 | _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); | ||
4467 | |||
4468 | spin_unlock_irq(&tconn->req_lock); | ||
4469 | |||
4470 | if (oc == C_DISCONNECTING) | ||
4471 | conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD); | ||
4472 | } | ||
4473 | |||
4474 | static int drbd_disconnected(struct drbd_conf *mdev) | ||
4475 | { | ||
4476 | unsigned int i; | ||
3855 | 4477 | ||
3856 | /* wait for current activity to cease. */ | 4478 | /* wait for current activity to cease. */ |
3857 | spin_lock_irq(&mdev->req_lock); | 4479 | spin_lock_irq(&mdev->tconn->req_lock); |
3858 | _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | 4480 | _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); |
3859 | _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); | 4481 | _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); |
3860 | _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); | 4482 | _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); |
3861 | spin_unlock_irq(&mdev->req_lock); | 4483 | spin_unlock_irq(&mdev->tconn->req_lock); |
3862 | 4484 | ||
3863 | /* We do not have data structures that would allow us to | 4485 | /* We do not have data structures that would allow us to |
3864 | * get the rs_pending_cnt down to 0 again. | 4486 | * get the rs_pending_cnt down to 0 again. |
@@ -3876,7 +4498,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3876 | atomic_set(&mdev->rs_pending_cnt, 0); | 4498 | atomic_set(&mdev->rs_pending_cnt, 0); |
3877 | wake_up(&mdev->misc_wait); | 4499 | wake_up(&mdev->misc_wait); |
3878 | 4500 | ||
3879 | /* make sure syncer is stopped and w_resume_next_sg queued */ | ||
3880 | del_timer_sync(&mdev->resync_timer); | 4501 | del_timer_sync(&mdev->resync_timer); |
3881 | resync_timer_fn((unsigned long)mdev); | 4502 | resync_timer_fn((unsigned long)mdev); |
3882 | 4503 | ||
@@ -3885,53 +4506,28 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3885 | * to be "canceled" */ | 4506 | * to be "canceled" */ |
3886 | drbd_flush_workqueue(mdev); | 4507 | drbd_flush_workqueue(mdev); |
3887 | 4508 | ||
3888 | /* This also does reclaim_net_ee(). If we do this too early, we might | 4509 | drbd_finish_peer_reqs(mdev); |
3889 | * miss some resync ee and pages.*/ | 4510 | |
3890 | drbd_process_done_ee(mdev); | 4511 | /* This second workqueue flush is necessary, since drbd_finish_peer_reqs() |
4512 | might have issued a work again. The one before drbd_finish_peer_reqs() is | ||
4513 | necessary to reclain net_ee in drbd_finish_peer_reqs(). */ | ||
4514 | drbd_flush_workqueue(mdev); | ||
4515 | |||
4516 | /* need to do it again, drbd_finish_peer_reqs() may have populated it | ||
4517 | * again via drbd_try_clear_on_disk_bm(). */ | ||
4518 | drbd_rs_cancel_all(mdev); | ||
3891 | 4519 | ||
3892 | kfree(mdev->p_uuid); | 4520 | kfree(mdev->p_uuid); |
3893 | mdev->p_uuid = NULL; | 4521 | mdev->p_uuid = NULL; |
3894 | 4522 | ||
3895 | if (!is_susp(mdev->state)) | 4523 | if (!drbd_suspended(mdev)) |
3896 | tl_clear(mdev); | 4524 | tl_clear(mdev->tconn); |
3897 | |||
3898 | dev_info(DEV, "Connection closed\n"); | ||
3899 | 4525 | ||
3900 | drbd_md_sync(mdev); | 4526 | drbd_md_sync(mdev); |
3901 | 4527 | ||
3902 | fp = FP_DONT_CARE; | ||
3903 | if (get_ldev(mdev)) { | ||
3904 | fp = mdev->ldev->dc.fencing; | ||
3905 | put_ldev(mdev); | ||
3906 | } | ||
3907 | |||
3908 | if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) | ||
3909 | drbd_try_outdate_peer_async(mdev); | ||
3910 | |||
3911 | spin_lock_irq(&mdev->req_lock); | ||
3912 | os = mdev->state; | ||
3913 | if (os.conn >= C_UNCONNECTED) { | ||
3914 | /* Do not restart in case we are C_DISCONNECTING */ | ||
3915 | ns = os; | ||
3916 | ns.conn = C_UNCONNECTED; | ||
3917 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
3918 | } | ||
3919 | spin_unlock_irq(&mdev->req_lock); | ||
3920 | |||
3921 | if (os.conn == C_DISCONNECTING) { | ||
3922 | wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0); | ||
3923 | |||
3924 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3925 | mdev->cram_hmac_tfm = NULL; | ||
3926 | |||
3927 | kfree(mdev->net_conf); | ||
3928 | mdev->net_conf = NULL; | ||
3929 | drbd_request_state(mdev, NS(conn, C_STANDALONE)); | ||
3930 | } | ||
3931 | |||
3932 | /* serialize with bitmap writeout triggered by the state change, | 4528 | /* serialize with bitmap writeout triggered by the state change, |
3933 | * if any. */ | 4529 | * if any. */ |
3934 | wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO)); | 4530 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
3935 | 4531 | ||
3936 | /* tcp_close and release of sendpage pages can be deferred. I don't | 4532 | /* tcp_close and release of sendpage pages can be deferred. I don't |
3937 | * want to use SO_LINGER, because apparently it can be deferred for | 4533 | * want to use SO_LINGER, because apparently it can be deferred for |
@@ -3940,7 +4536,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3940 | * Actually we don't care for exactly when the network stack does its | 4536 | * Actually we don't care for exactly when the network stack does its |
3941 | * put_page(), but release our reference on these pages right here. | 4537 | * put_page(), but release our reference on these pages right here. |
3942 | */ | 4538 | */ |
3943 | i = drbd_release_ee(mdev, &mdev->net_ee); | 4539 | i = drbd_free_peer_reqs(mdev, &mdev->net_ee); |
3944 | if (i) | 4540 | if (i) |
3945 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); | 4541 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); |
3946 | i = atomic_read(&mdev->pp_in_use_by_net); | 4542 | i = atomic_read(&mdev->pp_in_use_by_net); |
@@ -3955,9 +4551,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3955 | D_ASSERT(list_empty(&mdev->sync_ee)); | 4551 | D_ASSERT(list_empty(&mdev->sync_ee)); |
3956 | D_ASSERT(list_empty(&mdev->done_ee)); | 4552 | D_ASSERT(list_empty(&mdev->done_ee)); |
3957 | 4553 | ||
3958 | /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ | 4554 | return 0; |
3959 | atomic_set(&mdev->current_epoch->epoch_size, 0); | ||
3960 | D_ASSERT(list_empty(&mdev->current_epoch->list)); | ||
3961 | } | 4555 | } |
3962 | 4556 | ||
3963 | /* | 4557 | /* |
@@ -3969,29 +4563,19 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3969 | * | 4563 | * |
3970 | * for now, they are expected to be zero, but ignored. | 4564 | * for now, they are expected to be zero, but ignored. |
3971 | */ | 4565 | */ |
3972 | static int drbd_send_handshake(struct drbd_conf *mdev) | 4566 | static int drbd_send_features(struct drbd_tconn *tconn) |
3973 | { | 4567 | { |
3974 | /* ASSERT current == mdev->receiver ... */ | 4568 | struct drbd_socket *sock; |
3975 | struct p_handshake *p = &mdev->data.sbuf.handshake; | 4569 | struct p_connection_features *p; |
3976 | int ok; | ||
3977 | |||
3978 | if (mutex_lock_interruptible(&mdev->data.mutex)) { | ||
3979 | dev_err(DEV, "interrupted during initial handshake\n"); | ||
3980 | return 0; /* interrupted. not ok. */ | ||
3981 | } | ||
3982 | |||
3983 | if (mdev->data.socket == NULL) { | ||
3984 | mutex_unlock(&mdev->data.mutex); | ||
3985 | return 0; | ||
3986 | } | ||
3987 | 4570 | ||
4571 | sock = &tconn->data; | ||
4572 | p = conn_prepare_command(tconn, sock); | ||
4573 | if (!p) | ||
4574 | return -EIO; | ||
3988 | memset(p, 0, sizeof(*p)); | 4575 | memset(p, 0, sizeof(*p)); |
3989 | p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); | 4576 | p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); |
3990 | p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); | 4577 | p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); |
3991 | ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, | 4578 | return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); |
3992 | (struct p_header80 *)p, sizeof(*p), 0 ); | ||
3993 | mutex_unlock(&mdev->data.mutex); | ||
3994 | return ok; | ||
3995 | } | 4579 | } |
3996 | 4580 | ||
3997 | /* | 4581 | /* |
@@ -4001,42 +4585,38 @@ static int drbd_send_handshake(struct drbd_conf *mdev) | |||
4001 | * -1 peer talks different language, | 4585 | * -1 peer talks different language, |
4002 | * no point in trying again, please go standalone. | 4586 | * no point in trying again, please go standalone. |
4003 | */ | 4587 | */ |
4004 | static int drbd_do_handshake(struct drbd_conf *mdev) | 4588 | static int drbd_do_features(struct drbd_tconn *tconn) |
4005 | { | 4589 | { |
4006 | /* ASSERT current == mdev->receiver ... */ | 4590 | /* ASSERT current == tconn->receiver ... */ |
4007 | struct p_handshake *p = &mdev->data.rbuf.handshake; | 4591 | struct p_connection_features *p; |
4008 | const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80); | 4592 | const int expect = sizeof(struct p_connection_features); |
4009 | unsigned int length; | 4593 | struct packet_info pi; |
4010 | enum drbd_packets cmd; | 4594 | int err; |
4011 | int rv; | ||
4012 | 4595 | ||
4013 | rv = drbd_send_handshake(mdev); | 4596 | err = drbd_send_features(tconn); |
4014 | if (!rv) | 4597 | if (err) |
4015 | return 0; | 4598 | return 0; |
4016 | 4599 | ||
4017 | rv = drbd_recv_header(mdev, &cmd, &length); | 4600 | err = drbd_recv_header(tconn, &pi); |
4018 | if (!rv) | 4601 | if (err) |
4019 | return 0; | 4602 | return 0; |
4020 | 4603 | ||
4021 | if (cmd != P_HAND_SHAKE) { | 4604 | if (pi.cmd != P_CONNECTION_FEATURES) { |
4022 | dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", | 4605 | conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n", |
4023 | cmdname(cmd), cmd); | 4606 | cmdname(pi.cmd), pi.cmd); |
4024 | return -1; | 4607 | return -1; |
4025 | } | 4608 | } |
4026 | 4609 | ||
4027 | if (length != expect) { | 4610 | if (pi.size != expect) { |
4028 | dev_err(DEV, "expected HandShake length: %u, received: %u\n", | 4611 | conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n", |
4029 | expect, length); | 4612 | expect, pi.size); |
4030 | return -1; | 4613 | return -1; |
4031 | } | 4614 | } |
4032 | 4615 | ||
4033 | rv = drbd_recv(mdev, &p->head.payload, expect); | 4616 | p = pi.data; |
4034 | 4617 | err = drbd_recv_all_warn(tconn, p, expect); | |
4035 | if (rv != expect) { | 4618 | if (err) |
4036 | if (!signal_pending(current)) | ||
4037 | dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv); | ||
4038 | return 0; | 4619 | return 0; |
4039 | } | ||
4040 | 4620 | ||
4041 | p->protocol_min = be32_to_cpu(p->protocol_min); | 4621 | p->protocol_min = be32_to_cpu(p->protocol_min); |
4042 | p->protocol_max = be32_to_cpu(p->protocol_max); | 4622 | p->protocol_max = be32_to_cpu(p->protocol_max); |
@@ -4047,15 +4627,15 @@ static int drbd_do_handshake(struct drbd_conf *mdev) | |||
4047 | PRO_VERSION_MIN > p->protocol_max) | 4627 | PRO_VERSION_MIN > p->protocol_max) |
4048 | goto incompat; | 4628 | goto incompat; |
4049 | 4629 | ||
4050 | mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); | 4630 | tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); |
4051 | 4631 | ||
4052 | dev_info(DEV, "Handshake successful: " | 4632 | conn_info(tconn, "Handshake successful: " |
4053 | "Agreed network protocol version %d\n", mdev->agreed_pro_version); | 4633 | "Agreed network protocol version %d\n", tconn->agreed_pro_version); |
4054 | 4634 | ||
4055 | return 1; | 4635 | return 1; |
4056 | 4636 | ||
4057 | incompat: | 4637 | incompat: |
4058 | dev_err(DEV, "incompatible DRBD dialects: " | 4638 | conn_err(tconn, "incompatible DRBD dialects: " |
4059 | "I support %d-%d, peer supports %d-%d\n", | 4639 | "I support %d-%d, peer supports %d-%d\n", |
4060 | PRO_VERSION_MIN, PRO_VERSION_MAX, | 4640 | PRO_VERSION_MIN, PRO_VERSION_MAX, |
4061 | p->protocol_min, p->protocol_max); | 4641 | p->protocol_min, p->protocol_max); |
@@ -4063,7 +4643,7 @@ static int drbd_do_handshake(struct drbd_conf *mdev) | |||
4063 | } | 4643 | } |
4064 | 4644 | ||
4065 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) | 4645 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) |
4066 | static int drbd_do_auth(struct drbd_conf *mdev) | 4646 | static int drbd_do_auth(struct drbd_tconn *tconn) |
4067 | { | 4647 | { |
4068 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); | 4648 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); |
4069 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); | 4649 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); |
@@ -4078,121 +4658,139 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4078 | -1 - auth failed, don't try again. | 4658 | -1 - auth failed, don't try again. |
4079 | */ | 4659 | */ |
4080 | 4660 | ||
4081 | static int drbd_do_auth(struct drbd_conf *mdev) | 4661 | static int drbd_do_auth(struct drbd_tconn *tconn) |
4082 | { | 4662 | { |
4663 | struct drbd_socket *sock; | ||
4083 | char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ | 4664 | char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ |
4084 | struct scatterlist sg; | 4665 | struct scatterlist sg; |
4085 | char *response = NULL; | 4666 | char *response = NULL; |
4086 | char *right_response = NULL; | 4667 | char *right_response = NULL; |
4087 | char *peers_ch = NULL; | 4668 | char *peers_ch = NULL; |
4088 | unsigned int key_len = strlen(mdev->net_conf->shared_secret); | 4669 | unsigned int key_len; |
4670 | char secret[SHARED_SECRET_MAX]; /* 64 byte */ | ||
4089 | unsigned int resp_size; | 4671 | unsigned int resp_size; |
4090 | struct hash_desc desc; | 4672 | struct hash_desc desc; |
4091 | enum drbd_packets cmd; | 4673 | struct packet_info pi; |
4092 | unsigned int length; | 4674 | struct net_conf *nc; |
4093 | int rv; | 4675 | int err, rv; |
4676 | |||
4677 | /* FIXME: Put the challenge/response into the preallocated socket buffer. */ | ||
4094 | 4678 | ||
4095 | desc.tfm = mdev->cram_hmac_tfm; | 4679 | rcu_read_lock(); |
4680 | nc = rcu_dereference(tconn->net_conf); | ||
4681 | key_len = strlen(nc->shared_secret); | ||
4682 | memcpy(secret, nc->shared_secret, key_len); | ||
4683 | rcu_read_unlock(); | ||
4684 | |||
4685 | desc.tfm = tconn->cram_hmac_tfm; | ||
4096 | desc.flags = 0; | 4686 | desc.flags = 0; |
4097 | 4687 | ||
4098 | rv = crypto_hash_setkey(mdev->cram_hmac_tfm, | 4688 | rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len); |
4099 | (u8 *)mdev->net_conf->shared_secret, key_len); | ||
4100 | if (rv) { | 4689 | if (rv) { |
4101 | dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); | 4690 | conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv); |
4102 | rv = -1; | 4691 | rv = -1; |
4103 | goto fail; | 4692 | goto fail; |
4104 | } | 4693 | } |
4105 | 4694 | ||
4106 | get_random_bytes(my_challenge, CHALLENGE_LEN); | 4695 | get_random_bytes(my_challenge, CHALLENGE_LEN); |
4107 | 4696 | ||
4108 | rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); | 4697 | sock = &tconn->data; |
4698 | if (!conn_prepare_command(tconn, sock)) { | ||
4699 | rv = 0; | ||
4700 | goto fail; | ||
4701 | } | ||
4702 | rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0, | ||
4703 | my_challenge, CHALLENGE_LEN); | ||
4109 | if (!rv) | 4704 | if (!rv) |
4110 | goto fail; | 4705 | goto fail; |
4111 | 4706 | ||
4112 | rv = drbd_recv_header(mdev, &cmd, &length); | 4707 | err = drbd_recv_header(tconn, &pi); |
4113 | if (!rv) | 4708 | if (err) { |
4709 | rv = 0; | ||
4114 | goto fail; | 4710 | goto fail; |
4711 | } | ||
4115 | 4712 | ||
4116 | if (cmd != P_AUTH_CHALLENGE) { | 4713 | if (pi.cmd != P_AUTH_CHALLENGE) { |
4117 | dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", | 4714 | conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n", |
4118 | cmdname(cmd), cmd); | 4715 | cmdname(pi.cmd), pi.cmd); |
4119 | rv = 0; | 4716 | rv = 0; |
4120 | goto fail; | 4717 | goto fail; |
4121 | } | 4718 | } |
4122 | 4719 | ||
4123 | if (length > CHALLENGE_LEN * 2) { | 4720 | if (pi.size > CHALLENGE_LEN * 2) { |
4124 | dev_err(DEV, "expected AuthChallenge payload too big.\n"); | 4721 | conn_err(tconn, "expected AuthChallenge payload too big.\n"); |
4125 | rv = -1; | 4722 | rv = -1; |
4126 | goto fail; | 4723 | goto fail; |
4127 | } | 4724 | } |
4128 | 4725 | ||
4129 | peers_ch = kmalloc(length, GFP_NOIO); | 4726 | peers_ch = kmalloc(pi.size, GFP_NOIO); |
4130 | if (peers_ch == NULL) { | 4727 | if (peers_ch == NULL) { |
4131 | dev_err(DEV, "kmalloc of peers_ch failed\n"); | 4728 | conn_err(tconn, "kmalloc of peers_ch failed\n"); |
4132 | rv = -1; | 4729 | rv = -1; |
4133 | goto fail; | 4730 | goto fail; |
4134 | } | 4731 | } |
4135 | 4732 | ||
4136 | rv = drbd_recv(mdev, peers_ch, length); | 4733 | err = drbd_recv_all_warn(tconn, peers_ch, pi.size); |
4137 | 4734 | if (err) { | |
4138 | if (rv != length) { | ||
4139 | if (!signal_pending(current)) | ||
4140 | dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv); | ||
4141 | rv = 0; | 4735 | rv = 0; |
4142 | goto fail; | 4736 | goto fail; |
4143 | } | 4737 | } |
4144 | 4738 | ||
4145 | resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); | 4739 | resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm); |
4146 | response = kmalloc(resp_size, GFP_NOIO); | 4740 | response = kmalloc(resp_size, GFP_NOIO); |
4147 | if (response == NULL) { | 4741 | if (response == NULL) { |
4148 | dev_err(DEV, "kmalloc of response failed\n"); | 4742 | conn_err(tconn, "kmalloc of response failed\n"); |
4149 | rv = -1; | 4743 | rv = -1; |
4150 | goto fail; | 4744 | goto fail; |
4151 | } | 4745 | } |
4152 | 4746 | ||
4153 | sg_init_table(&sg, 1); | 4747 | sg_init_table(&sg, 1); |
4154 | sg_set_buf(&sg, peers_ch, length); | 4748 | sg_set_buf(&sg, peers_ch, pi.size); |
4155 | 4749 | ||
4156 | rv = crypto_hash_digest(&desc, &sg, sg.length, response); | 4750 | rv = crypto_hash_digest(&desc, &sg, sg.length, response); |
4157 | if (rv) { | 4751 | if (rv) { |
4158 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | 4752 | conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); |
4159 | rv = -1; | 4753 | rv = -1; |
4160 | goto fail; | 4754 | goto fail; |
4161 | } | 4755 | } |
4162 | 4756 | ||
4163 | rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); | 4757 | if (!conn_prepare_command(tconn, sock)) { |
4164 | if (!rv) | 4758 | rv = 0; |
4165 | goto fail; | 4759 | goto fail; |
4166 | 4760 | } | |
4167 | rv = drbd_recv_header(mdev, &cmd, &length); | 4761 | rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0, |
4762 | response, resp_size); | ||
4168 | if (!rv) | 4763 | if (!rv) |
4169 | goto fail; | 4764 | goto fail; |
4170 | 4765 | ||
4171 | if (cmd != P_AUTH_RESPONSE) { | 4766 | err = drbd_recv_header(tconn, &pi); |
4172 | dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", | 4767 | if (err) { |
4173 | cmdname(cmd), cmd); | ||
4174 | rv = 0; | 4768 | rv = 0; |
4175 | goto fail; | 4769 | goto fail; |
4176 | } | 4770 | } |
4177 | 4771 | ||
4178 | if (length != resp_size) { | 4772 | if (pi.cmd != P_AUTH_RESPONSE) { |
4179 | dev_err(DEV, "expected AuthResponse payload of wrong size\n"); | 4773 | conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n", |
4774 | cmdname(pi.cmd), pi.cmd); | ||
4180 | rv = 0; | 4775 | rv = 0; |
4181 | goto fail; | 4776 | goto fail; |
4182 | } | 4777 | } |
4183 | 4778 | ||
4184 | rv = drbd_recv(mdev, response , resp_size); | 4779 | if (pi.size != resp_size) { |
4780 | conn_err(tconn, "expected AuthResponse payload of wrong size\n"); | ||
4781 | rv = 0; | ||
4782 | goto fail; | ||
4783 | } | ||
4185 | 4784 | ||
4186 | if (rv != resp_size) { | 4785 | err = drbd_recv_all_warn(tconn, response , resp_size); |
4187 | if (!signal_pending(current)) | 4786 | if (err) { |
4188 | dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv); | ||
4189 | rv = 0; | 4787 | rv = 0; |
4190 | goto fail; | 4788 | goto fail; |
4191 | } | 4789 | } |
4192 | 4790 | ||
4193 | right_response = kmalloc(resp_size, GFP_NOIO); | 4791 | right_response = kmalloc(resp_size, GFP_NOIO); |
4194 | if (right_response == NULL) { | 4792 | if (right_response == NULL) { |
4195 | dev_err(DEV, "kmalloc of right_response failed\n"); | 4793 | conn_err(tconn, "kmalloc of right_response failed\n"); |
4196 | rv = -1; | 4794 | rv = -1; |
4197 | goto fail; | 4795 | goto fail; |
4198 | } | 4796 | } |
@@ -4201,7 +4799,7 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4201 | 4799 | ||
4202 | rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); | 4800 | rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); |
4203 | if (rv) { | 4801 | if (rv) { |
4204 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | 4802 | conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); |
4205 | rv = -1; | 4803 | rv = -1; |
4206 | goto fail; | 4804 | goto fail; |
4207 | } | 4805 | } |
@@ -4209,8 +4807,8 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4209 | rv = !memcmp(response, right_response, resp_size); | 4807 | rv = !memcmp(response, right_response, resp_size); |
4210 | 4808 | ||
4211 | if (rv) | 4809 | if (rv) |
4212 | dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", | 4810 | conn_info(tconn, "Peer authenticated using %d bytes HMAC\n", |
4213 | resp_size, mdev->net_conf->cram_hmac_alg); | 4811 | resp_size); |
4214 | else | 4812 | else |
4215 | rv = -1; | 4813 | rv = -1; |
4216 | 4814 | ||
@@ -4225,82 +4823,106 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4225 | 4823 | ||
4226 | int drbdd_init(struct drbd_thread *thi) | 4824 | int drbdd_init(struct drbd_thread *thi) |
4227 | { | 4825 | { |
4228 | struct drbd_conf *mdev = thi->mdev; | 4826 | struct drbd_tconn *tconn = thi->tconn; |
4229 | unsigned int minor = mdev_to_minor(mdev); | ||
4230 | int h; | 4827 | int h; |
4231 | 4828 | ||
4232 | sprintf(current->comm, "drbd%d_receiver", minor); | 4829 | conn_info(tconn, "receiver (re)started\n"); |
4233 | |||
4234 | dev_info(DEV, "receiver (re)started\n"); | ||
4235 | 4830 | ||
4236 | do { | 4831 | do { |
4237 | h = drbd_connect(mdev); | 4832 | h = conn_connect(tconn); |
4238 | if (h == 0) { | 4833 | if (h == 0) { |
4239 | drbd_disconnect(mdev); | 4834 | conn_disconnect(tconn); |
4240 | schedule_timeout_interruptible(HZ); | 4835 | schedule_timeout_interruptible(HZ); |
4241 | } | 4836 | } |
4242 | if (h == -1) { | 4837 | if (h == -1) { |
4243 | dev_warn(DEV, "Discarding network configuration.\n"); | 4838 | conn_warn(tconn, "Discarding network configuration.\n"); |
4244 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 4839 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
4245 | } | 4840 | } |
4246 | } while (h == 0); | 4841 | } while (h == 0); |
4247 | 4842 | ||
4248 | if (h > 0) { | 4843 | if (h > 0) |
4249 | if (get_net_conf(mdev)) { | 4844 | drbdd(tconn); |
4250 | drbdd(mdev); | ||
4251 | put_net_conf(mdev); | ||
4252 | } | ||
4253 | } | ||
4254 | 4845 | ||
4255 | drbd_disconnect(mdev); | 4846 | conn_disconnect(tconn); |
4256 | 4847 | ||
4257 | dev_info(DEV, "receiver terminated\n"); | 4848 | conn_info(tconn, "receiver terminated\n"); |
4258 | return 0; | 4849 | return 0; |
4259 | } | 4850 | } |
4260 | 4851 | ||
4261 | /* ********* acknowledge sender ******** */ | 4852 | /* ********* acknowledge sender ******** */ |
4262 | 4853 | ||
4263 | static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h) | 4854 | static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) |
4264 | { | 4855 | { |
4265 | struct p_req_state_reply *p = (struct p_req_state_reply *)h; | 4856 | struct p_req_state_reply *p = pi->data; |
4857 | int retcode = be32_to_cpu(p->retcode); | ||
4858 | |||
4859 | if (retcode >= SS_SUCCESS) { | ||
4860 | set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags); | ||
4861 | } else { | ||
4862 | set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags); | ||
4863 | conn_err(tconn, "Requested state change failed by peer: %s (%d)\n", | ||
4864 | drbd_set_st_err_str(retcode), retcode); | ||
4865 | } | ||
4866 | wake_up(&tconn->ping_wait); | ||
4266 | 4867 | ||
4868 | return 0; | ||
4869 | } | ||
4870 | |||
4871 | static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) | ||
4872 | { | ||
4873 | struct drbd_conf *mdev; | ||
4874 | struct p_req_state_reply *p = pi->data; | ||
4267 | int retcode = be32_to_cpu(p->retcode); | 4875 | int retcode = be32_to_cpu(p->retcode); |
4268 | 4876 | ||
4877 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4878 | if (!mdev) | ||
4879 | return -EIO; | ||
4880 | |||
4881 | if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) { | ||
4882 | D_ASSERT(tconn->agreed_pro_version < 100); | ||
4883 | return got_conn_RqSReply(tconn, pi); | ||
4884 | } | ||
4885 | |||
4269 | if (retcode >= SS_SUCCESS) { | 4886 | if (retcode >= SS_SUCCESS) { |
4270 | drbd_set_flag(mdev, CL_ST_CHG_SUCCESS); | 4887 | set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); |
4271 | } else { | 4888 | } else { |
4272 | drbd_set_flag(mdev, CL_ST_CHG_FAIL); | 4889 | set_bit(CL_ST_CHG_FAIL, &mdev->flags); |
4273 | dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", | 4890 | dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", |
4274 | drbd_set_st_err_str(retcode), retcode); | 4891 | drbd_set_st_err_str(retcode), retcode); |
4275 | } | 4892 | } |
4276 | wake_up(&mdev->state_wait); | 4893 | wake_up(&mdev->state_wait); |
4277 | 4894 | ||
4278 | return true; | 4895 | return 0; |
4279 | } | 4896 | } |
4280 | 4897 | ||
4281 | static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h) | 4898 | static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi) |
4282 | { | 4899 | { |
4283 | return drbd_send_ping_ack(mdev); | 4900 | return drbd_send_ping_ack(tconn); |
4284 | 4901 | ||
4285 | } | 4902 | } |
4286 | 4903 | ||
4287 | static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h) | 4904 | static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4288 | { | 4905 | { |
4289 | /* restore idle timeout */ | 4906 | /* restore idle timeout */ |
4290 | mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | 4907 | tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ; |
4291 | if (!drbd_test_and_set_flag(mdev, GOT_PING_ACK)) | 4908 | if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags)) |
4292 | wake_up(&mdev->misc_wait); | 4909 | wake_up(&tconn->ping_wait); |
4293 | 4910 | ||
4294 | return true; | 4911 | return 0; |
4295 | } | 4912 | } |
4296 | 4913 | ||
4297 | static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) | 4914 | static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi) |
4298 | { | 4915 | { |
4299 | struct p_block_ack *p = (struct p_block_ack *)h; | 4916 | struct drbd_conf *mdev; |
4917 | struct p_block_ack *p = pi->data; | ||
4300 | sector_t sector = be64_to_cpu(p->sector); | 4918 | sector_t sector = be64_to_cpu(p->sector); |
4301 | int blksize = be32_to_cpu(p->blksize); | 4919 | int blksize = be32_to_cpu(p->blksize); |
4302 | 4920 | ||
4303 | D_ASSERT(mdev->agreed_pro_version >= 89); | 4921 | mdev = vnr_to_mdev(tconn, pi->vnr); |
4922 | if (!mdev) | ||
4923 | return -EIO; | ||
4924 | |||
4925 | D_ASSERT(mdev->tconn->agreed_pro_version >= 89); | ||
4304 | 4926 | ||
4305 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 4927 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4306 | 4928 | ||
@@ -4314,162 +4936,139 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) | |||
4314 | dec_rs_pending(mdev); | 4936 | dec_rs_pending(mdev); |
4315 | atomic_add(blksize >> 9, &mdev->rs_sect_in); | 4937 | atomic_add(blksize >> 9, &mdev->rs_sect_in); |
4316 | 4938 | ||
4317 | return true; | 4939 | return 0; |
4318 | } | ||
4319 | |||
4320 | /* when we receive the ACK for a write request, | ||
4321 | * verify that we actually know about it */ | ||
4322 | static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, | ||
4323 | u64 id, sector_t sector) | ||
4324 | { | ||
4325 | struct hlist_head *slot = tl_hash_slot(mdev, sector); | ||
4326 | struct hlist_node *n; | ||
4327 | struct drbd_request *req; | ||
4328 | |||
4329 | hlist_for_each_entry(req, n, slot, collision) { | ||
4330 | if ((unsigned long)req == (unsigned long)id) { | ||
4331 | if (req->sector != sector) { | ||
4332 | dev_err(DEV, "_ack_id_to_req: found req %p but it has " | ||
4333 | "wrong sector (%llus versus %llus)\n", req, | ||
4334 | (unsigned long long)req->sector, | ||
4335 | (unsigned long long)sector); | ||
4336 | break; | ||
4337 | } | ||
4338 | return req; | ||
4339 | } | ||
4340 | } | ||
4341 | return NULL; | ||
4342 | } | 4940 | } |
4343 | 4941 | ||
4344 | typedef struct drbd_request *(req_validator_fn) | 4942 | static int |
4345 | (struct drbd_conf *mdev, u64 id, sector_t sector); | 4943 | validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector, |
4346 | 4944 | struct rb_root *root, const char *func, | |
4347 | static int validate_req_change_req_state(struct drbd_conf *mdev, | 4945 | enum drbd_req_event what, bool missing_ok) |
4348 | u64 id, sector_t sector, req_validator_fn validator, | ||
4349 | const char *func, enum drbd_req_event what) | ||
4350 | { | 4946 | { |
4351 | struct drbd_request *req; | 4947 | struct drbd_request *req; |
4352 | struct bio_and_error m; | 4948 | struct bio_and_error m; |
4353 | 4949 | ||
4354 | spin_lock_irq(&mdev->req_lock); | 4950 | spin_lock_irq(&mdev->tconn->req_lock); |
4355 | req = validator(mdev, id, sector); | 4951 | req = find_request(mdev, root, id, sector, missing_ok, func); |
4356 | if (unlikely(!req)) { | 4952 | if (unlikely(!req)) { |
4357 | spin_unlock_irq(&mdev->req_lock); | 4953 | spin_unlock_irq(&mdev->tconn->req_lock); |
4358 | 4954 | return -EIO; | |
4359 | dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func, | ||
4360 | (void *)(unsigned long)id, (unsigned long long)sector); | ||
4361 | return false; | ||
4362 | } | 4955 | } |
4363 | __req_mod(req, what, &m); | 4956 | __req_mod(req, what, &m); |
4364 | spin_unlock_irq(&mdev->req_lock); | 4957 | spin_unlock_irq(&mdev->tconn->req_lock); |
4365 | 4958 | ||
4366 | if (m.bio) | 4959 | if (m.bio) |
4367 | complete_master_bio(mdev, &m); | 4960 | complete_master_bio(mdev, &m); |
4368 | return true; | 4961 | return 0; |
4369 | } | 4962 | } |
4370 | 4963 | ||
4371 | static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h) | 4964 | static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4372 | { | 4965 | { |
4373 | struct p_block_ack *p = (struct p_block_ack *)h; | 4966 | struct drbd_conf *mdev; |
4967 | struct p_block_ack *p = pi->data; | ||
4374 | sector_t sector = be64_to_cpu(p->sector); | 4968 | sector_t sector = be64_to_cpu(p->sector); |
4375 | int blksize = be32_to_cpu(p->blksize); | 4969 | int blksize = be32_to_cpu(p->blksize); |
4376 | enum drbd_req_event what; | 4970 | enum drbd_req_event what; |
4377 | 4971 | ||
4972 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4973 | if (!mdev) | ||
4974 | return -EIO; | ||
4975 | |||
4378 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 4976 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4379 | 4977 | ||
4380 | if (is_syncer_block_id(p->block_id)) { | 4978 | if (p->block_id == ID_SYNCER) { |
4381 | drbd_set_in_sync(mdev, sector, blksize); | 4979 | drbd_set_in_sync(mdev, sector, blksize); |
4382 | dec_rs_pending(mdev); | 4980 | dec_rs_pending(mdev); |
4383 | return true; | 4981 | return 0; |
4384 | } | 4982 | } |
4385 | switch (be16_to_cpu(h->command)) { | 4983 | switch (pi->cmd) { |
4386 | case P_RS_WRITE_ACK: | 4984 | case P_RS_WRITE_ACK: |
4387 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 4985 | what = WRITE_ACKED_BY_PEER_AND_SIS; |
4388 | what = write_acked_by_peer_and_sis; | ||
4389 | break; | 4986 | break; |
4390 | case P_WRITE_ACK: | 4987 | case P_WRITE_ACK: |
4391 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 4988 | what = WRITE_ACKED_BY_PEER; |
4392 | what = write_acked_by_peer; | ||
4393 | break; | 4989 | break; |
4394 | case P_RECV_ACK: | 4990 | case P_RECV_ACK: |
4395 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); | 4991 | what = RECV_ACKED_BY_PEER; |
4396 | what = recv_acked_by_peer; | 4992 | break; |
4993 | case P_SUPERSEDED: | ||
4994 | what = CONFLICT_RESOLVED; | ||
4397 | break; | 4995 | break; |
4398 | case P_DISCARD_ACK: | 4996 | case P_RETRY_WRITE: |
4399 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 4997 | what = POSTPONE_WRITE; |
4400 | what = conflict_discarded_by_peer; | ||
4401 | break; | 4998 | break; |
4402 | default: | 4999 | default: |
4403 | D_ASSERT(0); | 5000 | BUG(); |
4404 | return false; | ||
4405 | } | 5001 | } |
4406 | 5002 | ||
4407 | return validate_req_change_req_state(mdev, p->block_id, sector, | 5003 | return validate_req_change_req_state(mdev, p->block_id, sector, |
4408 | _ack_id_to_req, __func__ , what); | 5004 | &mdev->write_requests, __func__, |
5005 | what, false); | ||
4409 | } | 5006 | } |
4410 | 5007 | ||
4411 | static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h) | 5008 | static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4412 | { | 5009 | { |
4413 | struct p_block_ack *p = (struct p_block_ack *)h; | 5010 | struct drbd_conf *mdev; |
5011 | struct p_block_ack *p = pi->data; | ||
4414 | sector_t sector = be64_to_cpu(p->sector); | 5012 | sector_t sector = be64_to_cpu(p->sector); |
4415 | int size = be32_to_cpu(p->blksize); | 5013 | int size = be32_to_cpu(p->blksize); |
4416 | struct drbd_request *req; | 5014 | int err; |
4417 | struct bio_and_error m; | 5015 | |
5016 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5017 | if (!mdev) | ||
5018 | return -EIO; | ||
4418 | 5019 | ||
4419 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 5020 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4420 | 5021 | ||
4421 | if (is_syncer_block_id(p->block_id)) { | 5022 | if (p->block_id == ID_SYNCER) { |
4422 | dec_rs_pending(mdev); | 5023 | dec_rs_pending(mdev); |
4423 | drbd_rs_failed_io(mdev, sector, size); | 5024 | drbd_rs_failed_io(mdev, sector, size); |
4424 | return true; | 5025 | return 0; |
4425 | } | 5026 | } |
4426 | 5027 | ||
4427 | spin_lock_irq(&mdev->req_lock); | 5028 | err = validate_req_change_req_state(mdev, p->block_id, sector, |
4428 | req = _ack_id_to_req(mdev, p->block_id, sector); | 5029 | &mdev->write_requests, __func__, |
4429 | if (!req) { | 5030 | NEG_ACKED, true); |
4430 | spin_unlock_irq(&mdev->req_lock); | 5031 | if (err) { |
4431 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A || | 5032 | /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. |
4432 | mdev->net_conf->wire_protocol == DRBD_PROT_B) { | 5033 | The master bio might already be completed, therefore the |
4433 | /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. | 5034 | request is no longer in the collision hash. */ |
4434 | The master bio might already be completed, therefore the | 5035 | /* In Protocol B we might already have got a P_RECV_ACK |
4435 | request is no longer in the collision hash. | 5036 | but then get a P_NEG_ACK afterwards. */ |
4436 | => Do not try to validate block_id as request. */ | 5037 | drbd_set_out_of_sync(mdev, sector, size); |
4437 | /* In Protocol B we might already have got a P_RECV_ACK | ||
4438 | but then get a P_NEG_ACK after wards. */ | ||
4439 | drbd_set_out_of_sync(mdev, sector, size); | ||
4440 | return true; | ||
4441 | } else { | ||
4442 | dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__, | ||
4443 | (void *)(unsigned long)p->block_id, (unsigned long long)sector); | ||
4444 | return false; | ||
4445 | } | ||
4446 | } | 5038 | } |
4447 | __req_mod(req, neg_acked, &m); | 5039 | return 0; |
4448 | spin_unlock_irq(&mdev->req_lock); | ||
4449 | |||
4450 | if (m.bio) | ||
4451 | complete_master_bio(mdev, &m); | ||
4452 | return true; | ||
4453 | } | 5040 | } |
4454 | 5041 | ||
4455 | static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h) | 5042 | static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi) |
4456 | { | 5043 | { |
4457 | struct p_block_ack *p = (struct p_block_ack *)h; | 5044 | struct drbd_conf *mdev; |
5045 | struct p_block_ack *p = pi->data; | ||
4458 | sector_t sector = be64_to_cpu(p->sector); | 5046 | sector_t sector = be64_to_cpu(p->sector); |
4459 | 5047 | ||
5048 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5049 | if (!mdev) | ||
5050 | return -EIO; | ||
5051 | |||
4460 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 5052 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4461 | dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", | 5053 | |
5054 | dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n", | ||
4462 | (unsigned long long)sector, be32_to_cpu(p->blksize)); | 5055 | (unsigned long long)sector, be32_to_cpu(p->blksize)); |
4463 | 5056 | ||
4464 | return validate_req_change_req_state(mdev, p->block_id, sector, | 5057 | return validate_req_change_req_state(mdev, p->block_id, sector, |
4465 | _ar_id_to_req, __func__ , neg_acked); | 5058 | &mdev->read_requests, __func__, |
5059 | NEG_ACKED, false); | ||
4466 | } | 5060 | } |
4467 | 5061 | ||
4468 | static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h) | 5062 | static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi) |
4469 | { | 5063 | { |
5064 | struct drbd_conf *mdev; | ||
4470 | sector_t sector; | 5065 | sector_t sector; |
4471 | int size; | 5066 | int size; |
4472 | struct p_block_ack *p = (struct p_block_ack *)h; | 5067 | struct p_block_ack *p = pi->data; |
5068 | |||
5069 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5070 | if (!mdev) | ||
5071 | return -EIO; | ||
4473 | 5072 | ||
4474 | sector = be64_to_cpu(p->sector); | 5073 | sector = be64_to_cpu(p->sector); |
4475 | size = be32_to_cpu(p->blksize); | 5074 | size = be32_to_cpu(p->blksize); |
@@ -4480,57 +5079,66 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h) | |||
4480 | 5079 | ||
4481 | if (get_ldev_if_state(mdev, D_FAILED)) { | 5080 | if (get_ldev_if_state(mdev, D_FAILED)) { |
4482 | drbd_rs_complete_io(mdev, sector); | 5081 | drbd_rs_complete_io(mdev, sector); |
4483 | switch (be16_to_cpu(h->command)) { | 5082 | switch (pi->cmd) { |
4484 | case P_NEG_RS_DREPLY: | 5083 | case P_NEG_RS_DREPLY: |
4485 | drbd_rs_failed_io(mdev, sector, size); | 5084 | drbd_rs_failed_io(mdev, sector, size); |
4486 | case P_RS_CANCEL: | 5085 | case P_RS_CANCEL: |
4487 | break; | 5086 | break; |
4488 | default: | 5087 | default: |
4489 | D_ASSERT(0); | 5088 | BUG(); |
4490 | put_ldev(mdev); | ||
4491 | return false; | ||
4492 | } | 5089 | } |
4493 | put_ldev(mdev); | 5090 | put_ldev(mdev); |
4494 | } | 5091 | } |
4495 | 5092 | ||
4496 | return true; | 5093 | return 0; |
4497 | } | 5094 | } |
4498 | 5095 | ||
4499 | static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) | 5096 | static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4500 | { | 5097 | { |
4501 | struct p_barrier_ack *p = (struct p_barrier_ack *)h; | 5098 | struct p_barrier_ack *p = pi->data; |
4502 | 5099 | struct drbd_conf *mdev; | |
4503 | tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); | 5100 | int vnr; |
4504 | 5101 | ||
4505 | if (mdev->state.conn == C_AHEAD && | 5102 | tl_release(tconn, p->barrier, be32_to_cpu(p->set_size)); |
4506 | atomic_read(&mdev->ap_in_flight) == 0 && | 5103 | |
4507 | !drbd_test_and_set_flag(mdev, AHEAD_TO_SYNC_SOURCE)) { | 5104 | rcu_read_lock(); |
4508 | mdev->start_resync_timer.expires = jiffies + HZ; | 5105 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
4509 | add_timer(&mdev->start_resync_timer); | 5106 | if (mdev->state.conn == C_AHEAD && |
5107 | atomic_read(&mdev->ap_in_flight) == 0 && | ||
5108 | !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { | ||
5109 | mdev->start_resync_timer.expires = jiffies + HZ; | ||
5110 | add_timer(&mdev->start_resync_timer); | ||
5111 | } | ||
4510 | } | 5112 | } |
5113 | rcu_read_unlock(); | ||
4511 | 5114 | ||
4512 | return true; | 5115 | return 0; |
4513 | } | 5116 | } |
4514 | 5117 | ||
4515 | static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) | 5118 | static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi) |
4516 | { | 5119 | { |
4517 | struct p_block_ack *p = (struct p_block_ack *)h; | 5120 | struct drbd_conf *mdev; |
5121 | struct p_block_ack *p = pi->data; | ||
4518 | struct drbd_work *w; | 5122 | struct drbd_work *w; |
4519 | sector_t sector; | 5123 | sector_t sector; |
4520 | int size; | 5124 | int size; |
4521 | 5125 | ||
5126 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5127 | if (!mdev) | ||
5128 | return -EIO; | ||
5129 | |||
4522 | sector = be64_to_cpu(p->sector); | 5130 | sector = be64_to_cpu(p->sector); |
4523 | size = be32_to_cpu(p->blksize); | 5131 | size = be32_to_cpu(p->blksize); |
4524 | 5132 | ||
4525 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 5133 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4526 | 5134 | ||
4527 | if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) | 5135 | if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) |
4528 | drbd_ov_oos_found(mdev, sector, size); | 5136 | drbd_ov_out_of_sync_found(mdev, sector, size); |
4529 | else | 5137 | else |
4530 | ov_oos_print(mdev); | 5138 | ov_out_of_sync_print(mdev); |
4531 | 5139 | ||
4532 | if (!get_ldev(mdev)) | 5140 | if (!get_ldev(mdev)) |
4533 | return true; | 5141 | return 0; |
4534 | 5142 | ||
4535 | drbd_rs_complete_io(mdev, sector); | 5143 | drbd_rs_complete_io(mdev, sector); |
4536 | dec_rs_pending(mdev); | 5144 | dec_rs_pending(mdev); |
@@ -4545,114 +5153,137 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) | |||
4545 | w = kmalloc(sizeof(*w), GFP_NOIO); | 5153 | w = kmalloc(sizeof(*w), GFP_NOIO); |
4546 | if (w) { | 5154 | if (w) { |
4547 | w->cb = w_ov_finished; | 5155 | w->cb = w_ov_finished; |
4548 | drbd_queue_work_front(&mdev->data.work, w); | 5156 | w->mdev = mdev; |
5157 | drbd_queue_work(&mdev->tconn->sender_work, w); | ||
4549 | } else { | 5158 | } else { |
4550 | dev_err(DEV, "kmalloc(w) failed."); | 5159 | dev_err(DEV, "kmalloc(w) failed."); |
4551 | ov_oos_print(mdev); | 5160 | ov_out_of_sync_print(mdev); |
4552 | drbd_resync_finished(mdev); | 5161 | drbd_resync_finished(mdev); |
4553 | } | 5162 | } |
4554 | } | 5163 | } |
4555 | put_ldev(mdev); | 5164 | put_ldev(mdev); |
4556 | return true; | 5165 | return 0; |
4557 | } | 5166 | } |
4558 | 5167 | ||
4559 | static int got_skip(struct drbd_conf *mdev, struct p_header80 *h) | 5168 | static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi) |
4560 | { | 5169 | { |
4561 | return true; | 5170 | return 0; |
5171 | } | ||
5172 | |||
5173 | static int tconn_finish_peer_reqs(struct drbd_tconn *tconn) | ||
5174 | { | ||
5175 | struct drbd_conf *mdev; | ||
5176 | int vnr, not_empty = 0; | ||
5177 | |||
5178 | do { | ||
5179 | clear_bit(SIGNAL_ASENDER, &tconn->flags); | ||
5180 | flush_signals(current); | ||
5181 | |||
5182 | rcu_read_lock(); | ||
5183 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
5184 | kref_get(&mdev->kref); | ||
5185 | rcu_read_unlock(); | ||
5186 | if (drbd_finish_peer_reqs(mdev)) { | ||
5187 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
5188 | return 1; | ||
5189 | } | ||
5190 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
5191 | rcu_read_lock(); | ||
5192 | } | ||
5193 | set_bit(SIGNAL_ASENDER, &tconn->flags); | ||
5194 | |||
5195 | spin_lock_irq(&tconn->req_lock); | ||
5196 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
5197 | not_empty = !list_empty(&mdev->done_ee); | ||
5198 | if (not_empty) | ||
5199 | break; | ||
5200 | } | ||
5201 | spin_unlock_irq(&tconn->req_lock); | ||
5202 | rcu_read_unlock(); | ||
5203 | } while (not_empty); | ||
5204 | |||
5205 | return 0; | ||
4562 | } | 5206 | } |
4563 | 5207 | ||
4564 | struct asender_cmd { | 5208 | struct asender_cmd { |
4565 | size_t pkt_size; | 5209 | size_t pkt_size; |
4566 | int (*process)(struct drbd_conf *mdev, struct p_header80 *h); | 5210 | int (*fn)(struct drbd_tconn *tconn, struct packet_info *); |
4567 | }; | 5211 | }; |
4568 | 5212 | ||
4569 | static struct asender_cmd *get_asender_cmd(int cmd) | 5213 | static struct asender_cmd asender_tbl[] = { |
4570 | { | 5214 | [P_PING] = { 0, got_Ping }, |
4571 | static struct asender_cmd asender_tbl[] = { | 5215 | [P_PING_ACK] = { 0, got_PingAck }, |
4572 | /* anything missing from this table is in | ||
4573 | * the drbd_cmd_handler (drbd_default_handler) table, | ||
4574 | * see the beginning of drbdd() */ | ||
4575 | [P_PING] = { sizeof(struct p_header80), got_Ping }, | ||
4576 | [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck }, | ||
4577 | [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5216 | [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, |
4578 | [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5217 | [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, |
4579 | [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5218 | [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, |
4580 | [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5219 | [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck }, |
4581 | [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, | 5220 | [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, |
4582 | [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, | 5221 | [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, |
4583 | [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, | 5222 | [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply }, |
4584 | [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, | 5223 | [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, |
4585 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, | 5224 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, |
4586 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, | 5225 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, |
4587 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, | 5226 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, |
4588 | [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, | 5227 | [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, |
4589 | [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply}, | 5228 | [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply }, |
4590 | [P_MAX_CMD] = { 0, NULL }, | 5229 | [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply }, |
4591 | }; | 5230 | [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, |
4592 | if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) | 5231 | }; |
4593 | return NULL; | ||
4594 | return &asender_tbl[cmd]; | ||
4595 | } | ||
4596 | 5232 | ||
4597 | int drbd_asender(struct drbd_thread *thi) | 5233 | int drbd_asender(struct drbd_thread *thi) |
4598 | { | 5234 | { |
4599 | struct drbd_conf *mdev = thi->mdev; | 5235 | struct drbd_tconn *tconn = thi->tconn; |
4600 | struct p_header80 *h = &mdev->meta.rbuf.header.h80; | ||
4601 | struct asender_cmd *cmd = NULL; | 5236 | struct asender_cmd *cmd = NULL; |
4602 | 5237 | struct packet_info pi; | |
4603 | int rv, len; | 5238 | int rv; |
4604 | void *buf = h; | 5239 | void *buf = tconn->meta.rbuf; |
4605 | int received = 0; | 5240 | int received = 0; |
4606 | int expect = sizeof(struct p_header80); | 5241 | unsigned int header_size = drbd_header_size(tconn); |
4607 | int empty; | 5242 | int expect = header_size; |
4608 | int ping_timeout_active = 0; | 5243 | bool ping_timeout_active = false; |
4609 | 5244 | struct net_conf *nc; | |
4610 | sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); | 5245 | int ping_timeo, tcp_cork, ping_int; |
4611 | 5246 | ||
4612 | current->policy = SCHED_RR; /* Make this a realtime task! */ | 5247 | current->policy = SCHED_RR; /* Make this a realtime task! */ |
4613 | current->rt_priority = 2; /* more important than all other tasks */ | 5248 | current->rt_priority = 2; /* more important than all other tasks */ |
4614 | 5249 | ||
4615 | while (get_t_state(thi) == Running) { | 5250 | while (get_t_state(thi) == RUNNING) { |
4616 | drbd_thread_current_set_cpu(mdev); | 5251 | drbd_thread_current_set_cpu(thi); |
4617 | if (drbd_test_and_clear_flag(mdev, SEND_PING)) { | ||
4618 | ERR_IF(!drbd_send_ping(mdev)) goto reconnect; | ||
4619 | mdev->meta.socket->sk->sk_rcvtimeo = | ||
4620 | mdev->net_conf->ping_timeo*HZ/10; | ||
4621 | ping_timeout_active = 1; | ||
4622 | } | ||
4623 | 5252 | ||
4624 | /* conditionally cork; | 5253 | rcu_read_lock(); |
4625 | * it may hurt latency if we cork without much to send */ | 5254 | nc = rcu_dereference(tconn->net_conf); |
4626 | if (!mdev->net_conf->no_cork && | 5255 | ping_timeo = nc->ping_timeo; |
4627 | 3 < atomic_read(&mdev->unacked_cnt)) | 5256 | tcp_cork = nc->tcp_cork; |
4628 | drbd_tcp_cork(mdev->meta.socket); | 5257 | ping_int = nc->ping_int; |
4629 | while (1) { | 5258 | rcu_read_unlock(); |
4630 | drbd_clear_flag(mdev, SIGNAL_ASENDER); | 5259 | |
4631 | flush_signals(current); | 5260 | if (test_and_clear_bit(SEND_PING, &tconn->flags)) { |
4632 | if (!drbd_process_done_ee(mdev)) | 5261 | if (drbd_send_ping(tconn)) { |
5262 | conn_err(tconn, "drbd_send_ping has failed\n"); | ||
4633 | goto reconnect; | 5263 | goto reconnect; |
4634 | /* to avoid race with newly queued ACKs */ | 5264 | } |
4635 | drbd_set_flag(mdev, SIGNAL_ASENDER); | 5265 | tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; |
4636 | spin_lock_irq(&mdev->req_lock); | 5266 | ping_timeout_active = true; |
4637 | empty = list_empty(&mdev->done_ee); | 5267 | } |
4638 | spin_unlock_irq(&mdev->req_lock); | 5268 | |
4639 | /* new ack may have been queued right here, | 5269 | /* TODO: conditionally cork; it may hurt latency if we cork without |
4640 | * but then there is also a signal pending, | 5270 | much to send */ |
4641 | * and we start over... */ | 5271 | if (tcp_cork) |
4642 | if (empty) | 5272 | drbd_tcp_cork(tconn->meta.socket); |
4643 | break; | 5273 | if (tconn_finish_peer_reqs(tconn)) { |
5274 | conn_err(tconn, "tconn_finish_peer_reqs() failed\n"); | ||
5275 | goto reconnect; | ||
4644 | } | 5276 | } |
4645 | /* but unconditionally uncork unless disabled */ | 5277 | /* but unconditionally uncork unless disabled */ |
4646 | if (!mdev->net_conf->no_cork) | 5278 | if (tcp_cork) |
4647 | drbd_tcp_uncork(mdev->meta.socket); | 5279 | drbd_tcp_uncork(tconn->meta.socket); |
4648 | 5280 | ||
4649 | /* short circuit, recv_msg would return EINTR anyways. */ | 5281 | /* short circuit, recv_msg would return EINTR anyways. */ |
4650 | if (signal_pending(current)) | 5282 | if (signal_pending(current)) |
4651 | continue; | 5283 | continue; |
4652 | 5284 | ||
4653 | rv = drbd_recv_short(mdev, mdev->meta.socket, | 5285 | rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0); |
4654 | buf, expect-received, 0); | 5286 | clear_bit(SIGNAL_ASENDER, &tconn->flags); |
4655 | drbd_clear_flag(mdev, SIGNAL_ASENDER); | ||
4656 | 5287 | ||
4657 | flush_signals(current); | 5288 | flush_signals(current); |
4658 | 5289 | ||
@@ -4670,87 +5301,91 @@ int drbd_asender(struct drbd_thread *thi) | |||
4670 | received += rv; | 5301 | received += rv; |
4671 | buf += rv; | 5302 | buf += rv; |
4672 | } else if (rv == 0) { | 5303 | } else if (rv == 0) { |
4673 | if (drbd_test_flag(mdev, DISCONNECT_SENT)) { | 5304 | if (test_bit(DISCONNECT_SENT, &tconn->flags)) { |
4674 | long t; /* time_left */ | 5305 | long t; |
4675 | t = wait_event_timeout(mdev->state_wait, mdev->state.conn < C_CONNECTED, | 5306 | rcu_read_lock(); |
4676 | mdev->net_conf->ping_timeo * HZ/10); | 5307 | t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10; |
5308 | rcu_read_unlock(); | ||
5309 | |||
5310 | t = wait_event_timeout(tconn->ping_wait, | ||
5311 | tconn->cstate < C_WF_REPORT_PARAMS, | ||
5312 | t); | ||
4677 | if (t) | 5313 | if (t) |
4678 | break; | 5314 | break; |
4679 | } | 5315 | } |
4680 | dev_err(DEV, "meta connection shut down by peer.\n"); | 5316 | conn_err(tconn, "meta connection shut down by peer.\n"); |
4681 | goto reconnect; | 5317 | goto reconnect; |
4682 | } else if (rv == -EAGAIN) { | 5318 | } else if (rv == -EAGAIN) { |
4683 | /* If the data socket received something meanwhile, | 5319 | /* If the data socket received something meanwhile, |
4684 | * that is good enough: peer is still alive. */ | 5320 | * that is good enough: peer is still alive. */ |
4685 | if (time_after(mdev->last_received, | 5321 | if (time_after(tconn->last_received, |
4686 | jiffies - mdev->meta.socket->sk->sk_rcvtimeo)) | 5322 | jiffies - tconn->meta.socket->sk->sk_rcvtimeo)) |
4687 | continue; | 5323 | continue; |
4688 | if (ping_timeout_active) { | 5324 | if (ping_timeout_active) { |
4689 | dev_err(DEV, "PingAck did not arrive in time.\n"); | 5325 | conn_err(tconn, "PingAck did not arrive in time.\n"); |
4690 | goto reconnect; | 5326 | goto reconnect; |
4691 | } | 5327 | } |
4692 | drbd_set_flag(mdev, SEND_PING); | 5328 | set_bit(SEND_PING, &tconn->flags); |
4693 | continue; | 5329 | continue; |
4694 | } else if (rv == -EINTR) { | 5330 | } else if (rv == -EINTR) { |
4695 | continue; | 5331 | continue; |
4696 | } else { | 5332 | } else { |
4697 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | 5333 | conn_err(tconn, "sock_recvmsg returned %d\n", rv); |
4698 | goto reconnect; | 5334 | goto reconnect; |
4699 | } | 5335 | } |
4700 | 5336 | ||
4701 | if (received == expect && cmd == NULL) { | 5337 | if (received == expect && cmd == NULL) { |
4702 | if (unlikely(h->magic != BE_DRBD_MAGIC)) { | 5338 | if (decode_header(tconn, tconn->meta.rbuf, &pi)) |
4703 | dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n", | ||
4704 | be32_to_cpu(h->magic), | ||
4705 | be16_to_cpu(h->command), | ||
4706 | be16_to_cpu(h->length)); | ||
4707 | goto reconnect; | 5339 | goto reconnect; |
4708 | } | 5340 | cmd = &asender_tbl[pi.cmd]; |
4709 | cmd = get_asender_cmd(be16_to_cpu(h->command)); | 5341 | if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { |
4710 | len = be16_to_cpu(h->length); | 5342 | conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n", |
4711 | if (unlikely(cmd == NULL)) { | 5343 | cmdname(pi.cmd), pi.cmd); |
4712 | dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n", | ||
4713 | be32_to_cpu(h->magic), | ||
4714 | be16_to_cpu(h->command), | ||
4715 | be16_to_cpu(h->length)); | ||
4716 | goto disconnect; | 5344 | goto disconnect; |
4717 | } | 5345 | } |
4718 | expect = cmd->pkt_size; | 5346 | expect = header_size + cmd->pkt_size; |
4719 | ERR_IF(len != expect-sizeof(struct p_header80)) | 5347 | if (pi.size != expect - header_size) { |
5348 | conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n", | ||
5349 | pi.cmd, pi.size); | ||
4720 | goto reconnect; | 5350 | goto reconnect; |
5351 | } | ||
4721 | } | 5352 | } |
4722 | if (received == expect) { | 5353 | if (received == expect) { |
4723 | mdev->last_received = jiffies; | 5354 | bool err; |
4724 | D_ASSERT(cmd != NULL); | 5355 | |
4725 | if (!cmd->process(mdev, h)) | 5356 | err = cmd->fn(tconn, &pi); |
5357 | if (err) { | ||
5358 | conn_err(tconn, "%pf failed\n", cmd->fn); | ||
4726 | goto reconnect; | 5359 | goto reconnect; |
5360 | } | ||
5361 | |||
5362 | tconn->last_received = jiffies; | ||
4727 | 5363 | ||
4728 | /* the idle_timeout (ping-int) | 5364 | if (cmd == &asender_tbl[P_PING_ACK]) { |
4729 | * has been restored in got_PingAck() */ | 5365 | /* restore idle timeout */ |
4730 | if (cmd == get_asender_cmd(P_PING_ACK)) | 5366 | tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; |
4731 | ping_timeout_active = 0; | 5367 | ping_timeout_active = false; |
5368 | } | ||
4732 | 5369 | ||
4733 | buf = h; | 5370 | buf = tconn->meta.rbuf; |
4734 | received = 0; | 5371 | received = 0; |
4735 | expect = sizeof(struct p_header80); | 5372 | expect = header_size; |
4736 | cmd = NULL; | 5373 | cmd = NULL; |
4737 | } | 5374 | } |
4738 | } | 5375 | } |
4739 | 5376 | ||
4740 | if (0) { | 5377 | if (0) { |
4741 | reconnect: | 5378 | reconnect: |
4742 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | 5379 | conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); |
4743 | drbd_md_sync(mdev); | 5380 | conn_md_sync(tconn); |
4744 | } | 5381 | } |
4745 | if (0) { | 5382 | if (0) { |
4746 | disconnect: | 5383 | disconnect: |
4747 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 5384 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
4748 | drbd_md_sync(mdev); | ||
4749 | } | 5385 | } |
4750 | drbd_clear_flag(mdev, SIGNAL_ASENDER); | 5386 | clear_bit(SIGNAL_ASENDER, &tconn->flags); |
4751 | 5387 | ||
4752 | D_ASSERT(mdev->state.conn < C_CONNECTED); | 5388 | conn_info(tconn, "asender terminated\n"); |
4753 | dev_info(DEV, "asender terminated\n"); | ||
4754 | 5389 | ||
4755 | return 0; | 5390 | return 0; |
4756 | } | 5391 | } |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 135ea76ed502..f58a4a4b4dfb 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -31,6 +31,8 @@ | |||
31 | #include "drbd_req.h" | 31 | #include "drbd_req.h" |
32 | 32 | ||
33 | 33 | ||
34 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); | ||
35 | |||
34 | /* Update disk stats at start of I/O request */ | 36 | /* Update disk stats at start of I/O request */ |
35 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) | 37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) |
36 | { | 38 | { |
@@ -40,6 +42,8 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req | |||
40 | part_round_stats(cpu, &mdev->vdisk->part0); | 42 | part_round_stats(cpu, &mdev->vdisk->part0); |
41 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | 43 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); |
42 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | 44 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); |
45 | (void) cpu; /* The macro invocations above want the cpu argument, I do not like | ||
46 | the compiler warning about cpu only assigned but never used... */ | ||
43 | part_inc_in_flight(&mdev->vdisk->part0, rw); | 47 | part_inc_in_flight(&mdev->vdisk->part0, rw); |
44 | part_stat_unlock(); | 48 | part_stat_unlock(); |
45 | } | 49 | } |
@@ -57,9 +61,51 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) | |||
57 | part_stat_unlock(); | 61 | part_stat_unlock(); |
58 | } | 62 | } |
59 | 63 | ||
60 | static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) | 64 | static struct drbd_request *drbd_req_new(struct drbd_conf *mdev, |
65 | struct bio *bio_src) | ||
66 | { | ||
67 | struct drbd_request *req; | ||
68 | |||
69 | req = mempool_alloc(drbd_request_mempool, GFP_NOIO); | ||
70 | if (!req) | ||
71 | return NULL; | ||
72 | |||
73 | drbd_req_make_private_bio(req, bio_src); | ||
74 | req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; | ||
75 | req->w.mdev = mdev; | ||
76 | req->master_bio = bio_src; | ||
77 | req->epoch = 0; | ||
78 | |||
79 | drbd_clear_interval(&req->i); | ||
80 | req->i.sector = bio_src->bi_sector; | ||
81 | req->i.size = bio_src->bi_size; | ||
82 | req->i.local = true; | ||
83 | req->i.waiting = false; | ||
84 | |||
85 | INIT_LIST_HEAD(&req->tl_requests); | ||
86 | INIT_LIST_HEAD(&req->w.list); | ||
87 | |||
88 | /* one reference to be put by __drbd_make_request */ | ||
89 | atomic_set(&req->completion_ref, 1); | ||
90 | /* one kref as long as completion_ref > 0 */ | ||
91 | kref_init(&req->kref); | ||
92 | return req; | ||
93 | } | ||
94 | |||
95 | void drbd_req_destroy(struct kref *kref) | ||
61 | { | 96 | { |
62 | const unsigned long s = req->rq_state; | 97 | struct drbd_request *req = container_of(kref, struct drbd_request, kref); |
98 | struct drbd_conf *mdev = req->w.mdev; | ||
99 | const unsigned s = req->rq_state; | ||
100 | |||
101 | if ((req->master_bio && !(s & RQ_POSTPONED)) || | ||
102 | atomic_read(&req->completion_ref) || | ||
103 | (s & RQ_LOCAL_PENDING) || | ||
104 | ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) { | ||
105 | dev_err(DEV, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n", | ||
106 | s, atomic_read(&req->completion_ref)); | ||
107 | return; | ||
108 | } | ||
63 | 109 | ||
64 | /* remove it from the transfer log. | 110 | /* remove it from the transfer log. |
65 | * well, only if it had been there in the first | 111 | * well, only if it had been there in the first |
@@ -67,24 +113,33 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const | |||
67 | * and never sent), it should still be "empty" as | 113 | * and never sent), it should still be "empty" as |
68 | * initialized in drbd_req_new(), so we can list_del() it | 114 | * initialized in drbd_req_new(), so we can list_del() it |
69 | * here unconditionally */ | 115 | * here unconditionally */ |
70 | list_del(&req->tl_requests); | 116 | list_del_init(&req->tl_requests); |
71 | 117 | ||
72 | /* if it was a write, we may have to set the corresponding | 118 | /* if it was a write, we may have to set the corresponding |
73 | * bit(s) out-of-sync first. If it had a local part, we need to | 119 | * bit(s) out-of-sync first. If it had a local part, we need to |
74 | * release the reference to the activity log. */ | 120 | * release the reference to the activity log. */ |
75 | if (rw == WRITE) { | 121 | if (s & RQ_WRITE) { |
76 | /* Set out-of-sync unless both OK flags are set | 122 | /* Set out-of-sync unless both OK flags are set |
77 | * (local only or remote failed). | 123 | * (local only or remote failed). |
78 | * Other places where we set out-of-sync: | 124 | * Other places where we set out-of-sync: |
79 | * READ with local io-error */ | 125 | * READ with local io-error */ |
80 | if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) | ||
81 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
82 | 126 | ||
83 | if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) | 127 | /* There is a special case: |
84 | drbd_set_in_sync(mdev, req->sector, req->size); | 128 | * we may notice late that IO was suspended, |
129 | * and postpone, or schedule for retry, a write, | ||
130 | * before it even was submitted or sent. | ||
131 | * In that case we do not want to touch the bitmap at all. | ||
132 | */ | ||
133 | if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) { | ||
134 | if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) | ||
135 | drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); | ||
136 | |||
137 | if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) | ||
138 | drbd_set_in_sync(mdev, req->i.sector, req->i.size); | ||
139 | } | ||
85 | 140 | ||
86 | /* one might be tempted to move the drbd_al_complete_io | 141 | /* one might be tempted to move the drbd_al_complete_io |
87 | * to the local io completion callback drbd_endio_pri. | 142 | * to the local io completion callback drbd_request_endio. |
88 | * but, if this was a mirror write, we may only | 143 | * but, if this was a mirror write, we may only |
89 | * drbd_al_complete_io after this is RQ_NET_DONE, | 144 | * drbd_al_complete_io after this is RQ_NET_DONE, |
90 | * otherwise the extent could be dropped from the al | 145 | * otherwise the extent could be dropped from the al |
@@ -93,109 +148,35 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const | |||
93 | * but after the extent has been dropped from the al, | 148 | * but after the extent has been dropped from the al, |
94 | * we would forget to resync the corresponding extent. | 149 | * we would forget to resync the corresponding extent. |
95 | */ | 150 | */ |
96 | if (s & RQ_LOCAL_MASK) { | 151 | if (s & RQ_IN_ACT_LOG) { |
97 | if (get_ldev_if_state(mdev, D_FAILED)) { | 152 | if (get_ldev_if_state(mdev, D_FAILED)) { |
98 | if (s & RQ_IN_ACT_LOG) | 153 | drbd_al_complete_io(mdev, &req->i); |
99 | drbd_al_complete_io(mdev, req->sector); | ||
100 | put_ldev(mdev); | 154 | put_ldev(mdev); |
101 | } else if (__ratelimit(&drbd_ratelimit_state)) { | 155 | } else if (__ratelimit(&drbd_ratelimit_state)) { |
102 | dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " | 156 | dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), " |
103 | "but my Disk seems to have failed :(\n", | 157 | "but my Disk seems to have failed :(\n", |
104 | (unsigned long long) req->sector); | 158 | (unsigned long long) req->i.sector, req->i.size); |
105 | } | 159 | } |
106 | } | 160 | } |
107 | } | 161 | } |
108 | 162 | ||
109 | drbd_req_free(req); | 163 | mempool_free(req, drbd_request_mempool); |
110 | } | 164 | } |
111 | 165 | ||
112 | static void queue_barrier(struct drbd_conf *mdev) | 166 | static void wake_all_senders(struct drbd_tconn *tconn) { |
113 | { | 167 | wake_up(&tconn->sender_work.q_wait); |
114 | struct drbd_tl_epoch *b; | ||
115 | |||
116 | /* We are within the req_lock. Once we queued the barrier for sending, | ||
117 | * we set the CREATE_BARRIER bit. It is cleared as soon as a new | ||
118 | * barrier/epoch object is added. This is the only place this bit is | ||
119 | * set. It indicates that the barrier for this epoch is already queued, | ||
120 | * and no new epoch has been created yet. */ | ||
121 | if (drbd_test_flag(mdev, CREATE_BARRIER)) | ||
122 | return; | ||
123 | |||
124 | b = mdev->newest_tle; | ||
125 | b->w.cb = w_send_barrier; | ||
126 | /* inc_ap_pending done here, so we won't | ||
127 | * get imbalanced on connection loss. | ||
128 | * dec_ap_pending will be done in got_BarrierAck | ||
129 | * or (on connection loss) in tl_clear. */ | ||
130 | inc_ap_pending(mdev); | ||
131 | drbd_queue_work(&mdev->data.work, &b->w); | ||
132 | drbd_set_flag(mdev, CREATE_BARRIER); | ||
133 | } | 168 | } |
134 | 169 | ||
135 | static void _about_to_complete_local_write(struct drbd_conf *mdev, | 170 | /* must hold resource->req_lock */ |
136 | struct drbd_request *req) | 171 | static void start_new_tl_epoch(struct drbd_tconn *tconn) |
137 | { | 172 | { |
138 | const unsigned long s = req->rq_state; | 173 | /* no point closing an epoch, if it is empty, anyways. */ |
139 | struct drbd_request *i; | 174 | if (tconn->current_tle_writes == 0) |
140 | struct drbd_epoch_entry *e; | 175 | return; |
141 | struct hlist_node *n; | ||
142 | struct hlist_head *slot; | ||
143 | |||
144 | /* Before we can signal completion to the upper layers, | ||
145 | * we may need to close the current epoch. | ||
146 | * We can skip this, if this request has not even been sent, because we | ||
147 | * did not have a fully established connection yet/anymore, during | ||
148 | * bitmap exchange, or while we are C_AHEAD due to congestion policy. | ||
149 | */ | ||
150 | if (mdev->state.conn >= C_CONNECTED && | ||
151 | (s & RQ_NET_SENT) != 0 && | ||
152 | req->epoch == mdev->newest_tle->br_number) | ||
153 | queue_barrier(mdev); | ||
154 | |||
155 | /* we need to do the conflict detection stuff, | ||
156 | * if we have the ee_hash (two_primaries) and | ||
157 | * this has been on the network */ | ||
158 | if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { | ||
159 | const sector_t sector = req->sector; | ||
160 | const int size = req->size; | ||
161 | |||
162 | /* ASSERT: | ||
163 | * there must be no conflicting requests, since | ||
164 | * they must have been failed on the spot */ | ||
165 | #define OVERLAPS overlaps(sector, size, i->sector, i->size) | ||
166 | slot = tl_hash_slot(mdev, sector); | ||
167 | hlist_for_each_entry(i, n, slot, collision) { | ||
168 | if (OVERLAPS) { | ||
169 | dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " | ||
170 | "other: %p %llus +%u\n", | ||
171 | req, (unsigned long long)sector, size, | ||
172 | i, (unsigned long long)i->sector, i->size); | ||
173 | } | ||
174 | } | ||
175 | 176 | ||
176 | /* maybe "wake" those conflicting epoch entries | 177 | tconn->current_tle_writes = 0; |
177 | * that wait for this request to finish. | 178 | atomic_inc(&tconn->current_tle_nr); |
178 | * | 179 | wake_all_senders(tconn); |
179 | * currently, there can be only _one_ such ee | ||
180 | * (well, or some more, which would be pending | ||
181 | * P_DISCARD_ACK not yet sent by the asender...), | ||
182 | * since we block the receiver thread upon the | ||
183 | * first conflict detection, which will wait on | ||
184 | * misc_wait. maybe we want to assert that? | ||
185 | * | ||
186 | * anyways, if we found one, | ||
187 | * we just have to do a wake_up. */ | ||
188 | #undef OVERLAPS | ||
189 | #define OVERLAPS overlaps(sector, size, e->sector, e->size) | ||
190 | slot = ee_hash_slot(mdev, req->sector); | ||
191 | hlist_for_each_entry(e, n, slot, collision) { | ||
192 | if (OVERLAPS) { | ||
193 | wake_up(&mdev->misc_wait); | ||
194 | break; | ||
195 | } | ||
196 | } | ||
197 | } | ||
198 | #undef OVERLAPS | ||
199 | } | 180 | } |
200 | 181 | ||
201 | void complete_master_bio(struct drbd_conf *mdev, | 182 | void complete_master_bio(struct drbd_conf *mdev, |
@@ -205,17 +186,33 @@ void complete_master_bio(struct drbd_conf *mdev, | |||
205 | dec_ap_bio(mdev); | 186 | dec_ap_bio(mdev); |
206 | } | 187 | } |
207 | 188 | ||
189 | |||
190 | static void drbd_remove_request_interval(struct rb_root *root, | ||
191 | struct drbd_request *req) | ||
192 | { | ||
193 | struct drbd_conf *mdev = req->w.mdev; | ||
194 | struct drbd_interval *i = &req->i; | ||
195 | |||
196 | drbd_remove_interval(root, i); | ||
197 | |||
198 | /* Wake up any processes waiting for this request to complete. */ | ||
199 | if (i->waiting) | ||
200 | wake_up(&mdev->misc_wait); | ||
201 | } | ||
202 | |||
208 | /* Helper for __req_mod(). | 203 | /* Helper for __req_mod(). |
209 | * Set m->bio to the master bio, if it is fit to be completed, | 204 | * Set m->bio to the master bio, if it is fit to be completed, |
210 | * or leave it alone (it is initialized to NULL in __req_mod), | 205 | * or leave it alone (it is initialized to NULL in __req_mod), |
211 | * if it has already been completed, or cannot be completed yet. | 206 | * if it has already been completed, or cannot be completed yet. |
212 | * If m->bio is set, the error status to be returned is placed in m->error. | 207 | * If m->bio is set, the error status to be returned is placed in m->error. |
213 | */ | 208 | */ |
214 | void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | 209 | static |
210 | void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) | ||
215 | { | 211 | { |
216 | const unsigned long s = req->rq_state; | 212 | const unsigned s = req->rq_state; |
217 | struct drbd_conf *mdev = req->mdev; | 213 | struct drbd_conf *mdev = req->w.mdev; |
218 | int rw = req->rq_state & RQ_WRITE ? WRITE : READ; | 214 | int rw; |
215 | int error, ok; | ||
219 | 216 | ||
220 | /* we must not complete the master bio, while it is | 217 | /* we must not complete the master bio, while it is |
221 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) | 218 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) |
@@ -226,178 +223,219 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | |||
226 | * the receiver, | 223 | * the receiver, |
227 | * the bio_endio completion callbacks. | 224 | * the bio_endio completion callbacks. |
228 | */ | 225 | */ |
229 | if (s & RQ_NET_QUEUED) | 226 | if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) || |
230 | return; | 227 | (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) || |
231 | if (s & RQ_NET_PENDING) | 228 | (s & RQ_COMPLETION_SUSP)) { |
229 | dev_err(DEV, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s); | ||
232 | return; | 230 | return; |
233 | if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) | 231 | } |
232 | |||
233 | if (!req->master_bio) { | ||
234 | dev_err(DEV, "drbd_req_complete: Logic BUG, master_bio == NULL!\n"); | ||
234 | return; | 235 | return; |
236 | } | ||
235 | 237 | ||
236 | if (req->master_bio) { | 238 | rw = bio_rw(req->master_bio); |
237 | /* this is data_received (remote read) | ||
238 | * or protocol C P_WRITE_ACK | ||
239 | * or protocol B P_RECV_ACK | ||
240 | * or protocol A "handed_over_to_network" (SendAck) | ||
241 | * or canceled or failed, | ||
242 | * or killed from the transfer log due to connection loss. | ||
243 | */ | ||
244 | 239 | ||
245 | /* | 240 | /* |
246 | * figure out whether to report success or failure. | 241 | * figure out whether to report success or failure. |
247 | * | 242 | * |
248 | * report success when at least one of the operations succeeded. | 243 | * report success when at least one of the operations succeeded. |
249 | * or, to put the other way, | 244 | * or, to put the other way, |
250 | * only report failure, when both operations failed. | 245 | * only report failure, when both operations failed. |
251 | * | 246 | * |
252 | * what to do about the failures is handled elsewhere. | 247 | * what to do about the failures is handled elsewhere. |
253 | * what we need to do here is just: complete the master_bio. | 248 | * what we need to do here is just: complete the master_bio. |
254 | * | 249 | * |
255 | * local completion error, if any, has been stored as ERR_PTR | 250 | * local completion error, if any, has been stored as ERR_PTR |
256 | * in private_bio within drbd_endio_pri. | 251 | * in private_bio within drbd_request_endio. |
257 | */ | 252 | */ |
258 | int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); | 253 | ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); |
259 | int error = PTR_ERR(req->private_bio); | 254 | error = PTR_ERR(req->private_bio); |
260 | 255 | ||
261 | /* remove the request from the conflict detection | 256 | /* remove the request from the conflict detection |
262 | * respective block_id verification hash */ | 257 | * respective block_id verification hash */ |
263 | if (!hlist_unhashed(&req->collision)) | 258 | if (!drbd_interval_empty(&req->i)) { |
264 | hlist_del(&req->collision); | 259 | struct rb_root *root; |
265 | else | ||
266 | D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); | ||
267 | 260 | ||
268 | /* for writes we need to do some extra housekeeping */ | ||
269 | if (rw == WRITE) | 261 | if (rw == WRITE) |
270 | _about_to_complete_local_write(mdev, req); | 262 | root = &mdev->write_requests; |
263 | else | ||
264 | root = &mdev->read_requests; | ||
265 | drbd_remove_request_interval(root, req); | ||
266 | } else if (!(s & RQ_POSTPONED)) | ||
267 | D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); | ||
271 | 268 | ||
272 | /* Update disk stats */ | 269 | /* Before we can signal completion to the upper layers, |
273 | _drbd_end_io_acct(mdev, req); | 270 | * we may need to close the current transfer log epoch. |
271 | * We are within the request lock, so we can simply compare | ||
272 | * the request epoch number with the current transfer log | ||
273 | * epoch number. If they match, increase the current_tle_nr, | ||
274 | * and reset the transfer log epoch write_cnt. | ||
275 | */ | ||
276 | if (rw == WRITE && | ||
277 | req->epoch == atomic_read(&mdev->tconn->current_tle_nr)) | ||
278 | start_new_tl_epoch(mdev->tconn); | ||
279 | |||
280 | /* Update disk stats */ | ||
281 | _drbd_end_io_acct(mdev, req); | ||
282 | |||
283 | /* If READ failed, | ||
284 | * have it be pushed back to the retry work queue, | ||
285 | * so it will re-enter __drbd_make_request(), | ||
286 | * and be re-assigned to a suitable local or remote path, | ||
287 | * or failed if we do not have access to good data anymore. | ||
288 | * | ||
289 | * Unless it was failed early by __drbd_make_request(), | ||
290 | * because no path was available, in which case | ||
291 | * it was not even added to the transfer_log. | ||
292 | * | ||
293 | * READA may fail, and will not be retried. | ||
294 | * | ||
295 | * WRITE should have used all available paths already. | ||
296 | */ | ||
297 | if (!ok && rw == READ && !list_empty(&req->tl_requests)) | ||
298 | req->rq_state |= RQ_POSTPONED; | ||
274 | 299 | ||
300 | if (!(req->rq_state & RQ_POSTPONED)) { | ||
275 | m->error = ok ? 0 : (error ?: -EIO); | 301 | m->error = ok ? 0 : (error ?: -EIO); |
276 | m->bio = req->master_bio; | 302 | m->bio = req->master_bio; |
277 | req->master_bio = NULL; | 303 | req->master_bio = NULL; |
278 | } | 304 | } |
305 | } | ||
279 | 306 | ||
280 | if (s & RQ_LOCAL_PENDING) | 307 | static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) |
281 | return; | 308 | { |
309 | struct drbd_conf *mdev = req->w.mdev; | ||
310 | D_ASSERT(m || (req->rq_state & RQ_POSTPONED)); | ||
311 | |||
312 | if (!atomic_sub_and_test(put, &req->completion_ref)) | ||
313 | return 0; | ||
282 | 314 | ||
283 | if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { | 315 | drbd_req_complete(req, m); |
284 | /* this is disconnected (local only) operation, | 316 | |
285 | * or protocol C P_WRITE_ACK, | 317 | if (req->rq_state & RQ_POSTPONED) { |
286 | * or protocol A or B P_BARRIER_ACK, | 318 | /* don't destroy the req object just yet, |
287 | * or killed from the transfer log due to connection loss. */ | 319 | * but queue it for retry */ |
288 | _req_is_done(mdev, req, rw); | 320 | drbd_restart_request(req); |
321 | return 0; | ||
289 | } | 322 | } |
290 | /* else: network part and not DONE yet. that is | 323 | |
291 | * protocol A or B, barrier ack still pending... */ | 324 | return 1; |
292 | } | 325 | } |
293 | 326 | ||
294 | static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m) | 327 | /* I'd like this to be the only place that manipulates |
328 | * req->completion_ref and req->kref. */ | ||
329 | static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, | ||
330 | int clear, int set) | ||
295 | { | 331 | { |
296 | struct drbd_conf *mdev = req->mdev; | 332 | struct drbd_conf *mdev = req->w.mdev; |
333 | unsigned s = req->rq_state; | ||
334 | int c_put = 0; | ||
335 | int k_put = 0; | ||
297 | 336 | ||
298 | if (!is_susp(mdev->state)) | 337 | if (drbd_suspended(mdev) && !((s | clear) & RQ_COMPLETION_SUSP)) |
299 | _req_may_be_done(req, m); | 338 | set |= RQ_COMPLETION_SUSP; |
300 | } | ||
301 | 339 | ||
302 | /* | 340 | /* apply */ |
303 | * checks whether there was an overlapping request | ||
304 | * or ee already registered. | ||
305 | * | ||
306 | * if so, return 1, in which case this request is completed on the spot, | ||
307 | * without ever being submitted or send. | ||
308 | * | ||
309 | * return 0 if it is ok to submit this request. | ||
310 | * | ||
311 | * NOTE: | ||
312 | * paranoia: assume something above us is broken, and issues different write | ||
313 | * requests for the same block simultaneously... | ||
314 | * | ||
315 | * To ensure these won't be reordered differently on both nodes, resulting in | ||
316 | * diverging data sets, we discard the later one(s). Not that this is supposed | ||
317 | * to happen, but this is the rationale why we also have to check for | ||
318 | * conflicting requests with local origin, and why we have to do so regardless | ||
319 | * of whether we allowed multiple primaries. | ||
320 | * | ||
321 | * BTW, in case we only have one primary, the ee_hash is empty anyways, and the | ||
322 | * second hlist_for_each_entry becomes a noop. This is even simpler than to | ||
323 | * grab a reference on the net_conf, and check for the two_primaries flag... | ||
324 | */ | ||
325 | static int _req_conflicts(struct drbd_request *req) | ||
326 | { | ||
327 | struct drbd_conf *mdev = req->mdev; | ||
328 | const sector_t sector = req->sector; | ||
329 | const int size = req->size; | ||
330 | struct drbd_request *i; | ||
331 | struct drbd_epoch_entry *e; | ||
332 | struct hlist_node *n; | ||
333 | struct hlist_head *slot; | ||
334 | 341 | ||
335 | D_ASSERT(hlist_unhashed(&req->collision)); | 342 | req->rq_state &= ~clear; |
343 | req->rq_state |= set; | ||
336 | 344 | ||
337 | if (!get_net_conf(mdev)) | 345 | /* no change? */ |
338 | return 0; | 346 | if (req->rq_state == s) |
347 | return; | ||
339 | 348 | ||
340 | /* BUG_ON */ | 349 | /* intent: get references */ |
341 | ERR_IF (mdev->tl_hash_s == 0) | 350 | |
342 | goto out_no_conflict; | 351 | if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING)) |
343 | BUG_ON(mdev->tl_hash == NULL); | 352 | atomic_inc(&req->completion_ref); |
344 | 353 | ||
345 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | 354 | if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) { |
346 | slot = tl_hash_slot(mdev, sector); | 355 | inc_ap_pending(mdev); |
347 | hlist_for_each_entry(i, n, slot, collision) { | 356 | atomic_inc(&req->completion_ref); |
348 | if (OVERLAPS) { | ||
349 | dev_alert(DEV, "%s[%u] Concurrent local write detected! " | ||
350 | "[DISCARD L] new: %llus +%u; " | ||
351 | "pending: %llus +%u\n", | ||
352 | current->comm, current->pid, | ||
353 | (unsigned long long)sector, size, | ||
354 | (unsigned long long)i->sector, i->size); | ||
355 | goto out_conflict; | ||
356 | } | ||
357 | } | 357 | } |
358 | 358 | ||
359 | if (mdev->ee_hash_s) { | 359 | if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) |
360 | /* now, check for overlapping requests with remote origin */ | 360 | atomic_inc(&req->completion_ref); |
361 | BUG_ON(mdev->ee_hash == NULL); | 361 | |
362 | #undef OVERLAPS | 362 | if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) |
363 | #define OVERLAPS overlaps(e->sector, e->size, sector, size) | 363 | kref_get(&req->kref); /* wait for the DONE */ |
364 | slot = ee_hash_slot(mdev, sector); | 364 | |
365 | hlist_for_each_entry(e, n, slot, collision) { | 365 | if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) |
366 | if (OVERLAPS) { | 366 | atomic_add(req->i.size >> 9, &mdev->ap_in_flight); |
367 | dev_alert(DEV, "%s[%u] Concurrent remote write detected!" | 367 | |
368 | " [DISCARD L] new: %llus +%u; " | 368 | if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) |
369 | "pending: %llus +%u\n", | 369 | atomic_inc(&req->completion_ref); |
370 | current->comm, current->pid, | 370 | |
371 | (unsigned long long)sector, size, | 371 | /* progress: put references */ |
372 | (unsigned long long)e->sector, e->size); | 372 | |
373 | goto out_conflict; | 373 | if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP)) |
374 | } | 374 | ++c_put; |
375 | } | 375 | |
376 | if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) { | ||
377 | D_ASSERT(req->rq_state & RQ_LOCAL_PENDING); | ||
378 | /* local completion may still come in later, | ||
379 | * we need to keep the req object around. */ | ||
380 | kref_get(&req->kref); | ||
381 | ++c_put; | ||
376 | } | 382 | } |
377 | #undef OVERLAPS | ||
378 | 383 | ||
379 | out_no_conflict: | 384 | if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) { |
380 | /* this is like it should be, and what we expected. | 385 | if (req->rq_state & RQ_LOCAL_ABORTED) |
381 | * our users do behave after all... */ | 386 | ++k_put; |
382 | put_net_conf(mdev); | 387 | else |
383 | return 0; | 388 | ++c_put; |
389 | } | ||
384 | 390 | ||
385 | out_conflict: | 391 | if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { |
386 | put_net_conf(mdev); | 392 | dec_ap_pending(mdev); |
387 | return 1; | 393 | ++c_put; |
394 | } | ||
395 | |||
396 | if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) | ||
397 | ++c_put; | ||
398 | |||
399 | if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { | ||
400 | if (req->rq_state & RQ_NET_SENT) | ||
401 | atomic_sub(req->i.size >> 9, &mdev->ap_in_flight); | ||
402 | ++k_put; | ||
403 | } | ||
404 | |||
405 | /* potentially complete and destroy */ | ||
406 | |||
407 | if (k_put || c_put) { | ||
408 | /* Completion does it's own kref_put. If we are going to | ||
409 | * kref_sub below, we need req to be still around then. */ | ||
410 | int at_least = k_put + !!c_put; | ||
411 | int refcount = atomic_read(&req->kref.refcount); | ||
412 | if (refcount < at_least) | ||
413 | dev_err(DEV, | ||
414 | "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n", | ||
415 | s, req->rq_state, refcount, at_least); | ||
416 | } | ||
417 | |||
418 | /* If we made progress, retry conflicting peer requests, if any. */ | ||
419 | if (req->i.waiting) | ||
420 | wake_up(&mdev->misc_wait); | ||
421 | |||
422 | if (c_put) | ||
423 | k_put += drbd_req_put_completion_ref(req, m, c_put); | ||
424 | if (k_put) | ||
425 | kref_sub(&req->kref, k_put, drbd_req_destroy); | ||
388 | } | 426 | } |
389 | 427 | ||
390 | static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *req) | 428 | static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *req) |
391 | { | 429 | { |
392 | char b[BDEVNAME_SIZE]; | 430 | char b[BDEVNAME_SIZE]; |
393 | 431 | ||
394 | if (__ratelimit(&drbd_ratelimit_state)) | 432 | if (!__ratelimit(&drbd_ratelimit_state)) |
395 | return; | 433 | return; |
396 | 434 | ||
397 | dev_warn(DEV, "local %s IO error sector %llu+%u on %s\n", | 435 | dev_warn(DEV, "local %s IO error sector %llu+%u on %s\n", |
398 | (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", | 436 | (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", |
399 | (unsigned long long)req->sector, | 437 | (unsigned long long)req->i.sector, |
400 | req->size >> 9, | 438 | req->i.size >> 9, |
401 | bdevname(mdev->ldev->backing_bdev, b)); | 439 | bdevname(mdev->ldev->backing_bdev, b)); |
402 | } | 440 | } |
403 | 441 | ||
@@ -416,9 +454,12 @@ static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *re | |||
416 | int __req_mod(struct drbd_request *req, enum drbd_req_event what, | 454 | int __req_mod(struct drbd_request *req, enum drbd_req_event what, |
417 | struct bio_and_error *m) | 455 | struct bio_and_error *m) |
418 | { | 456 | { |
419 | struct drbd_conf *mdev = req->mdev; | 457 | struct drbd_conf *mdev = req->w.mdev; |
420 | int rv = 0; | 458 | struct net_conf *nc; |
421 | m->bio = NULL; | 459 | int p, rv = 0; |
460 | |||
461 | if (m) | ||
462 | m->bio = NULL; | ||
422 | 463 | ||
423 | switch (what) { | 464 | switch (what) { |
424 | default: | 465 | default: |
@@ -427,118 +468,91 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
427 | 468 | ||
428 | /* does not happen... | 469 | /* does not happen... |
429 | * initialization done in drbd_req_new | 470 | * initialization done in drbd_req_new |
430 | case created: | 471 | case CREATED: |
431 | break; | 472 | break; |
432 | */ | 473 | */ |
433 | 474 | ||
434 | case to_be_send: /* via network */ | 475 | case TO_BE_SENT: /* via network */ |
435 | /* reached via drbd_make_request_common | 476 | /* reached via __drbd_make_request |
436 | * and from w_read_retry_remote */ | 477 | * and from w_read_retry_remote */ |
437 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | 478 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); |
438 | req->rq_state |= RQ_NET_PENDING; | 479 | rcu_read_lock(); |
439 | inc_ap_pending(mdev); | 480 | nc = rcu_dereference(mdev->tconn->net_conf); |
481 | p = nc->wire_protocol; | ||
482 | rcu_read_unlock(); | ||
483 | req->rq_state |= | ||
484 | p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK : | ||
485 | p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0; | ||
486 | mod_rq_state(req, m, 0, RQ_NET_PENDING); | ||
440 | break; | 487 | break; |
441 | 488 | ||
442 | case to_be_submitted: /* locally */ | 489 | case TO_BE_SUBMITTED: /* locally */ |
443 | /* reached via drbd_make_request_common */ | 490 | /* reached via __drbd_make_request */ |
444 | D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); | 491 | D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); |
445 | req->rq_state |= RQ_LOCAL_PENDING; | 492 | mod_rq_state(req, m, 0, RQ_LOCAL_PENDING); |
446 | break; | 493 | break; |
447 | 494 | ||
448 | case completed_ok: | 495 | case COMPLETED_OK: |
449 | if (req->rq_state & RQ_WRITE) | 496 | if (req->rq_state & RQ_WRITE) |
450 | mdev->writ_cnt += req->size>>9; | 497 | mdev->writ_cnt += req->i.size >> 9; |
451 | else | 498 | else |
452 | mdev->read_cnt += req->size>>9; | 499 | mdev->read_cnt += req->i.size >> 9; |
453 | 500 | ||
454 | req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); | 501 | mod_rq_state(req, m, RQ_LOCAL_PENDING, |
455 | req->rq_state &= ~RQ_LOCAL_PENDING; | 502 | RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); |
456 | |||
457 | _req_may_be_done_not_susp(req, m); | ||
458 | break; | 503 | break; |
459 | 504 | ||
460 | case abort_disk_io: | 505 | case ABORT_DISK_IO: |
461 | req->rq_state |= RQ_LOCAL_ABORTED; | 506 | mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED); |
462 | if (req->rq_state & RQ_WRITE) | ||
463 | _req_may_be_done_not_susp(req, m); | ||
464 | else | ||
465 | goto goto_queue_for_net_read; | ||
466 | break; | 507 | break; |
467 | 508 | ||
468 | case write_completed_with_error: | 509 | case WRITE_COMPLETED_WITH_ERROR: |
469 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
470 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
471 | |||
472 | drbd_report_io_error(mdev, req); | 510 | drbd_report_io_error(mdev, req); |
473 | __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); | 511 | __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); |
474 | _req_may_be_done_not_susp(req, m); | 512 | mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); |
475 | break; | 513 | break; |
476 | 514 | ||
477 | case read_ahead_completed_with_error: | 515 | case READ_COMPLETED_WITH_ERROR: |
478 | /* it is legal to fail READA */ | 516 | drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); |
479 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
480 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
481 | _req_may_be_done_not_susp(req, m); | ||
482 | break; | ||
483 | |||
484 | case read_completed_with_error: | ||
485 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
486 | |||
487 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
488 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
489 | |||
490 | if (req->rq_state & RQ_LOCAL_ABORTED) { | ||
491 | _req_may_be_done(req, m); | ||
492 | break; | ||
493 | } | ||
494 | |||
495 | drbd_report_io_error(mdev, req); | 517 | drbd_report_io_error(mdev, req); |
496 | __drbd_chk_io_error(mdev, DRBD_READ_ERROR); | 518 | __drbd_chk_io_error(mdev, DRBD_READ_ERROR); |
519 | /* fall through. */ | ||
520 | case READ_AHEAD_COMPLETED_WITH_ERROR: | ||
521 | /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ | ||
522 | mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); | ||
523 | break; | ||
497 | 524 | ||
498 | goto_queue_for_net_read: | 525 | case QUEUE_FOR_NET_READ: |
499 | |||
500 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | ||
501 | |||
502 | /* no point in retrying if there is no good remote data, | ||
503 | * or we have no connection. */ | ||
504 | if (mdev->state.pdsk != D_UP_TO_DATE) { | ||
505 | _req_may_be_done_not_susp(req, m); | ||
506 | break; | ||
507 | } | ||
508 | |||
509 | /* _req_mod(req,to_be_send); oops, recursion... */ | ||
510 | req->rq_state |= RQ_NET_PENDING; | ||
511 | inc_ap_pending(mdev); | ||
512 | /* fall through: _req_mod(req,queue_for_net_read); */ | ||
513 | |||
514 | case queue_for_net_read: | ||
515 | /* READ or READA, and | 526 | /* READ or READA, and |
516 | * no local disk, | 527 | * no local disk, |
517 | * or target area marked as invalid, | 528 | * or target area marked as invalid, |
518 | * or just got an io-error. */ | 529 | * or just got an io-error. */ |
519 | /* from drbd_make_request_common | 530 | /* from __drbd_make_request |
520 | * or from bio_endio during read io-error recovery */ | 531 | * or from bio_endio during read io-error recovery */ |
521 | 532 | ||
522 | /* so we can verify the handle in the answer packet | 533 | /* So we can verify the handle in the answer packet. |
523 | * corresponding hlist_del is in _req_may_be_done() */ | 534 | * Corresponding drbd_remove_request_interval is in |
524 | hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector)); | 535 | * drbd_req_complete() */ |
536 | D_ASSERT(drbd_interval_empty(&req->i)); | ||
537 | drbd_insert_interval(&mdev->read_requests, &req->i); | ||
525 | 538 | ||
526 | drbd_set_flag(mdev, UNPLUG_REMOTE); | 539 | set_bit(UNPLUG_REMOTE, &mdev->flags); |
527 | 540 | ||
528 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 541 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
529 | req->rq_state |= RQ_NET_QUEUED; | 542 | D_ASSERT((req->rq_state & RQ_LOCAL_MASK) == 0); |
530 | req->w.cb = (req->rq_state & RQ_LOCAL_MASK) | 543 | mod_rq_state(req, m, 0, RQ_NET_QUEUED); |
531 | ? w_read_retry_remote | 544 | req->w.cb = w_send_read_req; |
532 | : w_send_read_req; | 545 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
533 | drbd_queue_work(&mdev->data.work, &req->w); | ||
534 | break; | 546 | break; |
535 | 547 | ||
536 | case queue_for_net_write: | 548 | case QUEUE_FOR_NET_WRITE: |
537 | /* assert something? */ | 549 | /* assert something? */ |
538 | /* from drbd_make_request_common only */ | 550 | /* from __drbd_make_request only */ |
539 | 551 | ||
540 | hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector)); | 552 | /* Corresponding drbd_remove_request_interval is in |
541 | /* corresponding hlist_del is in _req_may_be_done() */ | 553 | * drbd_req_complete() */ |
554 | D_ASSERT(drbd_interval_empty(&req->i)); | ||
555 | drbd_insert_interval(&mdev->write_requests, &req->i); | ||
542 | 556 | ||
543 | /* NOTE | 557 | /* NOTE |
544 | * In case the req ended up on the transfer log before being | 558 | * In case the req ended up on the transfer log before being |
@@ -549,7 +563,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
549 | * | 563 | * |
550 | * _req_add_to_epoch(req); this has to be after the | 564 | * _req_add_to_epoch(req); this has to be after the |
551 | * _maybe_start_new_epoch(req); which happened in | 565 | * _maybe_start_new_epoch(req); which happened in |
552 | * drbd_make_request_common, because we now may set the bit | 566 | * __drbd_make_request, because we now may set the bit |
553 | * again ourselves to close the current epoch. | 567 | * again ourselves to close the current epoch. |
554 | * | 568 | * |
555 | * Add req to the (now) current epoch (barrier). */ | 569 | * Add req to the (now) current epoch (barrier). */ |
@@ -557,204 +571,189 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
557 | /* otherwise we may lose an unplug, which may cause some remote | 571 | /* otherwise we may lose an unplug, which may cause some remote |
558 | * io-scheduler timeout to expire, increasing maximum latency, | 572 | * io-scheduler timeout to expire, increasing maximum latency, |
559 | * hurting performance. */ | 573 | * hurting performance. */ |
560 | drbd_set_flag(mdev, UNPLUG_REMOTE); | 574 | set_bit(UNPLUG_REMOTE, &mdev->flags); |
561 | |||
562 | /* see drbd_make_request_common, | ||
563 | * just after it grabs the req_lock */ | ||
564 | D_ASSERT(drbd_test_flag(mdev, CREATE_BARRIER) == 0); | ||
565 | |||
566 | req->epoch = mdev->newest_tle->br_number; | ||
567 | |||
568 | /* increment size of current epoch */ | ||
569 | mdev->newest_tle->n_writes++; | ||
570 | 575 | ||
571 | /* queue work item to send data */ | 576 | /* queue work item to send data */ |
572 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 577 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
573 | req->rq_state |= RQ_NET_QUEUED; | 578 | mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); |
574 | req->w.cb = w_send_dblock; | 579 | req->w.cb = w_send_dblock; |
575 | drbd_queue_work(&mdev->data.work, &req->w); | 580 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
576 | 581 | ||
577 | /* close the epoch, in case it outgrew the limit */ | 582 | /* close the epoch, in case it outgrew the limit */ |
578 | if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size) | 583 | rcu_read_lock(); |
579 | queue_barrier(mdev); | 584 | nc = rcu_dereference(mdev->tconn->net_conf); |
585 | p = nc->max_epoch_size; | ||
586 | rcu_read_unlock(); | ||
587 | if (mdev->tconn->current_tle_writes >= p) | ||
588 | start_new_tl_epoch(mdev->tconn); | ||
580 | 589 | ||
581 | break; | 590 | break; |
582 | 591 | ||
583 | case queue_for_send_oos: | 592 | case QUEUE_FOR_SEND_OOS: |
584 | req->rq_state |= RQ_NET_QUEUED; | 593 | mod_rq_state(req, m, 0, RQ_NET_QUEUED); |
585 | req->w.cb = w_send_oos; | 594 | req->w.cb = w_send_out_of_sync; |
586 | drbd_queue_work(&mdev->data.work, &req->w); | 595 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
587 | break; | 596 | break; |
588 | 597 | ||
589 | case read_retry_remote_canceled: | 598 | case READ_RETRY_REMOTE_CANCELED: |
590 | case send_canceled: | 599 | case SEND_CANCELED: |
591 | case send_failed: | 600 | case SEND_FAILED: |
592 | /* real cleanup will be done from tl_clear. just update flags | 601 | /* real cleanup will be done from tl_clear. just update flags |
593 | * so it is no longer marked as on the worker queue */ | 602 | * so it is no longer marked as on the worker queue */ |
594 | req->rq_state &= ~RQ_NET_QUEUED; | 603 | mod_rq_state(req, m, RQ_NET_QUEUED, 0); |
595 | /* if we did it right, tl_clear should be scheduled only after | ||
596 | * this, so this should not be necessary! */ | ||
597 | _req_may_be_done_not_susp(req, m); | ||
598 | break; | 604 | break; |
599 | 605 | ||
600 | case handed_over_to_network: | 606 | case HANDED_OVER_TO_NETWORK: |
601 | /* assert something? */ | 607 | /* assert something? */ |
602 | if (bio_data_dir(req->master_bio) == WRITE) | ||
603 | atomic_add(req->size>>9, &mdev->ap_in_flight); | ||
604 | |||
605 | if (bio_data_dir(req->master_bio) == WRITE && | 608 | if (bio_data_dir(req->master_bio) == WRITE && |
606 | mdev->net_conf->wire_protocol == DRBD_PROT_A) { | 609 | !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) { |
607 | /* this is what is dangerous about protocol A: | 610 | /* this is what is dangerous about protocol A: |
608 | * pretend it was successfully written on the peer. */ | 611 | * pretend it was successfully written on the peer. */ |
609 | if (req->rq_state & RQ_NET_PENDING) { | 612 | if (req->rq_state & RQ_NET_PENDING) |
610 | dec_ap_pending(mdev); | 613 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); |
611 | req->rq_state &= ~RQ_NET_PENDING; | 614 | /* else: neg-ack was faster... */ |
612 | req->rq_state |= RQ_NET_OK; | ||
613 | } /* else: neg-ack was faster... */ | ||
614 | /* it is still not yet RQ_NET_DONE until the | 615 | /* it is still not yet RQ_NET_DONE until the |
615 | * corresponding epoch barrier got acked as well, | 616 | * corresponding epoch barrier got acked as well, |
616 | * so we know what to dirty on connection loss */ | 617 | * so we know what to dirty on connection loss */ |
617 | } | 618 | } |
618 | req->rq_state &= ~RQ_NET_QUEUED; | 619 | mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); |
619 | req->rq_state |= RQ_NET_SENT; | ||
620 | _req_may_be_done_not_susp(req, m); | ||
621 | break; | 620 | break; |
622 | 621 | ||
623 | case oos_handed_to_network: | 622 | case OOS_HANDED_TO_NETWORK: |
624 | /* Was not set PENDING, no longer QUEUED, so is now DONE | 623 | /* Was not set PENDING, no longer QUEUED, so is now DONE |
625 | * as far as this connection is concerned. */ | 624 | * as far as this connection is concerned. */ |
626 | req->rq_state &= ~RQ_NET_QUEUED; | 625 | mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE); |
627 | req->rq_state |= RQ_NET_DONE; | ||
628 | _req_may_be_done_not_susp(req, m); | ||
629 | break; | 626 | break; |
630 | 627 | ||
631 | case connection_lost_while_pending: | 628 | case CONNECTION_LOST_WHILE_PENDING: |
632 | /* transfer log cleanup after connection loss */ | 629 | /* transfer log cleanup after connection loss */ |
633 | /* assert something? */ | 630 | mod_rq_state(req, m, |
634 | if (req->rq_state & RQ_NET_PENDING) | 631 | RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP, |
635 | dec_ap_pending(mdev); | 632 | RQ_NET_DONE); |
636 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | ||
637 | req->rq_state |= RQ_NET_DONE; | ||
638 | if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE) | ||
639 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | ||
640 | |||
641 | /* if it is still queued, we may not complete it here. | ||
642 | * it will be canceled soon. */ | ||
643 | if (!(req->rq_state & RQ_NET_QUEUED)) | ||
644 | _req_may_be_done(req, m); /* Allowed while state.susp */ | ||
645 | break; | 633 | break; |
646 | 634 | ||
647 | case conflict_discarded_by_peer: | 635 | case CONFLICT_RESOLVED: |
648 | /* for discarded conflicting writes of multiple primaries, | 636 | /* for superseded conflicting writes of multiple primaries, |
649 | * there is no need to keep anything in the tl, potential | 637 | * there is no need to keep anything in the tl, potential |
650 | * node crashes are covered by the activity log. */ | 638 | * node crashes are covered by the activity log. |
651 | if (what == conflict_discarded_by_peer) | 639 | * |
652 | dev_alert(DEV, "Got DiscardAck packet %llus +%u!" | 640 | * If this request had been marked as RQ_POSTPONED before, |
653 | " DRBD is not a random data generator!\n", | 641 | * it will actually not be completed, but "restarted", |
654 | (unsigned long long)req->sector, req->size); | 642 | * resubmitted from the retry worker context. */ |
655 | req->rq_state |= RQ_NET_DONE; | 643 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
656 | /* fall through */ | 644 | D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); |
657 | case write_acked_by_peer_and_sis: | 645 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK); |
658 | case write_acked_by_peer: | 646 | break; |
659 | if (what == write_acked_by_peer_and_sis) | 647 | |
660 | req->rq_state |= RQ_NET_SIS; | 648 | case WRITE_ACKED_BY_PEER_AND_SIS: |
649 | req->rq_state |= RQ_NET_SIS; | ||
650 | case WRITE_ACKED_BY_PEER: | ||
651 | D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); | ||
661 | /* protocol C; successfully written on peer. | 652 | /* protocol C; successfully written on peer. |
662 | * Nothing more to do here. | 653 | * Nothing more to do here. |
663 | * We want to keep the tl in place for all protocols, to cater | 654 | * We want to keep the tl in place for all protocols, to cater |
664 | * for volatile write-back caches on lower level devices. */ | 655 | * for volatile write-back caches on lower level devices. */ |
665 | 656 | ||
666 | case recv_acked_by_peer: | 657 | goto ack_common; |
658 | case RECV_ACKED_BY_PEER: | ||
659 | D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK); | ||
667 | /* protocol B; pretends to be successfully written on peer. | 660 | /* protocol B; pretends to be successfully written on peer. |
668 | * see also notes above in handed_over_to_network about | 661 | * see also notes above in HANDED_OVER_TO_NETWORK about |
669 | * protocol != C */ | 662 | * protocol != C */ |
670 | req->rq_state |= RQ_NET_OK; | 663 | ack_common: |
671 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 664 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
672 | dec_ap_pending(mdev); | 665 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); |
673 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | ||
674 | req->rq_state &= ~RQ_NET_PENDING; | ||
675 | _req_may_be_done_not_susp(req, m); | ||
676 | break; | 666 | break; |
677 | 667 | ||
678 | case neg_acked: | 668 | case POSTPONE_WRITE: |
679 | /* assert something? */ | 669 | D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); |
680 | if (req->rq_state & RQ_NET_PENDING) { | 670 | /* If this node has already detected the write conflict, the |
681 | dec_ap_pending(mdev); | 671 | * worker will be waiting on misc_wait. Wake it up once this |
682 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | 672 | * request has completed locally. |
683 | } | 673 | */ |
684 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | 674 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
675 | req->rq_state |= RQ_POSTPONED; | ||
676 | if (req->i.waiting) | ||
677 | wake_up(&mdev->misc_wait); | ||
678 | /* Do not clear RQ_NET_PENDING. This request will make further | ||
679 | * progress via restart_conflicting_writes() or | ||
680 | * fail_postponed_requests(). Hopefully. */ | ||
681 | break; | ||
685 | 682 | ||
686 | req->rq_state |= RQ_NET_DONE; | 683 | case NEG_ACKED: |
687 | _req_may_be_done_not_susp(req, m); | 684 | mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0); |
688 | /* else: done by handed_over_to_network */ | ||
689 | break; | 685 | break; |
690 | 686 | ||
691 | case fail_frozen_disk_io: | 687 | case FAIL_FROZEN_DISK_IO: |
692 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) | 688 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) |
693 | break; | 689 | break; |
694 | 690 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); | |
695 | _req_may_be_done(req, m); /* Allowed while state.susp */ | ||
696 | break; | 691 | break; |
697 | 692 | ||
698 | case restart_frozen_disk_io: | 693 | case RESTART_FROZEN_DISK_IO: |
699 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) | 694 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) |
700 | break; | 695 | break; |
701 | 696 | ||
702 | req->rq_state &= ~RQ_LOCAL_COMPLETED; | 697 | mod_rq_state(req, m, |
698 | RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED, | ||
699 | RQ_LOCAL_PENDING); | ||
703 | 700 | ||
704 | rv = MR_READ; | 701 | rv = MR_READ; |
705 | if (bio_data_dir(req->master_bio) == WRITE) | 702 | if (bio_data_dir(req->master_bio) == WRITE) |
706 | rv = MR_WRITE; | 703 | rv = MR_WRITE; |
707 | 704 | ||
708 | get_ldev(mdev); | 705 | get_ldev(mdev); /* always succeeds in this call path */ |
709 | req->w.cb = w_restart_disk_io; | 706 | req->w.cb = w_restart_disk_io; |
710 | drbd_queue_work(&mdev->data.work, &req->w); | 707 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
711 | break; | 708 | break; |
712 | 709 | ||
713 | case resend: | 710 | case RESEND: |
714 | /* Simply complete (local only) READs. */ | 711 | /* Simply complete (local only) READs. */ |
715 | if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { | 712 | if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { |
716 | _req_may_be_done(req, m); | 713 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); |
717 | break; | 714 | break; |
718 | } | 715 | } |
719 | 716 | ||
720 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK | 717 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK |
721 | before the connection loss (B&C only); only P_BARRIER_ACK was missing. | 718 | before the connection loss (B&C only); only P_BARRIER_ACK |
722 | Trowing them out of the TL here by pretending we got a BARRIER_ACK | 719 | (or the local completion?) was missing when we suspended. |
723 | We ensure that the peer was not rebooted */ | 720 | Throwing them out of the TL here by pretending we got a BARRIER_ACK. |
721 | During connection handshake, we ensure that the peer was not rebooted. */ | ||
724 | if (!(req->rq_state & RQ_NET_OK)) { | 722 | if (!(req->rq_state & RQ_NET_OK)) { |
723 | /* FIXME could this possibly be a req->w.cb == w_send_out_of_sync? | ||
724 | * in that case we must not set RQ_NET_PENDING. */ | ||
725 | |||
726 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); | ||
725 | if (req->w.cb) { | 727 | if (req->w.cb) { |
726 | drbd_queue_work(&mdev->data.work, &req->w); | 728 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
727 | rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; | 729 | rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; |
728 | } | 730 | } /* else: FIXME can this happen? */ |
729 | break; | 731 | break; |
730 | } | 732 | } |
731 | /* else, fall through to barrier_acked */ | 733 | /* else, fall through to BARRIER_ACKED */ |
732 | 734 | ||
733 | case barrier_acked: | 735 | case BARRIER_ACKED: |
736 | /* barrier ack for READ requests does not make sense */ | ||
734 | if (!(req->rq_state & RQ_WRITE)) | 737 | if (!(req->rq_state & RQ_WRITE)) |
735 | break; | 738 | break; |
736 | 739 | ||
737 | if (req->rq_state & RQ_NET_PENDING) { | 740 | if (req->rq_state & RQ_NET_PENDING) { |
738 | /* barrier came in before all requests have been acked. | 741 | /* barrier came in before all requests were acked. |
739 | * this is bad, because if the connection is lost now, | 742 | * this is bad, because if the connection is lost now, |
740 | * we won't be able to clean them up... */ | 743 | * we won't be able to clean them up... */ |
741 | dev_err(DEV, "FIXME (barrier_acked but pending)\n"); | 744 | dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n"); |
742 | list_move(&req->tl_requests, &mdev->out_of_sequence_requests); | ||
743 | } | 745 | } |
744 | if ((req->rq_state & RQ_NET_MASK) != 0) { | 746 | /* Allowed to complete requests, even while suspended. |
745 | req->rq_state |= RQ_NET_DONE; | 747 | * As this is called for all requests within a matching epoch, |
746 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A) | 748 | * we need to filter, and only set RQ_NET_DONE for those that |
747 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | 749 | * have actually been on the wire. */ |
748 | } | 750 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, |
749 | _req_may_be_done(req, m); /* Allowed while state.susp */ | 751 | (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0); |
750 | break; | 752 | break; |
751 | 753 | ||
752 | case data_received: | 754 | case DATA_RECEIVED: |
753 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 755 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
754 | dec_ap_pending(mdev); | 756 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); |
755 | req->rq_state &= ~RQ_NET_PENDING; | ||
756 | req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); | ||
757 | _req_may_be_done_not_susp(req, m); | ||
758 | break; | 757 | break; |
759 | }; | 758 | }; |
760 | 759 | ||
@@ -768,75 +767,265 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
768 | * since size may be bigger than BM_BLOCK_SIZE, | 767 | * since size may be bigger than BM_BLOCK_SIZE, |
769 | * we may need to check several bits. | 768 | * we may need to check several bits. |
770 | */ | 769 | */ |
771 | static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) | 770 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) |
772 | { | 771 | { |
773 | unsigned long sbnr, ebnr; | 772 | unsigned long sbnr, ebnr; |
774 | sector_t esector, nr_sectors; | 773 | sector_t esector, nr_sectors; |
775 | 774 | ||
776 | if (mdev->state.disk == D_UP_TO_DATE) | 775 | if (mdev->state.disk == D_UP_TO_DATE) |
777 | return 1; | 776 | return true; |
778 | if (mdev->state.disk >= D_OUTDATED) | 777 | if (mdev->state.disk != D_INCONSISTENT) |
779 | return 0; | 778 | return false; |
780 | if (mdev->state.disk < D_INCONSISTENT) | ||
781 | return 0; | ||
782 | /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ | ||
783 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
784 | esector = sector + (size >> 9) - 1; | 779 | esector = sector + (size >> 9) - 1; |
785 | 780 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | |
786 | D_ASSERT(sector < nr_sectors); | 781 | D_ASSERT(sector < nr_sectors); |
787 | D_ASSERT(esector < nr_sectors); | 782 | D_ASSERT(esector < nr_sectors); |
788 | 783 | ||
789 | sbnr = BM_SECT_TO_BIT(sector); | 784 | sbnr = BM_SECT_TO_BIT(sector); |
790 | ebnr = BM_SECT_TO_BIT(esector); | 785 | ebnr = BM_SECT_TO_BIT(esector); |
791 | 786 | ||
792 | return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); | 787 | return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0; |
788 | } | ||
789 | |||
790 | static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector, | ||
791 | enum drbd_read_balancing rbm) | ||
792 | { | ||
793 | struct backing_dev_info *bdi; | ||
794 | int stripe_shift; | ||
795 | |||
796 | switch (rbm) { | ||
797 | case RB_CONGESTED_REMOTE: | ||
798 | bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info; | ||
799 | return bdi_read_congested(bdi); | ||
800 | case RB_LEAST_PENDING: | ||
801 | return atomic_read(&mdev->local_cnt) > | ||
802 | atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt); | ||
803 | case RB_32K_STRIPING: /* stripe_shift = 15 */ | ||
804 | case RB_64K_STRIPING: | ||
805 | case RB_128K_STRIPING: | ||
806 | case RB_256K_STRIPING: | ||
807 | case RB_512K_STRIPING: | ||
808 | case RB_1M_STRIPING: /* stripe_shift = 20 */ | ||
809 | stripe_shift = (rbm - RB_32K_STRIPING + 15); | ||
810 | return (sector >> (stripe_shift - 9)) & 1; | ||
811 | case RB_ROUND_ROBIN: | ||
812 | return test_and_change_bit(READ_BALANCE_RR, &mdev->flags); | ||
813 | case RB_PREFER_REMOTE: | ||
814 | return true; | ||
815 | case RB_PREFER_LOCAL: | ||
816 | default: | ||
817 | return false; | ||
818 | } | ||
819 | } | ||
820 | |||
821 | /* | ||
822 | * complete_conflicting_writes - wait for any conflicting write requests | ||
823 | * | ||
824 | * The write_requests tree contains all active write requests which we | ||
825 | * currently know about. Wait for any requests to complete which conflict with | ||
826 | * the new one. | ||
827 | * | ||
828 | * Only way out: remove the conflicting intervals from the tree. | ||
829 | */ | ||
830 | static void complete_conflicting_writes(struct drbd_request *req) | ||
831 | { | ||
832 | DEFINE_WAIT(wait); | ||
833 | struct drbd_conf *mdev = req->w.mdev; | ||
834 | struct drbd_interval *i; | ||
835 | sector_t sector = req->i.sector; | ||
836 | int size = req->i.size; | ||
837 | |||
838 | i = drbd_find_overlap(&mdev->write_requests, sector, size); | ||
839 | if (!i) | ||
840 | return; | ||
841 | |||
842 | for (;;) { | ||
843 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
844 | i = drbd_find_overlap(&mdev->write_requests, sector, size); | ||
845 | if (!i) | ||
846 | break; | ||
847 | /* Indicate to wake up device->misc_wait on progress. */ | ||
848 | i->waiting = true; | ||
849 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
850 | schedule(); | ||
851 | spin_lock_irq(&mdev->tconn->req_lock); | ||
852 | } | ||
853 | finish_wait(&mdev->misc_wait, &wait); | ||
793 | } | 854 | } |
794 | 855 | ||
856 | /* called within req_lock and rcu_read_lock() */ | ||
795 | static void maybe_pull_ahead(struct drbd_conf *mdev) | 857 | static void maybe_pull_ahead(struct drbd_conf *mdev) |
796 | { | 858 | { |
797 | int congested = 0; | 859 | struct drbd_tconn *tconn = mdev->tconn; |
860 | struct net_conf *nc; | ||
861 | bool congested = false; | ||
862 | enum drbd_on_congestion on_congestion; | ||
863 | |||
864 | nc = rcu_dereference(tconn->net_conf); | ||
865 | on_congestion = nc ? nc->on_congestion : OC_BLOCK; | ||
866 | if (on_congestion == OC_BLOCK || | ||
867 | tconn->agreed_pro_version < 96) | ||
868 | return; | ||
798 | 869 | ||
799 | /* If I don't even have good local storage, we can not reasonably try | 870 | /* If I don't even have good local storage, we can not reasonably try |
800 | * to pull ahead of the peer. We also need the local reference to make | 871 | * to pull ahead of the peer. We also need the local reference to make |
801 | * sure mdev->act_log is there. | 872 | * sure mdev->act_log is there. |
802 | * Note: caller has to make sure that net_conf is there. | ||
803 | */ | 873 | */ |
804 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) | 874 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) |
805 | return; | 875 | return; |
806 | 876 | ||
807 | if (mdev->net_conf->cong_fill && | 877 | if (nc->cong_fill && |
808 | atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { | 878 | atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) { |
809 | dev_info(DEV, "Congestion-fill threshold reached\n"); | 879 | dev_info(DEV, "Congestion-fill threshold reached\n"); |
810 | congested = 1; | 880 | congested = true; |
811 | } | 881 | } |
812 | 882 | ||
813 | if (mdev->act_log->used >= mdev->net_conf->cong_extents) { | 883 | if (mdev->act_log->used >= nc->cong_extents) { |
814 | dev_info(DEV, "Congestion-extents threshold reached\n"); | 884 | dev_info(DEV, "Congestion-extents threshold reached\n"); |
815 | congested = 1; | 885 | congested = true; |
816 | } | 886 | } |
817 | 887 | ||
818 | if (congested) { | 888 | if (congested) { |
819 | queue_barrier(mdev); /* last barrier, after mirrored writes */ | 889 | /* start a new epoch for non-mirrored writes */ |
890 | start_new_tl_epoch(mdev->tconn); | ||
820 | 891 | ||
821 | if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) | 892 | if (on_congestion == OC_PULL_AHEAD) |
822 | _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); | 893 | _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); |
823 | else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ | 894 | else /*nc->on_congestion == OC_DISCONNECT */ |
824 | _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); | 895 | _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); |
825 | } | 896 | } |
826 | put_ldev(mdev); | 897 | put_ldev(mdev); |
827 | } | 898 | } |
828 | 899 | ||
829 | static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | 900 | /* If this returns false, and req->private_bio is still set, |
901 | * this should be submitted locally. | ||
902 | * | ||
903 | * If it returns false, but req->private_bio is not set, | ||
904 | * we do not have access to good data :( | ||
905 | * | ||
906 | * Otherwise, this destroys req->private_bio, if any, | ||
907 | * and returns true. | ||
908 | */ | ||
909 | static bool do_remote_read(struct drbd_request *req) | ||
910 | { | ||
911 | struct drbd_conf *mdev = req->w.mdev; | ||
912 | enum drbd_read_balancing rbm; | ||
913 | |||
914 | if (req->private_bio) { | ||
915 | if (!drbd_may_do_local_read(mdev, | ||
916 | req->i.sector, req->i.size)) { | ||
917 | bio_put(req->private_bio); | ||
918 | req->private_bio = NULL; | ||
919 | put_ldev(mdev); | ||
920 | } | ||
921 | } | ||
922 | |||
923 | if (mdev->state.pdsk != D_UP_TO_DATE) | ||
924 | return false; | ||
925 | |||
926 | if (req->private_bio == NULL) | ||
927 | return true; | ||
928 | |||
929 | /* TODO: improve read balancing decisions, take into account drbd | ||
930 | * protocol, pending requests etc. */ | ||
931 | |||
932 | rcu_read_lock(); | ||
933 | rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing; | ||
934 | rcu_read_unlock(); | ||
935 | |||
936 | if (rbm == RB_PREFER_LOCAL && req->private_bio) | ||
937 | return false; /* submit locally */ | ||
938 | |||
939 | if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) { | ||
940 | if (req->private_bio) { | ||
941 | bio_put(req->private_bio); | ||
942 | req->private_bio = NULL; | ||
943 | put_ldev(mdev); | ||
944 | } | ||
945 | return true; | ||
946 | } | ||
947 | |||
948 | return false; | ||
949 | } | ||
950 | |||
951 | /* returns number of connections (== 1, for drbd 8.4) | ||
952 | * expected to actually write this data, | ||
953 | * which does NOT include those that we are L_AHEAD for. */ | ||
954 | static int drbd_process_write_request(struct drbd_request *req) | ||
955 | { | ||
956 | struct drbd_conf *mdev = req->w.mdev; | ||
957 | int remote, send_oos; | ||
958 | |||
959 | rcu_read_lock(); | ||
960 | remote = drbd_should_do_remote(mdev->state); | ||
961 | if (remote) { | ||
962 | maybe_pull_ahead(mdev); | ||
963 | remote = drbd_should_do_remote(mdev->state); | ||
964 | } | ||
965 | send_oos = drbd_should_send_out_of_sync(mdev->state); | ||
966 | rcu_read_unlock(); | ||
967 | |||
968 | /* Need to replicate writes. Unless it is an empty flush, | ||
969 | * which is better mapped to a DRBD P_BARRIER packet, | ||
970 | * also for drbd wire protocol compatibility reasons. | ||
971 | * If this was a flush, just start a new epoch. | ||
972 | * Unless the current epoch was empty anyways, or we are not currently | ||
973 | * replicating, in which case there is no point. */ | ||
974 | if (unlikely(req->i.size == 0)) { | ||
975 | /* The only size==0 bios we expect are empty flushes. */ | ||
976 | D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); | ||
977 | if (remote) | ||
978 | start_new_tl_epoch(mdev->tconn); | ||
979 | return 0; | ||
980 | } | ||
981 | |||
982 | if (!remote && !send_oos) | ||
983 | return 0; | ||
984 | |||
985 | D_ASSERT(!(remote && send_oos)); | ||
986 | |||
987 | if (remote) { | ||
988 | _req_mod(req, TO_BE_SENT); | ||
989 | _req_mod(req, QUEUE_FOR_NET_WRITE); | ||
990 | } else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size)) | ||
991 | _req_mod(req, QUEUE_FOR_SEND_OOS); | ||
992 | |||
993 | return remote; | ||
994 | } | ||
995 | |||
996 | static void | ||
997 | drbd_submit_req_private_bio(struct drbd_request *req) | ||
998 | { | ||
999 | struct drbd_conf *mdev = req->w.mdev; | ||
1000 | struct bio *bio = req->private_bio; | ||
1001 | const int rw = bio_rw(bio); | ||
1002 | |||
1003 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1004 | |||
1005 | /* State may have changed since we grabbed our reference on the | ||
1006 | * ->ldev member. Double check, and short-circuit to endio. | ||
1007 | * In case the last activity log transaction failed to get on | ||
1008 | * stable storage, and this is a WRITE, we may not even submit | ||
1009 | * this bio. */ | ||
1010 | if (get_ldev(mdev)) { | ||
1011 | if (drbd_insert_fault(mdev, | ||
1012 | rw == WRITE ? DRBD_FAULT_DT_WR | ||
1013 | : rw == READ ? DRBD_FAULT_DT_RD | ||
1014 | : DRBD_FAULT_DT_RA)) | ||
1015 | bio_endio(bio, -EIO); | ||
1016 | else | ||
1017 | generic_make_request(bio); | ||
1018 | put_ldev(mdev); | ||
1019 | } else | ||
1020 | bio_endio(bio, -EIO); | ||
1021 | } | ||
1022 | |||
1023 | void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | ||
830 | { | 1024 | { |
831 | const int rw = bio_rw(bio); | 1025 | const int rw = bio_rw(bio); |
832 | const int size = bio->bi_size; | 1026 | struct bio_and_error m = { NULL, }; |
833 | const sector_t sector = bio->bi_sector; | ||
834 | struct drbd_tl_epoch *b = NULL; | ||
835 | struct drbd_request *req; | 1027 | struct drbd_request *req; |
836 | int local, remote, send_oos = 0; | 1028 | bool no_remote = false; |
837 | int err = -EIO; | ||
838 | int ret = 0; | ||
839 | union drbd_state s; | ||
840 | 1029 | ||
841 | /* allocate outside of all locks; */ | 1030 | /* allocate outside of all locks; */ |
842 | req = drbd_req_new(mdev, bio); | 1031 | req = drbd_req_new(mdev, bio); |
@@ -846,55 +1035,14 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
846 | * if user cannot handle io errors, that's not our business. */ | 1035 | * if user cannot handle io errors, that's not our business. */ |
847 | dev_err(DEV, "could not kmalloc() req\n"); | 1036 | dev_err(DEV, "could not kmalloc() req\n"); |
848 | bio_endio(bio, -ENOMEM); | 1037 | bio_endio(bio, -ENOMEM); |
849 | return 0; | 1038 | return; |
850 | } | 1039 | } |
851 | req->start_time = start_time; | 1040 | req->start_time = start_time; |
852 | 1041 | ||
853 | local = get_ldev(mdev); | 1042 | if (!get_ldev(mdev)) { |
854 | if (!local) { | 1043 | bio_put(req->private_bio); |
855 | bio_put(req->private_bio); /* or we get a bio leak */ | ||
856 | req->private_bio = NULL; | 1044 | req->private_bio = NULL; |
857 | } | 1045 | } |
858 | if (rw == WRITE) { | ||
859 | /* Need to replicate writes. Unless it is an empty flush, | ||
860 | * which is better mapped to a DRBD P_BARRIER packet, | ||
861 | * also for drbd wire protocol compatibility reasons. */ | ||
862 | if (unlikely(size == 0)) { | ||
863 | /* The only size==0 bios we expect are empty flushes. */ | ||
864 | D_ASSERT(bio->bi_rw & REQ_FLUSH); | ||
865 | remote = 0; | ||
866 | } else | ||
867 | remote = 1; | ||
868 | } else { | ||
869 | /* READ || READA */ | ||
870 | if (local) { | ||
871 | if (!drbd_may_do_local_read(mdev, sector, size)) { | ||
872 | /* we could kick the syncer to | ||
873 | * sync this extent asap, wait for | ||
874 | * it, then continue locally. | ||
875 | * Or just issue the request remotely. | ||
876 | */ | ||
877 | local = 0; | ||
878 | bio_put(req->private_bio); | ||
879 | req->private_bio = NULL; | ||
880 | put_ldev(mdev); | ||
881 | } | ||
882 | } | ||
883 | remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; | ||
884 | } | ||
885 | |||
886 | /* If we have a disk, but a READA request is mapped to remote, | ||
887 | * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. | ||
888 | * Just fail that READA request right here. | ||
889 | * | ||
890 | * THINK: maybe fail all READA when not local? | ||
891 | * or make this configurable... | ||
892 | * if network is slow, READA won't do any good. | ||
893 | */ | ||
894 | if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { | ||
895 | err = -EWOULDBLOCK; | ||
896 | goto fail_and_free_req; | ||
897 | } | ||
898 | 1046 | ||
899 | /* For WRITES going to the local disk, grab a reference on the target | 1047 | /* For WRITES going to the local disk, grab a reference on the target |
900 | * extent. This waits for any resync activity in the corresponding | 1048 | * extent. This waits for any resync activity in the corresponding |
@@ -903,349 +1051,131 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
903 | * of transactional on-disk meta data updates. | 1051 | * of transactional on-disk meta data updates. |
904 | * Empty flushes don't need to go into the activity log, they can only | 1052 | * Empty flushes don't need to go into the activity log, they can only |
905 | * flush data for pending writes which are already in there. */ | 1053 | * flush data for pending writes which are already in there. */ |
906 | if (rw == WRITE && local && size | 1054 | if (rw == WRITE && req->private_bio && req->i.size |
907 | && !drbd_test_flag(mdev, AL_SUSPENDED)) { | 1055 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { |
908 | req->rq_state |= RQ_IN_ACT_LOG; | 1056 | req->rq_state |= RQ_IN_ACT_LOG; |
909 | drbd_al_begin_io(mdev, sector); | 1057 | drbd_al_begin_io(mdev, &req->i); |
910 | } | 1058 | } |
911 | 1059 | ||
912 | s = mdev->state; | 1060 | spin_lock_irq(&mdev->tconn->req_lock); |
913 | remote = remote && drbd_should_do_remote(s); | 1061 | if (rw == WRITE) { |
914 | send_oos = rw == WRITE && drbd_should_send_oos(s); | 1062 | /* This may temporarily give up the req_lock, |
915 | D_ASSERT(!(remote && send_oos)); | 1063 | * but will re-aquire it before it returns here. |
916 | 1064 | * Needs to be before the check on drbd_suspended() */ | |
917 | if (!(local || remote) && !is_susp(mdev->state)) { | 1065 | complete_conflicting_writes(req); |
918 | if (__ratelimit(&drbd_ratelimit_state)) | ||
919 | dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n", | ||
920 | (unsigned long long)req->sector, req->size >> 9); | ||
921 | goto fail_free_complete; | ||
922 | } | ||
923 | |||
924 | /* For WRITE request, we have to make sure that we have an | ||
925 | * unused_spare_tle, in case we need to start a new epoch. | ||
926 | * I try to be smart and avoid to pre-allocate always "just in case", | ||
927 | * but there is a race between testing the bit and pointer outside the | ||
928 | * spinlock, and grabbing the spinlock. | ||
929 | * if we lost that race, we retry. */ | ||
930 | if (rw == WRITE && (remote || send_oos) && | ||
931 | mdev->unused_spare_tle == NULL && | ||
932 | drbd_test_flag(mdev, CREATE_BARRIER)) { | ||
933 | allocate_barrier: | ||
934 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); | ||
935 | if (!b) { | ||
936 | dev_err(DEV, "Failed to alloc barrier.\n"); | ||
937 | err = -ENOMEM; | ||
938 | goto fail_free_complete; | ||
939 | } | ||
940 | } | 1066 | } |
941 | 1067 | ||
942 | /* GOOD, everything prepared, grab the spin_lock */ | 1068 | /* no more giving up req_lock from now on! */ |
943 | spin_lock_irq(&mdev->req_lock); | ||
944 | |||
945 | if (is_susp(mdev->state)) { | ||
946 | /* If we got suspended, use the retry mechanism of | ||
947 | drbd_make_request() to restart processing of this | ||
948 | bio. In the next call to drbd_make_request | ||
949 | we sleep in inc_ap_bio() */ | ||
950 | ret = 1; | ||
951 | spin_unlock_irq(&mdev->req_lock); | ||
952 | goto fail_free_complete; | ||
953 | } | ||
954 | 1069 | ||
955 | if (remote || send_oos) { | 1070 | if (drbd_suspended(mdev)) { |
956 | remote = drbd_should_do_remote(mdev->state); | 1071 | /* push back and retry: */ |
957 | send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); | 1072 | req->rq_state |= RQ_POSTPONED; |
958 | D_ASSERT(!(remote && send_oos)); | 1073 | if (req->private_bio) { |
959 | 1074 | bio_put(req->private_bio); | |
960 | if (!(remote || send_oos)) | 1075 | req->private_bio = NULL; |
961 | dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); | 1076 | put_ldev(mdev); |
962 | if (!(local || remote)) { | ||
963 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
964 | spin_unlock_irq(&mdev->req_lock); | ||
965 | goto fail_free_complete; | ||
966 | } | 1077 | } |
1078 | goto out; | ||
967 | } | 1079 | } |
968 | 1080 | ||
969 | if (b && mdev->unused_spare_tle == NULL) { | ||
970 | mdev->unused_spare_tle = b; | ||
971 | b = NULL; | ||
972 | } | ||
973 | if (rw == WRITE && (remote || send_oos) && | ||
974 | mdev->unused_spare_tle == NULL && | ||
975 | drbd_test_flag(mdev, CREATE_BARRIER)) { | ||
976 | /* someone closed the current epoch | ||
977 | * while we were grabbing the spinlock */ | ||
978 | spin_unlock_irq(&mdev->req_lock); | ||
979 | goto allocate_barrier; | ||
980 | } | ||
981 | |||
982 | |||
983 | /* Update disk stats */ | 1081 | /* Update disk stats */ |
984 | _drbd_start_io_acct(mdev, req, bio); | 1082 | _drbd_start_io_acct(mdev, req, bio); |
985 | 1083 | ||
986 | /* _maybe_start_new_epoch(mdev); | 1084 | /* We fail READ/READA early, if we can not serve it. |
987 | * If we need to generate a write barrier packet, we have to add the | 1085 | * We must do this before req is registered on any lists. |
988 | * new epoch (barrier) object, and queue the barrier packet for sending, | 1086 | * Otherwise, drbd_req_complete() will queue failed READ for retry. */ |
989 | * and queue the req's data after it _within the same lock_, otherwise | 1087 | if (rw != WRITE) { |
990 | * we have race conditions were the reorder domains could be mixed up. | 1088 | if (!do_remote_read(req) && !req->private_bio) |
991 | * | 1089 | goto nodata; |
992 | * Even read requests may start a new epoch and queue the corresponding | ||
993 | * barrier packet. To get the write ordering right, we only have to | ||
994 | * make sure that, if this is a write request and it triggered a | ||
995 | * barrier packet, this request is queued within the same spinlock. */ | ||
996 | if ((remote || send_oos) && mdev->unused_spare_tle && | ||
997 | drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) { | ||
998 | _tl_add_barrier(mdev, mdev->unused_spare_tle); | ||
999 | mdev->unused_spare_tle = NULL; | ||
1000 | } else { | ||
1001 | D_ASSERT(!(remote && rw == WRITE && | ||
1002 | drbd_test_flag(mdev, CREATE_BARRIER))); | ||
1003 | } | 1090 | } |
1004 | 1091 | ||
1005 | /* NOTE | 1092 | /* which transfer log epoch does this belong to? */ |
1006 | * Actually, 'local' may be wrong here already, since we may have failed | 1093 | req->epoch = atomic_read(&mdev->tconn->current_tle_nr); |
1007 | * to write to the meta data, and may become wrong anytime because of | ||
1008 | * local io-error for some other request, which would lead to us | ||
1009 | * "detaching" the local disk. | ||
1010 | * | ||
1011 | * 'remote' may become wrong any time because the network could fail. | ||
1012 | * | ||
1013 | * This is a harmless race condition, though, since it is handled | ||
1014 | * correctly at the appropriate places; so it just defers the failure | ||
1015 | * of the respective operation. | ||
1016 | */ | ||
1017 | |||
1018 | /* mark them early for readability. | ||
1019 | * this just sets some state flags. */ | ||
1020 | if (remote) | ||
1021 | _req_mod(req, to_be_send); | ||
1022 | if (local) | ||
1023 | _req_mod(req, to_be_submitted); | ||
1024 | |||
1025 | /* check this request on the collision detection hash tables. | ||
1026 | * if we have a conflict, just complete it here. | ||
1027 | * THINK do we want to check reads, too? (I don't think so...) */ | ||
1028 | if (rw == WRITE && _req_conflicts(req)) | ||
1029 | goto fail_conflicting; | ||
1030 | 1094 | ||
1031 | /* no point in adding empty flushes to the transfer log, | 1095 | /* no point in adding empty flushes to the transfer log, |
1032 | * they are mapped to drbd barriers already. */ | 1096 | * they are mapped to drbd barriers already. */ |
1033 | if (likely(size!=0)) | 1097 | if (likely(req->i.size!=0)) { |
1034 | list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); | 1098 | if (rw == WRITE) |
1099 | mdev->tconn->current_tle_writes++; | ||
1035 | 1100 | ||
1036 | /* NOTE remote first: to get the concurrent write detection right, | 1101 | list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log); |
1037 | * we must register the request before start of local IO. */ | ||
1038 | if (remote) { | ||
1039 | /* either WRITE and C_CONNECTED, | ||
1040 | * or READ, and no local disk, | ||
1041 | * or READ, but not in sync. | ||
1042 | */ | ||
1043 | _req_mod(req, (rw == WRITE) | ||
1044 | ? queue_for_net_write | ||
1045 | : queue_for_net_read); | ||
1046 | } | 1102 | } |
1047 | if (send_oos && drbd_set_out_of_sync(mdev, sector, size)) | ||
1048 | _req_mod(req, queue_for_send_oos); | ||
1049 | |||
1050 | if (remote && | ||
1051 | mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) | ||
1052 | maybe_pull_ahead(mdev); | ||
1053 | 1103 | ||
1054 | /* If this was a flush, queue a drbd barrier/start a new epoch. | 1104 | if (rw == WRITE) { |
1055 | * Unless the current epoch was empty anyways, or we are not currently | 1105 | if (!drbd_process_write_request(req)) |
1056 | * replicating, in which case there is no point. */ | 1106 | no_remote = true; |
1057 | if (unlikely(bio->bi_rw & REQ_FLUSH) | 1107 | } else { |
1058 | && mdev->newest_tle->n_writes | 1108 | /* We either have a private_bio, or we can read from remote. |
1059 | && drbd_should_do_remote(mdev->state)) | 1109 | * Otherwise we had done the goto nodata above. */ |
1060 | queue_barrier(mdev); | 1110 | if (req->private_bio == NULL) { |
1061 | 1111 | _req_mod(req, TO_BE_SENT); | |
1062 | spin_unlock_irq(&mdev->req_lock); | 1112 | _req_mod(req, QUEUE_FOR_NET_READ); |
1063 | kfree(b); /* if someone else has beaten us to it... */ | ||
1064 | |||
1065 | if (local) { | ||
1066 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1067 | |||
1068 | /* State may have changed since we grabbed our reference on the | ||
1069 | * mdev->ldev member. Double check, and short-circuit to endio. | ||
1070 | * In case the last activity log transaction failed to get on | ||
1071 | * stable storage, and this is a WRITE, we may not even submit | ||
1072 | * this bio. */ | ||
1073 | if (get_ldev(mdev)) { | ||
1074 | if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR | ||
1075 | : rw == READ ? DRBD_FAULT_DT_RD | ||
1076 | : DRBD_FAULT_DT_RA)) | ||
1077 | bio_endio(req->private_bio, -EIO); | ||
1078 | else | ||
1079 | generic_make_request(req->private_bio); | ||
1080 | put_ldev(mdev); | ||
1081 | } else | 1113 | } else |
1082 | bio_endio(req->private_bio, -EIO); | 1114 | no_remote = true; |
1083 | } | 1115 | } |
1084 | 1116 | ||
1085 | return 0; | 1117 | if (req->private_bio) { |
1086 | 1118 | /* needs to be marked within the same spinlock */ | |
1087 | fail_conflicting: | 1119 | _req_mod(req, TO_BE_SUBMITTED); |
1088 | /* this is a conflicting request. | 1120 | /* but we need to give up the spinlock to submit */ |
1089 | * even though it may have been only _partially_ | 1121 | spin_unlock_irq(&mdev->tconn->req_lock); |
1090 | * overlapping with one of the currently pending requests, | 1122 | drbd_submit_req_private_bio(req); |
1091 | * without even submitting or sending it, we will | 1123 | spin_lock_irq(&mdev->tconn->req_lock); |
1092 | * pretend that it was successfully served right now. | 1124 | } else if (no_remote) { |
1093 | */ | 1125 | nodata: |
1094 | _drbd_end_io_acct(mdev, req); | 1126 | if (__ratelimit(&drbd_ratelimit_state)) |
1095 | spin_unlock_irq(&mdev->req_lock); | 1127 | dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n", |
1096 | if (remote) | 1128 | (unsigned long long)req->i.sector, req->i.size >> 9); |
1097 | dec_ap_pending(mdev); | 1129 | /* A write may have been queued for send_oos, however. |
1098 | /* THINK: do we want to fail it (-EIO), or pretend success? | 1130 | * So we can not simply free it, we must go through drbd_req_put_completion_ref() */ |
1099 | * this pretends success. */ | ||
1100 | err = 0; | ||
1101 | |||
1102 | fail_free_complete: | ||
1103 | if (req->rq_state & RQ_IN_ACT_LOG) | ||
1104 | drbd_al_complete_io(mdev, sector); | ||
1105 | fail_and_free_req: | ||
1106 | if (local) { | ||
1107 | bio_put(req->private_bio); | ||
1108 | req->private_bio = NULL; | ||
1109 | put_ldev(mdev); | ||
1110 | } | 1131 | } |
1111 | if (!ret) | ||
1112 | bio_endio(bio, err); | ||
1113 | |||
1114 | drbd_req_free(req); | ||
1115 | dec_ap_bio(mdev); | ||
1116 | kfree(b); | ||
1117 | |||
1118 | return ret; | ||
1119 | } | ||
1120 | 1132 | ||
1121 | /* helper function for drbd_make_request | 1133 | out: |
1122 | * if we can determine just by the mdev (state) that this request will fail, | 1134 | if (drbd_req_put_completion_ref(req, &m, 1)) |
1123 | * return 1 | 1135 | kref_put(&req->kref, drbd_req_destroy); |
1124 | * otherwise return 0 | 1136 | spin_unlock_irq(&mdev->tconn->req_lock); |
1125 | */ | ||
1126 | static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) | ||
1127 | { | ||
1128 | if (mdev->state.role != R_PRIMARY && | ||
1129 | (!allow_oos || is_write)) { | ||
1130 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
1131 | dev_err(DEV, "Process %s[%u] tried to %s; " | ||
1132 | "since we are not in Primary state, " | ||
1133 | "we cannot allow this\n", | ||
1134 | current->comm, current->pid, | ||
1135 | is_write ? "WRITE" : "READ"); | ||
1136 | } | ||
1137 | return 1; | ||
1138 | } | ||
1139 | 1137 | ||
1140 | return 0; | 1138 | if (m.bio) |
1139 | complete_master_bio(mdev, &m); | ||
1140 | return; | ||
1141 | } | 1141 | } |
1142 | 1142 | ||
1143 | void drbd_make_request(struct request_queue *q, struct bio *bio) | 1143 | void drbd_make_request(struct request_queue *q, struct bio *bio) |
1144 | { | 1144 | { |
1145 | unsigned int s_enr, e_enr; | ||
1146 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | 1145 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; |
1147 | unsigned long start_time; | 1146 | unsigned long start_time; |
1148 | 1147 | ||
1149 | if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { | ||
1150 | bio_endio(bio, -EPERM); | ||
1151 | return; | ||
1152 | } | ||
1153 | |||
1154 | start_time = jiffies; | 1148 | start_time = jiffies; |
1155 | 1149 | ||
1156 | /* | 1150 | /* |
1157 | * what we "blindly" assume: | 1151 | * what we "blindly" assume: |
1158 | */ | 1152 | */ |
1159 | D_ASSERT((bio->bi_size & 0x1ff) == 0); | 1153 | D_ASSERT(IS_ALIGNED(bio->bi_size, 512)); |
1160 | |||
1161 | /* to make some things easier, force alignment of requests within the | ||
1162 | * granularity of our hash tables */ | ||
1163 | s_enr = bio->bi_sector >> HT_SHIFT; | ||
1164 | e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr; | ||
1165 | |||
1166 | if (likely(s_enr == e_enr)) { | ||
1167 | do { | ||
1168 | inc_ap_bio(mdev, 1); | ||
1169 | } while (drbd_make_request_common(mdev, bio, start_time)); | ||
1170 | return; | ||
1171 | } | ||
1172 | |||
1173 | /* can this bio be split generically? | ||
1174 | * Maybe add our own split-arbitrary-bios function. */ | ||
1175 | if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) { | ||
1176 | /* rather error out here than BUG in bio_split */ | ||
1177 | dev_err(DEV, "bio would need to, but cannot, be split: " | ||
1178 | "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", | ||
1179 | bio->bi_vcnt, bio->bi_idx, bio->bi_size, | ||
1180 | (unsigned long long)bio->bi_sector); | ||
1181 | bio_endio(bio, -EINVAL); | ||
1182 | } else { | ||
1183 | /* This bio crosses some boundary, so we have to split it. */ | ||
1184 | struct bio_pair *bp; | ||
1185 | /* works for the "do not cross hash slot boundaries" case | ||
1186 | * e.g. sector 262269, size 4096 | ||
1187 | * s_enr = 262269 >> 6 = 4097 | ||
1188 | * e_enr = (262269+8-1) >> 6 = 4098 | ||
1189 | * HT_SHIFT = 6 | ||
1190 | * sps = 64, mask = 63 | ||
1191 | * first_sectors = 64 - (262269 & 63) = 3 | ||
1192 | */ | ||
1193 | const sector_t sect = bio->bi_sector; | ||
1194 | const int sps = 1 << HT_SHIFT; /* sectors per slot */ | ||
1195 | const int mask = sps - 1; | ||
1196 | const sector_t first_sectors = sps - (sect & mask); | ||
1197 | bp = bio_split(bio, first_sectors); | ||
1198 | 1154 | ||
1199 | /* we need to get a "reference count" (ap_bio_cnt) | 1155 | inc_ap_bio(mdev); |
1200 | * to avoid races with the disconnect/reconnect/suspend code. | 1156 | __drbd_make_request(mdev, bio, start_time); |
1201 | * In case we need to split the bio here, we need to get three references | ||
1202 | * atomically, otherwise we might deadlock when trying to submit the | ||
1203 | * second one! */ | ||
1204 | inc_ap_bio(mdev, 3); | ||
1205 | |||
1206 | D_ASSERT(e_enr == s_enr + 1); | ||
1207 | |||
1208 | while (drbd_make_request_common(mdev, &bp->bio1, start_time)) | ||
1209 | inc_ap_bio(mdev, 1); | ||
1210 | |||
1211 | while (drbd_make_request_common(mdev, &bp->bio2, start_time)) | ||
1212 | inc_ap_bio(mdev, 1); | ||
1213 | |||
1214 | dec_ap_bio(mdev); | ||
1215 | |||
1216 | bio_pair_release(bp); | ||
1217 | } | ||
1218 | } | 1157 | } |
1219 | 1158 | ||
1220 | /* This is called by bio_add_page(). With this function we reduce | 1159 | /* This is called by bio_add_page(). |
1221 | * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs | 1160 | * |
1222 | * units (was AL_EXTENTs). | 1161 | * q->max_hw_sectors and other global limits are already enforced there. |
1223 | * | 1162 | * |
1224 | * we do the calculation within the lower 32bit of the byte offsets, | 1163 | * We need to call down to our lower level device, |
1225 | * since we don't care for actual offset, but only check whether it | 1164 | * in case it has special restrictions. |
1226 | * would cross "activity log extent" boundaries. | 1165 | * |
1166 | * We also may need to enforce configured max-bio-bvecs limits. | ||
1227 | * | 1167 | * |
1228 | * As long as the BIO is empty we have to allow at least one bvec, | 1168 | * As long as the BIO is empty we have to allow at least one bvec, |
1229 | * regardless of size and offset. so the resulting bio may still | 1169 | * regardless of size and offset, so no need to ask lower levels. |
1230 | * cross extent boundaries. those are dealt with (bio_split) in | ||
1231 | * drbd_make_request. | ||
1232 | */ | 1170 | */ |
1233 | int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) | 1171 | int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) |
1234 | { | 1172 | { |
1235 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | 1173 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; |
1236 | unsigned int bio_offset = | ||
1237 | (unsigned int)bvm->bi_sector << 9; /* 32 bit */ | ||
1238 | unsigned int bio_size = bvm->bi_size; | 1174 | unsigned int bio_size = bvm->bi_size; |
1239 | int limit, backing_limit; | 1175 | int limit = DRBD_MAX_BIO_SIZE; |
1240 | 1176 | int backing_limit; | |
1241 | limit = DRBD_MAX_BIO_SIZE | 1177 | |
1242 | - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size); | 1178 | if (bio_size && get_ldev(mdev)) { |
1243 | if (limit < 0) | ||
1244 | limit = 0; | ||
1245 | if (bio_size == 0) { | ||
1246 | if (limit <= bvec->bv_len) | ||
1247 | limit = bvec->bv_len; | ||
1248 | } else if (limit && get_ldev(mdev)) { | ||
1249 | struct request_queue * const b = | 1179 | struct request_queue * const b = |
1250 | mdev->ldev->backing_bdev->bd_disk->queue; | 1180 | mdev->ldev->backing_bdev->bd_disk->queue; |
1251 | if (b->merge_bvec_fn) { | 1181 | if (b->merge_bvec_fn) { |
@@ -1257,24 +1187,38 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct | |||
1257 | return limit; | 1187 | return limit; |
1258 | } | 1188 | } |
1259 | 1189 | ||
1190 | struct drbd_request *find_oldest_request(struct drbd_tconn *tconn) | ||
1191 | { | ||
1192 | /* Walk the transfer log, | ||
1193 | * and find the oldest not yet completed request */ | ||
1194 | struct drbd_request *r; | ||
1195 | list_for_each_entry(r, &tconn->transfer_log, tl_requests) { | ||
1196 | if (atomic_read(&r->completion_ref)) | ||
1197 | return r; | ||
1198 | } | ||
1199 | return NULL; | ||
1200 | } | ||
1201 | |||
1260 | void request_timer_fn(unsigned long data) | 1202 | void request_timer_fn(unsigned long data) |
1261 | { | 1203 | { |
1262 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 1204 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
1205 | struct drbd_tconn *tconn = mdev->tconn; | ||
1263 | struct drbd_request *req; /* oldest request */ | 1206 | struct drbd_request *req; /* oldest request */ |
1264 | struct list_head *le; | 1207 | struct net_conf *nc; |
1265 | unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ | 1208 | unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ |
1266 | unsigned long now; | 1209 | unsigned long now; |
1267 | 1210 | ||
1268 | if (get_net_conf(mdev)) { | 1211 | rcu_read_lock(); |
1269 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) | 1212 | nc = rcu_dereference(tconn->net_conf); |
1270 | ent = mdev->net_conf->timeout*HZ/10 | 1213 | if (nc && mdev->state.conn >= C_WF_REPORT_PARAMS) |
1271 | * mdev->net_conf->ko_count; | 1214 | ent = nc->timeout * HZ/10 * nc->ko_count; |
1272 | put_net_conf(mdev); | 1215 | |
1273 | } | ||
1274 | if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ | 1216 | if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ |
1275 | dt = mdev->ldev->dc.disk_timeout * HZ / 10; | 1217 | dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10; |
1276 | put_ldev(mdev); | 1218 | put_ldev(mdev); |
1277 | } | 1219 | } |
1220 | rcu_read_unlock(); | ||
1221 | |||
1278 | et = min_not_zero(dt, ent); | 1222 | et = min_not_zero(dt, ent); |
1279 | 1223 | ||
1280 | if (!et) | 1224 | if (!et) |
@@ -1282,17 +1226,14 @@ void request_timer_fn(unsigned long data) | |||
1282 | 1226 | ||
1283 | now = jiffies; | 1227 | now = jiffies; |
1284 | 1228 | ||
1285 | spin_lock_irq(&mdev->req_lock); | 1229 | spin_lock_irq(&tconn->req_lock); |
1286 | le = &mdev->oldest_tle->requests; | 1230 | req = find_oldest_request(tconn); |
1287 | if (list_empty(le)) { | 1231 | if (!req) { |
1288 | spin_unlock_irq(&mdev->req_lock); | 1232 | spin_unlock_irq(&tconn->req_lock); |
1289 | mod_timer(&mdev->request_timer, now + et); | 1233 | mod_timer(&mdev->request_timer, now + et); |
1290 | return; | 1234 | return; |
1291 | } | 1235 | } |
1292 | 1236 | ||
1293 | le = le->prev; | ||
1294 | req = list_entry(le, struct drbd_request, tl_requests); | ||
1295 | |||
1296 | /* The request is considered timed out, if | 1237 | /* The request is considered timed out, if |
1297 | * - we have some effective timeout from the configuration, | 1238 | * - we have some effective timeout from the configuration, |
1298 | * with above state restrictions applied, | 1239 | * with above state restrictions applied, |
@@ -1311,17 +1252,17 @@ void request_timer_fn(unsigned long data) | |||
1311 | */ | 1252 | */ |
1312 | if (ent && req->rq_state & RQ_NET_PENDING && | 1253 | if (ent && req->rq_state & RQ_NET_PENDING && |
1313 | time_after(now, req->start_time + ent) && | 1254 | time_after(now, req->start_time + ent) && |
1314 | !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { | 1255 | !time_in_range(now, tconn->last_reconnect_jif, tconn->last_reconnect_jif + ent)) { |
1315 | dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); | 1256 | dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); |
1316 | _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); | 1257 | _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); |
1317 | } | 1258 | } |
1318 | if (dt && req->rq_state & RQ_LOCAL_PENDING && | 1259 | if (dt && req->rq_state & RQ_LOCAL_PENDING && req->w.mdev == mdev && |
1319 | time_after(now, req->start_time + dt) && | 1260 | time_after(now, req->start_time + dt) && |
1320 | !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { | 1261 | !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { |
1321 | dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); | 1262 | dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); |
1322 | __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); | 1263 | __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); |
1323 | } | 1264 | } |
1324 | nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; | 1265 | nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; |
1325 | spin_unlock_irq(&mdev->req_lock); | 1266 | spin_unlock_irq(&tconn->req_lock); |
1326 | mod_timer(&mdev->request_timer, nt); | 1267 | mod_timer(&mdev->request_timer, nt); |
1327 | } | 1268 | } |
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 3d2111919486..016de6b8bb57 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h | |||
@@ -77,40 +77,41 @@ | |||
77 | */ | 77 | */ |
78 | 78 | ||
79 | enum drbd_req_event { | 79 | enum drbd_req_event { |
80 | created, | 80 | CREATED, |
81 | to_be_send, | 81 | TO_BE_SENT, |
82 | to_be_submitted, | 82 | TO_BE_SUBMITTED, |
83 | 83 | ||
84 | /* XXX yes, now I am inconsistent... | 84 | /* XXX yes, now I am inconsistent... |
85 | * these are not "events" but "actions" | 85 | * these are not "events" but "actions" |
86 | * oh, well... */ | 86 | * oh, well... */ |
87 | queue_for_net_write, | 87 | QUEUE_FOR_NET_WRITE, |
88 | queue_for_net_read, | 88 | QUEUE_FOR_NET_READ, |
89 | queue_for_send_oos, | 89 | QUEUE_FOR_SEND_OOS, |
90 | 90 | ||
91 | send_canceled, | 91 | SEND_CANCELED, |
92 | send_failed, | 92 | SEND_FAILED, |
93 | handed_over_to_network, | 93 | HANDED_OVER_TO_NETWORK, |
94 | oos_handed_to_network, | 94 | OOS_HANDED_TO_NETWORK, |
95 | connection_lost_while_pending, | 95 | CONNECTION_LOST_WHILE_PENDING, |
96 | read_retry_remote_canceled, | 96 | READ_RETRY_REMOTE_CANCELED, |
97 | recv_acked_by_peer, | 97 | RECV_ACKED_BY_PEER, |
98 | write_acked_by_peer, | 98 | WRITE_ACKED_BY_PEER, |
99 | write_acked_by_peer_and_sis, /* and set_in_sync */ | 99 | WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */ |
100 | conflict_discarded_by_peer, | 100 | CONFLICT_RESOLVED, |
101 | neg_acked, | 101 | POSTPONE_WRITE, |
102 | barrier_acked, /* in protocol A and B */ | 102 | NEG_ACKED, |
103 | data_received, /* (remote read) */ | 103 | BARRIER_ACKED, /* in protocol A and B */ |
104 | 104 | DATA_RECEIVED, /* (remote read) */ | |
105 | read_completed_with_error, | 105 | |
106 | read_ahead_completed_with_error, | 106 | READ_COMPLETED_WITH_ERROR, |
107 | write_completed_with_error, | 107 | READ_AHEAD_COMPLETED_WITH_ERROR, |
108 | abort_disk_io, | 108 | WRITE_COMPLETED_WITH_ERROR, |
109 | completed_ok, | 109 | ABORT_DISK_IO, |
110 | resend, | 110 | COMPLETED_OK, |
111 | fail_frozen_disk_io, | 111 | RESEND, |
112 | restart_frozen_disk_io, | 112 | FAIL_FROZEN_DISK_IO, |
113 | nothing, /* for tracing only */ | 113 | RESTART_FROZEN_DISK_IO, |
114 | NOTHING, | ||
114 | }; | 115 | }; |
115 | 116 | ||
116 | /* encoding of request states for now. we don't actually need that many bits. | 117 | /* encoding of request states for now. we don't actually need that many bits. |
@@ -142,8 +143,8 @@ enum drbd_req_state_bits { | |||
142 | * recv_ack (B) or implicit "ack" (A), | 143 | * recv_ack (B) or implicit "ack" (A), |
143 | * still waiting for the barrier ack. | 144 | * still waiting for the barrier ack. |
144 | * master_bio may already be completed and invalidated. | 145 | * master_bio may already be completed and invalidated. |
145 | * 11100: write_acked (C), | 146 | * 11100: write acked (C), |
146 | * data_received (for remote read, any protocol) | 147 | * data received (for remote read, any protocol) |
147 | * or finally the barrier ack has arrived (B,A)... | 148 | * or finally the barrier ack has arrived (B,A)... |
148 | * request can be freed | 149 | * request can be freed |
149 | * 01100: neg-acked (write, protocol C) | 150 | * 01100: neg-acked (write, protocol C) |
@@ -198,6 +199,22 @@ enum drbd_req_state_bits { | |||
198 | 199 | ||
199 | /* Should call drbd_al_complete_io() for this request... */ | 200 | /* Should call drbd_al_complete_io() for this request... */ |
200 | __RQ_IN_ACT_LOG, | 201 | __RQ_IN_ACT_LOG, |
202 | |||
203 | /* The peer has sent a retry ACK */ | ||
204 | __RQ_POSTPONED, | ||
205 | |||
206 | /* would have been completed, | ||
207 | * but was not, because of drbd_suspended() */ | ||
208 | __RQ_COMPLETION_SUSP, | ||
209 | |||
210 | /* We expect a receive ACK (wire proto B) */ | ||
211 | __RQ_EXP_RECEIVE_ACK, | ||
212 | |||
213 | /* We expect a write ACK (wite proto C) */ | ||
214 | __RQ_EXP_WRITE_ACK, | ||
215 | |||
216 | /* waiting for a barrier ack, did an extra kref_get */ | ||
217 | __RQ_EXP_BARR_ACK, | ||
201 | }; | 218 | }; |
202 | 219 | ||
203 | #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) | 220 | #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) |
@@ -219,56 +236,16 @@ enum drbd_req_state_bits { | |||
219 | 236 | ||
220 | #define RQ_WRITE (1UL << __RQ_WRITE) | 237 | #define RQ_WRITE (1UL << __RQ_WRITE) |
221 | #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) | 238 | #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) |
239 | #define RQ_POSTPONED (1UL << __RQ_POSTPONED) | ||
240 | #define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) | ||
241 | #define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK) | ||
242 | #define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK) | ||
243 | #define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK) | ||
222 | 244 | ||
223 | /* For waking up the frozen transfer log mod_req() has to return if the request | 245 | /* For waking up the frozen transfer log mod_req() has to return if the request |
224 | should be counted in the epoch object*/ | 246 | should be counted in the epoch object*/ |
225 | #define MR_WRITE_SHIFT 0 | 247 | #define MR_WRITE 1 |
226 | #define MR_WRITE (1 << MR_WRITE_SHIFT) | 248 | #define MR_READ 2 |
227 | #define MR_READ_SHIFT 1 | ||
228 | #define MR_READ (1 << MR_READ_SHIFT) | ||
229 | |||
230 | /* epoch entries */ | ||
231 | static inline | ||
232 | struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
233 | { | ||
234 | BUG_ON(mdev->ee_hash_s == 0); | ||
235 | return mdev->ee_hash + | ||
236 | ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); | ||
237 | } | ||
238 | |||
239 | /* transfer log (drbd_request objects) */ | ||
240 | static inline | ||
241 | struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
242 | { | ||
243 | BUG_ON(mdev->tl_hash_s == 0); | ||
244 | return mdev->tl_hash + | ||
245 | ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); | ||
246 | } | ||
247 | |||
248 | /* application reads (drbd_request objects) */ | ||
249 | static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
250 | { | ||
251 | return mdev->app_reads_hash | ||
252 | + ((unsigned int)(sector) % APP_R_HSIZE); | ||
253 | } | ||
254 | |||
255 | /* when we receive the answer for a read request, | ||
256 | * verify that we actually know about it */ | ||
257 | static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, | ||
258 | u64 id, sector_t sector) | ||
259 | { | ||
260 | struct hlist_head *slot = ar_hash_slot(mdev, sector); | ||
261 | struct hlist_node *n; | ||
262 | struct drbd_request *req; | ||
263 | |||
264 | hlist_for_each_entry(req, n, slot, collision) { | ||
265 | if ((unsigned long)req == (unsigned long)id) { | ||
266 | D_ASSERT(req->sector == sector); | ||
267 | return req; | ||
268 | } | ||
269 | } | ||
270 | return NULL; | ||
271 | } | ||
272 | 249 | ||
273 | static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) | 250 | static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) |
274 | { | 251 | { |
@@ -278,41 +255,10 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi | |||
278 | req->private_bio = bio; | 255 | req->private_bio = bio; |
279 | 256 | ||
280 | bio->bi_private = req; | 257 | bio->bi_private = req; |
281 | bio->bi_end_io = drbd_endio_pri; | 258 | bio->bi_end_io = drbd_request_endio; |
282 | bio->bi_next = NULL; | 259 | bio->bi_next = NULL; |
283 | } | 260 | } |
284 | 261 | ||
285 | static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, | ||
286 | struct bio *bio_src) | ||
287 | { | ||
288 | struct drbd_request *req = | ||
289 | mempool_alloc(drbd_request_mempool, GFP_NOIO); | ||
290 | if (likely(req)) { | ||
291 | drbd_req_make_private_bio(req, bio_src); | ||
292 | |||
293 | req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; | ||
294 | req->mdev = mdev; | ||
295 | req->master_bio = bio_src; | ||
296 | req->epoch = 0; | ||
297 | req->sector = bio_src->bi_sector; | ||
298 | req->size = bio_src->bi_size; | ||
299 | INIT_HLIST_NODE(&req->collision); | ||
300 | INIT_LIST_HEAD(&req->tl_requests); | ||
301 | INIT_LIST_HEAD(&req->w.list); | ||
302 | } | ||
303 | return req; | ||
304 | } | ||
305 | |||
306 | static inline void drbd_req_free(struct drbd_request *req) | ||
307 | { | ||
308 | mempool_free(req, drbd_request_mempool); | ||
309 | } | ||
310 | |||
311 | static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) | ||
312 | { | ||
313 | return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); | ||
314 | } | ||
315 | |||
316 | /* Short lived temporary struct on the stack. | 262 | /* Short lived temporary struct on the stack. |
317 | * We could squirrel the error to be returned into | 263 | * We could squirrel the error to be returned into |
318 | * bio->bi_size, or similar. But that would be too ugly. */ | 264 | * bio->bi_size, or similar. But that would be too ugly. */ |
@@ -321,6 +267,7 @@ struct bio_and_error { | |||
321 | int error; | 267 | int error; |
322 | }; | 268 | }; |
323 | 269 | ||
270 | extern void drbd_req_destroy(struct kref *kref); | ||
324 | extern void _req_may_be_done(struct drbd_request *req, | 271 | extern void _req_may_be_done(struct drbd_request *req, |
325 | struct bio_and_error *m); | 272 | struct bio_and_error *m); |
326 | extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, | 273 | extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, |
@@ -328,13 +275,17 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
328 | extern void complete_master_bio(struct drbd_conf *mdev, | 275 | extern void complete_master_bio(struct drbd_conf *mdev, |
329 | struct bio_and_error *m); | 276 | struct bio_and_error *m); |
330 | extern void request_timer_fn(unsigned long data); | 277 | extern void request_timer_fn(unsigned long data); |
331 | extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what); | 278 | extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); |
279 | extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); | ||
280 | |||
281 | /* this is in drbd_main.c */ | ||
282 | extern void drbd_restart_request(struct drbd_request *req); | ||
332 | 283 | ||
333 | /* use this if you don't want to deal with calling complete_master_bio() | 284 | /* use this if you don't want to deal with calling complete_master_bio() |
334 | * outside the spinlock, e.g. when walking some list on cleanup. */ | 285 | * outside the spinlock, e.g. when walking some list on cleanup. */ |
335 | static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) | 286 | static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) |
336 | { | 287 | { |
337 | struct drbd_conf *mdev = req->mdev; | 288 | struct drbd_conf *mdev = req->w.mdev; |
338 | struct bio_and_error m; | 289 | struct bio_and_error m; |
339 | int rv; | 290 | int rv; |
340 | 291 | ||
@@ -354,13 +305,13 @@ static inline int req_mod(struct drbd_request *req, | |||
354 | enum drbd_req_event what) | 305 | enum drbd_req_event what) |
355 | { | 306 | { |
356 | unsigned long flags; | 307 | unsigned long flags; |
357 | struct drbd_conf *mdev = req->mdev; | 308 | struct drbd_conf *mdev = req->w.mdev; |
358 | struct bio_and_error m; | 309 | struct bio_and_error m; |
359 | int rv; | 310 | int rv; |
360 | 311 | ||
361 | spin_lock_irqsave(&mdev->req_lock, flags); | 312 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
362 | rv = __req_mod(req, what, &m); | 313 | rv = __req_mod(req, what, &m); |
363 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 314 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
364 | 315 | ||
365 | if (m.bio) | 316 | if (m.bio) |
366 | complete_master_bio(mdev, &m); | 317 | complete_master_bio(mdev, &m); |
@@ -368,7 +319,7 @@ static inline int req_mod(struct drbd_request *req, | |||
368 | return rv; | 319 | return rv; |
369 | } | 320 | } |
370 | 321 | ||
371 | static inline bool drbd_should_do_remote(union drbd_state s) | 322 | static inline bool drbd_should_do_remote(union drbd_dev_state s) |
372 | { | 323 | { |
373 | return s.pdsk == D_UP_TO_DATE || | 324 | return s.pdsk == D_UP_TO_DATE || |
374 | (s.pdsk >= D_INCONSISTENT && | 325 | (s.pdsk >= D_INCONSISTENT && |
@@ -378,7 +329,7 @@ static inline bool drbd_should_do_remote(union drbd_state s) | |||
378 | That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* | 329 | That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* |
379 | states. */ | 330 | states. */ |
380 | } | 331 | } |
381 | static inline bool drbd_should_send_oos(union drbd_state s) | 332 | static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) |
382 | { | 333 | { |
383 | return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; | 334 | return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; |
384 | /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary | 335 | /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary |
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c new file mode 100644 index 000000000000..69ef35266bac --- /dev/null +++ b/drivers/block/drbd/drbd_state.c | |||
@@ -0,0 +1,1857 @@ | |||
1 | /* | ||
2 | drbd_state.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev | ||
11 | from Logicworks, Inc. for making SDP replication support possible. | ||
12 | |||
13 | drbd is free software; you can redistribute it and/or modify | ||
14 | it under the terms of the GNU General Public License as published by | ||
15 | the Free Software Foundation; either version 2, or (at your option) | ||
16 | any later version. | ||
17 | |||
18 | drbd is distributed in the hope that it will be useful, | ||
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
21 | GNU General Public License for more details. | ||
22 | |||
23 | You should have received a copy of the GNU General Public License | ||
24 | along with drbd; see the file COPYING. If not, write to | ||
25 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
26 | */ | ||
27 | |||
28 | #include <linux/drbd_limits.h> | ||
29 | #include "drbd_int.h" | ||
30 | #include "drbd_req.h" | ||
31 | |||
32 | /* in drbd_main.c */ | ||
33 | extern void tl_abort_disk_io(struct drbd_conf *mdev); | ||
34 | |||
35 | struct after_state_chg_work { | ||
36 | struct drbd_work w; | ||
37 | union drbd_state os; | ||
38 | union drbd_state ns; | ||
39 | enum chg_state_flags flags; | ||
40 | struct completion *done; | ||
41 | }; | ||
42 | |||
43 | enum sanitize_state_warnings { | ||
44 | NO_WARNING, | ||
45 | ABORTED_ONLINE_VERIFY, | ||
46 | ABORTED_RESYNC, | ||
47 | CONNECTION_LOST_NEGOTIATING, | ||
48 | IMPLICITLY_UPGRADED_DISK, | ||
49 | IMPLICITLY_UPGRADED_PDSK, | ||
50 | }; | ||
51 | |||
52 | static int w_after_state_ch(struct drbd_work *w, int unused); | ||
53 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
54 | union drbd_state ns, enum chg_state_flags flags); | ||
55 | static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); | ||
56 | static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_tconn *); | ||
57 | static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); | ||
58 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, | ||
59 | enum sanitize_state_warnings *warn); | ||
60 | |||
61 | static inline bool is_susp(union drbd_state s) | ||
62 | { | ||
63 | return s.susp || s.susp_nod || s.susp_fen; | ||
64 | } | ||
65 | |||
66 | bool conn_all_vols_unconf(struct drbd_tconn *tconn) | ||
67 | { | ||
68 | struct drbd_conf *mdev; | ||
69 | bool rv = true; | ||
70 | int vnr; | ||
71 | |||
72 | rcu_read_lock(); | ||
73 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
74 | if (mdev->state.disk != D_DISKLESS || | ||
75 | mdev->state.conn != C_STANDALONE || | ||
76 | mdev->state.role != R_SECONDARY) { | ||
77 | rv = false; | ||
78 | break; | ||
79 | } | ||
80 | } | ||
81 | rcu_read_unlock(); | ||
82 | |||
83 | return rv; | ||
84 | } | ||
85 | |||
86 | /* Unfortunately the states where not correctly ordered, when | ||
87 | they where defined. therefore can not use max_t() here. */ | ||
88 | static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) | ||
89 | { | ||
90 | if (role1 == R_PRIMARY || role2 == R_PRIMARY) | ||
91 | return R_PRIMARY; | ||
92 | if (role1 == R_SECONDARY || role2 == R_SECONDARY) | ||
93 | return R_SECONDARY; | ||
94 | return R_UNKNOWN; | ||
95 | } | ||
96 | static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) | ||
97 | { | ||
98 | if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) | ||
99 | return R_UNKNOWN; | ||
100 | if (role1 == R_SECONDARY || role2 == R_SECONDARY) | ||
101 | return R_SECONDARY; | ||
102 | return R_PRIMARY; | ||
103 | } | ||
104 | |||
105 | enum drbd_role conn_highest_role(struct drbd_tconn *tconn) | ||
106 | { | ||
107 | enum drbd_role role = R_UNKNOWN; | ||
108 | struct drbd_conf *mdev; | ||
109 | int vnr; | ||
110 | |||
111 | rcu_read_lock(); | ||
112 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
113 | role = max_role(role, mdev->state.role); | ||
114 | rcu_read_unlock(); | ||
115 | |||
116 | return role; | ||
117 | } | ||
118 | |||
119 | enum drbd_role conn_highest_peer(struct drbd_tconn *tconn) | ||
120 | { | ||
121 | enum drbd_role peer = R_UNKNOWN; | ||
122 | struct drbd_conf *mdev; | ||
123 | int vnr; | ||
124 | |||
125 | rcu_read_lock(); | ||
126 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
127 | peer = max_role(peer, mdev->state.peer); | ||
128 | rcu_read_unlock(); | ||
129 | |||
130 | return peer; | ||
131 | } | ||
132 | |||
133 | enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn) | ||
134 | { | ||
135 | enum drbd_disk_state ds = D_DISKLESS; | ||
136 | struct drbd_conf *mdev; | ||
137 | int vnr; | ||
138 | |||
139 | rcu_read_lock(); | ||
140 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
141 | ds = max_t(enum drbd_disk_state, ds, mdev->state.disk); | ||
142 | rcu_read_unlock(); | ||
143 | |||
144 | return ds; | ||
145 | } | ||
146 | |||
147 | enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn) | ||
148 | { | ||
149 | enum drbd_disk_state ds = D_MASK; | ||
150 | struct drbd_conf *mdev; | ||
151 | int vnr; | ||
152 | |||
153 | rcu_read_lock(); | ||
154 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
155 | ds = min_t(enum drbd_disk_state, ds, mdev->state.disk); | ||
156 | rcu_read_unlock(); | ||
157 | |||
158 | return ds; | ||
159 | } | ||
160 | |||
161 | enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn) | ||
162 | { | ||
163 | enum drbd_disk_state ds = D_DISKLESS; | ||
164 | struct drbd_conf *mdev; | ||
165 | int vnr; | ||
166 | |||
167 | rcu_read_lock(); | ||
168 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
169 | ds = max_t(enum drbd_disk_state, ds, mdev->state.pdsk); | ||
170 | rcu_read_unlock(); | ||
171 | |||
172 | return ds; | ||
173 | } | ||
174 | |||
175 | enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn) | ||
176 | { | ||
177 | enum drbd_conns conn = C_MASK; | ||
178 | struct drbd_conf *mdev; | ||
179 | int vnr; | ||
180 | |||
181 | rcu_read_lock(); | ||
182 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
183 | conn = min_t(enum drbd_conns, conn, mdev->state.conn); | ||
184 | rcu_read_unlock(); | ||
185 | |||
186 | return conn; | ||
187 | } | ||
188 | |||
189 | static bool no_peer_wf_report_params(struct drbd_tconn *tconn) | ||
190 | { | ||
191 | struct drbd_conf *mdev; | ||
192 | int vnr; | ||
193 | bool rv = true; | ||
194 | |||
195 | rcu_read_lock(); | ||
196 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
197 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | ||
198 | rv = false; | ||
199 | break; | ||
200 | } | ||
201 | rcu_read_unlock(); | ||
202 | |||
203 | return rv; | ||
204 | } | ||
205 | |||
206 | |||
207 | /** | ||
208 | * cl_wide_st_chg() - true if the state change is a cluster wide one | ||
209 | * @mdev: DRBD device. | ||
210 | * @os: old (current) state. | ||
211 | * @ns: new (wanted) state. | ||
212 | */ | ||
213 | static int cl_wide_st_chg(struct drbd_conf *mdev, | ||
214 | union drbd_state os, union drbd_state ns) | ||
215 | { | ||
216 | return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && | ||
217 | ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || | ||
218 | (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
219 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || | ||
220 | (os.disk != D_FAILED && ns.disk == D_FAILED))) || | ||
221 | (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || | ||
222 | (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) || | ||
223 | (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS); | ||
224 | } | ||
225 | |||
226 | static union drbd_state | ||
227 | apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val) | ||
228 | { | ||
229 | union drbd_state ns; | ||
230 | ns.i = (os.i & ~mask.i) | val.i; | ||
231 | return ns; | ||
232 | } | ||
233 | |||
234 | enum drbd_state_rv | ||
235 | drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
236 | union drbd_state mask, union drbd_state val) | ||
237 | { | ||
238 | unsigned long flags; | ||
239 | union drbd_state ns; | ||
240 | enum drbd_state_rv rv; | ||
241 | |||
242 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
243 | ns = apply_mask_val(drbd_read_state(mdev), mask, val); | ||
244 | rv = _drbd_set_state(mdev, ns, f, NULL); | ||
245 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
246 | |||
247 | return rv; | ||
248 | } | ||
249 | |||
250 | /** | ||
251 | * drbd_force_state() - Impose a change which happens outside our control on our state | ||
252 | * @mdev: DRBD device. | ||
253 | * @mask: mask of state bits to change. | ||
254 | * @val: value of new state bits. | ||
255 | */ | ||
256 | void drbd_force_state(struct drbd_conf *mdev, | ||
257 | union drbd_state mask, union drbd_state val) | ||
258 | { | ||
259 | drbd_change_state(mdev, CS_HARD, mask, val); | ||
260 | } | ||
261 | |||
262 | static enum drbd_state_rv | ||
263 | _req_st_cond(struct drbd_conf *mdev, union drbd_state mask, | ||
264 | union drbd_state val) | ||
265 | { | ||
266 | union drbd_state os, ns; | ||
267 | unsigned long flags; | ||
268 | enum drbd_state_rv rv; | ||
269 | |||
270 | if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) | ||
271 | return SS_CW_SUCCESS; | ||
272 | |||
273 | if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) | ||
274 | return SS_CW_FAILED_BY_PEER; | ||
275 | |||
276 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
277 | os = drbd_read_state(mdev); | ||
278 | ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); | ||
279 | rv = is_valid_transition(os, ns); | ||
280 | if (rv >= SS_SUCCESS) | ||
281 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
282 | |||
283 | if (!cl_wide_st_chg(mdev, os, ns)) | ||
284 | rv = SS_CW_NO_NEED; | ||
285 | if (rv == SS_UNKNOWN_ERROR) { | ||
286 | rv = is_valid_state(mdev, ns); | ||
287 | if (rv >= SS_SUCCESS) { | ||
288 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
289 | if (rv >= SS_SUCCESS) | ||
290 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
291 | } | ||
292 | } | ||
293 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
294 | |||
295 | return rv; | ||
296 | } | ||
297 | |||
298 | /** | ||
299 | * drbd_req_state() - Perform an eventually cluster wide state change | ||
300 | * @mdev: DRBD device. | ||
301 | * @mask: mask of state bits to change. | ||
302 | * @val: value of new state bits. | ||
303 | * @f: flags | ||
304 | * | ||
305 | * Should not be called directly, use drbd_request_state() or | ||
306 | * _drbd_request_state(). | ||
307 | */ | ||
308 | static enum drbd_state_rv | ||
309 | drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, | ||
310 | union drbd_state val, enum chg_state_flags f) | ||
311 | { | ||
312 | struct completion done; | ||
313 | unsigned long flags; | ||
314 | union drbd_state os, ns; | ||
315 | enum drbd_state_rv rv; | ||
316 | |||
317 | init_completion(&done); | ||
318 | |||
319 | if (f & CS_SERIALIZE) | ||
320 | mutex_lock(mdev->state_mutex); | ||
321 | |||
322 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
323 | os = drbd_read_state(mdev); | ||
324 | ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); | ||
325 | rv = is_valid_transition(os, ns); | ||
326 | if (rv < SS_SUCCESS) { | ||
327 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
328 | goto abort; | ||
329 | } | ||
330 | |||
331 | if (cl_wide_st_chg(mdev, os, ns)) { | ||
332 | rv = is_valid_state(mdev, ns); | ||
333 | if (rv == SS_SUCCESS) | ||
334 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
335 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
336 | |||
337 | if (rv < SS_SUCCESS) { | ||
338 | if (f & CS_VERBOSE) | ||
339 | print_st_err(mdev, os, ns, rv); | ||
340 | goto abort; | ||
341 | } | ||
342 | |||
343 | if (drbd_send_state_req(mdev, mask, val)) { | ||
344 | rv = SS_CW_FAILED_BY_PEER; | ||
345 | if (f & CS_VERBOSE) | ||
346 | print_st_err(mdev, os, ns, rv); | ||
347 | goto abort; | ||
348 | } | ||
349 | |||
350 | wait_event(mdev->state_wait, | ||
351 | (rv = _req_st_cond(mdev, mask, val))); | ||
352 | |||
353 | if (rv < SS_SUCCESS) { | ||
354 | if (f & CS_VERBOSE) | ||
355 | print_st_err(mdev, os, ns, rv); | ||
356 | goto abort; | ||
357 | } | ||
358 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
359 | ns = apply_mask_val(drbd_read_state(mdev), mask, val); | ||
360 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
361 | } else { | ||
362 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
363 | } | ||
364 | |||
365 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
366 | |||
367 | if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { | ||
368 | D_ASSERT(current != mdev->tconn->worker.task); | ||
369 | wait_for_completion(&done); | ||
370 | } | ||
371 | |||
372 | abort: | ||
373 | if (f & CS_SERIALIZE) | ||
374 | mutex_unlock(mdev->state_mutex); | ||
375 | |||
376 | return rv; | ||
377 | } | ||
378 | |||
379 | /** | ||
380 | * _drbd_request_state() - Request a state change (with flags) | ||
381 | * @mdev: DRBD device. | ||
382 | * @mask: mask of state bits to change. | ||
383 | * @val: value of new state bits. | ||
384 | * @f: flags | ||
385 | * | ||
386 | * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE | ||
387 | * flag, or when logging of failed state change requests is not desired. | ||
388 | */ | ||
389 | enum drbd_state_rv | ||
390 | _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, | ||
391 | union drbd_state val, enum chg_state_flags f) | ||
392 | { | ||
393 | enum drbd_state_rv rv; | ||
394 | |||
395 | wait_event(mdev->state_wait, | ||
396 | (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); | ||
397 | |||
398 | return rv; | ||
399 | } | ||
400 | |||
401 | static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) | ||
402 | { | ||
403 | dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", | ||
404 | name, | ||
405 | drbd_conn_str(ns.conn), | ||
406 | drbd_role_str(ns.role), | ||
407 | drbd_role_str(ns.peer), | ||
408 | drbd_disk_str(ns.disk), | ||
409 | drbd_disk_str(ns.pdsk), | ||
410 | is_susp(ns) ? 's' : 'r', | ||
411 | ns.aftr_isp ? 'a' : '-', | ||
412 | ns.peer_isp ? 'p' : '-', | ||
413 | ns.user_isp ? 'u' : '-', | ||
414 | ns.susp_fen ? 'F' : '-', | ||
415 | ns.susp_nod ? 'N' : '-' | ||
416 | ); | ||
417 | } | ||
418 | |||
419 | void print_st_err(struct drbd_conf *mdev, union drbd_state os, | ||
420 | union drbd_state ns, enum drbd_state_rv err) | ||
421 | { | ||
422 | if (err == SS_IN_TRANSIENT_STATE) | ||
423 | return; | ||
424 | dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); | ||
425 | print_st(mdev, " state", os); | ||
426 | print_st(mdev, "wanted", ns); | ||
427 | } | ||
428 | |||
429 | static long print_state_change(char *pb, union drbd_state os, union drbd_state ns, | ||
430 | enum chg_state_flags flags) | ||
431 | { | ||
432 | char *pbp; | ||
433 | pbp = pb; | ||
434 | *pbp = 0; | ||
435 | |||
436 | if (ns.role != os.role && flags & CS_DC_ROLE) | ||
437 | pbp += sprintf(pbp, "role( %s -> %s ) ", | ||
438 | drbd_role_str(os.role), | ||
439 | drbd_role_str(ns.role)); | ||
440 | if (ns.peer != os.peer && flags & CS_DC_PEER) | ||
441 | pbp += sprintf(pbp, "peer( %s -> %s ) ", | ||
442 | drbd_role_str(os.peer), | ||
443 | drbd_role_str(ns.peer)); | ||
444 | if (ns.conn != os.conn && flags & CS_DC_CONN) | ||
445 | pbp += sprintf(pbp, "conn( %s -> %s ) ", | ||
446 | drbd_conn_str(os.conn), | ||
447 | drbd_conn_str(ns.conn)); | ||
448 | if (ns.disk != os.disk && flags & CS_DC_DISK) | ||
449 | pbp += sprintf(pbp, "disk( %s -> %s ) ", | ||
450 | drbd_disk_str(os.disk), | ||
451 | drbd_disk_str(ns.disk)); | ||
452 | if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK) | ||
453 | pbp += sprintf(pbp, "pdsk( %s -> %s ) ", | ||
454 | drbd_disk_str(os.pdsk), | ||
455 | drbd_disk_str(ns.pdsk)); | ||
456 | |||
457 | return pbp - pb; | ||
458 | } | ||
459 | |||
460 | static void drbd_pr_state_change(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns, | ||
461 | enum chg_state_flags flags) | ||
462 | { | ||
463 | char pb[300]; | ||
464 | char *pbp = pb; | ||
465 | |||
466 | pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK); | ||
467 | |||
468 | if (ns.aftr_isp != os.aftr_isp) | ||
469 | pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", | ||
470 | os.aftr_isp, | ||
471 | ns.aftr_isp); | ||
472 | if (ns.peer_isp != os.peer_isp) | ||
473 | pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", | ||
474 | os.peer_isp, | ||
475 | ns.peer_isp); | ||
476 | if (ns.user_isp != os.user_isp) | ||
477 | pbp += sprintf(pbp, "user_isp( %d -> %d ) ", | ||
478 | os.user_isp, | ||
479 | ns.user_isp); | ||
480 | |||
481 | if (pbp != pb) | ||
482 | dev_info(DEV, "%s\n", pb); | ||
483 | } | ||
484 | |||
485 | static void conn_pr_state_change(struct drbd_tconn *tconn, union drbd_state os, union drbd_state ns, | ||
486 | enum chg_state_flags flags) | ||
487 | { | ||
488 | char pb[300]; | ||
489 | char *pbp = pb; | ||
490 | |||
491 | pbp += print_state_change(pbp, os, ns, flags); | ||
492 | |||
493 | if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP) | ||
494 | pbp += sprintf(pbp, "susp( %d -> %d ) ", | ||
495 | is_susp(os), | ||
496 | is_susp(ns)); | ||
497 | |||
498 | if (pbp != pb) | ||
499 | conn_info(tconn, "%s\n", pb); | ||
500 | } | ||
501 | |||
502 | |||
503 | /** | ||
504 | * is_valid_state() - Returns an SS_ error code if ns is not valid | ||
505 | * @mdev: DRBD device. | ||
506 | * @ns: State to consider. | ||
507 | */ | ||
508 | static enum drbd_state_rv | ||
509 | is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | ||
510 | { | ||
511 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
512 | |||
513 | enum drbd_fencing_p fp; | ||
514 | enum drbd_state_rv rv = SS_SUCCESS; | ||
515 | struct net_conf *nc; | ||
516 | |||
517 | rcu_read_lock(); | ||
518 | fp = FP_DONT_CARE; | ||
519 | if (get_ldev(mdev)) { | ||
520 | fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; | ||
521 | put_ldev(mdev); | ||
522 | } | ||
523 | |||
524 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
525 | if (nc) { | ||
526 | if (!nc->two_primaries && ns.role == R_PRIMARY) { | ||
527 | if (ns.peer == R_PRIMARY) | ||
528 | rv = SS_TWO_PRIMARIES; | ||
529 | else if (conn_highest_peer(mdev->tconn) == R_PRIMARY) | ||
530 | rv = SS_O_VOL_PEER_PRI; | ||
531 | } | ||
532 | } | ||
533 | |||
534 | if (rv <= 0) | ||
535 | /* already found a reason to abort */; | ||
536 | else if (ns.role == R_SECONDARY && mdev->open_cnt) | ||
537 | rv = SS_DEVICE_IN_USE; | ||
538 | |||
539 | else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) | ||
540 | rv = SS_NO_UP_TO_DATE_DISK; | ||
541 | |||
542 | else if (fp >= FP_RESOURCE && | ||
543 | ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) | ||
544 | rv = SS_PRIMARY_NOP; | ||
545 | |||
546 | else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) | ||
547 | rv = SS_NO_UP_TO_DATE_DISK; | ||
548 | |||
549 | else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) | ||
550 | rv = SS_NO_LOCAL_DISK; | ||
551 | |||
552 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | ||
553 | rv = SS_NO_REMOTE_DISK; | ||
554 | |||
555 | else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) | ||
556 | rv = SS_NO_UP_TO_DATE_DISK; | ||
557 | |||
558 | else if ((ns.conn == C_CONNECTED || | ||
559 | ns.conn == C_WF_BITMAP_S || | ||
560 | ns.conn == C_SYNC_SOURCE || | ||
561 | ns.conn == C_PAUSED_SYNC_S) && | ||
562 | ns.disk == D_OUTDATED) | ||
563 | rv = SS_CONNECTED_OUTDATES; | ||
564 | |||
565 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
566 | (nc->verify_alg[0] == 0)) | ||
567 | rv = SS_NO_VERIFY_ALG; | ||
568 | |||
569 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
570 | mdev->tconn->agreed_pro_version < 88) | ||
571 | rv = SS_NOT_SUPPORTED; | ||
572 | |||
573 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) | ||
574 | rv = SS_CONNECTED_OUTDATES; | ||
575 | |||
576 | rcu_read_unlock(); | ||
577 | |||
578 | return rv; | ||
579 | } | ||
580 | |||
581 | /** | ||
582 | * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible | ||
583 | * This function limits state transitions that may be declined by DRBD. I.e. | ||
584 | * user requests (aka soft transitions). | ||
585 | * @mdev: DRBD device. | ||
586 | * @ns: new state. | ||
587 | * @os: old state. | ||
588 | */ | ||
589 | static enum drbd_state_rv | ||
590 | is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_tconn *tconn) | ||
591 | { | ||
592 | enum drbd_state_rv rv = SS_SUCCESS; | ||
593 | |||
594 | if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && | ||
595 | os.conn > C_CONNECTED) | ||
596 | rv = SS_RESYNC_RUNNING; | ||
597 | |||
598 | if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) | ||
599 | rv = SS_ALREADY_STANDALONE; | ||
600 | |||
601 | if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) | ||
602 | rv = SS_IS_DISKLESS; | ||
603 | |||
604 | if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) | ||
605 | rv = SS_NO_NET_CONFIG; | ||
606 | |||
607 | if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) | ||
608 | rv = SS_LOWER_THAN_OUTDATED; | ||
609 | |||
610 | if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) | ||
611 | rv = SS_IN_TRANSIENT_STATE; | ||
612 | |||
613 | /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) | ||
614 | rv = SS_IN_TRANSIENT_STATE; */ | ||
615 | |||
616 | /* While establishing a connection only allow cstate to change. | ||
617 | Delay/refuse role changes, detach attach etc... */ | ||
618 | if (test_bit(STATE_SENT, &tconn->flags) && | ||
619 | !(os.conn == C_WF_REPORT_PARAMS || | ||
620 | (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) | ||
621 | rv = SS_IN_TRANSIENT_STATE; | ||
622 | |||
623 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) | ||
624 | rv = SS_NEED_CONNECTION; | ||
625 | |||
626 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
627 | ns.conn != os.conn && os.conn > C_CONNECTED) | ||
628 | rv = SS_RESYNC_RUNNING; | ||
629 | |||
630 | if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
631 | os.conn < C_CONNECTED) | ||
632 | rv = SS_NEED_CONNECTION; | ||
633 | |||
634 | if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) | ||
635 | && os.conn < C_WF_REPORT_PARAMS) | ||
636 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ | ||
637 | |||
638 | return rv; | ||
639 | } | ||
640 | |||
641 | static enum drbd_state_rv | ||
642 | is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc) | ||
643 | { | ||
644 | /* no change -> nothing to do, at least for the connection part */ | ||
645 | if (oc == nc) | ||
646 | return SS_NOTHING_TO_DO; | ||
647 | |||
648 | /* disconnect of an unconfigured connection does not make sense */ | ||
649 | if (oc == C_STANDALONE && nc == C_DISCONNECTING) | ||
650 | return SS_ALREADY_STANDALONE; | ||
651 | |||
652 | /* from C_STANDALONE, we start with C_UNCONNECTED */ | ||
653 | if (oc == C_STANDALONE && nc != C_UNCONNECTED) | ||
654 | return SS_NEED_CONNECTION; | ||
655 | |||
656 | /* When establishing a connection we need to go through WF_REPORT_PARAMS! | ||
657 | Necessary to do the right thing upon invalidate-remote on a disconnected resource */ | ||
658 | if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED) | ||
659 | return SS_NEED_CONNECTION; | ||
660 | |||
661 | /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */ | ||
662 | if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING) | ||
663 | return SS_IN_TRANSIENT_STATE; | ||
664 | |||
665 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | ||
666 | if (oc == C_DISCONNECTING && nc != C_STANDALONE) | ||
667 | return SS_IN_TRANSIENT_STATE; | ||
668 | |||
669 | return SS_SUCCESS; | ||
670 | } | ||
671 | |||
672 | |||
673 | /** | ||
674 | * is_valid_transition() - Returns an SS_ error code if the state transition is not possible | ||
675 | * This limits hard state transitions. Hard state transitions are facts there are | ||
676 | * imposed on DRBD by the environment. E.g. disk broke or network broke down. | ||
677 | * But those hard state transitions are still not allowed to do everything. | ||
678 | * @ns: new state. | ||
679 | * @os: old state. | ||
680 | */ | ||
681 | static enum drbd_state_rv | ||
682 | is_valid_transition(union drbd_state os, union drbd_state ns) | ||
683 | { | ||
684 | enum drbd_state_rv rv; | ||
685 | |||
686 | rv = is_valid_conn_transition(os.conn, ns.conn); | ||
687 | |||
688 | /* we cannot fail (again) if we already detached */ | ||
689 | if (ns.disk == D_FAILED && os.disk == D_DISKLESS) | ||
690 | rv = SS_IS_DISKLESS; | ||
691 | |||
692 | return rv; | ||
693 | } | ||
694 | |||
695 | static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) | ||
696 | { | ||
697 | static const char *msg_table[] = { | ||
698 | [NO_WARNING] = "", | ||
699 | [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", | ||
700 | [ABORTED_RESYNC] = "Resync aborted.", | ||
701 | [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", | ||
702 | [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", | ||
703 | [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", | ||
704 | }; | ||
705 | |||
706 | if (warn != NO_WARNING) | ||
707 | dev_warn(DEV, "%s\n", msg_table[warn]); | ||
708 | } | ||
709 | |||
710 | /** | ||
711 | * sanitize_state() - Resolves implicitly necessary additional changes to a state transition | ||
712 | * @mdev: DRBD device. | ||
713 | * @os: old state. | ||
714 | * @ns: new state. | ||
715 | * @warn_sync_abort: | ||
716 | * | ||
717 | * When we loose connection, we have to set the state of the peers disk (pdsk) | ||
718 | * to D_UNKNOWN. This rule and many more along those lines are in this function. | ||
719 | */ | ||
720 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, | ||
721 | enum sanitize_state_warnings *warn) | ||
722 | { | ||
723 | enum drbd_fencing_p fp; | ||
724 | enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; | ||
725 | |||
726 | if (warn) | ||
727 | *warn = NO_WARNING; | ||
728 | |||
729 | fp = FP_DONT_CARE; | ||
730 | if (get_ldev(mdev)) { | ||
731 | rcu_read_lock(); | ||
732 | fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; | ||
733 | rcu_read_unlock(); | ||
734 | put_ldev(mdev); | ||
735 | } | ||
736 | |||
737 | /* Implications from connection to peer and peer_isp */ | ||
738 | if (ns.conn < C_CONNECTED) { | ||
739 | ns.peer_isp = 0; | ||
740 | ns.peer = R_UNKNOWN; | ||
741 | if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) | ||
742 | ns.pdsk = D_UNKNOWN; | ||
743 | } | ||
744 | |||
745 | /* Clear the aftr_isp when becoming unconfigured */ | ||
746 | if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) | ||
747 | ns.aftr_isp = 0; | ||
748 | |||
749 | /* An implication of the disk states onto the connection state */ | ||
750 | /* Abort resync if a disk fails/detaches */ | ||
751 | if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { | ||
752 | if (warn) | ||
753 | *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ? | ||
754 | ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; | ||
755 | ns.conn = C_CONNECTED; | ||
756 | } | ||
757 | |||
758 | /* Connection breaks down before we finished "Negotiating" */ | ||
759 | if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && | ||
760 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
761 | if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { | ||
762 | ns.disk = mdev->new_state_tmp.disk; | ||
763 | ns.pdsk = mdev->new_state_tmp.pdsk; | ||
764 | } else { | ||
765 | if (warn) | ||
766 | *warn = CONNECTION_LOST_NEGOTIATING; | ||
767 | ns.disk = D_DISKLESS; | ||
768 | ns.pdsk = D_UNKNOWN; | ||
769 | } | ||
770 | put_ldev(mdev); | ||
771 | } | ||
772 | |||
773 | /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ | ||
774 | if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { | ||
775 | if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) | ||
776 | ns.disk = D_UP_TO_DATE; | ||
777 | if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) | ||
778 | ns.pdsk = D_UP_TO_DATE; | ||
779 | } | ||
780 | |||
781 | /* Implications of the connection stat on the disk states */ | ||
782 | disk_min = D_DISKLESS; | ||
783 | disk_max = D_UP_TO_DATE; | ||
784 | pdsk_min = D_INCONSISTENT; | ||
785 | pdsk_max = D_UNKNOWN; | ||
786 | switch ((enum drbd_conns)ns.conn) { | ||
787 | case C_WF_BITMAP_T: | ||
788 | case C_PAUSED_SYNC_T: | ||
789 | case C_STARTING_SYNC_T: | ||
790 | case C_WF_SYNC_UUID: | ||
791 | case C_BEHIND: | ||
792 | disk_min = D_INCONSISTENT; | ||
793 | disk_max = D_OUTDATED; | ||
794 | pdsk_min = D_UP_TO_DATE; | ||
795 | pdsk_max = D_UP_TO_DATE; | ||
796 | break; | ||
797 | case C_VERIFY_S: | ||
798 | case C_VERIFY_T: | ||
799 | disk_min = D_UP_TO_DATE; | ||
800 | disk_max = D_UP_TO_DATE; | ||
801 | pdsk_min = D_UP_TO_DATE; | ||
802 | pdsk_max = D_UP_TO_DATE; | ||
803 | break; | ||
804 | case C_CONNECTED: | ||
805 | disk_min = D_DISKLESS; | ||
806 | disk_max = D_UP_TO_DATE; | ||
807 | pdsk_min = D_DISKLESS; | ||
808 | pdsk_max = D_UP_TO_DATE; | ||
809 | break; | ||
810 | case C_WF_BITMAP_S: | ||
811 | case C_PAUSED_SYNC_S: | ||
812 | case C_STARTING_SYNC_S: | ||
813 | case C_AHEAD: | ||
814 | disk_min = D_UP_TO_DATE; | ||
815 | disk_max = D_UP_TO_DATE; | ||
816 | pdsk_min = D_INCONSISTENT; | ||
817 | pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ | ||
818 | break; | ||
819 | case C_SYNC_TARGET: | ||
820 | disk_min = D_INCONSISTENT; | ||
821 | disk_max = D_INCONSISTENT; | ||
822 | pdsk_min = D_UP_TO_DATE; | ||
823 | pdsk_max = D_UP_TO_DATE; | ||
824 | break; | ||
825 | case C_SYNC_SOURCE: | ||
826 | disk_min = D_UP_TO_DATE; | ||
827 | disk_max = D_UP_TO_DATE; | ||
828 | pdsk_min = D_INCONSISTENT; | ||
829 | pdsk_max = D_INCONSISTENT; | ||
830 | break; | ||
831 | case C_STANDALONE: | ||
832 | case C_DISCONNECTING: | ||
833 | case C_UNCONNECTED: | ||
834 | case C_TIMEOUT: | ||
835 | case C_BROKEN_PIPE: | ||
836 | case C_NETWORK_FAILURE: | ||
837 | case C_PROTOCOL_ERROR: | ||
838 | case C_TEAR_DOWN: | ||
839 | case C_WF_CONNECTION: | ||
840 | case C_WF_REPORT_PARAMS: | ||
841 | case C_MASK: | ||
842 | break; | ||
843 | } | ||
844 | if (ns.disk > disk_max) | ||
845 | ns.disk = disk_max; | ||
846 | |||
847 | if (ns.disk < disk_min) { | ||
848 | if (warn) | ||
849 | *warn = IMPLICITLY_UPGRADED_DISK; | ||
850 | ns.disk = disk_min; | ||
851 | } | ||
852 | if (ns.pdsk > pdsk_max) | ||
853 | ns.pdsk = pdsk_max; | ||
854 | |||
855 | if (ns.pdsk < pdsk_min) { | ||
856 | if (warn) | ||
857 | *warn = IMPLICITLY_UPGRADED_PDSK; | ||
858 | ns.pdsk = pdsk_min; | ||
859 | } | ||
860 | |||
861 | if (fp == FP_STONITH && | ||
862 | (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) | ||
863 | ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ | ||
864 | |||
865 | if (mdev->tconn->res_opts.on_no_data == OND_SUSPEND_IO && | ||
866 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
867 | ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ | ||
868 | |||
869 | if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { | ||
870 | if (ns.conn == C_SYNC_SOURCE) | ||
871 | ns.conn = C_PAUSED_SYNC_S; | ||
872 | if (ns.conn == C_SYNC_TARGET) | ||
873 | ns.conn = C_PAUSED_SYNC_T; | ||
874 | } else { | ||
875 | if (ns.conn == C_PAUSED_SYNC_S) | ||
876 | ns.conn = C_SYNC_SOURCE; | ||
877 | if (ns.conn == C_PAUSED_SYNC_T) | ||
878 | ns.conn = C_SYNC_TARGET; | ||
879 | } | ||
880 | |||
881 | return ns; | ||
882 | } | ||
883 | |||
884 | void drbd_resume_al(struct drbd_conf *mdev) | ||
885 | { | ||
886 | if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) | ||
887 | dev_info(DEV, "Resumed AL updates\n"); | ||
888 | } | ||
889 | |||
890 | /* helper for __drbd_set_state */ | ||
891 | static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) | ||
892 | { | ||
893 | if (mdev->tconn->agreed_pro_version < 90) | ||
894 | mdev->ov_start_sector = 0; | ||
895 | mdev->rs_total = drbd_bm_bits(mdev); | ||
896 | mdev->ov_position = 0; | ||
897 | if (cs == C_VERIFY_T) { | ||
898 | /* starting online verify from an arbitrary position | ||
899 | * does not fit well into the existing protocol. | ||
900 | * on C_VERIFY_T, we initialize ov_left and friends | ||
901 | * implicitly in receive_DataRequest once the | ||
902 | * first P_OV_REQUEST is received */ | ||
903 | mdev->ov_start_sector = ~(sector_t)0; | ||
904 | } else { | ||
905 | unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); | ||
906 | if (bit >= mdev->rs_total) { | ||
907 | mdev->ov_start_sector = | ||
908 | BM_BIT_TO_SECT(mdev->rs_total - 1); | ||
909 | mdev->rs_total = 1; | ||
910 | } else | ||
911 | mdev->rs_total -= bit; | ||
912 | mdev->ov_position = mdev->ov_start_sector; | ||
913 | } | ||
914 | mdev->ov_left = mdev->rs_total; | ||
915 | } | ||
916 | |||
917 | /** | ||
918 | * __drbd_set_state() - Set a new DRBD state | ||
919 | * @mdev: DRBD device. | ||
920 | * @ns: new state. | ||
921 | * @flags: Flags | ||
922 | * @done: Optional completion, that will get completed after the after_state_ch() finished | ||
923 | * | ||
924 | * Caller needs to hold req_lock, and global_state_lock. Do not call directly. | ||
925 | */ | ||
926 | enum drbd_state_rv | ||
927 | __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | ||
928 | enum chg_state_flags flags, struct completion *done) | ||
929 | { | ||
930 | union drbd_state os; | ||
931 | enum drbd_state_rv rv = SS_SUCCESS; | ||
932 | enum sanitize_state_warnings ssw; | ||
933 | struct after_state_chg_work *ascw; | ||
934 | |||
935 | os = drbd_read_state(mdev); | ||
936 | |||
937 | ns = sanitize_state(mdev, ns, &ssw); | ||
938 | if (ns.i == os.i) | ||
939 | return SS_NOTHING_TO_DO; | ||
940 | |||
941 | rv = is_valid_transition(os, ns); | ||
942 | if (rv < SS_SUCCESS) | ||
943 | return rv; | ||
944 | |||
945 | if (!(flags & CS_HARD)) { | ||
946 | /* pre-state-change checks ; only look at ns */ | ||
947 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
948 | |||
949 | rv = is_valid_state(mdev, ns); | ||
950 | if (rv < SS_SUCCESS) { | ||
951 | /* If the old state was illegal as well, then let | ||
952 | this happen...*/ | ||
953 | |||
954 | if (is_valid_state(mdev, os) == rv) | ||
955 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
956 | } else | ||
957 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
958 | } | ||
959 | |||
960 | if (rv < SS_SUCCESS) { | ||
961 | if (flags & CS_VERBOSE) | ||
962 | print_st_err(mdev, os, ns, rv); | ||
963 | return rv; | ||
964 | } | ||
965 | |||
966 | print_sanitize_warnings(mdev, ssw); | ||
967 | |||
968 | drbd_pr_state_change(mdev, os, ns, flags); | ||
969 | |||
970 | /* Display changes to the susp* flags that where caused by the call to | ||
971 | sanitize_state(). Only display it here if we where not called from | ||
972 | _conn_request_state() */ | ||
973 | if (!(flags & CS_DC_SUSP)) | ||
974 | conn_pr_state_change(mdev->tconn, os, ns, (flags & ~CS_DC_MASK) | CS_DC_SUSP); | ||
975 | |||
976 | /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference | ||
977 | * on the ldev here, to be sure the transition -> D_DISKLESS resp. | ||
978 | * drbd_ldev_destroy() won't happen before our corresponding | ||
979 | * after_state_ch works run, where we put_ldev again. */ | ||
980 | if ((os.disk != D_FAILED && ns.disk == D_FAILED) || | ||
981 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) | ||
982 | atomic_inc(&mdev->local_cnt); | ||
983 | |||
984 | mdev->state.i = ns.i; | ||
985 | mdev->tconn->susp = ns.susp; | ||
986 | mdev->tconn->susp_nod = ns.susp_nod; | ||
987 | mdev->tconn->susp_fen = ns.susp_fen; | ||
988 | |||
989 | if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) | ||
990 | drbd_print_uuids(mdev, "attached to UUIDs"); | ||
991 | |||
992 | /* Wake up role changes, that were delayed because of connection establishing */ | ||
993 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && | ||
994 | no_peer_wf_report_params(mdev->tconn)) | ||
995 | clear_bit(STATE_SENT, &mdev->tconn->flags); | ||
996 | |||
997 | wake_up(&mdev->misc_wait); | ||
998 | wake_up(&mdev->state_wait); | ||
999 | wake_up(&mdev->tconn->ping_wait); | ||
1000 | |||
1001 | /* Aborted verify run, or we reached the stop sector. | ||
1002 | * Log the last position, unless end-of-device. */ | ||
1003 | if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && | ||
1004 | ns.conn <= C_CONNECTED) { | ||
1005 | mdev->ov_start_sector = | ||
1006 | BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); | ||
1007 | if (mdev->ov_left) | ||
1008 | dev_info(DEV, "Online Verify reached sector %llu\n", | ||
1009 | (unsigned long long)mdev->ov_start_sector); | ||
1010 | } | ||
1011 | |||
1012 | if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && | ||
1013 | (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { | ||
1014 | dev_info(DEV, "Syncer continues.\n"); | ||
1015 | mdev->rs_paused += (long)jiffies | ||
1016 | -(long)mdev->rs_mark_time[mdev->rs_last_mark]; | ||
1017 | if (ns.conn == C_SYNC_TARGET) | ||
1018 | mod_timer(&mdev->resync_timer, jiffies); | ||
1019 | } | ||
1020 | |||
1021 | if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && | ||
1022 | (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { | ||
1023 | dev_info(DEV, "Resync suspended\n"); | ||
1024 | mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; | ||
1025 | } | ||
1026 | |||
1027 | if (os.conn == C_CONNECTED && | ||
1028 | (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { | ||
1029 | unsigned long now = jiffies; | ||
1030 | int i; | ||
1031 | |||
1032 | set_ov_position(mdev, ns.conn); | ||
1033 | mdev->rs_start = now; | ||
1034 | mdev->rs_last_events = 0; | ||
1035 | mdev->rs_last_sect_ev = 0; | ||
1036 | mdev->ov_last_oos_size = 0; | ||
1037 | mdev->ov_last_oos_start = 0; | ||
1038 | |||
1039 | for (i = 0; i < DRBD_SYNC_MARKS; i++) { | ||
1040 | mdev->rs_mark_left[i] = mdev->ov_left; | ||
1041 | mdev->rs_mark_time[i] = now; | ||
1042 | } | ||
1043 | |||
1044 | drbd_rs_controller_reset(mdev); | ||
1045 | |||
1046 | if (ns.conn == C_VERIFY_S) { | ||
1047 | dev_info(DEV, "Starting Online Verify from sector %llu\n", | ||
1048 | (unsigned long long)mdev->ov_position); | ||
1049 | mod_timer(&mdev->resync_timer, jiffies); | ||
1050 | } | ||
1051 | } | ||
1052 | |||
1053 | if (get_ldev(mdev)) { | ||
1054 | u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| | ||
1055 | MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| | ||
1056 | MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); | ||
1057 | |||
1058 | mdf &= ~MDF_AL_CLEAN; | ||
1059 | if (test_bit(CRASHED_PRIMARY, &mdev->flags)) | ||
1060 | mdf |= MDF_CRASHED_PRIMARY; | ||
1061 | if (mdev->state.role == R_PRIMARY || | ||
1062 | (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) | ||
1063 | mdf |= MDF_PRIMARY_IND; | ||
1064 | if (mdev->state.conn > C_WF_REPORT_PARAMS) | ||
1065 | mdf |= MDF_CONNECTED_IND; | ||
1066 | if (mdev->state.disk > D_INCONSISTENT) | ||
1067 | mdf |= MDF_CONSISTENT; | ||
1068 | if (mdev->state.disk > D_OUTDATED) | ||
1069 | mdf |= MDF_WAS_UP_TO_DATE; | ||
1070 | if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) | ||
1071 | mdf |= MDF_PEER_OUT_DATED; | ||
1072 | if (mdf != mdev->ldev->md.flags) { | ||
1073 | mdev->ldev->md.flags = mdf; | ||
1074 | drbd_md_mark_dirty(mdev); | ||
1075 | } | ||
1076 | if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) | ||
1077 | drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); | ||
1078 | put_ldev(mdev); | ||
1079 | } | ||
1080 | |||
1081 | /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ | ||
1082 | if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && | ||
1083 | os.peer == R_SECONDARY && ns.peer == R_PRIMARY) | ||
1084 | set_bit(CONSIDER_RESYNC, &mdev->flags); | ||
1085 | |||
1086 | /* Receiver should clean up itself */ | ||
1087 | if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) | ||
1088 | drbd_thread_stop_nowait(&mdev->tconn->receiver); | ||
1089 | |||
1090 | /* Now the receiver finished cleaning up itself, it should die */ | ||
1091 | if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) | ||
1092 | drbd_thread_stop_nowait(&mdev->tconn->receiver); | ||
1093 | |||
1094 | /* Upon network failure, we need to restart the receiver. */ | ||
1095 | if (os.conn > C_WF_CONNECTION && | ||
1096 | ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) | ||
1097 | drbd_thread_restart_nowait(&mdev->tconn->receiver); | ||
1098 | |||
1099 | /* Resume AL writing if we get a connection */ | ||
1100 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | ||
1101 | drbd_resume_al(mdev); | ||
1102 | |||
1103 | /* remember last attach time so request_timer_fn() won't | ||
1104 | * kill newly established sessions while we are still trying to thaw | ||
1105 | * previously frozen IO */ | ||
1106 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1107 | ns.disk > D_NEGOTIATING) | ||
1108 | mdev->last_reattach_jif = jiffies; | ||
1109 | |||
1110 | ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); | ||
1111 | if (ascw) { | ||
1112 | ascw->os = os; | ||
1113 | ascw->ns = ns; | ||
1114 | ascw->flags = flags; | ||
1115 | ascw->w.cb = w_after_state_ch; | ||
1116 | ascw->w.mdev = mdev; | ||
1117 | ascw->done = done; | ||
1118 | drbd_queue_work(&mdev->tconn->sender_work, &ascw->w); | ||
1119 | } else { | ||
1120 | dev_err(DEV, "Could not kmalloc an ascw\n"); | ||
1121 | } | ||
1122 | |||
1123 | return rv; | ||
1124 | } | ||
1125 | |||
1126 | static int w_after_state_ch(struct drbd_work *w, int unused) | ||
1127 | { | ||
1128 | struct after_state_chg_work *ascw = | ||
1129 | container_of(w, struct after_state_chg_work, w); | ||
1130 | struct drbd_conf *mdev = w->mdev; | ||
1131 | |||
1132 | after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); | ||
1133 | if (ascw->flags & CS_WAIT_COMPLETE) { | ||
1134 | D_ASSERT(ascw->done != NULL); | ||
1135 | complete(ascw->done); | ||
1136 | } | ||
1137 | kfree(ascw); | ||
1138 | |||
1139 | return 0; | ||
1140 | } | ||
1141 | |||
1142 | static void abw_start_sync(struct drbd_conf *mdev, int rv) | ||
1143 | { | ||
1144 | if (rv) { | ||
1145 | dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); | ||
1146 | _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); | ||
1147 | return; | ||
1148 | } | ||
1149 | |||
1150 | switch (mdev->state.conn) { | ||
1151 | case C_STARTING_SYNC_T: | ||
1152 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
1153 | break; | ||
1154 | case C_STARTING_SYNC_S: | ||
1155 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1156 | break; | ||
1157 | } | ||
1158 | } | ||
1159 | |||
1160 | int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | ||
1161 | int (*io_fn)(struct drbd_conf *), | ||
1162 | char *why, enum bm_flag flags) | ||
1163 | { | ||
1164 | int rv; | ||
1165 | |||
1166 | D_ASSERT(current == mdev->tconn->worker.task); | ||
1167 | |||
1168 | /* open coded non-blocking drbd_suspend_io(mdev); */ | ||
1169 | set_bit(SUSPEND_IO, &mdev->flags); | ||
1170 | |||
1171 | drbd_bm_lock(mdev, why, flags); | ||
1172 | rv = io_fn(mdev); | ||
1173 | drbd_bm_unlock(mdev); | ||
1174 | |||
1175 | drbd_resume_io(mdev); | ||
1176 | |||
1177 | return rv; | ||
1178 | } | ||
1179 | |||
1180 | /** | ||
1181 | * after_state_ch() - Perform after state change actions that may sleep | ||
1182 | * @mdev: DRBD device. | ||
1183 | * @os: old state. | ||
1184 | * @ns: new state. | ||
1185 | * @flags: Flags | ||
1186 | */ | ||
1187 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
1188 | union drbd_state ns, enum chg_state_flags flags) | ||
1189 | { | ||
1190 | struct sib_info sib; | ||
1191 | |||
1192 | sib.sib_reason = SIB_STATE_CHANGE; | ||
1193 | sib.os = os; | ||
1194 | sib.ns = ns; | ||
1195 | |||
1196 | if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { | ||
1197 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1198 | if (mdev->p_uuid) | ||
1199 | mdev->p_uuid[UI_FLAGS] &= ~((u64)2); | ||
1200 | } | ||
1201 | |||
1202 | /* Inform userspace about the change... */ | ||
1203 | drbd_bcast_event(mdev, &sib); | ||
1204 | |||
1205 | if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && | ||
1206 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
1207 | drbd_khelper(mdev, "pri-on-incon-degr"); | ||
1208 | |||
1209 | /* Here we have the actions that are performed after a | ||
1210 | state change. This function might sleep */ | ||
1211 | |||
1212 | if (ns.susp_nod) { | ||
1213 | struct drbd_tconn *tconn = mdev->tconn; | ||
1214 | enum drbd_req_event what = NOTHING; | ||
1215 | |||
1216 | spin_lock_irq(&tconn->req_lock); | ||
1217 | if (os.conn < C_CONNECTED && conn_lowest_conn(tconn) >= C_CONNECTED) | ||
1218 | what = RESEND; | ||
1219 | |||
1220 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1221 | conn_lowest_disk(tconn) > D_NEGOTIATING) | ||
1222 | what = RESTART_FROZEN_DISK_IO; | ||
1223 | |||
1224 | if (tconn->susp_nod && what != NOTHING) { | ||
1225 | _tl_restart(tconn, what); | ||
1226 | _conn_request_state(tconn, | ||
1227 | (union drbd_state) { { .susp_nod = 1 } }, | ||
1228 | (union drbd_state) { { .susp_nod = 0 } }, | ||
1229 | CS_VERBOSE); | ||
1230 | } | ||
1231 | spin_unlock_irq(&tconn->req_lock); | ||
1232 | } | ||
1233 | |||
1234 | if (ns.susp_fen) { | ||
1235 | struct drbd_tconn *tconn = mdev->tconn; | ||
1236 | |||
1237 | spin_lock_irq(&tconn->req_lock); | ||
1238 | if (tconn->susp_fen && conn_lowest_conn(tconn) >= C_CONNECTED) { | ||
1239 | /* case2: The connection was established again: */ | ||
1240 | struct drbd_conf *odev; | ||
1241 | int vnr; | ||
1242 | |||
1243 | rcu_read_lock(); | ||
1244 | idr_for_each_entry(&tconn->volumes, odev, vnr) | ||
1245 | clear_bit(NEW_CUR_UUID, &odev->flags); | ||
1246 | rcu_read_unlock(); | ||
1247 | _tl_restart(tconn, RESEND); | ||
1248 | _conn_request_state(tconn, | ||
1249 | (union drbd_state) { { .susp_fen = 1 } }, | ||
1250 | (union drbd_state) { { .susp_fen = 0 } }, | ||
1251 | CS_VERBOSE); | ||
1252 | } | ||
1253 | spin_unlock_irq(&tconn->req_lock); | ||
1254 | } | ||
1255 | |||
1256 | /* Became sync source. With protocol >= 96, we still need to send out | ||
1257 | * the sync uuid now. Need to do that before any drbd_send_state, or | ||
1258 | * the other side may go "paused sync" before receiving the sync uuids, | ||
1259 | * which is unexpected. */ | ||
1260 | if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && | ||
1261 | (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && | ||
1262 | mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) { | ||
1263 | drbd_gen_and_send_sync_uuid(mdev); | ||
1264 | put_ldev(mdev); | ||
1265 | } | ||
1266 | |||
1267 | /* Do not change the order of the if above and the two below... */ | ||
1268 | if (os.pdsk == D_DISKLESS && | ||
1269 | ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ | ||
1270 | /* we probably will start a resync soon. | ||
1271 | * make sure those things are properly reset. */ | ||
1272 | mdev->rs_total = 0; | ||
1273 | mdev->rs_failed = 0; | ||
1274 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1275 | drbd_rs_cancel_all(mdev); | ||
1276 | |||
1277 | drbd_send_uuids(mdev); | ||
1278 | drbd_send_state(mdev, ns); | ||
1279 | } | ||
1280 | /* No point in queuing send_bitmap if we don't have a connection | ||
1281 | * anymore, so check also the _current_ state, not only the new state | ||
1282 | * at the time this work was queued. */ | ||
1283 | if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && | ||
1284 | mdev->state.conn == C_WF_BITMAP_S) | ||
1285 | drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, | ||
1286 | "send_bitmap (WFBitMapS)", | ||
1287 | BM_LOCKED_TEST_ALLOWED); | ||
1288 | |||
1289 | /* Lost contact to peer's copy of the data */ | ||
1290 | if ((os.pdsk >= D_INCONSISTENT && | ||
1291 | os.pdsk != D_UNKNOWN && | ||
1292 | os.pdsk != D_OUTDATED) | ||
1293 | && (ns.pdsk < D_INCONSISTENT || | ||
1294 | ns.pdsk == D_UNKNOWN || | ||
1295 | ns.pdsk == D_OUTDATED)) { | ||
1296 | if (get_ldev(mdev)) { | ||
1297 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | ||
1298 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1299 | if (drbd_suspended(mdev)) { | ||
1300 | set_bit(NEW_CUR_UUID, &mdev->flags); | ||
1301 | } else { | ||
1302 | drbd_uuid_new_current(mdev); | ||
1303 | drbd_send_uuids(mdev); | ||
1304 | } | ||
1305 | } | ||
1306 | put_ldev(mdev); | ||
1307 | } | ||
1308 | } | ||
1309 | |||
1310 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | ||
1311 | if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && | ||
1312 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1313 | drbd_uuid_new_current(mdev); | ||
1314 | drbd_send_uuids(mdev); | ||
1315 | } | ||
1316 | /* D_DISKLESS Peer becomes secondary */ | ||
1317 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | ||
1318 | /* We may still be Primary ourselves. | ||
1319 | * No harm done if the bitmap still changes, | ||
1320 | * redirtied pages will follow later. */ | ||
1321 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1322 | "demote diskless peer", BM_LOCKED_SET_ALLOWED); | ||
1323 | put_ldev(mdev); | ||
1324 | } | ||
1325 | |||
1326 | /* Write out all changed bits on demote. | ||
1327 | * Though, no need to da that just yet | ||
1328 | * if there is a resync going on still */ | ||
1329 | if (os.role == R_PRIMARY && ns.role == R_SECONDARY && | ||
1330 | mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1331 | /* No changes to the bitmap expected this time, so assert that, | ||
1332 | * even though no harm was done if it did change. */ | ||
1333 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1334 | "demote", BM_LOCKED_TEST_ALLOWED); | ||
1335 | put_ldev(mdev); | ||
1336 | } | ||
1337 | |||
1338 | /* Last part of the attaching process ... */ | ||
1339 | if (ns.conn >= C_CONNECTED && | ||
1340 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | ||
1341 | drbd_send_sizes(mdev, 0, 0); /* to start sync... */ | ||
1342 | drbd_send_uuids(mdev); | ||
1343 | drbd_send_state(mdev, ns); | ||
1344 | } | ||
1345 | |||
1346 | /* We want to pause/continue resync, tell peer. */ | ||
1347 | if (ns.conn >= C_CONNECTED && | ||
1348 | ((os.aftr_isp != ns.aftr_isp) || | ||
1349 | (os.user_isp != ns.user_isp))) | ||
1350 | drbd_send_state(mdev, ns); | ||
1351 | |||
1352 | /* In case one of the isp bits got set, suspend other devices. */ | ||
1353 | if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && | ||
1354 | (ns.aftr_isp || ns.peer_isp || ns.user_isp)) | ||
1355 | suspend_other_sg(mdev); | ||
1356 | |||
1357 | /* Make sure the peer gets informed about eventual state | ||
1358 | changes (ISP bits) while we were in WFReportParams. */ | ||
1359 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) | ||
1360 | drbd_send_state(mdev, ns); | ||
1361 | |||
1362 | if (os.conn != C_AHEAD && ns.conn == C_AHEAD) | ||
1363 | drbd_send_state(mdev, ns); | ||
1364 | |||
1365 | /* We are in the progress to start a full sync... */ | ||
1366 | if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
1367 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) | ||
1368 | /* no other bitmap changes expected during this phase */ | ||
1369 | drbd_queue_bitmap_io(mdev, | ||
1370 | &drbd_bmio_set_n_write, &abw_start_sync, | ||
1371 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); | ||
1372 | |||
1373 | /* We are invalidating our self... */ | ||
1374 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1375 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1376 | /* other bitmap operation expected during this phase */ | ||
1377 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, | ||
1378 | "set_n_write from invalidate", BM_LOCKED_MASK); | ||
1379 | |||
1380 | /* first half of local IO error, failure to attach, | ||
1381 | * or administrative detach */ | ||
1382 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { | ||
1383 | enum drbd_io_error_p eh = EP_PASS_ON; | ||
1384 | int was_io_error = 0; | ||
1385 | /* corresponding get_ldev was in __drbd_set_state, to serialize | ||
1386 | * our cleanup here with the transition to D_DISKLESS. | ||
1387 | * But is is still not save to dreference ldev here, since | ||
1388 | * we might come from an failed Attach before ldev was set. */ | ||
1389 | if (mdev->ldev) { | ||
1390 | rcu_read_lock(); | ||
1391 | eh = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; | ||
1392 | rcu_read_unlock(); | ||
1393 | |||
1394 | was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); | ||
1395 | |||
1396 | if (was_io_error && eh == EP_CALL_HELPER) | ||
1397 | drbd_khelper(mdev, "local-io-error"); | ||
1398 | |||
1399 | /* Immediately allow completion of all application IO, | ||
1400 | * that waits for completion from the local disk, | ||
1401 | * if this was a force-detach due to disk_timeout | ||
1402 | * or administrator request (drbdsetup detach --force). | ||
1403 | * Do NOT abort otherwise. | ||
1404 | * Aborting local requests may cause serious problems, | ||
1405 | * if requests are completed to upper layers already, | ||
1406 | * and then later the already submitted local bio completes. | ||
1407 | * This can cause DMA into former bio pages that meanwhile | ||
1408 | * have been re-used for other things. | ||
1409 | * So aborting local requests may cause crashes, | ||
1410 | * or even worse, silent data corruption. | ||
1411 | */ | ||
1412 | if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) | ||
1413 | tl_abort_disk_io(mdev); | ||
1414 | |||
1415 | /* current state still has to be D_FAILED, | ||
1416 | * there is only one way out: to D_DISKLESS, | ||
1417 | * and that may only happen after our put_ldev below. */ | ||
1418 | if (mdev->state.disk != D_FAILED) | ||
1419 | dev_err(DEV, | ||
1420 | "ASSERT FAILED: disk is %s during detach\n", | ||
1421 | drbd_disk_str(mdev->state.disk)); | ||
1422 | |||
1423 | if (ns.conn >= C_CONNECTED) | ||
1424 | drbd_send_state(mdev, ns); | ||
1425 | |||
1426 | drbd_rs_cancel_all(mdev); | ||
1427 | |||
1428 | /* In case we want to get something to stable storage still, | ||
1429 | * this may be the last chance. | ||
1430 | * Following put_ldev may transition to D_DISKLESS. */ | ||
1431 | drbd_md_sync(mdev); | ||
1432 | } | ||
1433 | put_ldev(mdev); | ||
1434 | } | ||
1435 | |||
1436 | /* second half of local IO error, failure to attach, | ||
1437 | * or administrative detach, | ||
1438 | * after local_cnt references have reached zero again */ | ||
1439 | if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1440 | /* We must still be diskless, | ||
1441 | * re-attach has to be serialized with this! */ | ||
1442 | if (mdev->state.disk != D_DISKLESS) | ||
1443 | dev_err(DEV, | ||
1444 | "ASSERT FAILED: disk is %s while going diskless\n", | ||
1445 | drbd_disk_str(mdev->state.disk)); | ||
1446 | |||
1447 | if (ns.conn >= C_CONNECTED) | ||
1448 | drbd_send_state(mdev, ns); | ||
1449 | /* corresponding get_ldev in __drbd_set_state | ||
1450 | * this may finally trigger drbd_ldev_destroy. */ | ||
1451 | put_ldev(mdev); | ||
1452 | } | ||
1453 | |||
1454 | /* Notify peer that I had a local IO error, and did not detached.. */ | ||
1455 | if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) | ||
1456 | drbd_send_state(mdev, ns); | ||
1457 | |||
1458 | /* Disks got bigger while they were detached */ | ||
1459 | if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && | ||
1460 | test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { | ||
1461 | if (ns.conn == C_CONNECTED) | ||
1462 | resync_after_online_grow(mdev); | ||
1463 | } | ||
1464 | |||
1465 | /* A resync finished or aborted, wake paused devices... */ | ||
1466 | if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || | ||
1467 | (os.peer_isp && !ns.peer_isp) || | ||
1468 | (os.user_isp && !ns.user_isp)) | ||
1469 | resume_next_sg(mdev); | ||
1470 | |||
1471 | /* sync target done with resync. Explicitly notify peer, even though | ||
1472 | * it should (at least for non-empty resyncs) already know itself. */ | ||
1473 | if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) | ||
1474 | drbd_send_state(mdev, ns); | ||
1475 | |||
1476 | /* Verify finished, or reached stop sector. Peer did not know about | ||
1477 | * the stop sector, and we may even have changed the stop sector during | ||
1478 | * verify to interrupt/stop early. Send the new state. */ | ||
1479 | if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED | ||
1480 | && verify_can_do_stop_sector(mdev)) | ||
1481 | drbd_send_state(mdev, ns); | ||
1482 | |||
1483 | /* This triggers bitmap writeout of potentially still unwritten pages | ||
1484 | * if the resync finished cleanly, or aborted because of peer disk | ||
1485 | * failure, or because of connection loss. | ||
1486 | * For resync aborted because of local disk failure, we cannot do | ||
1487 | * any bitmap writeout anymore. | ||
1488 | * No harm done if some bits change during this phase. | ||
1489 | */ | ||
1490 | if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1491 | drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, | ||
1492 | "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); | ||
1493 | put_ldev(mdev); | ||
1494 | } | ||
1495 | |||
1496 | if (ns.disk == D_DISKLESS && | ||
1497 | ns.conn == C_STANDALONE && | ||
1498 | ns.role == R_SECONDARY) { | ||
1499 | if (os.aftr_isp != ns.aftr_isp) | ||
1500 | resume_next_sg(mdev); | ||
1501 | } | ||
1502 | |||
1503 | drbd_md_sync(mdev); | ||
1504 | } | ||
1505 | |||
1506 | struct after_conn_state_chg_work { | ||
1507 | struct drbd_work w; | ||
1508 | enum drbd_conns oc; | ||
1509 | union drbd_state ns_min; | ||
1510 | union drbd_state ns_max; /* new, max state, over all mdevs */ | ||
1511 | enum chg_state_flags flags; | ||
1512 | }; | ||
1513 | |||
1514 | static int w_after_conn_state_ch(struct drbd_work *w, int unused) | ||
1515 | { | ||
1516 | struct after_conn_state_chg_work *acscw = | ||
1517 | container_of(w, struct after_conn_state_chg_work, w); | ||
1518 | struct drbd_tconn *tconn = w->tconn; | ||
1519 | enum drbd_conns oc = acscw->oc; | ||
1520 | union drbd_state ns_max = acscw->ns_max; | ||
1521 | struct drbd_conf *mdev; | ||
1522 | int vnr; | ||
1523 | |||
1524 | kfree(acscw); | ||
1525 | |||
1526 | /* Upon network configuration, we need to start the receiver */ | ||
1527 | if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED) | ||
1528 | drbd_thread_start(&tconn->receiver); | ||
1529 | |||
1530 | if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { | ||
1531 | struct net_conf *old_conf; | ||
1532 | |||
1533 | mutex_lock(&tconn->conf_update); | ||
1534 | old_conf = tconn->net_conf; | ||
1535 | tconn->my_addr_len = 0; | ||
1536 | tconn->peer_addr_len = 0; | ||
1537 | rcu_assign_pointer(tconn->net_conf, NULL); | ||
1538 | conn_free_crypto(tconn); | ||
1539 | mutex_unlock(&tconn->conf_update); | ||
1540 | |||
1541 | synchronize_rcu(); | ||
1542 | kfree(old_conf); | ||
1543 | } | ||
1544 | |||
1545 | if (ns_max.susp_fen) { | ||
1546 | /* case1: The outdate peer handler is successful: */ | ||
1547 | if (ns_max.pdsk <= D_OUTDATED) { | ||
1548 | rcu_read_lock(); | ||
1549 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1550 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | ||
1551 | drbd_uuid_new_current(mdev); | ||
1552 | clear_bit(NEW_CUR_UUID, &mdev->flags); | ||
1553 | } | ||
1554 | } | ||
1555 | rcu_read_unlock(); | ||
1556 | spin_lock_irq(&tconn->req_lock); | ||
1557 | _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); | ||
1558 | _conn_request_state(tconn, | ||
1559 | (union drbd_state) { { .susp_fen = 1 } }, | ||
1560 | (union drbd_state) { { .susp_fen = 0 } }, | ||
1561 | CS_VERBOSE); | ||
1562 | spin_unlock_irq(&tconn->req_lock); | ||
1563 | } | ||
1564 | } | ||
1565 | kref_put(&tconn->kref, &conn_destroy); | ||
1566 | |||
1567 | conn_md_sync(tconn); | ||
1568 | |||
1569 | return 0; | ||
1570 | } | ||
1571 | |||
1572 | void conn_old_common_state(struct drbd_tconn *tconn, union drbd_state *pcs, enum chg_state_flags *pf) | ||
1573 | { | ||
1574 | enum chg_state_flags flags = ~0; | ||
1575 | struct drbd_conf *mdev; | ||
1576 | int vnr, first_vol = 1; | ||
1577 | union drbd_dev_state os, cs = { | ||
1578 | { .role = R_SECONDARY, | ||
1579 | .peer = R_UNKNOWN, | ||
1580 | .conn = tconn->cstate, | ||
1581 | .disk = D_DISKLESS, | ||
1582 | .pdsk = D_UNKNOWN, | ||
1583 | } }; | ||
1584 | |||
1585 | rcu_read_lock(); | ||
1586 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1587 | os = mdev->state; | ||
1588 | |||
1589 | if (first_vol) { | ||
1590 | cs = os; | ||
1591 | first_vol = 0; | ||
1592 | continue; | ||
1593 | } | ||
1594 | |||
1595 | if (cs.role != os.role) | ||
1596 | flags &= ~CS_DC_ROLE; | ||
1597 | |||
1598 | if (cs.peer != os.peer) | ||
1599 | flags &= ~CS_DC_PEER; | ||
1600 | |||
1601 | if (cs.conn != os.conn) | ||
1602 | flags &= ~CS_DC_CONN; | ||
1603 | |||
1604 | if (cs.disk != os.disk) | ||
1605 | flags &= ~CS_DC_DISK; | ||
1606 | |||
1607 | if (cs.pdsk != os.pdsk) | ||
1608 | flags &= ~CS_DC_PDSK; | ||
1609 | } | ||
1610 | rcu_read_unlock(); | ||
1611 | |||
1612 | *pf |= CS_DC_MASK; | ||
1613 | *pf &= flags; | ||
1614 | (*pcs).i = cs.i; | ||
1615 | } | ||
1616 | |||
1617 | static enum drbd_state_rv | ||
1618 | conn_is_valid_transition(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1619 | enum chg_state_flags flags) | ||
1620 | { | ||
1621 | enum drbd_state_rv rv = SS_SUCCESS; | ||
1622 | union drbd_state ns, os; | ||
1623 | struct drbd_conf *mdev; | ||
1624 | int vnr; | ||
1625 | |||
1626 | rcu_read_lock(); | ||
1627 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1628 | os = drbd_read_state(mdev); | ||
1629 | ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); | ||
1630 | |||
1631 | if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) | ||
1632 | ns.disk = os.disk; | ||
1633 | |||
1634 | if (ns.i == os.i) | ||
1635 | continue; | ||
1636 | |||
1637 | rv = is_valid_transition(os, ns); | ||
1638 | if (rv < SS_SUCCESS) | ||
1639 | break; | ||
1640 | |||
1641 | if (!(flags & CS_HARD)) { | ||
1642 | rv = is_valid_state(mdev, ns); | ||
1643 | if (rv < SS_SUCCESS) { | ||
1644 | if (is_valid_state(mdev, os) == rv) | ||
1645 | rv = is_valid_soft_transition(os, ns, tconn); | ||
1646 | } else | ||
1647 | rv = is_valid_soft_transition(os, ns, tconn); | ||
1648 | } | ||
1649 | if (rv < SS_SUCCESS) | ||
1650 | break; | ||
1651 | } | ||
1652 | rcu_read_unlock(); | ||
1653 | |||
1654 | if (rv < SS_SUCCESS && flags & CS_VERBOSE) | ||
1655 | print_st_err(mdev, os, ns, rv); | ||
1656 | |||
1657 | return rv; | ||
1658 | } | ||
1659 | |||
1660 | void | ||
1661 | conn_set_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1662 | union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) | ||
1663 | { | ||
1664 | union drbd_state ns, os, ns_max = { }; | ||
1665 | union drbd_state ns_min = { | ||
1666 | { .role = R_MASK, | ||
1667 | .peer = R_MASK, | ||
1668 | .conn = val.conn, | ||
1669 | .disk = D_MASK, | ||
1670 | .pdsk = D_MASK | ||
1671 | } }; | ||
1672 | struct drbd_conf *mdev; | ||
1673 | enum drbd_state_rv rv; | ||
1674 | int vnr, number_of_volumes = 0; | ||
1675 | |||
1676 | if (mask.conn == C_MASK) { | ||
1677 | /* remember last connect time so request_timer_fn() won't | ||
1678 | * kill newly established sessions while we are still trying to thaw | ||
1679 | * previously frozen IO */ | ||
1680 | if (tconn->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS) | ||
1681 | tconn->last_reconnect_jif = jiffies; | ||
1682 | |||
1683 | tconn->cstate = val.conn; | ||
1684 | } | ||
1685 | |||
1686 | rcu_read_lock(); | ||
1687 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1688 | number_of_volumes++; | ||
1689 | os = drbd_read_state(mdev); | ||
1690 | ns = apply_mask_val(os, mask, val); | ||
1691 | ns = sanitize_state(mdev, ns, NULL); | ||
1692 | |||
1693 | if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) | ||
1694 | ns.disk = os.disk; | ||
1695 | |||
1696 | rv = __drbd_set_state(mdev, ns, flags, NULL); | ||
1697 | if (rv < SS_SUCCESS) | ||
1698 | BUG(); | ||
1699 | |||
1700 | ns.i = mdev->state.i; | ||
1701 | ns_max.role = max_role(ns.role, ns_max.role); | ||
1702 | ns_max.peer = max_role(ns.peer, ns_max.peer); | ||
1703 | ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn); | ||
1704 | ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk); | ||
1705 | ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk); | ||
1706 | |||
1707 | ns_min.role = min_role(ns.role, ns_min.role); | ||
1708 | ns_min.peer = min_role(ns.peer, ns_min.peer); | ||
1709 | ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn); | ||
1710 | ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk); | ||
1711 | ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk); | ||
1712 | } | ||
1713 | rcu_read_unlock(); | ||
1714 | |||
1715 | if (number_of_volumes == 0) { | ||
1716 | ns_min = ns_max = (union drbd_state) { { | ||
1717 | .role = R_SECONDARY, | ||
1718 | .peer = R_UNKNOWN, | ||
1719 | .conn = val.conn, | ||
1720 | .disk = D_DISKLESS, | ||
1721 | .pdsk = D_UNKNOWN | ||
1722 | } }; | ||
1723 | } | ||
1724 | |||
1725 | ns_min.susp = ns_max.susp = tconn->susp; | ||
1726 | ns_min.susp_nod = ns_max.susp_nod = tconn->susp_nod; | ||
1727 | ns_min.susp_fen = ns_max.susp_fen = tconn->susp_fen; | ||
1728 | |||
1729 | *pns_min = ns_min; | ||
1730 | *pns_max = ns_max; | ||
1731 | } | ||
1732 | |||
1733 | static enum drbd_state_rv | ||
1734 | _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) | ||
1735 | { | ||
1736 | enum drbd_state_rv rv; | ||
1737 | |||
1738 | if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags)) | ||
1739 | return SS_CW_SUCCESS; | ||
1740 | |||
1741 | if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) | ||
1742 | return SS_CW_FAILED_BY_PEER; | ||
1743 | |||
1744 | rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; | ||
1745 | |||
1746 | if (rv == SS_UNKNOWN_ERROR) | ||
1747 | rv = conn_is_valid_transition(tconn, mask, val, 0); | ||
1748 | |||
1749 | if (rv == SS_SUCCESS) | ||
1750 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
1751 | |||
1752 | return rv; | ||
1753 | } | ||
1754 | |||
1755 | enum drbd_state_rv | ||
1756 | _conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1757 | enum chg_state_flags flags) | ||
1758 | { | ||
1759 | enum drbd_state_rv rv = SS_SUCCESS; | ||
1760 | struct after_conn_state_chg_work *acscw; | ||
1761 | enum drbd_conns oc = tconn->cstate; | ||
1762 | union drbd_state ns_max, ns_min, os; | ||
1763 | bool have_mutex = false; | ||
1764 | |||
1765 | if (mask.conn) { | ||
1766 | rv = is_valid_conn_transition(oc, val.conn); | ||
1767 | if (rv < SS_SUCCESS) | ||
1768 | goto abort; | ||
1769 | } | ||
1770 | |||
1771 | rv = conn_is_valid_transition(tconn, mask, val, flags); | ||
1772 | if (rv < SS_SUCCESS) | ||
1773 | goto abort; | ||
1774 | |||
1775 | if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING && | ||
1776 | !(flags & (CS_LOCAL_ONLY | CS_HARD))) { | ||
1777 | |||
1778 | /* This will be a cluster-wide state change. | ||
1779 | * Need to give up the spinlock, grab the mutex, | ||
1780 | * then send the state change request, ... */ | ||
1781 | spin_unlock_irq(&tconn->req_lock); | ||
1782 | mutex_lock(&tconn->cstate_mutex); | ||
1783 | have_mutex = true; | ||
1784 | |||
1785 | set_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); | ||
1786 | if (conn_send_state_req(tconn, mask, val)) { | ||
1787 | /* sending failed. */ | ||
1788 | clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); | ||
1789 | rv = SS_CW_FAILED_BY_PEER; | ||
1790 | /* need to re-aquire the spin lock, though */ | ||
1791 | goto abort_unlocked; | ||
1792 | } | ||
1793 | |||
1794 | if (val.conn == C_DISCONNECTING) | ||
1795 | set_bit(DISCONNECT_SENT, &tconn->flags); | ||
1796 | |||
1797 | /* ... and re-aquire the spinlock. | ||
1798 | * If _conn_rq_cond() returned >= SS_SUCCESS, we must call | ||
1799 | * conn_set_state() within the same spinlock. */ | ||
1800 | spin_lock_irq(&tconn->req_lock); | ||
1801 | wait_event_lock_irq(tconn->ping_wait, | ||
1802 | (rv = _conn_rq_cond(tconn, mask, val)), | ||
1803 | tconn->req_lock, | ||
1804 | ); | ||
1805 | clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); | ||
1806 | if (rv < SS_SUCCESS) | ||
1807 | goto abort; | ||
1808 | } | ||
1809 | |||
1810 | conn_old_common_state(tconn, &os, &flags); | ||
1811 | flags |= CS_DC_SUSP; | ||
1812 | conn_set_state(tconn, mask, val, &ns_min, &ns_max, flags); | ||
1813 | conn_pr_state_change(tconn, os, ns_max, flags); | ||
1814 | |||
1815 | acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); | ||
1816 | if (acscw) { | ||
1817 | acscw->oc = os.conn; | ||
1818 | acscw->ns_min = ns_min; | ||
1819 | acscw->ns_max = ns_max; | ||
1820 | acscw->flags = flags; | ||
1821 | acscw->w.cb = w_after_conn_state_ch; | ||
1822 | kref_get(&tconn->kref); | ||
1823 | acscw->w.tconn = tconn; | ||
1824 | drbd_queue_work(&tconn->sender_work, &acscw->w); | ||
1825 | } else { | ||
1826 | conn_err(tconn, "Could not kmalloc an acscw\n"); | ||
1827 | } | ||
1828 | |||
1829 | abort: | ||
1830 | if (have_mutex) { | ||
1831 | /* mutex_unlock() "... must not be used in interrupt context.", | ||
1832 | * so give up the spinlock, then re-aquire it */ | ||
1833 | spin_unlock_irq(&tconn->req_lock); | ||
1834 | abort_unlocked: | ||
1835 | mutex_unlock(&tconn->cstate_mutex); | ||
1836 | spin_lock_irq(&tconn->req_lock); | ||
1837 | } | ||
1838 | if (rv < SS_SUCCESS && flags & CS_VERBOSE) { | ||
1839 | conn_err(tconn, "State change failed: %s\n", drbd_set_st_err_str(rv)); | ||
1840 | conn_err(tconn, " mask = 0x%x val = 0x%x\n", mask.i, val.i); | ||
1841 | conn_err(tconn, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn)); | ||
1842 | } | ||
1843 | return rv; | ||
1844 | } | ||
1845 | |||
1846 | enum drbd_state_rv | ||
1847 | conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1848 | enum chg_state_flags flags) | ||
1849 | { | ||
1850 | enum drbd_state_rv rv; | ||
1851 | |||
1852 | spin_lock_irq(&tconn->req_lock); | ||
1853 | rv = _conn_request_state(tconn, mask, val, flags); | ||
1854 | spin_unlock_irq(&tconn->req_lock); | ||
1855 | |||
1856 | return rv; | ||
1857 | } | ||
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h new file mode 100644 index 000000000000..a3c361bbc4b6 --- /dev/null +++ b/drivers/block/drbd/drbd_state.h | |||
@@ -0,0 +1,161 @@ | |||
1 | #ifndef DRBD_STATE_H | ||
2 | #define DRBD_STATE_H | ||
3 | |||
4 | struct drbd_conf; | ||
5 | struct drbd_tconn; | ||
6 | |||
7 | /** | ||
8 | * DOC: DRBD State macros | ||
9 | * | ||
10 | * These macros are used to express state changes in easily readable form. | ||
11 | * | ||
12 | * The NS macros expand to a mask and a value, that can be bit ored onto the | ||
13 | * current state as soon as the spinlock (req_lock) was taken. | ||
14 | * | ||
15 | * The _NS macros are used for state functions that get called with the | ||
16 | * spinlock. These macros expand directly to the new state value. | ||
17 | * | ||
18 | * Besides the basic forms NS() and _NS() additional _?NS[23] are defined | ||
19 | * to express state changes that affect more than one aspect of the state. | ||
20 | * | ||
21 | * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) | ||
22 | * Means that the network connection was established and that the peer | ||
23 | * is in secondary role. | ||
24 | */ | ||
25 | #define role_MASK R_MASK | ||
26 | #define peer_MASK R_MASK | ||
27 | #define disk_MASK D_MASK | ||
28 | #define pdsk_MASK D_MASK | ||
29 | #define conn_MASK C_MASK | ||
30 | #define susp_MASK 1 | ||
31 | #define user_isp_MASK 1 | ||
32 | #define aftr_isp_MASK 1 | ||
33 | #define susp_nod_MASK 1 | ||
34 | #define susp_fen_MASK 1 | ||
35 | |||
36 | #define NS(T, S) \ | ||
37 | ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ | ||
38 | ({ union drbd_state val; val.i = 0; val.T = (S); val; }) | ||
39 | #define NS2(T1, S1, T2, S2) \ | ||
40 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
41 | mask.T2 = T2##_MASK; mask; }), \ | ||
42 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
43 | val.T2 = (S2); val; }) | ||
44 | #define NS3(T1, S1, T2, S2, T3, S3) \ | ||
45 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
46 | mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ | ||
47 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
48 | val.T2 = (S2); val.T3 = (S3); val; }) | ||
49 | |||
50 | #define _NS(D, T, S) \ | ||
51 | D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; }) | ||
52 | #define _NS2(D, T1, S1, T2, S2) \ | ||
53 | D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ | ||
54 | __ns.T2 = (S2); __ns; }) | ||
55 | #define _NS3(D, T1, S1, T2, S2, T3, S3) \ | ||
56 | D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ | ||
57 | __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) | ||
58 | |||
59 | enum chg_state_flags { | ||
60 | CS_HARD = 1 << 0, | ||
61 | CS_VERBOSE = 1 << 1, | ||
62 | CS_WAIT_COMPLETE = 1 << 2, | ||
63 | CS_SERIALIZE = 1 << 3, | ||
64 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | ||
65 | CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */ | ||
66 | CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */ | ||
67 | CS_DC_PEER = 1 << 6, | ||
68 | CS_DC_CONN = 1 << 7, | ||
69 | CS_DC_DISK = 1 << 8, | ||
70 | CS_DC_PDSK = 1 << 9, | ||
71 | CS_DC_SUSP = 1 << 10, | ||
72 | CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK, | ||
73 | CS_IGN_OUTD_FAIL = 1 << 11, | ||
74 | }; | ||
75 | |||
76 | /* drbd_dev_state and drbd_state are different types. This is to stress the | ||
77 | small difference. There is no suspended flag (.susp), and no suspended | ||
78 | while fence handler runs flas (susp_fen). */ | ||
79 | union drbd_dev_state { | ||
80 | struct { | ||
81 | #if defined(__LITTLE_ENDIAN_BITFIELD) | ||
82 | unsigned role:2 ; /* 3/4 primary/secondary/unknown */ | ||
83 | unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ | ||
84 | unsigned conn:5 ; /* 17/32 cstates */ | ||
85 | unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
86 | unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
87 | unsigned _unused:1 ; | ||
88 | unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ | ||
89 | unsigned peer_isp:1 ; | ||
90 | unsigned user_isp:1 ; | ||
91 | unsigned _pad:11; /* 0 unused */ | ||
92 | #elif defined(__BIG_ENDIAN_BITFIELD) | ||
93 | unsigned _pad:11; | ||
94 | unsigned user_isp:1 ; | ||
95 | unsigned peer_isp:1 ; | ||
96 | unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ | ||
97 | unsigned _unused:1 ; | ||
98 | unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
99 | unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
100 | unsigned conn:5 ; /* 17/32 cstates */ | ||
101 | unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ | ||
102 | unsigned role:2 ; /* 3/4 primary/secondary/unknown */ | ||
103 | #else | ||
104 | # error "this endianess is not supported" | ||
105 | #endif | ||
106 | }; | ||
107 | unsigned int i; | ||
108 | }; | ||
109 | |||
110 | extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev, | ||
111 | enum chg_state_flags f, | ||
112 | union drbd_state mask, | ||
113 | union drbd_state val); | ||
114 | extern void drbd_force_state(struct drbd_conf *, union drbd_state, | ||
115 | union drbd_state); | ||
116 | extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *, | ||
117 | union drbd_state, | ||
118 | union drbd_state, | ||
119 | enum chg_state_flags); | ||
120 | extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state, | ||
121 | enum chg_state_flags, | ||
122 | struct completion *done); | ||
123 | extern void print_st_err(struct drbd_conf *, union drbd_state, | ||
124 | union drbd_state, int); | ||
125 | |||
126 | enum drbd_state_rv | ||
127 | _conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
128 | enum chg_state_flags flags); | ||
129 | |||
130 | enum drbd_state_rv | ||
131 | conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
132 | enum chg_state_flags flags); | ||
133 | |||
134 | extern void drbd_resume_al(struct drbd_conf *mdev); | ||
135 | extern bool conn_all_vols_unconf(struct drbd_tconn *tconn); | ||
136 | |||
137 | /** | ||
138 | * drbd_request_state() - Reqest a state change | ||
139 | * @mdev: DRBD device. | ||
140 | * @mask: mask of state bits to change. | ||
141 | * @val: value of new state bits. | ||
142 | * | ||
143 | * This is the most graceful way of requesting a state change. It is verbose | ||
144 | * quite verbose in case the state change is not possible, and all those | ||
145 | * state changes are globally serialized. | ||
146 | */ | ||
147 | static inline int drbd_request_state(struct drbd_conf *mdev, | ||
148 | union drbd_state mask, | ||
149 | union drbd_state val) | ||
150 | { | ||
151 | return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); | ||
152 | } | ||
153 | |||
154 | enum drbd_role conn_highest_role(struct drbd_tconn *tconn); | ||
155 | enum drbd_role conn_highest_peer(struct drbd_tconn *tconn); | ||
156 | enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn); | ||
157 | enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn); | ||
158 | enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn); | ||
159 | enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn); | ||
160 | |||
161 | #endif | ||
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index c44a2a602772..9a664bd27404 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c | |||
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = { | |||
89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", | 89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", |
90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", | 90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", |
91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", | 91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", |
92 | [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", | ||
92 | }; | 93 | }; |
93 | 94 | ||
94 | const char *drbd_conn_str(enum drbd_conns s) | 95 | const char *drbd_conn_str(enum drbd_conns s) |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 7cd32e73b016..424dc7bdf9b7 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -38,16 +38,13 @@ | |||
38 | #include "drbd_int.h" | 38 | #include "drbd_int.h" |
39 | #include "drbd_req.h" | 39 | #include "drbd_req.h" |
40 | 40 | ||
41 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); | 41 | static int w_make_ov_request(struct drbd_work *w, int cancel); |
42 | static int w_make_resync_request(struct drbd_conf *mdev, | ||
43 | struct drbd_work *w, int cancel); | ||
44 | |||
45 | 42 | ||
46 | 43 | ||
47 | /* endio handlers: | 44 | /* endio handlers: |
48 | * drbd_md_io_complete (defined here) | 45 | * drbd_md_io_complete (defined here) |
49 | * drbd_endio_pri (defined here) | 46 | * drbd_request_endio (defined here) |
50 | * drbd_endio_sec (defined here) | 47 | * drbd_peer_request_endio (defined here) |
51 | * bm_async_io_complete (defined in drbd_bitmap.c) | 48 | * bm_async_io_complete (defined in drbd_bitmap.c) |
52 | * | 49 | * |
53 | * For all these callbacks, note the following: | 50 | * For all these callbacks, note the following: |
@@ -60,7 +57,7 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
60 | 57 | ||
61 | /* About the global_state_lock | 58 | /* About the global_state_lock |
62 | Each state transition on an device holds a read lock. In case we have | 59 | Each state transition on an device holds a read lock. In case we have |
63 | to evaluate the sync after dependencies, we grab a write lock, because | 60 | to evaluate the resync after dependencies, we grab a write lock, because |
64 | we need stable states on all devices for that. */ | 61 | we need stable states on all devices for that. */ |
65 | rwlock_t global_state_lock; | 62 | rwlock_t global_state_lock; |
66 | 63 | ||
@@ -98,97 +95,93 @@ void drbd_md_io_complete(struct bio *bio, int error) | |||
98 | /* reads on behalf of the partner, | 95 | /* reads on behalf of the partner, |
99 | * "submitted" by the receiver | 96 | * "submitted" by the receiver |
100 | */ | 97 | */ |
101 | void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) | 98 | void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local) |
102 | { | 99 | { |
103 | unsigned long flags = 0; | 100 | unsigned long flags = 0; |
104 | struct drbd_conf *mdev = e->mdev; | 101 | struct drbd_conf *mdev = peer_req->w.mdev; |
105 | |||
106 | D_ASSERT(e->block_id != ID_VACANT); | ||
107 | 102 | ||
108 | spin_lock_irqsave(&mdev->req_lock, flags); | 103 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
109 | mdev->read_cnt += e->size >> 9; | 104 | mdev->read_cnt += peer_req->i.size >> 9; |
110 | list_del(&e->w.list); | 105 | list_del(&peer_req->w.list); |
111 | if (list_empty(&mdev->read_ee)) | 106 | if (list_empty(&mdev->read_ee)) |
112 | wake_up(&mdev->ee_wait); | 107 | wake_up(&mdev->ee_wait); |
113 | if (test_bit(__EE_WAS_ERROR, &e->flags)) | 108 | if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) |
114 | __drbd_chk_io_error(mdev, DRBD_READ_ERROR); | 109 | __drbd_chk_io_error(mdev, DRBD_READ_ERROR); |
115 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 110 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
116 | 111 | ||
117 | drbd_queue_work(&mdev->data.work, &e->w); | 112 | drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w); |
118 | put_ldev(mdev); | 113 | put_ldev(mdev); |
119 | } | 114 | } |
120 | 115 | ||
121 | /* writes on behalf of the partner, or resync writes, | 116 | /* writes on behalf of the partner, or resync writes, |
122 | * "submitted" by the receiver, final stage. */ | 117 | * "submitted" by the receiver, final stage. */ |
123 | static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) | 118 | static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) |
124 | { | 119 | { |
125 | unsigned long flags = 0; | 120 | unsigned long flags = 0; |
126 | struct drbd_conf *mdev = e->mdev; | 121 | struct drbd_conf *mdev = peer_req->w.mdev; |
127 | sector_t e_sector; | 122 | struct drbd_interval i; |
128 | int do_wake; | 123 | int do_wake; |
129 | int is_syncer_req; | 124 | u64 block_id; |
130 | int do_al_complete_io; | 125 | int do_al_complete_io; |
131 | 126 | ||
132 | D_ASSERT(e->block_id != ID_VACANT); | 127 | /* after we moved peer_req to done_ee, |
133 | |||
134 | /* after we moved e to done_ee, | ||
135 | * we may no longer access it, | 128 | * we may no longer access it, |
136 | * it may be freed/reused already! | 129 | * it may be freed/reused already! |
137 | * (as soon as we release the req_lock) */ | 130 | * (as soon as we release the req_lock) */ |
138 | e_sector = e->sector; | 131 | i = peer_req->i; |
139 | do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; | 132 | do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; |
140 | is_syncer_req = is_syncer_block_id(e->block_id); | 133 | block_id = peer_req->block_id; |
141 | 134 | ||
142 | spin_lock_irqsave(&mdev->req_lock, flags); | 135 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
143 | mdev->writ_cnt += e->size >> 9; | 136 | mdev->writ_cnt += peer_req->i.size >> 9; |
144 | list_del(&e->w.list); /* has been on active_ee or sync_ee */ | 137 | list_move_tail(&peer_req->w.list, &mdev->done_ee); |
145 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
146 | 138 | ||
147 | /* No hlist_del_init(&e->collision) here, we did not send the Ack yet, | 139 | /* |
148 | * neither did we wake possibly waiting conflicting requests. | 140 | * Do not remove from the write_requests tree here: we did not send the |
149 | * done from "drbd_process_done_ee" within the appropriate w.cb | 141 | * Ack yet and did not wake possibly waiting conflicting requests. |
150 | * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ | 142 | * Removed from the tree from "drbd_process_done_ee" within the |
143 | * appropriate w.cb (e_end_block/e_end_resync_block) or from | ||
144 | * _drbd_clear_done_ee. | ||
145 | */ | ||
151 | 146 | ||
152 | do_wake = is_syncer_req | 147 | do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee); |
153 | ? list_empty(&mdev->sync_ee) | ||
154 | : list_empty(&mdev->active_ee); | ||
155 | 148 | ||
156 | if (test_bit(__EE_WAS_ERROR, &e->flags)) | 149 | if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) |
157 | __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); | 150 | __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); |
158 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 151 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
159 | 152 | ||
160 | if (is_syncer_req) | 153 | if (block_id == ID_SYNCER) |
161 | drbd_rs_complete_io(mdev, e_sector); | 154 | drbd_rs_complete_io(mdev, i.sector); |
162 | 155 | ||
163 | if (do_wake) | 156 | if (do_wake) |
164 | wake_up(&mdev->ee_wait); | 157 | wake_up(&mdev->ee_wait); |
165 | 158 | ||
166 | if (do_al_complete_io) | 159 | if (do_al_complete_io) |
167 | drbd_al_complete_io(mdev, e_sector); | 160 | drbd_al_complete_io(mdev, &i); |
168 | 161 | ||
169 | wake_asender(mdev); | 162 | wake_asender(mdev->tconn); |
170 | put_ldev(mdev); | 163 | put_ldev(mdev); |
171 | } | 164 | } |
172 | 165 | ||
173 | /* writes on behalf of the partner, or resync writes, | 166 | /* writes on behalf of the partner, or resync writes, |
174 | * "submitted" by the receiver. | 167 | * "submitted" by the receiver. |
175 | */ | 168 | */ |
176 | void drbd_endio_sec(struct bio *bio, int error) | 169 | void drbd_peer_request_endio(struct bio *bio, int error) |
177 | { | 170 | { |
178 | struct drbd_epoch_entry *e = bio->bi_private; | 171 | struct drbd_peer_request *peer_req = bio->bi_private; |
179 | struct drbd_conf *mdev = e->mdev; | 172 | struct drbd_conf *mdev = peer_req->w.mdev; |
180 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | 173 | int uptodate = bio_flagged(bio, BIO_UPTODATE); |
181 | int is_write = bio_data_dir(bio) == WRITE; | 174 | int is_write = bio_data_dir(bio) == WRITE; |
182 | 175 | ||
183 | if (error && __ratelimit(&drbd_ratelimit_state)) | 176 | if (error && __ratelimit(&drbd_ratelimit_state)) |
184 | dev_warn(DEV, "%s: error=%d s=%llus\n", | 177 | dev_warn(DEV, "%s: error=%d s=%llus\n", |
185 | is_write ? "write" : "read", error, | 178 | is_write ? "write" : "read", error, |
186 | (unsigned long long)e->sector); | 179 | (unsigned long long)peer_req->i.sector); |
187 | if (!error && !uptodate) { | 180 | if (!error && !uptodate) { |
188 | if (__ratelimit(&drbd_ratelimit_state)) | 181 | if (__ratelimit(&drbd_ratelimit_state)) |
189 | dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", | 182 | dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", |
190 | is_write ? "write" : "read", | 183 | is_write ? "write" : "read", |
191 | (unsigned long long)e->sector); | 184 | (unsigned long long)peer_req->i.sector); |
192 | /* strange behavior of some lower level drivers... | 185 | /* strange behavior of some lower level drivers... |
193 | * fail the request by clearing the uptodate flag, | 186 | * fail the request by clearing the uptodate flag, |
194 | * but do not return any error?! */ | 187 | * but do not return any error?! */ |
@@ -196,24 +189,24 @@ void drbd_endio_sec(struct bio *bio, int error) | |||
196 | } | 189 | } |
197 | 190 | ||
198 | if (error) | 191 | if (error) |
199 | set_bit(__EE_WAS_ERROR, &e->flags); | 192 | set_bit(__EE_WAS_ERROR, &peer_req->flags); |
200 | 193 | ||
201 | bio_put(bio); /* no need for the bio anymore */ | 194 | bio_put(bio); /* no need for the bio anymore */ |
202 | if (atomic_dec_and_test(&e->pending_bios)) { | 195 | if (atomic_dec_and_test(&peer_req->pending_bios)) { |
203 | if (is_write) | 196 | if (is_write) |
204 | drbd_endio_write_sec_final(e); | 197 | drbd_endio_write_sec_final(peer_req); |
205 | else | 198 | else |
206 | drbd_endio_read_sec_final(e); | 199 | drbd_endio_read_sec_final(peer_req); |
207 | } | 200 | } |
208 | } | 201 | } |
209 | 202 | ||
210 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request | 203 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request |
211 | */ | 204 | */ |
212 | void drbd_endio_pri(struct bio *bio, int error) | 205 | void drbd_request_endio(struct bio *bio, int error) |
213 | { | 206 | { |
214 | unsigned long flags; | 207 | unsigned long flags; |
215 | struct drbd_request *req = bio->bi_private; | 208 | struct drbd_request *req = bio->bi_private; |
216 | struct drbd_conf *mdev = req->mdev; | 209 | struct drbd_conf *mdev = req->w.mdev; |
217 | struct bio_and_error m; | 210 | struct bio_and_error m; |
218 | enum drbd_req_event what; | 211 | enum drbd_req_event what; |
219 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | 212 | int uptodate = bio_flagged(bio, BIO_UPTODATE); |
@@ -227,6 +220,7 @@ void drbd_endio_pri(struct bio *bio, int error) | |||
227 | error = -EIO; | 220 | error = -EIO; |
228 | } | 221 | } |
229 | 222 | ||
223 | |||
230 | /* If this request was aborted locally before, | 224 | /* If this request was aborted locally before, |
231 | * but now was completed "successfully", | 225 | * but now was completed "successfully", |
232 | * chances are that this caused arbitrary data corruption. | 226 | * chances are that this caused arbitrary data corruption. |
@@ -266,50 +260,32 @@ void drbd_endio_pri(struct bio *bio, int error) | |||
266 | /* to avoid recursion in __req_mod */ | 260 | /* to avoid recursion in __req_mod */ |
267 | if (unlikely(error)) { | 261 | if (unlikely(error)) { |
268 | what = (bio_data_dir(bio) == WRITE) | 262 | what = (bio_data_dir(bio) == WRITE) |
269 | ? write_completed_with_error | 263 | ? WRITE_COMPLETED_WITH_ERROR |
270 | : (bio_rw(bio) == READ) | 264 | : (bio_rw(bio) == READ) |
271 | ? read_completed_with_error | 265 | ? READ_COMPLETED_WITH_ERROR |
272 | : read_ahead_completed_with_error; | 266 | : READ_AHEAD_COMPLETED_WITH_ERROR; |
273 | } else | 267 | } else |
274 | what = completed_ok; | 268 | what = COMPLETED_OK; |
275 | 269 | ||
276 | bio_put(req->private_bio); | 270 | bio_put(req->private_bio); |
277 | req->private_bio = ERR_PTR(error); | 271 | req->private_bio = ERR_PTR(error); |
278 | 272 | ||
279 | /* not req_mod(), we need irqsave here! */ | 273 | /* not req_mod(), we need irqsave here! */ |
280 | spin_lock_irqsave(&mdev->req_lock, flags); | 274 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
281 | __req_mod(req, what, &m); | 275 | __req_mod(req, what, &m); |
282 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 276 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
283 | put_ldev(mdev); | 277 | put_ldev(mdev); |
284 | 278 | ||
285 | if (m.bio) | 279 | if (m.bio) |
286 | complete_master_bio(mdev, &m); | 280 | complete_master_bio(mdev, &m); |
287 | } | 281 | } |
288 | 282 | ||
289 | int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 283 | void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, |
290 | { | 284 | struct drbd_peer_request *peer_req, void *digest) |
291 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
292 | |||
293 | /* We should not detach for read io-error, | ||
294 | * but try to WRITE the P_DATA_REPLY to the failed location, | ||
295 | * to give the disk the chance to relocate that block */ | ||
296 | |||
297 | spin_lock_irq(&mdev->req_lock); | ||
298 | if (cancel || mdev->state.pdsk != D_UP_TO_DATE) { | ||
299 | _req_mod(req, read_retry_remote_canceled); | ||
300 | spin_unlock_irq(&mdev->req_lock); | ||
301 | return 1; | ||
302 | } | ||
303 | spin_unlock_irq(&mdev->req_lock); | ||
304 | |||
305 | return w_send_read_req(mdev, w, 0); | ||
306 | } | ||
307 | |||
308 | void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest) | ||
309 | { | 285 | { |
310 | struct hash_desc desc; | 286 | struct hash_desc desc; |
311 | struct scatterlist sg; | 287 | struct scatterlist sg; |
312 | struct page *page = e->pages; | 288 | struct page *page = peer_req->pages; |
313 | struct page *tmp; | 289 | struct page *tmp; |
314 | unsigned len; | 290 | unsigned len; |
315 | 291 | ||
@@ -326,7 +302,7 @@ void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_e | |||
326 | page = tmp; | 302 | page = tmp; |
327 | } | 303 | } |
328 | /* and now the last, possibly only partially used page */ | 304 | /* and now the last, possibly only partially used page */ |
329 | len = e->size & (PAGE_SIZE - 1); | 305 | len = peer_req->i.size & (PAGE_SIZE - 1); |
330 | sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); | 306 | sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); |
331 | crypto_hash_update(&desc, &sg, sg.length); | 307 | crypto_hash_update(&desc, &sg, sg.length); |
332 | crypto_hash_final(&desc, digest); | 308 | crypto_hash_final(&desc, digest); |
@@ -352,59 +328,58 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * | |||
352 | crypto_hash_final(&desc, digest); | 328 | crypto_hash_final(&desc, digest); |
353 | } | 329 | } |
354 | 330 | ||
355 | /* TODO merge common code with w_e_end_ov_req */ | 331 | /* MAYBE merge common code with w_e_end_ov_req */ |
356 | int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 332 | static int w_e_send_csum(struct drbd_work *w, int cancel) |
357 | { | 333 | { |
358 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 334 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
335 | struct drbd_conf *mdev = w->mdev; | ||
359 | int digest_size; | 336 | int digest_size; |
360 | void *digest; | 337 | void *digest; |
361 | int ok = 1; | 338 | int err = 0; |
362 | |||
363 | D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); | ||
364 | 339 | ||
365 | if (unlikely(cancel)) | 340 | if (unlikely(cancel)) |
366 | goto out; | 341 | goto out; |
367 | 342 | ||
368 | if (likely((e->flags & EE_WAS_ERROR) != 0)) | 343 | if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0)) |
369 | goto out; | 344 | goto out; |
370 | 345 | ||
371 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | 346 | digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); |
372 | digest = kmalloc(digest_size, GFP_NOIO); | 347 | digest = kmalloc(digest_size, GFP_NOIO); |
373 | if (digest) { | 348 | if (digest) { |
374 | sector_t sector = e->sector; | 349 | sector_t sector = peer_req->i.sector; |
375 | unsigned int size = e->size; | 350 | unsigned int size = peer_req->i.size; |
376 | drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); | 351 | drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); |
377 | /* Free e and pages before send. | 352 | /* Free peer_req and pages before send. |
378 | * In case we block on congestion, we could otherwise run into | 353 | * In case we block on congestion, we could otherwise run into |
379 | * some distributed deadlock, if the other side blocks on | 354 | * some distributed deadlock, if the other side blocks on |
380 | * congestion as well, because our receiver blocks in | 355 | * congestion as well, because our receiver blocks in |
381 | * drbd_pp_alloc due to pp_in_use > max_buffers. */ | 356 | * drbd_alloc_pages due to pp_in_use > max_buffers. */ |
382 | drbd_free_ee(mdev, e); | 357 | drbd_free_peer_req(mdev, peer_req); |
383 | e = NULL; | 358 | peer_req = NULL; |
384 | inc_rs_pending(mdev); | 359 | inc_rs_pending(mdev); |
385 | ok = drbd_send_drequest_csum(mdev, sector, size, | 360 | err = drbd_send_drequest_csum(mdev, sector, size, |
386 | digest, digest_size, | 361 | digest, digest_size, |
387 | P_CSUM_RS_REQUEST); | 362 | P_CSUM_RS_REQUEST); |
388 | kfree(digest); | 363 | kfree(digest); |
389 | } else { | 364 | } else { |
390 | dev_err(DEV, "kmalloc() of digest failed.\n"); | 365 | dev_err(DEV, "kmalloc() of digest failed.\n"); |
391 | ok = 0; | 366 | err = -ENOMEM; |
392 | } | 367 | } |
393 | 368 | ||
394 | out: | 369 | out: |
395 | if (e) | 370 | if (peer_req) |
396 | drbd_free_ee(mdev, e); | 371 | drbd_free_peer_req(mdev, peer_req); |
397 | 372 | ||
398 | if (unlikely(!ok)) | 373 | if (unlikely(err)) |
399 | dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); | 374 | dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); |
400 | return ok; | 375 | return err; |
401 | } | 376 | } |
402 | 377 | ||
403 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | 378 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) |
404 | 379 | ||
405 | static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | 380 | static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) |
406 | { | 381 | { |
407 | struct drbd_epoch_entry *e; | 382 | struct drbd_peer_request *peer_req; |
408 | 383 | ||
409 | if (!get_ldev(mdev)) | 384 | if (!get_ldev(mdev)) |
410 | return -EIO; | 385 | return -EIO; |
@@ -414,45 +389,47 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | |||
414 | 389 | ||
415 | /* GFP_TRY, because if there is no memory available right now, this may | 390 | /* GFP_TRY, because if there is no memory available right now, this may |
416 | * be rescheduled for later. It is "only" background resync, after all. */ | 391 | * be rescheduled for later. It is "only" background resync, after all. */ |
417 | e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); | 392 | peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector, |
418 | if (!e) | 393 | size, GFP_TRY); |
394 | if (!peer_req) | ||
419 | goto defer; | 395 | goto defer; |
420 | 396 | ||
421 | e->w.cb = w_e_send_csum; | 397 | peer_req->w.cb = w_e_send_csum; |
422 | spin_lock_irq(&mdev->req_lock); | 398 | spin_lock_irq(&mdev->tconn->req_lock); |
423 | list_add(&e->w.list, &mdev->read_ee); | 399 | list_add(&peer_req->w.list, &mdev->read_ee); |
424 | spin_unlock_irq(&mdev->req_lock); | 400 | spin_unlock_irq(&mdev->tconn->req_lock); |
425 | 401 | ||
426 | atomic_add(size >> 9, &mdev->rs_sect_ev); | 402 | atomic_add(size >> 9, &mdev->rs_sect_ev); |
427 | if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) | 403 | if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0) |
428 | return 0; | 404 | return 0; |
429 | 405 | ||
430 | /* If it failed because of ENOMEM, retry should help. If it failed | 406 | /* If it failed because of ENOMEM, retry should help. If it failed |
431 | * because bio_add_page failed (probably broken lower level driver), | 407 | * because bio_add_page failed (probably broken lower level driver), |
432 | * retry may or may not help. | 408 | * retry may or may not help. |
433 | * If it does not, you may need to force disconnect. */ | 409 | * If it does not, you may need to force disconnect. */ |
434 | spin_lock_irq(&mdev->req_lock); | 410 | spin_lock_irq(&mdev->tconn->req_lock); |
435 | list_del(&e->w.list); | 411 | list_del(&peer_req->w.list); |
436 | spin_unlock_irq(&mdev->req_lock); | 412 | spin_unlock_irq(&mdev->tconn->req_lock); |
437 | 413 | ||
438 | drbd_free_ee(mdev, e); | 414 | drbd_free_peer_req(mdev, peer_req); |
439 | defer: | 415 | defer: |
440 | put_ldev(mdev); | 416 | put_ldev(mdev); |
441 | return -EAGAIN; | 417 | return -EAGAIN; |
442 | } | 418 | } |
443 | 419 | ||
444 | int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 420 | int w_resync_timer(struct drbd_work *w, int cancel) |
445 | { | 421 | { |
422 | struct drbd_conf *mdev = w->mdev; | ||
446 | switch (mdev->state.conn) { | 423 | switch (mdev->state.conn) { |
447 | case C_VERIFY_S: | 424 | case C_VERIFY_S: |
448 | w_make_ov_request(mdev, w, cancel); | 425 | w_make_ov_request(w, cancel); |
449 | break; | 426 | break; |
450 | case C_SYNC_TARGET: | 427 | case C_SYNC_TARGET: |
451 | w_make_resync_request(mdev, w, cancel); | 428 | w_make_resync_request(w, cancel); |
452 | break; | 429 | break; |
453 | } | 430 | } |
454 | 431 | ||
455 | return 1; | 432 | return 0; |
456 | } | 433 | } |
457 | 434 | ||
458 | void resync_timer_fn(unsigned long data) | 435 | void resync_timer_fn(unsigned long data) |
@@ -460,7 +437,7 @@ void resync_timer_fn(unsigned long data) | |||
460 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 437 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
461 | 438 | ||
462 | if (list_empty(&mdev->resync_work.list)) | 439 | if (list_empty(&mdev->resync_work.list)) |
463 | drbd_queue_work(&mdev->data.work, &mdev->resync_work); | 440 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work); |
464 | } | 441 | } |
465 | 442 | ||
466 | static void fifo_set(struct fifo_buffer *fb, int value) | 443 | static void fifo_set(struct fifo_buffer *fb, int value) |
@@ -492,8 +469,24 @@ static void fifo_add_val(struct fifo_buffer *fb, int value) | |||
492 | fb->values[i] += value; | 469 | fb->values[i] += value; |
493 | } | 470 | } |
494 | 471 | ||
472 | struct fifo_buffer *fifo_alloc(int fifo_size) | ||
473 | { | ||
474 | struct fifo_buffer *fb; | ||
475 | |||
476 | fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO); | ||
477 | if (!fb) | ||
478 | return NULL; | ||
479 | |||
480 | fb->head_index = 0; | ||
481 | fb->size = fifo_size; | ||
482 | fb->total = 0; | ||
483 | |||
484 | return fb; | ||
485 | } | ||
486 | |||
495 | static int drbd_rs_controller(struct drbd_conf *mdev) | 487 | static int drbd_rs_controller(struct drbd_conf *mdev) |
496 | { | 488 | { |
489 | struct disk_conf *dc; | ||
497 | unsigned int sect_in; /* Number of sectors that came in since the last turn */ | 490 | unsigned int sect_in; /* Number of sectors that came in since the last turn */ |
498 | unsigned int want; /* The number of sectors we want in the proxy */ | 491 | unsigned int want; /* The number of sectors we want in the proxy */ |
499 | int req_sect; /* Number of sectors to request in this turn */ | 492 | int req_sect; /* Number of sectors to request in this turn */ |
@@ -502,38 +495,39 @@ static int drbd_rs_controller(struct drbd_conf *mdev) | |||
502 | int steps; /* Number of time steps to plan ahead */ | 495 | int steps; /* Number of time steps to plan ahead */ |
503 | int curr_corr; | 496 | int curr_corr; |
504 | int max_sect; | 497 | int max_sect; |
498 | struct fifo_buffer *plan; | ||
505 | 499 | ||
506 | sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ | 500 | sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ |
507 | mdev->rs_in_flight -= sect_in; | 501 | mdev->rs_in_flight -= sect_in; |
508 | 502 | ||
509 | spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */ | 503 | dc = rcu_dereference(mdev->ldev->disk_conf); |
504 | plan = rcu_dereference(mdev->rs_plan_s); | ||
510 | 505 | ||
511 | steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ | 506 | steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ |
512 | 507 | ||
513 | if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ | 508 | if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ |
514 | want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps; | 509 | want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps; |
515 | } else { /* normal path */ | 510 | } else { /* normal path */ |
516 | want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target : | 511 | want = dc->c_fill_target ? dc->c_fill_target : |
517 | sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10); | 512 | sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10); |
518 | } | 513 | } |
519 | 514 | ||
520 | correction = want - mdev->rs_in_flight - mdev->rs_planed; | 515 | correction = want - mdev->rs_in_flight - plan->total; |
521 | 516 | ||
522 | /* Plan ahead */ | 517 | /* Plan ahead */ |
523 | cps = correction / steps; | 518 | cps = correction / steps; |
524 | fifo_add_val(&mdev->rs_plan_s, cps); | 519 | fifo_add_val(plan, cps); |
525 | mdev->rs_planed += cps * steps; | 520 | plan->total += cps * steps; |
526 | 521 | ||
527 | /* What we do in this step */ | 522 | /* What we do in this step */ |
528 | curr_corr = fifo_push(&mdev->rs_plan_s, 0); | 523 | curr_corr = fifo_push(plan, 0); |
529 | spin_unlock(&mdev->peer_seq_lock); | 524 | plan->total -= curr_corr; |
530 | mdev->rs_planed -= curr_corr; | ||
531 | 525 | ||
532 | req_sect = sect_in + curr_corr; | 526 | req_sect = sect_in + curr_corr; |
533 | if (req_sect < 0) | 527 | if (req_sect < 0) |
534 | req_sect = 0; | 528 | req_sect = 0; |
535 | 529 | ||
536 | max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ; | 530 | max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ; |
537 | if (req_sect > max_sect) | 531 | if (req_sect > max_sect) |
538 | req_sect = max_sect; | 532 | req_sect = max_sect; |
539 | 533 | ||
@@ -549,22 +543,25 @@ static int drbd_rs_controller(struct drbd_conf *mdev) | |||
549 | static int drbd_rs_number_requests(struct drbd_conf *mdev) | 543 | static int drbd_rs_number_requests(struct drbd_conf *mdev) |
550 | { | 544 | { |
551 | int number; | 545 | int number; |
552 | if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */ | 546 | |
547 | rcu_read_lock(); | ||
548 | if (rcu_dereference(mdev->rs_plan_s)->size) { | ||
553 | number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); | 549 | number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); |
554 | mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; | 550 | mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; |
555 | } else { | 551 | } else { |
556 | mdev->c_sync_rate = mdev->sync_conf.rate; | 552 | mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate; |
557 | number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); | 553 | number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); |
558 | } | 554 | } |
555 | rcu_read_unlock(); | ||
559 | 556 | ||
560 | /* ignore the amount of pending requests, the resync controller should | 557 | /* ignore the amount of pending requests, the resync controller should |
561 | * throttle down to incoming reply rate soon enough anyways. */ | 558 | * throttle down to incoming reply rate soon enough anyways. */ |
562 | return number; | 559 | return number; |
563 | } | 560 | } |
564 | 561 | ||
565 | static int w_make_resync_request(struct drbd_conf *mdev, | 562 | int w_make_resync_request(struct drbd_work *w, int cancel) |
566 | struct drbd_work *w, int cancel) | ||
567 | { | 563 | { |
564 | struct drbd_conf *mdev = w->mdev; | ||
568 | unsigned long bit; | 565 | unsigned long bit; |
569 | sector_t sector; | 566 | sector_t sector; |
570 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 567 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
@@ -574,12 +571,12 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
574 | int i = 0; | 571 | int i = 0; |
575 | 572 | ||
576 | if (unlikely(cancel)) | 573 | if (unlikely(cancel)) |
577 | return 1; | 574 | return 0; |
578 | 575 | ||
579 | if (mdev->rs_total == 0) { | 576 | if (mdev->rs_total == 0) { |
580 | /* empty resync? */ | 577 | /* empty resync? */ |
581 | drbd_resync_finished(mdev); | 578 | drbd_resync_finished(mdev); |
582 | return 1; | 579 | return 0; |
583 | } | 580 | } |
584 | 581 | ||
585 | if (!get_ldev(mdev)) { | 582 | if (!get_ldev(mdev)) { |
@@ -588,7 +585,7 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
588 | to continue resync with a broken disk makes no sense at | 585 | to continue resync with a broken disk makes no sense at |
589 | all */ | 586 | all */ |
590 | dev_err(DEV, "Disk broke down during resync!\n"); | 587 | dev_err(DEV, "Disk broke down during resync!\n"); |
591 | return 1; | 588 | return 0; |
592 | } | 589 | } |
593 | 590 | ||
594 | max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; | 591 | max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; |
@@ -598,15 +595,15 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
598 | 595 | ||
599 | for (i = 0; i < number; i++) { | 596 | for (i = 0; i < number; i++) { |
600 | /* Stop generating RS requests, when half of the send buffer is filled */ | 597 | /* Stop generating RS requests, when half of the send buffer is filled */ |
601 | mutex_lock(&mdev->data.mutex); | 598 | mutex_lock(&mdev->tconn->data.mutex); |
602 | if (mdev->data.socket) { | 599 | if (mdev->tconn->data.socket) { |
603 | queued = mdev->data.socket->sk->sk_wmem_queued; | 600 | queued = mdev->tconn->data.socket->sk->sk_wmem_queued; |
604 | sndbuf = mdev->data.socket->sk->sk_sndbuf; | 601 | sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf; |
605 | } else { | 602 | } else { |
606 | queued = 1; | 603 | queued = 1; |
607 | sndbuf = 0; | 604 | sndbuf = 0; |
608 | } | 605 | } |
609 | mutex_unlock(&mdev->data.mutex); | 606 | mutex_unlock(&mdev->tconn->data.mutex); |
610 | if (queued > sndbuf / 2) | 607 | if (queued > sndbuf / 2) |
611 | goto requeue; | 608 | goto requeue; |
612 | 609 | ||
@@ -617,7 +614,7 @@ next_sector: | |||
617 | if (bit == DRBD_END_OF_BITMAP) { | 614 | if (bit == DRBD_END_OF_BITMAP) { |
618 | mdev->bm_resync_fo = drbd_bm_bits(mdev); | 615 | mdev->bm_resync_fo = drbd_bm_bits(mdev); |
619 | put_ldev(mdev); | 616 | put_ldev(mdev); |
620 | return 1; | 617 | return 0; |
621 | } | 618 | } |
622 | 619 | ||
623 | sector = BM_BIT_TO_SECT(bit); | 620 | sector = BM_BIT_TO_SECT(bit); |
@@ -676,11 +673,11 @@ next_sector: | |||
676 | /* adjust very last sectors, in case we are oddly sized */ | 673 | /* adjust very last sectors, in case we are oddly sized */ |
677 | if (sector + (size>>9) > capacity) | 674 | if (sector + (size>>9) > capacity) |
678 | size = (capacity-sector)<<9; | 675 | size = (capacity-sector)<<9; |
679 | if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { | 676 | if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) { |
680 | switch (read_for_csum(mdev, sector, size)) { | 677 | switch (read_for_csum(mdev, sector, size)) { |
681 | case -EIO: /* Disk failure */ | 678 | case -EIO: /* Disk failure */ |
682 | put_ldev(mdev); | 679 | put_ldev(mdev); |
683 | return 0; | 680 | return -EIO; |
684 | case -EAGAIN: /* allocation failed, or ldev busy */ | 681 | case -EAGAIN: /* allocation failed, or ldev busy */ |
685 | drbd_rs_complete_io(mdev, sector); | 682 | drbd_rs_complete_io(mdev, sector); |
686 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | 683 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); |
@@ -693,13 +690,16 @@ next_sector: | |||
693 | BUG(); | 690 | BUG(); |
694 | } | 691 | } |
695 | } else { | 692 | } else { |
693 | int err; | ||
694 | |||
696 | inc_rs_pending(mdev); | 695 | inc_rs_pending(mdev); |
697 | if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, | 696 | err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST, |
698 | sector, size, ID_SYNCER)) { | 697 | sector, size, ID_SYNCER); |
698 | if (err) { | ||
699 | dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); | 699 | dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); |
700 | dec_rs_pending(mdev); | 700 | dec_rs_pending(mdev); |
701 | put_ldev(mdev); | 701 | put_ldev(mdev); |
702 | return 0; | 702 | return err; |
703 | } | 703 | } |
704 | } | 704 | } |
705 | } | 705 | } |
@@ -712,18 +712,19 @@ next_sector: | |||
712 | * until then resync "work" is "inactive" ... | 712 | * until then resync "work" is "inactive" ... |
713 | */ | 713 | */ |
714 | put_ldev(mdev); | 714 | put_ldev(mdev); |
715 | return 1; | 715 | return 0; |
716 | } | 716 | } |
717 | 717 | ||
718 | requeue: | 718 | requeue: |
719 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); | 719 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); |
720 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | 720 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); |
721 | put_ldev(mdev); | 721 | put_ldev(mdev); |
722 | return 1; | 722 | return 0; |
723 | } | 723 | } |
724 | 724 | ||
725 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 725 | static int w_make_ov_request(struct drbd_work *w, int cancel) |
726 | { | 726 | { |
727 | struct drbd_conf *mdev = w->mdev; | ||
727 | int number, i, size; | 728 | int number, i, size; |
728 | sector_t sector; | 729 | sector_t sector; |
729 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 730 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
@@ -743,7 +744,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
743 | * w_e_end_ov_reply(). | 744 | * w_e_end_ov_reply(). |
744 | * We need to send at least one request out. */ | 745 | * We need to send at least one request out. */ |
745 | stop_sector_reached = i > 0 | 746 | stop_sector_reached = i > 0 |
746 | && mdev->agreed_pro_version >= 97 | 747 | && verify_can_do_stop_sector(mdev) |
747 | && sector >= mdev->ov_stop_sector; | 748 | && sector >= mdev->ov_stop_sector; |
748 | if (stop_sector_reached) | 749 | if (stop_sector_reached) |
749 | break; | 750 | break; |
@@ -760,7 +761,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
760 | size = (capacity-sector)<<9; | 761 | size = (capacity-sector)<<9; |
761 | 762 | ||
762 | inc_rs_pending(mdev); | 763 | inc_rs_pending(mdev); |
763 | if (!drbd_send_ov_request(mdev, sector, size)) { | 764 | if (drbd_send_ov_request(mdev, sector, size)) { |
764 | dec_rs_pending(mdev); | 765 | dec_rs_pending(mdev); |
765 | return 0; | 766 | return 0; |
766 | } | 767 | } |
@@ -775,52 +776,34 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
775 | return 1; | 776 | return 1; |
776 | } | 777 | } |
777 | 778 | ||
778 | 779 | int w_ov_finished(struct drbd_work *w, int cancel) | |
779 | void start_resync_timer_fn(unsigned long data) | ||
780 | { | ||
781 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
782 | |||
783 | drbd_queue_work(&mdev->data.work, &mdev->start_resync_work); | ||
784 | } | ||
785 | |||
786 | int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
787 | { | ||
788 | if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) { | ||
789 | dev_warn(DEV, "w_start_resync later...\n"); | ||
790 | mdev->start_resync_timer.expires = jiffies + HZ/10; | ||
791 | add_timer(&mdev->start_resync_timer); | ||
792 | return 1; | ||
793 | } | ||
794 | |||
795 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
796 | drbd_clear_flag(mdev, AHEAD_TO_SYNC_SOURCE); | ||
797 | return 1; | ||
798 | } | ||
799 | |||
800 | int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
801 | { | 780 | { |
781 | struct drbd_conf *mdev = w->mdev; | ||
802 | kfree(w); | 782 | kfree(w); |
803 | ov_oos_print(mdev); | 783 | ov_out_of_sync_print(mdev); |
804 | drbd_resync_finished(mdev); | 784 | drbd_resync_finished(mdev); |
805 | 785 | ||
806 | return 1; | 786 | return 0; |
807 | } | 787 | } |
808 | 788 | ||
809 | static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 789 | static int w_resync_finished(struct drbd_work *w, int cancel) |
810 | { | 790 | { |
791 | struct drbd_conf *mdev = w->mdev; | ||
811 | kfree(w); | 792 | kfree(w); |
812 | 793 | ||
813 | drbd_resync_finished(mdev); | 794 | drbd_resync_finished(mdev); |
814 | 795 | ||
815 | return 1; | 796 | return 0; |
816 | } | 797 | } |
817 | 798 | ||
818 | static void ping_peer(struct drbd_conf *mdev) | 799 | static void ping_peer(struct drbd_conf *mdev) |
819 | { | 800 | { |
820 | drbd_clear_flag(mdev, GOT_PING_ACK); | 801 | struct drbd_tconn *tconn = mdev->tconn; |
821 | request_ping(mdev); | 802 | |
822 | wait_event(mdev->misc_wait, | 803 | clear_bit(GOT_PING_ACK, &tconn->flags); |
823 | drbd_test_flag(mdev, GOT_PING_ACK) || mdev->state.conn < C_CONNECTED); | 804 | request_ping(tconn); |
805 | wait_event(tconn->ping_wait, | ||
806 | test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED); | ||
824 | } | 807 | } |
825 | 808 | ||
826 | int drbd_resync_finished(struct drbd_conf *mdev) | 809 | int drbd_resync_finished(struct drbd_conf *mdev) |
@@ -845,7 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
845 | w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); | 828 | w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); |
846 | if (w) { | 829 | if (w) { |
847 | w->cb = w_resync_finished; | 830 | w->cb = w_resync_finished; |
848 | drbd_queue_work(&mdev->data.work, w); | 831 | w->mdev = mdev; |
832 | drbd_queue_work(&mdev->tconn->sender_work, w); | ||
849 | return 1; | 833 | return 1; |
850 | } | 834 | } |
851 | dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); | 835 | dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); |
@@ -868,8 +852,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
868 | 852 | ||
869 | ping_peer(mdev); | 853 | ping_peer(mdev); |
870 | 854 | ||
871 | spin_lock_irq(&mdev->req_lock); | 855 | spin_lock_irq(&mdev->tconn->req_lock); |
872 | os = mdev->state; | 856 | os = drbd_read_state(mdev); |
873 | 857 | ||
874 | verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); | 858 | verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); |
875 | 859 | ||
@@ -899,7 +883,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
899 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) | 883 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) |
900 | khelper_cmd = "after-resync-target"; | 884 | khelper_cmd = "after-resync-target"; |
901 | 885 | ||
902 | if (mdev->csums_tfm && mdev->rs_total) { | 886 | if (mdev->tconn->csums_tfm && mdev->rs_total) { |
903 | const unsigned long s = mdev->rs_same_csum; | 887 | const unsigned long s = mdev->rs_same_csum; |
904 | const unsigned long t = mdev->rs_total; | 888 | const unsigned long t = mdev->rs_total; |
905 | const int ratio = | 889 | const int ratio = |
@@ -957,7 +941,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
957 | 941 | ||
958 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | 942 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); |
959 | out_unlock: | 943 | out_unlock: |
960 | spin_unlock_irq(&mdev->req_lock); | 944 | spin_unlock_irq(&mdev->tconn->req_lock); |
961 | put_ldev(mdev); | 945 | put_ldev(mdev); |
962 | out: | 946 | out: |
963 | mdev->rs_total = 0; | 947 | mdev->rs_total = 0; |
@@ -977,19 +961,19 @@ out: | |||
977 | } | 961 | } |
978 | 962 | ||
979 | /* helper */ | 963 | /* helper */ |
980 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | 964 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req) |
981 | { | 965 | { |
982 | if (drbd_ee_has_active_page(e)) { | 966 | if (drbd_peer_req_has_active_page(peer_req)) { |
983 | /* This might happen if sendpage() has not finished */ | 967 | /* This might happen if sendpage() has not finished */ |
984 | int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT; | 968 | int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; |
985 | atomic_add(i, &mdev->pp_in_use_by_net); | 969 | atomic_add(i, &mdev->pp_in_use_by_net); |
986 | atomic_sub(i, &mdev->pp_in_use); | 970 | atomic_sub(i, &mdev->pp_in_use); |
987 | spin_lock_irq(&mdev->req_lock); | 971 | spin_lock_irq(&mdev->tconn->req_lock); |
988 | list_add_tail(&e->w.list, &mdev->net_ee); | 972 | list_add_tail(&peer_req->w.list, &mdev->net_ee); |
989 | spin_unlock_irq(&mdev->req_lock); | 973 | spin_unlock_irq(&mdev->tconn->req_lock); |
990 | wake_up(&drbd_pp_wait); | 974 | wake_up(&drbd_pp_wait); |
991 | } else | 975 | } else |
992 | drbd_free_ee(mdev, e); | 976 | drbd_free_peer_req(mdev, peer_req); |
993 | } | 977 | } |
994 | 978 | ||
995 | /** | 979 | /** |
@@ -998,174 +982,177 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent | |||
998 | * @w: work object. | 982 | * @w: work object. |
999 | * @cancel: The connection will be closed anyways | 983 | * @cancel: The connection will be closed anyways |
1000 | */ | 984 | */ |
1001 | int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 985 | int w_e_end_data_req(struct drbd_work *w, int cancel) |
1002 | { | 986 | { |
1003 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 987 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1004 | int ok; | 988 | struct drbd_conf *mdev = w->mdev; |
989 | int err; | ||
1005 | 990 | ||
1006 | if (unlikely(cancel)) { | 991 | if (unlikely(cancel)) { |
1007 | drbd_free_ee(mdev, e); | 992 | drbd_free_peer_req(mdev, peer_req); |
1008 | dec_unacked(mdev); | 993 | dec_unacked(mdev); |
1009 | return 1; | 994 | return 0; |
1010 | } | 995 | } |
1011 | 996 | ||
1012 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 997 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1013 | ok = drbd_send_block(mdev, P_DATA_REPLY, e); | 998 | err = drbd_send_block(mdev, P_DATA_REPLY, peer_req); |
1014 | } else { | 999 | } else { |
1015 | if (__ratelimit(&drbd_ratelimit_state)) | 1000 | if (__ratelimit(&drbd_ratelimit_state)) |
1016 | dev_err(DEV, "Sending NegDReply. sector=%llus.\n", | 1001 | dev_err(DEV, "Sending NegDReply. sector=%llus.\n", |
1017 | (unsigned long long)e->sector); | 1002 | (unsigned long long)peer_req->i.sector); |
1018 | 1003 | ||
1019 | ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); | 1004 | err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req); |
1020 | } | 1005 | } |
1021 | 1006 | ||
1022 | dec_unacked(mdev); | 1007 | dec_unacked(mdev); |
1023 | 1008 | ||
1024 | move_to_net_ee_or_free(mdev, e); | 1009 | move_to_net_ee_or_free(mdev, peer_req); |
1025 | 1010 | ||
1026 | if (unlikely(!ok)) | 1011 | if (unlikely(err)) |
1027 | dev_err(DEV, "drbd_send_block() failed\n"); | 1012 | dev_err(DEV, "drbd_send_block() failed\n"); |
1028 | return ok; | 1013 | return err; |
1029 | } | 1014 | } |
1030 | 1015 | ||
1031 | /** | 1016 | /** |
1032 | * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS | 1017 | * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST |
1033 | * @mdev: DRBD device. | 1018 | * @mdev: DRBD device. |
1034 | * @w: work object. | 1019 | * @w: work object. |
1035 | * @cancel: The connection will be closed anyways | 1020 | * @cancel: The connection will be closed anyways |
1036 | */ | 1021 | */ |
1037 | int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1022 | int w_e_end_rsdata_req(struct drbd_work *w, int cancel) |
1038 | { | 1023 | { |
1039 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1024 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1040 | int ok; | 1025 | struct drbd_conf *mdev = w->mdev; |
1026 | int err; | ||
1041 | 1027 | ||
1042 | if (unlikely(cancel)) { | 1028 | if (unlikely(cancel)) { |
1043 | drbd_free_ee(mdev, e); | 1029 | drbd_free_peer_req(mdev, peer_req); |
1044 | dec_unacked(mdev); | 1030 | dec_unacked(mdev); |
1045 | return 1; | 1031 | return 0; |
1046 | } | 1032 | } |
1047 | 1033 | ||
1048 | if (get_ldev_if_state(mdev, D_FAILED)) { | 1034 | if (get_ldev_if_state(mdev, D_FAILED)) { |
1049 | drbd_rs_complete_io(mdev, e->sector); | 1035 | drbd_rs_complete_io(mdev, peer_req->i.sector); |
1050 | put_ldev(mdev); | 1036 | put_ldev(mdev); |
1051 | } | 1037 | } |
1052 | 1038 | ||
1053 | if (mdev->state.conn == C_AHEAD) { | 1039 | if (mdev->state.conn == C_AHEAD) { |
1054 | ok = drbd_send_ack(mdev, P_RS_CANCEL, e); | 1040 | err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req); |
1055 | } else if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1041 | } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1056 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { | 1042 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { |
1057 | inc_rs_pending(mdev); | 1043 | inc_rs_pending(mdev); |
1058 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | 1044 | err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); |
1059 | } else { | 1045 | } else { |
1060 | if (__ratelimit(&drbd_ratelimit_state)) | 1046 | if (__ratelimit(&drbd_ratelimit_state)) |
1061 | dev_err(DEV, "Not sending RSDataReply, " | 1047 | dev_err(DEV, "Not sending RSDataReply, " |
1062 | "partner DISKLESS!\n"); | 1048 | "partner DISKLESS!\n"); |
1063 | ok = 1; | 1049 | err = 0; |
1064 | } | 1050 | } |
1065 | } else { | 1051 | } else { |
1066 | if (__ratelimit(&drbd_ratelimit_state)) | 1052 | if (__ratelimit(&drbd_ratelimit_state)) |
1067 | dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", | 1053 | dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", |
1068 | (unsigned long long)e->sector); | 1054 | (unsigned long long)peer_req->i.sector); |
1069 | 1055 | ||
1070 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | 1056 | err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); |
1071 | 1057 | ||
1072 | /* update resync data with failure */ | 1058 | /* update resync data with failure */ |
1073 | drbd_rs_failed_io(mdev, e->sector, e->size); | 1059 | drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size); |
1074 | } | 1060 | } |
1075 | 1061 | ||
1076 | dec_unacked(mdev); | 1062 | dec_unacked(mdev); |
1077 | 1063 | ||
1078 | move_to_net_ee_or_free(mdev, e); | 1064 | move_to_net_ee_or_free(mdev, peer_req); |
1079 | 1065 | ||
1080 | if (unlikely(!ok)) | 1066 | if (unlikely(err)) |
1081 | dev_err(DEV, "drbd_send_block() failed\n"); | 1067 | dev_err(DEV, "drbd_send_block() failed\n"); |
1082 | return ok; | 1068 | return err; |
1083 | } | 1069 | } |
1084 | 1070 | ||
1085 | int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1071 | int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) |
1086 | { | 1072 | { |
1087 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1073 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1074 | struct drbd_conf *mdev = w->mdev; | ||
1088 | struct digest_info *di; | 1075 | struct digest_info *di; |
1089 | int digest_size; | 1076 | int digest_size; |
1090 | void *digest = NULL; | 1077 | void *digest = NULL; |
1091 | int ok, eq = 0; | 1078 | int err, eq = 0; |
1092 | 1079 | ||
1093 | if (unlikely(cancel)) { | 1080 | if (unlikely(cancel)) { |
1094 | drbd_free_ee(mdev, e); | 1081 | drbd_free_peer_req(mdev, peer_req); |
1095 | dec_unacked(mdev); | 1082 | dec_unacked(mdev); |
1096 | return 1; | 1083 | return 0; |
1097 | } | 1084 | } |
1098 | 1085 | ||
1099 | if (get_ldev(mdev)) { | 1086 | if (get_ldev(mdev)) { |
1100 | drbd_rs_complete_io(mdev, e->sector); | 1087 | drbd_rs_complete_io(mdev, peer_req->i.sector); |
1101 | put_ldev(mdev); | 1088 | put_ldev(mdev); |
1102 | } | 1089 | } |
1103 | 1090 | ||
1104 | di = e->digest; | 1091 | di = peer_req->digest; |
1105 | 1092 | ||
1106 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1093 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1107 | /* quick hack to try to avoid a race against reconfiguration. | 1094 | /* quick hack to try to avoid a race against reconfiguration. |
1108 | * a real fix would be much more involved, | 1095 | * a real fix would be much more involved, |
1109 | * introducing more locking mechanisms */ | 1096 | * introducing more locking mechanisms */ |
1110 | if (mdev->csums_tfm) { | 1097 | if (mdev->tconn->csums_tfm) { |
1111 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | 1098 | digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); |
1112 | D_ASSERT(digest_size == di->digest_size); | 1099 | D_ASSERT(digest_size == di->digest_size); |
1113 | digest = kmalloc(digest_size, GFP_NOIO); | 1100 | digest = kmalloc(digest_size, GFP_NOIO); |
1114 | } | 1101 | } |
1115 | if (digest) { | 1102 | if (digest) { |
1116 | drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); | 1103 | drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); |
1117 | eq = !memcmp(digest, di->digest, digest_size); | 1104 | eq = !memcmp(digest, di->digest, digest_size); |
1118 | kfree(digest); | 1105 | kfree(digest); |
1119 | } | 1106 | } |
1120 | 1107 | ||
1121 | if (eq) { | 1108 | if (eq) { |
1122 | drbd_set_in_sync(mdev, e->sector, e->size); | 1109 | drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size); |
1123 | /* rs_same_csums unit is BM_BLOCK_SIZE */ | 1110 | /* rs_same_csums unit is BM_BLOCK_SIZE */ |
1124 | mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT; | 1111 | mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT; |
1125 | ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); | 1112 | err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req); |
1126 | } else { | 1113 | } else { |
1127 | inc_rs_pending(mdev); | 1114 | inc_rs_pending(mdev); |
1128 | e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ | 1115 | peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ |
1129 | e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */ | 1116 | peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */ |
1130 | kfree(di); | 1117 | kfree(di); |
1131 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | 1118 | err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); |
1132 | } | 1119 | } |
1133 | } else { | 1120 | } else { |
1134 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | 1121 | err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); |
1135 | if (__ratelimit(&drbd_ratelimit_state)) | 1122 | if (__ratelimit(&drbd_ratelimit_state)) |
1136 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); | 1123 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); |
1137 | } | 1124 | } |
1138 | 1125 | ||
1139 | dec_unacked(mdev); | 1126 | dec_unacked(mdev); |
1140 | move_to_net_ee_or_free(mdev, e); | 1127 | move_to_net_ee_or_free(mdev, peer_req); |
1141 | 1128 | ||
1142 | if (unlikely(!ok)) | 1129 | if (unlikely(err)) |
1143 | dev_err(DEV, "drbd_send_block/ack() failed\n"); | 1130 | dev_err(DEV, "drbd_send_block/ack() failed\n"); |
1144 | return ok; | 1131 | return err; |
1145 | } | 1132 | } |
1146 | 1133 | ||
1147 | /* TODO merge common code with w_e_send_csum */ | 1134 | int w_e_end_ov_req(struct drbd_work *w, int cancel) |
1148 | int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1149 | { | 1135 | { |
1150 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1136 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1151 | sector_t sector = e->sector; | 1137 | struct drbd_conf *mdev = w->mdev; |
1152 | unsigned int size = e->size; | 1138 | sector_t sector = peer_req->i.sector; |
1139 | unsigned int size = peer_req->i.size; | ||
1153 | int digest_size; | 1140 | int digest_size; |
1154 | void *digest; | 1141 | void *digest; |
1155 | int ok = 1; | 1142 | int err = 0; |
1156 | 1143 | ||
1157 | if (unlikely(cancel)) | 1144 | if (unlikely(cancel)) |
1158 | goto out; | 1145 | goto out; |
1159 | 1146 | ||
1160 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | 1147 | digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); |
1161 | digest = kmalloc(digest_size, GFP_NOIO); | 1148 | digest = kmalloc(digest_size, GFP_NOIO); |
1162 | if (!digest) { | 1149 | if (!digest) { |
1163 | ok = 0; /* terminate the connection in case the allocation failed */ | 1150 | err = 1; /* terminate the connection in case the allocation failed */ |
1164 | goto out; | 1151 | goto out; |
1165 | } | 1152 | } |
1166 | 1153 | ||
1167 | if (likely(!(e->flags & EE_WAS_ERROR))) | 1154 | if (likely(!(peer_req->flags & EE_WAS_ERROR))) |
1168 | drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); | 1155 | drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); |
1169 | else | 1156 | else |
1170 | memset(digest, 0, digest_size); | 1157 | memset(digest, 0, digest_size); |
1171 | 1158 | ||
@@ -1173,25 +1160,23 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1173 | * In case we block on congestion, we could otherwise run into | 1160 | * In case we block on congestion, we could otherwise run into |
1174 | * some distributed deadlock, if the other side blocks on | 1161 | * some distributed deadlock, if the other side blocks on |
1175 | * congestion as well, because our receiver blocks in | 1162 | * congestion as well, because our receiver blocks in |
1176 | * drbd_pp_alloc due to pp_in_use > max_buffers. */ | 1163 | * drbd_alloc_pages due to pp_in_use > max_buffers. */ |
1177 | drbd_free_ee(mdev, e); | 1164 | drbd_free_peer_req(mdev, peer_req); |
1178 | e = NULL; | 1165 | peer_req = NULL; |
1179 | inc_rs_pending(mdev); | 1166 | inc_rs_pending(mdev); |
1180 | ok = drbd_send_drequest_csum(mdev, sector, size, | 1167 | err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY); |
1181 | digest, digest_size, | 1168 | if (err) |
1182 | P_OV_REPLY); | ||
1183 | if (!ok) | ||
1184 | dec_rs_pending(mdev); | 1169 | dec_rs_pending(mdev); |
1185 | kfree(digest); | 1170 | kfree(digest); |
1186 | 1171 | ||
1187 | out: | 1172 | out: |
1188 | if (e) | 1173 | if (peer_req) |
1189 | drbd_free_ee(mdev, e); | 1174 | drbd_free_peer_req(mdev, peer_req); |
1190 | dec_unacked(mdev); | 1175 | dec_unacked(mdev); |
1191 | return ok; | 1176 | return err; |
1192 | } | 1177 | } |
1193 | 1178 | ||
1194 | void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | 1179 | void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size) |
1195 | { | 1180 | { |
1196 | if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { | 1181 | if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { |
1197 | mdev->ov_last_oos_size += size>>9; | 1182 | mdev->ov_last_oos_size += size>>9; |
@@ -1202,37 +1187,38 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | |||
1202 | drbd_set_out_of_sync(mdev, sector, size); | 1187 | drbd_set_out_of_sync(mdev, sector, size); |
1203 | } | 1188 | } |
1204 | 1189 | ||
1205 | int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1190 | int w_e_end_ov_reply(struct drbd_work *w, int cancel) |
1206 | { | 1191 | { |
1207 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1192 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1193 | struct drbd_conf *mdev = w->mdev; | ||
1208 | struct digest_info *di; | 1194 | struct digest_info *di; |
1209 | void *digest; | 1195 | void *digest; |
1210 | sector_t sector = e->sector; | 1196 | sector_t sector = peer_req->i.sector; |
1211 | unsigned int size = e->size; | 1197 | unsigned int size = peer_req->i.size; |
1212 | int digest_size; | 1198 | int digest_size; |
1213 | int ok, eq = 0; | 1199 | int err, eq = 0; |
1214 | bool stop_sector_reached = false; | 1200 | bool stop_sector_reached = false; |
1215 | 1201 | ||
1216 | if (unlikely(cancel)) { | 1202 | if (unlikely(cancel)) { |
1217 | drbd_free_ee(mdev, e); | 1203 | drbd_free_peer_req(mdev, peer_req); |
1218 | dec_unacked(mdev); | 1204 | dec_unacked(mdev); |
1219 | return 1; | 1205 | return 0; |
1220 | } | 1206 | } |
1221 | 1207 | ||
1222 | /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all | 1208 | /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all |
1223 | * the resync lru has been cleaned up already */ | 1209 | * the resync lru has been cleaned up already */ |
1224 | if (get_ldev(mdev)) { | 1210 | if (get_ldev(mdev)) { |
1225 | drbd_rs_complete_io(mdev, e->sector); | 1211 | drbd_rs_complete_io(mdev, peer_req->i.sector); |
1226 | put_ldev(mdev); | 1212 | put_ldev(mdev); |
1227 | } | 1213 | } |
1228 | 1214 | ||
1229 | di = e->digest; | 1215 | di = peer_req->digest; |
1230 | 1216 | ||
1231 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1217 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1232 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | 1218 | digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); |
1233 | digest = kmalloc(digest_size, GFP_NOIO); | 1219 | digest = kmalloc(digest_size, GFP_NOIO); |
1234 | if (digest) { | 1220 | if (digest) { |
1235 | drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); | 1221 | drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); |
1236 | 1222 | ||
1237 | D_ASSERT(digest_size == di->digest_size); | 1223 | D_ASSERT(digest_size == di->digest_size); |
1238 | eq = !memcmp(digest, di->digest, digest_size); | 1224 | eq = !memcmp(digest, di->digest, digest_size); |
@@ -1240,19 +1226,19 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1240 | } | 1226 | } |
1241 | } | 1227 | } |
1242 | 1228 | ||
1243 | /* Free e and pages before send. | 1229 | /* Free peer_req and pages before send. |
1244 | * In case we block on congestion, we could otherwise run into | 1230 | * In case we block on congestion, we could otherwise run into |
1245 | * some distributed deadlock, if the other side blocks on | 1231 | * some distributed deadlock, if the other side blocks on |
1246 | * congestion as well, because our receiver blocks in | 1232 | * congestion as well, because our receiver blocks in |
1247 | * drbd_pp_alloc due to pp_in_use > max_buffers. */ | 1233 | * drbd_alloc_pages due to pp_in_use > max_buffers. */ |
1248 | drbd_free_ee(mdev, e); | 1234 | drbd_free_peer_req(mdev, peer_req); |
1249 | if (!eq) | 1235 | if (!eq) |
1250 | drbd_ov_oos_found(mdev, sector, size); | 1236 | drbd_ov_out_of_sync_found(mdev, sector, size); |
1251 | else | 1237 | else |
1252 | ov_oos_print(mdev); | 1238 | ov_out_of_sync_print(mdev); |
1253 | 1239 | ||
1254 | ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, | 1240 | err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, |
1255 | eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); | 1241 | eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); |
1256 | 1242 | ||
1257 | dec_unacked(mdev); | 1243 | dec_unacked(mdev); |
1258 | 1244 | ||
@@ -1262,76 +1248,102 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1262 | if ((mdev->ov_left & 0x200) == 0x200) | 1248 | if ((mdev->ov_left & 0x200) == 0x200) |
1263 | drbd_advance_rs_marks(mdev, mdev->ov_left); | 1249 | drbd_advance_rs_marks(mdev, mdev->ov_left); |
1264 | 1250 | ||
1265 | stop_sector_reached = mdev->agreed_pro_version >= 97 && | 1251 | stop_sector_reached = verify_can_do_stop_sector(mdev) && |
1266 | (sector + (size>>9)) >= mdev->ov_stop_sector; | 1252 | (sector + (size>>9)) >= mdev->ov_stop_sector; |
1267 | 1253 | ||
1268 | if (mdev->ov_left == 0 || stop_sector_reached) { | 1254 | if (mdev->ov_left == 0 || stop_sector_reached) { |
1269 | ov_oos_print(mdev); | 1255 | ov_out_of_sync_print(mdev); |
1270 | drbd_resync_finished(mdev); | 1256 | drbd_resync_finished(mdev); |
1271 | } | 1257 | } |
1272 | 1258 | ||
1273 | return ok; | 1259 | return err; |
1274 | } | 1260 | } |
1275 | 1261 | ||
1276 | int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1262 | int w_prev_work_done(struct drbd_work *w, int cancel) |
1277 | { | 1263 | { |
1278 | struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); | 1264 | struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); |
1265 | |||
1279 | complete(&b->done); | 1266 | complete(&b->done); |
1280 | return 1; | 1267 | return 0; |
1281 | } | 1268 | } |
1282 | 1269 | ||
1283 | int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1270 | /* FIXME |
1271 | * We need to track the number of pending barrier acks, | ||
1272 | * and to be able to wait for them. | ||
1273 | * See also comment in drbd_adm_attach before drbd_suspend_io. | ||
1274 | */ | ||
1275 | int drbd_send_barrier(struct drbd_tconn *tconn) | ||
1284 | { | 1276 | { |
1285 | struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); | 1277 | struct p_barrier *p; |
1286 | struct p_barrier *p = &mdev->data.sbuf.barrier; | 1278 | struct drbd_socket *sock; |
1287 | int ok = 1; | ||
1288 | |||
1289 | /* really avoid racing with tl_clear. w.cb may have been referenced | ||
1290 | * just before it was reassigned and re-queued, so double check that. | ||
1291 | * actually, this race was harmless, since we only try to send the | ||
1292 | * barrier packet here, and otherwise do nothing with the object. | ||
1293 | * but compare with the head of w_clear_epoch */ | ||
1294 | spin_lock_irq(&mdev->req_lock); | ||
1295 | if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) | ||
1296 | cancel = 1; | ||
1297 | spin_unlock_irq(&mdev->req_lock); | ||
1298 | if (cancel) | ||
1299 | return 1; | ||
1300 | 1279 | ||
1301 | if (!drbd_get_data_sock(mdev)) | 1280 | sock = &tconn->data; |
1302 | return 0; | 1281 | p = conn_prepare_command(tconn, sock); |
1303 | p->barrier = b->br_number; | 1282 | if (!p) |
1304 | /* inc_ap_pending was done where this was queued. | 1283 | return -EIO; |
1305 | * dec_ap_pending will be done in got_BarrierAck | 1284 | p->barrier = tconn->send.current_epoch_nr; |
1306 | * or (on connection loss) in w_clear_epoch. */ | 1285 | p->pad = 0; |
1307 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, | 1286 | tconn->send.current_epoch_writes = 0; |
1308 | (struct p_header80 *)p, sizeof(*p), 0); | 1287 | |
1309 | drbd_put_data_sock(mdev); | 1288 | return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0); |
1310 | |||
1311 | return ok; | ||
1312 | } | 1289 | } |
1313 | 1290 | ||
1314 | int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1291 | int w_send_write_hint(struct drbd_work *w, int cancel) |
1315 | { | 1292 | { |
1293 | struct drbd_conf *mdev = w->mdev; | ||
1294 | struct drbd_socket *sock; | ||
1295 | |||
1316 | if (cancel) | 1296 | if (cancel) |
1317 | return 1; | 1297 | return 0; |
1318 | return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); | 1298 | sock = &mdev->tconn->data; |
1299 | if (!drbd_prepare_command(mdev, sock)) | ||
1300 | return -EIO; | ||
1301 | return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0); | ||
1302 | } | ||
1303 | |||
1304 | static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch) | ||
1305 | { | ||
1306 | if (!tconn->send.seen_any_write_yet) { | ||
1307 | tconn->send.seen_any_write_yet = true; | ||
1308 | tconn->send.current_epoch_nr = epoch; | ||
1309 | tconn->send.current_epoch_writes = 0; | ||
1310 | } | ||
1311 | } | ||
1312 | |||
1313 | static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch) | ||
1314 | { | ||
1315 | /* re-init if first write on this connection */ | ||
1316 | if (!tconn->send.seen_any_write_yet) | ||
1317 | return; | ||
1318 | if (tconn->send.current_epoch_nr != epoch) { | ||
1319 | if (tconn->send.current_epoch_writes) | ||
1320 | drbd_send_barrier(tconn); | ||
1321 | tconn->send.current_epoch_nr = epoch; | ||
1322 | } | ||
1319 | } | 1323 | } |
1320 | 1324 | ||
1321 | int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1325 | int w_send_out_of_sync(struct drbd_work *w, int cancel) |
1322 | { | 1326 | { |
1323 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1327 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1324 | int ok; | 1328 | struct drbd_conf *mdev = w->mdev; |
1329 | struct drbd_tconn *tconn = mdev->tconn; | ||
1330 | int err; | ||
1325 | 1331 | ||
1326 | if (unlikely(cancel)) { | 1332 | if (unlikely(cancel)) { |
1327 | req_mod(req, send_canceled); | 1333 | req_mod(req, SEND_CANCELED); |
1328 | return 1; | 1334 | return 0; |
1329 | } | 1335 | } |
1330 | 1336 | ||
1331 | ok = drbd_send_oos(mdev, req); | 1337 | /* this time, no tconn->send.current_epoch_writes++; |
1332 | req_mod(req, oos_handed_to_network); | 1338 | * If it was sent, it was the closing barrier for the last |
1339 | * replicated epoch, before we went into AHEAD mode. | ||
1340 | * No more barriers will be sent, until we leave AHEAD mode again. */ | ||
1341 | maybe_send_barrier(tconn, req->epoch); | ||
1342 | |||
1343 | err = drbd_send_out_of_sync(mdev, req); | ||
1344 | req_mod(req, OOS_HANDED_TO_NETWORK); | ||
1333 | 1345 | ||
1334 | return ok; | 1346 | return err; |
1335 | } | 1347 | } |
1336 | 1348 | ||
1337 | /** | 1349 | /** |
@@ -1340,20 +1352,26 @@ int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1340 | * @w: work object. | 1352 | * @w: work object. |
1341 | * @cancel: The connection will be closed anyways | 1353 | * @cancel: The connection will be closed anyways |
1342 | */ | 1354 | */ |
1343 | int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1355 | int w_send_dblock(struct drbd_work *w, int cancel) |
1344 | { | 1356 | { |
1345 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1357 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1346 | int ok; | 1358 | struct drbd_conf *mdev = w->mdev; |
1359 | struct drbd_tconn *tconn = mdev->tconn; | ||
1360 | int err; | ||
1347 | 1361 | ||
1348 | if (unlikely(cancel)) { | 1362 | if (unlikely(cancel)) { |
1349 | req_mod(req, send_canceled); | 1363 | req_mod(req, SEND_CANCELED); |
1350 | return 1; | 1364 | return 0; |
1351 | } | 1365 | } |
1352 | 1366 | ||
1353 | ok = drbd_send_dblock(mdev, req); | 1367 | re_init_if_first_write(tconn, req->epoch); |
1354 | req_mod(req, ok ? handed_over_to_network : send_failed); | 1368 | maybe_send_barrier(tconn, req->epoch); |
1369 | tconn->send.current_epoch_writes++; | ||
1370 | |||
1371 | err = drbd_send_dblock(mdev, req); | ||
1372 | req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); | ||
1355 | 1373 | ||
1356 | return ok; | 1374 | return err; |
1357 | } | 1375 | } |
1358 | 1376 | ||
1359 | /** | 1377 | /** |
@@ -1362,57 +1380,61 @@ int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1362 | * @w: work object. | 1380 | * @w: work object. |
1363 | * @cancel: The connection will be closed anyways | 1381 | * @cancel: The connection will be closed anyways |
1364 | */ | 1382 | */ |
1365 | int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1383 | int w_send_read_req(struct drbd_work *w, int cancel) |
1366 | { | 1384 | { |
1367 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1385 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1368 | int ok; | 1386 | struct drbd_conf *mdev = w->mdev; |
1387 | struct drbd_tconn *tconn = mdev->tconn; | ||
1388 | int err; | ||
1369 | 1389 | ||
1370 | if (unlikely(cancel)) { | 1390 | if (unlikely(cancel)) { |
1371 | req_mod(req, send_canceled); | 1391 | req_mod(req, SEND_CANCELED); |
1372 | return 1; | 1392 | return 0; |
1373 | } | 1393 | } |
1374 | 1394 | ||
1375 | ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, | 1395 | /* Even read requests may close a write epoch, |
1376 | (unsigned long)req); | 1396 | * if there was any yet. */ |
1397 | maybe_send_barrier(tconn, req->epoch); | ||
1377 | 1398 | ||
1378 | if (!ok) { | 1399 | err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size, |
1379 | /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); | 1400 | (unsigned long)req); |
1380 | * so this is probably redundant */ | ||
1381 | if (mdev->state.conn >= C_CONNECTED) | ||
1382 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | ||
1383 | } | ||
1384 | req_mod(req, ok ? handed_over_to_network : send_failed); | ||
1385 | 1401 | ||
1386 | return ok; | 1402 | req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); |
1403 | |||
1404 | return err; | ||
1387 | } | 1405 | } |
1388 | 1406 | ||
1389 | int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1407 | int w_restart_disk_io(struct drbd_work *w, int cancel) |
1390 | { | 1408 | { |
1391 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1409 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1410 | struct drbd_conf *mdev = w->mdev; | ||
1392 | 1411 | ||
1393 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) | 1412 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) |
1394 | drbd_al_begin_io(mdev, req->sector); | 1413 | drbd_al_begin_io(mdev, &req->i); |
1395 | /* Calling drbd_al_begin_io() out of the worker might deadlocks | ||
1396 | theoretically. Practically it can not deadlock, since this is | ||
1397 | only used when unfreezing IOs. All the extents of the requests | ||
1398 | that made it into the TL are already active */ | ||
1399 | 1414 | ||
1400 | drbd_req_make_private_bio(req, req->master_bio); | 1415 | drbd_req_make_private_bio(req, req->master_bio); |
1401 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | 1416 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; |
1402 | generic_make_request(req->private_bio); | 1417 | generic_make_request(req->private_bio); |
1403 | 1418 | ||
1404 | return 1; | 1419 | return 0; |
1405 | } | 1420 | } |
1406 | 1421 | ||
1407 | static int _drbd_may_sync_now(struct drbd_conf *mdev) | 1422 | static int _drbd_may_sync_now(struct drbd_conf *mdev) |
1408 | { | 1423 | { |
1409 | struct drbd_conf *odev = mdev; | 1424 | struct drbd_conf *odev = mdev; |
1425 | int resync_after; | ||
1410 | 1426 | ||
1411 | while (1) { | 1427 | while (1) { |
1412 | if (odev->sync_conf.after == -1) | 1428 | if (!odev->ldev) |
1429 | return 1; | ||
1430 | rcu_read_lock(); | ||
1431 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | ||
1432 | rcu_read_unlock(); | ||
1433 | if (resync_after == -1) | ||
1434 | return 1; | ||
1435 | odev = minor_to_mdev(resync_after); | ||
1436 | if (!expect(odev)) | ||
1413 | return 1; | 1437 | return 1; |
1414 | odev = minor_to_mdev(odev->sync_conf.after); | ||
1415 | ERR_IF(!odev) return 1; | ||
1416 | if ((odev->state.conn >= C_SYNC_SOURCE && | 1438 | if ((odev->state.conn >= C_SYNC_SOURCE && |
1417 | odev->state.conn <= C_PAUSED_SYNC_T) || | 1439 | odev->state.conn <= C_PAUSED_SYNC_T) || |
1418 | odev->state.aftr_isp || odev->state.peer_isp || | 1440 | odev->state.aftr_isp || odev->state.peer_isp || |
@@ -1432,16 +1454,15 @@ static int _drbd_pause_after(struct drbd_conf *mdev) | |||
1432 | struct drbd_conf *odev; | 1454 | struct drbd_conf *odev; |
1433 | int i, rv = 0; | 1455 | int i, rv = 0; |
1434 | 1456 | ||
1435 | for (i = 0; i < minor_count; i++) { | 1457 | rcu_read_lock(); |
1436 | odev = minor_to_mdev(i); | 1458 | idr_for_each_entry(&minors, odev, i) { |
1437 | if (!odev) | ||
1438 | continue; | ||
1439 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | 1459 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) |
1440 | continue; | 1460 | continue; |
1441 | if (!_drbd_may_sync_now(odev)) | 1461 | if (!_drbd_may_sync_now(odev)) |
1442 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) | 1462 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) |
1443 | != SS_NOTHING_TO_DO); | 1463 | != SS_NOTHING_TO_DO); |
1444 | } | 1464 | } |
1465 | rcu_read_unlock(); | ||
1445 | 1466 | ||
1446 | return rv; | 1467 | return rv; |
1447 | } | 1468 | } |
@@ -1457,10 +1478,8 @@ static int _drbd_resume_next(struct drbd_conf *mdev) | |||
1457 | struct drbd_conf *odev; | 1478 | struct drbd_conf *odev; |
1458 | int i, rv = 0; | 1479 | int i, rv = 0; |
1459 | 1480 | ||
1460 | for (i = 0; i < minor_count; i++) { | 1481 | rcu_read_lock(); |
1461 | odev = minor_to_mdev(i); | 1482 | idr_for_each_entry(&minors, odev, i) { |
1462 | if (!odev) | ||
1463 | continue; | ||
1464 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | 1483 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) |
1465 | continue; | 1484 | continue; |
1466 | if (odev->state.aftr_isp) { | 1485 | if (odev->state.aftr_isp) { |
@@ -1470,6 +1489,7 @@ static int _drbd_resume_next(struct drbd_conf *mdev) | |||
1470 | != SS_NOTHING_TO_DO) ; | 1489 | != SS_NOTHING_TO_DO) ; |
1471 | } | 1490 | } |
1472 | } | 1491 | } |
1492 | rcu_read_unlock(); | ||
1473 | return rv; | 1493 | return rv; |
1474 | } | 1494 | } |
1475 | 1495 | ||
@@ -1487,57 +1507,86 @@ void suspend_other_sg(struct drbd_conf *mdev) | |||
1487 | write_unlock_irq(&global_state_lock); | 1507 | write_unlock_irq(&global_state_lock); |
1488 | } | 1508 | } |
1489 | 1509 | ||
1490 | static int sync_after_error(struct drbd_conf *mdev, int o_minor) | 1510 | /* caller must hold global_state_lock */ |
1511 | enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) | ||
1491 | { | 1512 | { |
1492 | struct drbd_conf *odev; | 1513 | struct drbd_conf *odev; |
1514 | int resync_after; | ||
1493 | 1515 | ||
1494 | if (o_minor == -1) | 1516 | if (o_minor == -1) |
1495 | return NO_ERROR; | 1517 | return NO_ERROR; |
1496 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) | 1518 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) |
1497 | return ERR_SYNC_AFTER; | 1519 | return ERR_RESYNC_AFTER; |
1498 | 1520 | ||
1499 | /* check for loops */ | 1521 | /* check for loops */ |
1500 | odev = minor_to_mdev(o_minor); | 1522 | odev = minor_to_mdev(o_minor); |
1501 | while (1) { | 1523 | while (1) { |
1502 | if (odev == mdev) | 1524 | if (odev == mdev) |
1503 | return ERR_SYNC_AFTER_CYCLE; | 1525 | return ERR_RESYNC_AFTER_CYCLE; |
1504 | 1526 | ||
1527 | rcu_read_lock(); | ||
1528 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | ||
1529 | rcu_read_unlock(); | ||
1505 | /* dependency chain ends here, no cycles. */ | 1530 | /* dependency chain ends here, no cycles. */ |
1506 | if (odev->sync_conf.after == -1) | 1531 | if (resync_after == -1) |
1507 | return NO_ERROR; | 1532 | return NO_ERROR; |
1508 | 1533 | ||
1509 | /* follow the dependency chain */ | 1534 | /* follow the dependency chain */ |
1510 | odev = minor_to_mdev(odev->sync_conf.after); | 1535 | odev = minor_to_mdev(resync_after); |
1511 | } | 1536 | } |
1512 | } | 1537 | } |
1513 | 1538 | ||
1514 | int drbd_alter_sa(struct drbd_conf *mdev, int na) | 1539 | /* caller must hold global_state_lock */ |
1540 | void drbd_resync_after_changed(struct drbd_conf *mdev) | ||
1515 | { | 1541 | { |
1516 | int changes; | 1542 | int changes; |
1517 | int retcode; | ||
1518 | 1543 | ||
1519 | write_lock_irq(&global_state_lock); | 1544 | do { |
1520 | retcode = sync_after_error(mdev, na); | 1545 | changes = _drbd_pause_after(mdev); |
1521 | if (retcode == NO_ERROR) { | 1546 | changes |= _drbd_resume_next(mdev); |
1522 | mdev->sync_conf.after = na; | 1547 | } while (changes); |
1523 | do { | ||
1524 | changes = _drbd_pause_after(mdev); | ||
1525 | changes |= _drbd_resume_next(mdev); | ||
1526 | } while (changes); | ||
1527 | } | ||
1528 | write_unlock_irq(&global_state_lock); | ||
1529 | return retcode; | ||
1530 | } | 1548 | } |
1531 | 1549 | ||
1532 | void drbd_rs_controller_reset(struct drbd_conf *mdev) | 1550 | void drbd_rs_controller_reset(struct drbd_conf *mdev) |
1533 | { | 1551 | { |
1552 | struct fifo_buffer *plan; | ||
1553 | |||
1534 | atomic_set(&mdev->rs_sect_in, 0); | 1554 | atomic_set(&mdev->rs_sect_in, 0); |
1535 | atomic_set(&mdev->rs_sect_ev, 0); | 1555 | atomic_set(&mdev->rs_sect_ev, 0); |
1536 | mdev->rs_in_flight = 0; | 1556 | mdev->rs_in_flight = 0; |
1537 | mdev->rs_planed = 0; | 1557 | |
1538 | spin_lock(&mdev->peer_seq_lock); | 1558 | /* Updating the RCU protected object in place is necessary since |
1539 | fifo_set(&mdev->rs_plan_s, 0); | 1559 | this function gets called from atomic context. |
1540 | spin_unlock(&mdev->peer_seq_lock); | 1560 | It is valid since all other updates also lead to an completely |
1561 | empty fifo */ | ||
1562 | rcu_read_lock(); | ||
1563 | plan = rcu_dereference(mdev->rs_plan_s); | ||
1564 | plan->total = 0; | ||
1565 | fifo_set(plan, 0); | ||
1566 | rcu_read_unlock(); | ||
1567 | } | ||
1568 | |||
1569 | void start_resync_timer_fn(unsigned long data) | ||
1570 | { | ||
1571 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
1572 | |||
1573 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work); | ||
1574 | } | ||
1575 | |||
1576 | int w_start_resync(struct drbd_work *w, int cancel) | ||
1577 | { | ||
1578 | struct drbd_conf *mdev = w->mdev; | ||
1579 | |||
1580 | if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) { | ||
1581 | dev_warn(DEV, "w_start_resync later...\n"); | ||
1582 | mdev->start_resync_timer.expires = jiffies + HZ/10; | ||
1583 | add_timer(&mdev->start_resync_timer); | ||
1584 | return 0; | ||
1585 | } | ||
1586 | |||
1587 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1588 | clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); | ||
1589 | return 0; | ||
1541 | } | 1590 | } |
1542 | 1591 | ||
1543 | /** | 1592 | /** |
@@ -1558,43 +1607,58 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1558 | return; | 1607 | return; |
1559 | } | 1608 | } |
1560 | 1609 | ||
1561 | if (side == C_SYNC_TARGET) { | 1610 | if (!test_bit(B_RS_H_DONE, &mdev->flags)) { |
1562 | /* Since application IO was locked out during C_WF_BITMAP_T and | 1611 | if (side == C_SYNC_TARGET) { |
1563 | C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET | 1612 | /* Since application IO was locked out during C_WF_BITMAP_T and |
1564 | we check that we might make the data inconsistent. */ | 1613 | C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET |
1565 | r = drbd_khelper(mdev, "before-resync-target"); | 1614 | we check that we might make the data inconsistent. */ |
1566 | r = (r >> 8) & 0xff; | 1615 | r = drbd_khelper(mdev, "before-resync-target"); |
1567 | if (r > 0) { | 1616 | r = (r >> 8) & 0xff; |
1568 | dev_info(DEV, "before-resync-target handler returned %d, " | 1617 | if (r > 0) { |
1569 | "dropping connection.\n", r); | 1618 | dev_info(DEV, "before-resync-target handler returned %d, " |
1570 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
1571 | return; | ||
1572 | } | ||
1573 | } else /* C_SYNC_SOURCE */ { | ||
1574 | r = drbd_khelper(mdev, "before-resync-source"); | ||
1575 | r = (r >> 8) & 0xff; | ||
1576 | if (r > 0) { | ||
1577 | if (r == 3) { | ||
1578 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1579 | "ignoring. Old userland tools?", r); | ||
1580 | } else { | ||
1581 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1582 | "dropping connection.\n", r); | 1619 | "dropping connection.\n", r); |
1583 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 1620 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
1584 | return; | 1621 | return; |
1585 | } | 1622 | } |
1623 | } else /* C_SYNC_SOURCE */ { | ||
1624 | r = drbd_khelper(mdev, "before-resync-source"); | ||
1625 | r = (r >> 8) & 0xff; | ||
1626 | if (r > 0) { | ||
1627 | if (r == 3) { | ||
1628 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1629 | "ignoring. Old userland tools?", r); | ||
1630 | } else { | ||
1631 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1632 | "dropping connection.\n", r); | ||
1633 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); | ||
1634 | return; | ||
1635 | } | ||
1636 | } | ||
1586 | } | 1637 | } |
1587 | } | 1638 | } |
1588 | 1639 | ||
1589 | drbd_state_lock(mdev); | 1640 | if (current == mdev->tconn->worker.task) { |
1641 | /* The worker should not sleep waiting for state_mutex, | ||
1642 | that can take long */ | ||
1643 | if (!mutex_trylock(mdev->state_mutex)) { | ||
1644 | set_bit(B_RS_H_DONE, &mdev->flags); | ||
1645 | mdev->start_resync_timer.expires = jiffies + HZ/5; | ||
1646 | add_timer(&mdev->start_resync_timer); | ||
1647 | return; | ||
1648 | } | ||
1649 | } else { | ||
1650 | mutex_lock(mdev->state_mutex); | ||
1651 | } | ||
1652 | clear_bit(B_RS_H_DONE, &mdev->flags); | ||
1653 | |||
1590 | write_lock_irq(&global_state_lock); | 1654 | write_lock_irq(&global_state_lock); |
1591 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { | 1655 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { |
1592 | write_unlock_irq(&global_state_lock); | 1656 | write_unlock_irq(&global_state_lock); |
1593 | drbd_state_unlock(mdev); | 1657 | mutex_unlock(mdev->state_mutex); |
1594 | return; | 1658 | return; |
1595 | } | 1659 | } |
1596 | 1660 | ||
1597 | ns.i = mdev->state.i; | 1661 | ns = drbd_read_state(mdev); |
1598 | 1662 | ||
1599 | ns.aftr_isp = !_drbd_may_sync_now(mdev); | 1663 | ns.aftr_isp = !_drbd_may_sync_now(mdev); |
1600 | 1664 | ||
@@ -1606,7 +1670,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1606 | ns.pdsk = D_INCONSISTENT; | 1670 | ns.pdsk = D_INCONSISTENT; |
1607 | 1671 | ||
1608 | r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | 1672 | r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); |
1609 | ns = mdev->state; | 1673 | ns = drbd_read_state(mdev); |
1610 | 1674 | ||
1611 | if (ns.conn < C_CONNECTED) | 1675 | if (ns.conn < C_CONNECTED) |
1612 | r = SS_UNKNOWN_ERROR; | 1676 | r = SS_UNKNOWN_ERROR; |
@@ -1632,6 +1696,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1632 | write_unlock_irq(&global_state_lock); | 1696 | write_unlock_irq(&global_state_lock); |
1633 | 1697 | ||
1634 | if (r == SS_SUCCESS) { | 1698 | if (r == SS_SUCCESS) { |
1699 | /* reset rs_last_bcast when a resync or verify is started, | ||
1700 | * to deal with potential jiffies wrap. */ | ||
1701 | mdev->rs_last_bcast = jiffies - HZ; | ||
1702 | |||
1635 | dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", | 1703 | dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", |
1636 | drbd_conn_str(ns.conn), | 1704 | drbd_conn_str(ns.conn), |
1637 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), | 1705 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), |
@@ -1646,10 +1714,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1646 | * drbd_resync_finished from here in that case. | 1714 | * drbd_resync_finished from here in that case. |
1647 | * We drbd_gen_and_send_sync_uuid here for protocol < 96, | 1715 | * We drbd_gen_and_send_sync_uuid here for protocol < 96, |
1648 | * and from after_state_ch otherwise. */ | 1716 | * and from after_state_ch otherwise. */ |
1649 | if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96) | 1717 | if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96) |
1650 | drbd_gen_and_send_sync_uuid(mdev); | 1718 | drbd_gen_and_send_sync_uuid(mdev); |
1651 | 1719 | ||
1652 | if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) { | 1720 | if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) { |
1653 | /* This still has a race (about when exactly the peers | 1721 | /* This still has a race (about when exactly the peers |
1654 | * detect connection loss) that can lead to a full sync | 1722 | * detect connection loss) that can lead to a full sync |
1655 | * on next handshake. In 8.3.9 we fixed this with explicit | 1723 | * on next handshake. In 8.3.9 we fixed this with explicit |
@@ -1660,10 +1728,16 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1660 | * detect connection loss, then waiting for a ping | 1728 | * detect connection loss, then waiting for a ping |
1661 | * response (implicit in drbd_resync_finished) reduces | 1729 | * response (implicit in drbd_resync_finished) reduces |
1662 | * the race considerably, but does not solve it. */ | 1730 | * the race considerably, but does not solve it. */ |
1663 | if (side == C_SYNC_SOURCE) | 1731 | if (side == C_SYNC_SOURCE) { |
1664 | schedule_timeout_interruptible( | 1732 | struct net_conf *nc; |
1665 | mdev->net_conf->ping_int * HZ + | 1733 | int timeo; |
1666 | mdev->net_conf->ping_timeo*HZ/9); | 1734 | |
1735 | rcu_read_lock(); | ||
1736 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
1737 | timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; | ||
1738 | rcu_read_unlock(); | ||
1739 | schedule_timeout_interruptible(timeo); | ||
1740 | } | ||
1667 | drbd_resync_finished(mdev); | 1741 | drbd_resync_finished(mdev); |
1668 | } | 1742 | } |
1669 | 1743 | ||
@@ -1678,114 +1752,180 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1678 | drbd_md_sync(mdev); | 1752 | drbd_md_sync(mdev); |
1679 | } | 1753 | } |
1680 | put_ldev(mdev); | 1754 | put_ldev(mdev); |
1681 | drbd_state_unlock(mdev); | 1755 | mutex_unlock(mdev->state_mutex); |
1682 | } | 1756 | } |
1683 | 1757 | ||
1684 | int drbd_worker(struct drbd_thread *thi) | 1758 | /* If the resource already closed the current epoch, but we did not |
1759 | * (because we have not yet seen new requests), we should send the | ||
1760 | * corresponding barrier now. Must be checked within the same spinlock | ||
1761 | * that is used to check for new requests. */ | ||
1762 | bool need_to_send_barrier(struct drbd_tconn *connection) | ||
1685 | { | 1763 | { |
1686 | struct drbd_conf *mdev = thi->mdev; | 1764 | if (!connection->send.seen_any_write_yet) |
1687 | struct drbd_work *w = NULL; | 1765 | return false; |
1688 | LIST_HEAD(work_list); | 1766 | |
1689 | int intr = 0, i; | 1767 | /* Skip barriers that do not contain any writes. |
1768 | * This may happen during AHEAD mode. */ | ||
1769 | if (!connection->send.current_epoch_writes) | ||
1770 | return false; | ||
1771 | |||
1772 | /* ->req_lock is held when requests are queued on | ||
1773 | * connection->sender_work, and put into ->transfer_log. | ||
1774 | * It is also held when ->current_tle_nr is increased. | ||
1775 | * So either there are already new requests queued, | ||
1776 | * and corresponding barriers will be send there. | ||
1777 | * Or nothing new is queued yet, so the difference will be 1. | ||
1778 | */ | ||
1779 | if (atomic_read(&connection->current_tle_nr) != | ||
1780 | connection->send.current_epoch_nr + 1) | ||
1781 | return false; | ||
1782 | |||
1783 | return true; | ||
1784 | } | ||
1690 | 1785 | ||
1691 | sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); | 1786 | bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) |
1787 | { | ||
1788 | spin_lock_irq(&queue->q_lock); | ||
1789 | list_splice_init(&queue->q, work_list); | ||
1790 | spin_unlock_irq(&queue->q_lock); | ||
1791 | return !list_empty(work_list); | ||
1792 | } | ||
1793 | |||
1794 | bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list) | ||
1795 | { | ||
1796 | spin_lock_irq(&queue->q_lock); | ||
1797 | if (!list_empty(&queue->q)) | ||
1798 | list_move(queue->q.next, work_list); | ||
1799 | spin_unlock_irq(&queue->q_lock); | ||
1800 | return !list_empty(work_list); | ||
1801 | } | ||
1692 | 1802 | ||
1693 | while (get_t_state(thi) == Running) { | 1803 | void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list) |
1694 | drbd_thread_current_set_cpu(mdev); | 1804 | { |
1805 | DEFINE_WAIT(wait); | ||
1806 | struct net_conf *nc; | ||
1807 | int uncork, cork; | ||
1695 | 1808 | ||
1696 | if (down_trylock(&mdev->data.work.s)) { | 1809 | dequeue_work_item(&connection->sender_work, work_list); |
1697 | mutex_lock(&mdev->data.mutex); | 1810 | if (!list_empty(work_list)) |
1698 | if (mdev->data.socket && !mdev->net_conf->no_cork) | 1811 | return; |
1699 | drbd_tcp_uncork(mdev->data.socket); | ||
1700 | mutex_unlock(&mdev->data.mutex); | ||
1701 | 1812 | ||
1702 | intr = down_interruptible(&mdev->data.work.s); | 1813 | /* Still nothing to do? |
1814 | * Maybe we still need to close the current epoch, | ||
1815 | * even if no new requests are queued yet. | ||
1816 | * | ||
1817 | * Also, poke TCP, just in case. | ||
1818 | * Then wait for new work (or signal). */ | ||
1819 | rcu_read_lock(); | ||
1820 | nc = rcu_dereference(connection->net_conf); | ||
1821 | uncork = nc ? nc->tcp_cork : 0; | ||
1822 | rcu_read_unlock(); | ||
1823 | if (uncork) { | ||
1824 | mutex_lock(&connection->data.mutex); | ||
1825 | if (connection->data.socket) | ||
1826 | drbd_tcp_uncork(connection->data.socket); | ||
1827 | mutex_unlock(&connection->data.mutex); | ||
1828 | } | ||
1703 | 1829 | ||
1704 | mutex_lock(&mdev->data.mutex); | 1830 | for (;;) { |
1705 | if (mdev->data.socket && !mdev->net_conf->no_cork) | 1831 | int send_barrier; |
1706 | drbd_tcp_cork(mdev->data.socket); | 1832 | prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE); |
1707 | mutex_unlock(&mdev->data.mutex); | 1833 | spin_lock_irq(&connection->req_lock); |
1834 | spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ | ||
1835 | /* dequeue single item only, | ||
1836 | * we still use drbd_queue_work_front() in some places */ | ||
1837 | if (!list_empty(&connection->sender_work.q)) | ||
1838 | list_move(connection->sender_work.q.next, work_list); | ||
1839 | spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ | ||
1840 | if (!list_empty(work_list) || signal_pending(current)) { | ||
1841 | spin_unlock_irq(&connection->req_lock); | ||
1842 | break; | ||
1843 | } | ||
1844 | send_barrier = need_to_send_barrier(connection); | ||
1845 | spin_unlock_irq(&connection->req_lock); | ||
1846 | if (send_barrier) { | ||
1847 | drbd_send_barrier(connection); | ||
1848 | connection->send.current_epoch_nr++; | ||
1708 | } | 1849 | } |
1850 | schedule(); | ||
1851 | /* may be woken up for other things but new work, too, | ||
1852 | * e.g. if the current epoch got closed. | ||
1853 | * In which case we send the barrier above. */ | ||
1854 | } | ||
1855 | finish_wait(&connection->sender_work.q_wait, &wait); | ||
1856 | |||
1857 | /* someone may have changed the config while we have been waiting above. */ | ||
1858 | rcu_read_lock(); | ||
1859 | nc = rcu_dereference(connection->net_conf); | ||
1860 | cork = nc ? nc->tcp_cork : 0; | ||
1861 | rcu_read_unlock(); | ||
1862 | mutex_lock(&connection->data.mutex); | ||
1863 | if (connection->data.socket) { | ||
1864 | if (cork) | ||
1865 | drbd_tcp_cork(connection->data.socket); | ||
1866 | else if (!uncork) | ||
1867 | drbd_tcp_uncork(connection->data.socket); | ||
1868 | } | ||
1869 | mutex_unlock(&connection->data.mutex); | ||
1870 | } | ||
1871 | |||
1872 | int drbd_worker(struct drbd_thread *thi) | ||
1873 | { | ||
1874 | struct drbd_tconn *tconn = thi->tconn; | ||
1875 | struct drbd_work *w = NULL; | ||
1876 | struct drbd_conf *mdev; | ||
1877 | LIST_HEAD(work_list); | ||
1878 | int vnr; | ||
1879 | |||
1880 | while (get_t_state(thi) == RUNNING) { | ||
1881 | drbd_thread_current_set_cpu(thi); | ||
1709 | 1882 | ||
1710 | if (intr) { | 1883 | /* as long as we use drbd_queue_work_front(), |
1711 | D_ASSERT(intr == -EINTR); | 1884 | * we may only dequeue single work items here, not batches. */ |
1885 | if (list_empty(&work_list)) | ||
1886 | wait_for_work(tconn, &work_list); | ||
1887 | |||
1888 | if (signal_pending(current)) { | ||
1712 | flush_signals(current); | 1889 | flush_signals(current); |
1713 | ERR_IF (get_t_state(thi) == Running) | 1890 | if (get_t_state(thi) == RUNNING) { |
1891 | conn_warn(tconn, "Worker got an unexpected signal\n"); | ||
1714 | continue; | 1892 | continue; |
1893 | } | ||
1715 | break; | 1894 | break; |
1716 | } | 1895 | } |
1717 | 1896 | ||
1718 | if (get_t_state(thi) != Running) | 1897 | if (get_t_state(thi) != RUNNING) |
1719 | break; | 1898 | break; |
1720 | /* With this break, we have done a down() but not consumed | 1899 | |
1721 | the entry from the list. The cleanup code takes care of | 1900 | while (!list_empty(&work_list)) { |
1722 | this... */ | 1901 | w = list_first_entry(&work_list, struct drbd_work, list); |
1723 | 1902 | list_del_init(&w->list); | |
1724 | w = NULL; | 1903 | if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0) |
1725 | spin_lock_irq(&mdev->data.work.q_lock); | 1904 | continue; |
1726 | ERR_IF(list_empty(&mdev->data.work.q)) { | 1905 | if (tconn->cstate >= C_WF_REPORT_PARAMS) |
1727 | /* something terribly wrong in our logic. | 1906 | conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); |
1728 | * we were able to down() the semaphore, | ||
1729 | * but the list is empty... doh. | ||
1730 | * | ||
1731 | * what is the best thing to do now? | ||
1732 | * try again from scratch, restarting the receiver, | ||
1733 | * asender, whatnot? could break even more ugly, | ||
1734 | * e.g. when we are primary, but no good local data. | ||
1735 | * | ||
1736 | * I'll try to get away just starting over this loop. | ||
1737 | */ | ||
1738 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1739 | continue; | ||
1740 | } | ||
1741 | w = list_entry(mdev->data.work.q.next, struct drbd_work, list); | ||
1742 | list_del_init(&w->list); | ||
1743 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1744 | |||
1745 | if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { | ||
1746 | /* dev_warn(DEV, "worker: a callback failed! \n"); */ | ||
1747 | if (mdev->state.conn >= C_CONNECTED) | ||
1748 | drbd_force_state(mdev, | ||
1749 | NS(conn, C_NETWORK_FAILURE)); | ||
1750 | } | 1907 | } |
1751 | } | 1908 | } |
1752 | D_ASSERT(drbd_test_flag(mdev, DEVICE_DYING)); | ||
1753 | D_ASSERT(drbd_test_flag(mdev, CONFIG_PENDING)); | ||
1754 | |||
1755 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1756 | i = 0; | ||
1757 | while (!list_empty(&mdev->data.work.q)) { | ||
1758 | list_splice_init(&mdev->data.work.q, &work_list); | ||
1759 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1760 | 1909 | ||
1910 | do { | ||
1761 | while (!list_empty(&work_list)) { | 1911 | while (!list_empty(&work_list)) { |
1762 | w = list_entry(work_list.next, struct drbd_work, list); | 1912 | w = list_first_entry(&work_list, struct drbd_work, list); |
1763 | list_del_init(&w->list); | 1913 | list_del_init(&w->list); |
1764 | w->cb(mdev, w, 1); | 1914 | w->cb(w, 1); |
1765 | i++; /* dead debugging code */ | ||
1766 | } | 1915 | } |
1767 | 1916 | dequeue_work_batch(&tconn->sender_work, &work_list); | |
1768 | spin_lock_irq(&mdev->data.work.q_lock); | 1917 | } while (!list_empty(&work_list)); |
1918 | |||
1919 | rcu_read_lock(); | ||
1920 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1921 | D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); | ||
1922 | kref_get(&mdev->kref); | ||
1923 | rcu_read_unlock(); | ||
1924 | drbd_mdev_cleanup(mdev); | ||
1925 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1926 | rcu_read_lock(); | ||
1769 | } | 1927 | } |
1770 | sema_init(&mdev->data.work.s, 0); | 1928 | rcu_read_unlock(); |
1771 | /* DANGEROUS race: if someone did queue his work within the spinlock, | ||
1772 | * but up() ed outside the spinlock, we could get an up() on the | ||
1773 | * semaphore without corresponding list entry. | ||
1774 | * So don't do that. | ||
1775 | */ | ||
1776 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1777 | |||
1778 | D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); | ||
1779 | /* _drbd_set_state only uses stop_nowait. | ||
1780 | * wait here for the Exiting receiver. */ | ||
1781 | drbd_thread_stop(&mdev->receiver); | ||
1782 | drbd_mdev_cleanup(mdev); | ||
1783 | |||
1784 | dev_info(DEV, "worker terminated\n"); | ||
1785 | |||
1786 | drbd_clear_flag(mdev, DEVICE_DYING); | ||
1787 | drbd_clear_flag(mdev, CONFIG_PENDING); | ||
1788 | wake_up(&mdev->state_wait); | ||
1789 | 1929 | ||
1790 | return 0; | 1930 | return 0; |
1791 | } | 1931 | } |
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h index 151f1a37478f..328f18e4b4ee 100644 --- a/drivers/block/drbd/drbd_wrappers.h +++ b/drivers/block/drbd/drbd_wrappers.h | |||
@@ -3,6 +3,7 @@ | |||
3 | 3 | ||
4 | #include <linux/ctype.h> | 4 | #include <linux/ctype.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include "drbd_int.h" | ||
6 | 7 | ||
7 | /* see get_sb_bdev and bd_claim */ | 8 | /* see get_sb_bdev and bd_claim */ |
8 | extern char *drbd_sec_holder; | 9 | extern char *drbd_sec_holder; |
@@ -20,8 +21,8 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev, | |||
20 | 21 | ||
21 | /* bi_end_io handlers */ | 22 | /* bi_end_io handlers */ |
22 | extern void drbd_md_io_complete(struct bio *bio, int error); | 23 | extern void drbd_md_io_complete(struct bio *bio, int error); |
23 | extern void drbd_endio_sec(struct bio *bio, int error); | 24 | extern void drbd_peer_request_endio(struct bio *bio, int error); |
24 | extern void drbd_endio_pri(struct bio *bio, int error); | 25 | extern void drbd_request_endio(struct bio *bio, int error); |
25 | 26 | ||
26 | /* | 27 | /* |
27 | * used to submit our private bio | 28 | * used to submit our private bio |
@@ -45,12 +46,6 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev, | |||
45 | generic_make_request(bio); | 46 | generic_make_request(bio); |
46 | } | 47 | } |
47 | 48 | ||
48 | static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) | ||
49 | { | ||
50 | return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) | ||
51 | == CRYPTO_ALG_TYPE_HASH; | ||
52 | } | ||
53 | |||
54 | #ifndef __CHECKER__ | 49 | #ifndef __CHECKER__ |
55 | # undef __cond_lock | 50 | # undef __cond_lock |
56 | # define __cond_lock(x,c) (c) | 51 | # define __cond_lock(x,c) (c) |
diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 94f58a102bbb..0c5a18ec322c 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h | |||
@@ -51,12 +51,11 @@ | |||
51 | 51 | ||
52 | #endif | 52 | #endif |
53 | 53 | ||
54 | |||
55 | extern const char *drbd_buildtag(void); | 54 | extern const char *drbd_buildtag(void); |
56 | #define REL_VERSION "8.3.14" | 55 | #define REL_VERSION "8.4.2" |
57 | #define API_VERSION 88 | 56 | #define API_VERSION 1 |
58 | #define PRO_VERSION_MIN 86 | 57 | #define PRO_VERSION_MIN 86 |
59 | #define PRO_VERSION_MAX 97 | 58 | #define PRO_VERSION_MAX 101 |
60 | 59 | ||
61 | 60 | ||
62 | enum drbd_io_error_p { | 61 | enum drbd_io_error_p { |
@@ -66,7 +65,8 @@ enum drbd_io_error_p { | |||
66 | }; | 65 | }; |
67 | 66 | ||
68 | enum drbd_fencing_p { | 67 | enum drbd_fencing_p { |
69 | FP_DONT_CARE, | 68 | FP_NOT_AVAIL = -1, /* Not a policy */ |
69 | FP_DONT_CARE = 0, | ||
70 | FP_RESOURCE, | 70 | FP_RESOURCE, |
71 | FP_STONITH | 71 | FP_STONITH |
72 | }; | 72 | }; |
@@ -102,6 +102,20 @@ enum drbd_on_congestion { | |||
102 | OC_DISCONNECT, | 102 | OC_DISCONNECT, |
103 | }; | 103 | }; |
104 | 104 | ||
105 | enum drbd_read_balancing { | ||
106 | RB_PREFER_LOCAL, | ||
107 | RB_PREFER_REMOTE, | ||
108 | RB_ROUND_ROBIN, | ||
109 | RB_LEAST_PENDING, | ||
110 | RB_CONGESTED_REMOTE, | ||
111 | RB_32K_STRIPING, | ||
112 | RB_64K_STRIPING, | ||
113 | RB_128K_STRIPING, | ||
114 | RB_256K_STRIPING, | ||
115 | RB_512K_STRIPING, | ||
116 | RB_1M_STRIPING, | ||
117 | }; | ||
118 | |||
105 | /* KEEP the order, do not delete or insert. Only append. */ | 119 | /* KEEP the order, do not delete or insert. Only append. */ |
106 | enum drbd_ret_code { | 120 | enum drbd_ret_code { |
107 | ERR_CODE_BASE = 100, | 121 | ERR_CODE_BASE = 100, |
@@ -122,7 +136,7 @@ enum drbd_ret_code { | |||
122 | ERR_AUTH_ALG = 120, | 136 | ERR_AUTH_ALG = 120, |
123 | ERR_AUTH_ALG_ND = 121, | 137 | ERR_AUTH_ALG_ND = 121, |
124 | ERR_NOMEM = 122, | 138 | ERR_NOMEM = 122, |
125 | ERR_DISCARD = 123, | 139 | ERR_DISCARD_IMPOSSIBLE = 123, |
126 | ERR_DISK_CONFIGURED = 124, | 140 | ERR_DISK_CONFIGURED = 124, |
127 | ERR_NET_CONFIGURED = 125, | 141 | ERR_NET_CONFIGURED = 125, |
128 | ERR_MANDATORY_TAG = 126, | 142 | ERR_MANDATORY_TAG = 126, |
@@ -130,8 +144,8 @@ enum drbd_ret_code { | |||
130 | ERR_INTR = 129, /* EINTR */ | 144 | ERR_INTR = 129, /* EINTR */ |
131 | ERR_RESIZE_RESYNC = 130, | 145 | ERR_RESIZE_RESYNC = 130, |
132 | ERR_NO_PRIMARY = 131, | 146 | ERR_NO_PRIMARY = 131, |
133 | ERR_SYNC_AFTER = 132, | 147 | ERR_RESYNC_AFTER = 132, |
134 | ERR_SYNC_AFTER_CYCLE = 133, | 148 | ERR_RESYNC_AFTER_CYCLE = 133, |
135 | ERR_PAUSE_IS_SET = 134, | 149 | ERR_PAUSE_IS_SET = 134, |
136 | ERR_PAUSE_IS_CLEAR = 135, | 150 | ERR_PAUSE_IS_CLEAR = 135, |
137 | ERR_PACKET_NR = 137, | 151 | ERR_PACKET_NR = 137, |
@@ -155,6 +169,14 @@ enum drbd_ret_code { | |||
155 | ERR_CONG_NOT_PROTO_A = 155, | 169 | ERR_CONG_NOT_PROTO_A = 155, |
156 | ERR_PIC_AFTER_DEP = 156, | 170 | ERR_PIC_AFTER_DEP = 156, |
157 | ERR_PIC_PEER_DEP = 157, | 171 | ERR_PIC_PEER_DEP = 157, |
172 | ERR_RES_NOT_KNOWN = 158, | ||
173 | ERR_RES_IN_USE = 159, | ||
174 | ERR_MINOR_CONFIGURED = 160, | ||
175 | ERR_MINOR_EXISTS = 161, | ||
176 | ERR_INVALID_REQUEST = 162, | ||
177 | ERR_NEED_APV_100 = 163, | ||
178 | ERR_NEED_ALLOW_TWO_PRI = 164, | ||
179 | ERR_MD_UNCLEAN = 165, | ||
158 | 180 | ||
159 | /* insert new ones above this line */ | 181 | /* insert new ones above this line */ |
160 | AFTER_LAST_ERR_CODE | 182 | AFTER_LAST_ERR_CODE |
@@ -296,7 +318,8 @@ enum drbd_state_rv { | |||
296 | SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ | 318 | SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ |
297 | SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ | 319 | SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ |
298 | SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ | 320 | SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ |
299 | SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */ | 321 | SS_O_VOL_PEER_PRI = -20, |
322 | SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ | ||
300 | }; | 323 | }; |
301 | 324 | ||
302 | /* from drbd_strings.c */ | 325 | /* from drbd_strings.c */ |
@@ -313,7 +336,9 @@ extern const char *drbd_set_st_err_str(enum drbd_state_rv); | |||
313 | #define MDF_FULL_SYNC (1 << 3) | 336 | #define MDF_FULL_SYNC (1 << 3) |
314 | #define MDF_WAS_UP_TO_DATE (1 << 4) | 337 | #define MDF_WAS_UP_TO_DATE (1 << 4) |
315 | #define MDF_PEER_OUT_DATED (1 << 5) | 338 | #define MDF_PEER_OUT_DATED (1 << 5) |
316 | #define MDF_CRASHED_PRIMARY (1 << 6) | 339 | #define MDF_CRASHED_PRIMARY (1 << 6) |
340 | #define MDF_AL_CLEAN (1 << 7) | ||
341 | #define MDF_AL_DISABLED (1 << 8) | ||
317 | 342 | ||
318 | enum drbd_uuid_index { | 343 | enum drbd_uuid_index { |
319 | UI_CURRENT, | 344 | UI_CURRENT, |
@@ -333,37 +358,23 @@ enum drbd_timeout_flag { | |||
333 | 358 | ||
334 | #define UUID_JUST_CREATED ((__u64)4) | 359 | #define UUID_JUST_CREATED ((__u64)4) |
335 | 360 | ||
361 | /* magic numbers used in meta data and network packets */ | ||
336 | #define DRBD_MAGIC 0x83740267 | 362 | #define DRBD_MAGIC 0x83740267 |
337 | #define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) | ||
338 | #define DRBD_MAGIC_BIG 0x835a | 363 | #define DRBD_MAGIC_BIG 0x835a |
339 | #define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG) | 364 | #define DRBD_MAGIC_100 0x8620ec20 |
365 | |||
366 | #define DRBD_MD_MAGIC_07 (DRBD_MAGIC+3) | ||
367 | #define DRBD_MD_MAGIC_08 (DRBD_MAGIC+4) | ||
368 | #define DRBD_MD_MAGIC_84_UNCLEAN (DRBD_MAGIC+5) | ||
369 | |||
370 | |||
371 | /* how I came up with this magic? | ||
372 | * base64 decode "actlog==" ;) */ | ||
373 | #define DRBD_AL_MAGIC 0x69cb65a2 | ||
340 | 374 | ||
341 | /* these are of type "int" */ | 375 | /* these are of type "int" */ |
342 | #define DRBD_MD_INDEX_INTERNAL -1 | 376 | #define DRBD_MD_INDEX_INTERNAL -1 |
343 | #define DRBD_MD_INDEX_FLEX_EXT -2 | 377 | #define DRBD_MD_INDEX_FLEX_EXT -2 |
344 | #define DRBD_MD_INDEX_FLEX_INT -3 | 378 | #define DRBD_MD_INDEX_FLEX_INT -3 |
345 | 379 | ||
346 | /* Start of the new netlink/connector stuff */ | ||
347 | |||
348 | #define DRBD_NL_CREATE_DEVICE 0x01 | ||
349 | #define DRBD_NL_SET_DEFAULTS 0x02 | ||
350 | |||
351 | |||
352 | /* For searching a vacant cn_idx value */ | ||
353 | #define CN_IDX_STEP 6977 | ||
354 | |||
355 | struct drbd_nl_cfg_req { | ||
356 | int packet_type; | ||
357 | unsigned int drbd_minor; | ||
358 | int flags; | ||
359 | unsigned short tag_list[]; | ||
360 | }; | ||
361 | |||
362 | struct drbd_nl_cfg_reply { | ||
363 | int packet_type; | ||
364 | unsigned int minor; | ||
365 | int ret_code; /* enum ret_code or set_st_err_t */ | ||
366 | unsigned short tag_list[]; /* only used with get_* calls */ | ||
367 | }; | ||
368 | |||
369 | #endif | 380 | #endif |
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h new file mode 100644 index 000000000000..d0d8fac8a6e4 --- /dev/null +++ b/include/linux/drbd_genl.h | |||
@@ -0,0 +1,378 @@ | |||
1 | /* | ||
2 | * General overview: | ||
3 | * full generic netlink message: | ||
4 | * |nlmsghdr|genlmsghdr|<payload> | ||
5 | * | ||
6 | * payload: | ||
7 | * |optional fixed size family header|<sequence of netlink attributes> | ||
8 | * | ||
9 | * sequence of netlink attributes: | ||
10 | * I chose to have all "top level" attributes NLA_NESTED, | ||
11 | * corresponding to some real struct. | ||
12 | * So we have a sequence of |tla, len|<nested nla sequence> | ||
13 | * | ||
14 | * nested nla sequence: | ||
15 | * may be empty, or contain a sequence of netlink attributes | ||
16 | * representing the struct fields. | ||
17 | * | ||
18 | * The tag number of any field (regardless of containing struct) | ||
19 | * will be available as T_ ## field_name, | ||
20 | * so you cannot have the same field name in two differnt structs. | ||
21 | * | ||
22 | * The tag numbers themselves are per struct, though, | ||
23 | * so should always begin at 1 (not 0, that is the special "NLA_UNSPEC" type, | ||
24 | * which we won't use here). | ||
25 | * The tag numbers are used as index in the respective nla_policy array. | ||
26 | * | ||
27 | * GENL_struct(tag_name, tag_number, struct name, struct fields) - struct and policy | ||
28 | * genl_magic_struct.h | ||
29 | * generates the struct declaration, | ||
30 | * generates an entry in the tla enum, | ||
31 | * genl_magic_func.h | ||
32 | * generates an entry in the static tla policy | ||
33 | * with .type = NLA_NESTED | ||
34 | * generates the static <struct_name>_nl_policy definition, | ||
35 | * and static conversion functions | ||
36 | * | ||
37 | * genl_magic_func.h | ||
38 | * | ||
39 | * GENL_mc_group(group) | ||
40 | * genl_magic_struct.h | ||
41 | * does nothing | ||
42 | * genl_magic_func.h | ||
43 | * defines and registers the mcast group, | ||
44 | * and provides a send helper | ||
45 | * | ||
46 | * GENL_notification(op_name, op_num, mcast_group, tla list) | ||
47 | * These are notifications to userspace. | ||
48 | * | ||
49 | * genl_magic_struct.h | ||
50 | * generates an entry in the genl_ops enum, | ||
51 | * genl_magic_func.h | ||
52 | * does nothing | ||
53 | * | ||
54 | * mcast group: the name of the mcast group this notification should be | ||
55 | * expected on | ||
56 | * tla list: the list of expected top level attributes, | ||
57 | * for documentation and sanity checking. | ||
58 | * | ||
59 | * GENL_op(op_name, op_num, flags and handler, tla list) - "genl operations" | ||
60 | * These are requests from userspace. | ||
61 | * | ||
62 | * _op and _notification share the same "number space", | ||
63 | * op_nr will be assigned to "genlmsghdr->cmd" | ||
64 | * | ||
65 | * genl_magic_struct.h | ||
66 | * generates an entry in the genl_ops enum, | ||
67 | * genl_magic_func.h | ||
68 | * generates an entry in the static genl_ops array, | ||
69 | * and static register/unregister functions to | ||
70 | * genl_register_family_with_ops(). | ||
71 | * | ||
72 | * flags and handler: | ||
73 | * GENL_op_init( .doit = x, .dumpit = y, .flags = something) | ||
74 | * GENL_doit(x) => .dumpit = NULL, .flags = GENL_ADMIN_PERM | ||
75 | * tla list: the list of expected top level attributes, | ||
76 | * for documentation and sanity checking. | ||
77 | */ | ||
78 | |||
79 | /* | ||
80 | * STRUCTS | ||
81 | */ | ||
82 | |||
83 | /* this is sent kernel -> userland on various error conditions, and contains | ||
84 | * informational textual info, which is supposedly human readable. | ||
85 | * The computer relevant return code is in the drbd_genlmsghdr. | ||
86 | */ | ||
87 | GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, | ||
88 | /* "arbitrary" size strings, nla_policy.len = 0 */ | ||
89 | __str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0) | ||
90 | ) | ||
91 | |||
92 | /* Configuration requests typically need a context to operate on. | ||
93 | * Possible keys are device minor (fits in the drbd_genlmsghdr), | ||
94 | * the replication link (aka connection) name, | ||
95 | * and/or the replication group (aka resource) name, | ||
96 | * and the volume id within the resource. */ | ||
97 | GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context, | ||
98 | __u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume) | ||
99 | __str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128) | ||
100 | __bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128) | ||
101 | __bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128) | ||
102 | ) | ||
103 | |||
104 | GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, | ||
105 | __str_field(1, DRBD_F_REQUIRED | DRBD_F_INVARIANT, backing_dev, 128) | ||
106 | __str_field(2, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev, 128) | ||
107 | __s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx) | ||
108 | |||
109 | /* use the resize command to try and change the disk_size */ | ||
110 | __u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size) | ||
111 | /* we could change the max_bio_bvecs, | ||
112 | * but it won't propagate through the stack */ | ||
113 | __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs) | ||
114 | |||
115 | __u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF) | ||
116 | __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF) | ||
117 | |||
118 | __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF) | ||
119 | __s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF) | ||
120 | __u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF) | ||
121 | __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) | ||
122 | __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF) | ||
123 | __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) | ||
124 | __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) | ||
125 | __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) | ||
126 | |||
127 | __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) | ||
128 | __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) | ||
129 | __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) | ||
130 | __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) | ||
131 | __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) | ||
132 | __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) | ||
133 | /* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */ | ||
134 | __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) | ||
135 | ) | ||
136 | |||
137 | GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, | ||
138 | __str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, 32) | ||
139 | __u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF) | ||
140 | ) | ||
141 | |||
142 | GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, | ||
143 | __str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE, | ||
144 | shared_secret, SHARED_SECRET_MAX) | ||
145 | __str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX) | ||
146 | __str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX) | ||
147 | __str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX) | ||
148 | __str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX) | ||
149 | __u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF) | ||
150 | __u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF) | ||
151 | __u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF) | ||
152 | __u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF) | ||
153 | __u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF) | ||
154 | __u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) | ||
155 | __u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) | ||
156 | __u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF) | ||
157 | __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF) | ||
158 | __u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) | ||
159 | __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) | ||
160 | __u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF) | ||
161 | __u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF) | ||
162 | __u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF) | ||
163 | __u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF) | ||
164 | __u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF) | ||
165 | __u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF) | ||
166 | __u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF) | ||
167 | __flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) | ||
168 | __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data) | ||
169 | __flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF) | ||
170 | __flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF) | ||
171 | __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) | ||
172 | __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) | ||
173 | /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */ | ||
174 | ) | ||
175 | |||
176 | GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, | ||
177 | __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate) | ||
178 | ) | ||
179 | |||
180 | GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, | ||
181 | __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) | ||
182 | __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) | ||
183 | __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) | ||
184 | ) | ||
185 | |||
186 | GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, | ||
187 | /* the reason of the broadcast, | ||
188 | * if this is an event triggered broadcast. */ | ||
189 | __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason) | ||
190 | __u32_field(2, DRBD_F_REQUIRED, current_state) | ||
191 | __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity) | ||
192 | __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid) | ||
193 | |||
194 | /* These are for broadcast from after state change work. | ||
195 | * prev_state and new_state are from the moment the state change took | ||
196 | * place, new_state is not neccessarily the same as current_state, | ||
197 | * there may have been more state changes since. Which will be | ||
198 | * broadcasted soon, in their respective after state change work. */ | ||
199 | __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state) | ||
200 | __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state) | ||
201 | |||
202 | /* if we have a local disk: */ | ||
203 | __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64))) | ||
204 | __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags) | ||
205 | __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total) | ||
206 | __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos) | ||
207 | /* and in case resync or online verify is active */ | ||
208 | __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total) | ||
209 | __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed) | ||
210 | |||
211 | /* for pre and post notifications of helper execution */ | ||
212 | __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32) | ||
213 | __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code) | ||
214 | |||
215 | __u64_field(15, 0, send_cnt) | ||
216 | __u64_field(16, 0, recv_cnt) | ||
217 | __u64_field(17, 0, read_cnt) | ||
218 | __u64_field(18, 0, writ_cnt) | ||
219 | __u64_field(19, 0, al_writ_cnt) | ||
220 | __u64_field(20, 0, bm_writ_cnt) | ||
221 | __u32_field(21, 0, ap_bio_cnt) | ||
222 | __u32_field(22, 0, ap_pending_cnt) | ||
223 | __u32_field(23, 0, rs_pending_cnt) | ||
224 | ) | ||
225 | |||
226 | GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms, | ||
227 | __u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector) | ||
228 | __u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector) | ||
229 | ) | ||
230 | |||
231 | GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms, | ||
232 | __flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm) | ||
233 | ) | ||
234 | |||
235 | GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, | ||
236 | __u32_field(1, DRBD_F_REQUIRED, timeout_type) | ||
237 | ) | ||
238 | |||
239 | GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms, | ||
240 | __flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect) | ||
241 | ) | ||
242 | |||
243 | GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, | ||
244 | __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) | ||
245 | ) | ||
246 | |||
247 | /* | ||
248 | * Notifications and commands (genlmsghdr->cmd) | ||
249 | */ | ||
250 | GENL_mc_group(events) | ||
251 | |||
252 | /* kernel -> userspace announcement of changes */ | ||
253 | GENL_notification( | ||
254 | DRBD_EVENT, 1, events, | ||
255 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
256 | GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED) | ||
257 | GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY) | ||
258 | GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY) | ||
259 | GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY) | ||
260 | ) | ||
261 | |||
262 | /* query kernel for specific or all info */ | ||
263 | GENL_op( | ||
264 | DRBD_ADM_GET_STATUS, 2, | ||
265 | GENL_op_init( | ||
266 | .doit = drbd_adm_get_status, | ||
267 | .dumpit = drbd_adm_get_status_all, | ||
268 | /* anyone may ask for the status, | ||
269 | * it is broadcasted anyways */ | ||
270 | ), | ||
271 | /* To select the object .doit. | ||
272 | * Or a subset of objects in .dumpit. */ | ||
273 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) | ||
274 | ) | ||
275 | |||
276 | /* add DRBD minor devices as volumes to resources */ | ||
277 | GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_add_minor), | ||
278 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
279 | GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_delete_minor), | ||
280 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
281 | |||
282 | /* add or delete resources */ | ||
283 | GENL_op(DRBD_ADM_NEW_RESOURCE, 7, GENL_doit(drbd_adm_new_resource), | ||
284 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
285 | GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource), | ||
286 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
287 | |||
288 | GENL_op(DRBD_ADM_RESOURCE_OPTS, 9, | ||
289 | GENL_doit(drbd_adm_resource_opts), | ||
290 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
291 | GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY) | ||
292 | ) | ||
293 | |||
294 | GENL_op( | ||
295 | DRBD_ADM_CONNECT, 10, | ||
296 | GENL_doit(drbd_adm_connect), | ||
297 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
298 | GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) | ||
299 | ) | ||
300 | |||
301 | GENL_op( | ||
302 | DRBD_ADM_CHG_NET_OPTS, 29, | ||
303 | GENL_doit(drbd_adm_net_opts), | ||
304 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
305 | GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) | ||
306 | ) | ||
307 | |||
308 | GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect), | ||
309 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
310 | |||
311 | GENL_op(DRBD_ADM_ATTACH, 12, | ||
312 | GENL_doit(drbd_adm_attach), | ||
313 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
314 | GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_F_REQUIRED) | ||
315 | ) | ||
316 | |||
317 | GENL_op(DRBD_ADM_CHG_DISK_OPTS, 28, | ||
318 | GENL_doit(drbd_adm_disk_opts), | ||
319 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
320 | GENL_tla_expected(DRBD_NLA_DISK_OPTS, DRBD_F_REQUIRED) | ||
321 | ) | ||
322 | |||
323 | GENL_op( | ||
324 | DRBD_ADM_RESIZE, 13, | ||
325 | GENL_doit(drbd_adm_resize), | ||
326 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
327 | GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY) | ||
328 | ) | ||
329 | |||
330 | GENL_op( | ||
331 | DRBD_ADM_PRIMARY, 14, | ||
332 | GENL_doit(drbd_adm_set_role), | ||
333 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
334 | GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) | ||
335 | ) | ||
336 | |||
337 | GENL_op( | ||
338 | DRBD_ADM_SECONDARY, 15, | ||
339 | GENL_doit(drbd_adm_set_role), | ||
340 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
341 | GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) | ||
342 | ) | ||
343 | |||
344 | GENL_op( | ||
345 | DRBD_ADM_NEW_C_UUID, 16, | ||
346 | GENL_doit(drbd_adm_new_c_uuid), | ||
347 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
348 | GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY) | ||
349 | ) | ||
350 | |||
351 | GENL_op( | ||
352 | DRBD_ADM_START_OV, 17, | ||
353 | GENL_doit(drbd_adm_start_ov), | ||
354 | GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY) | ||
355 | ) | ||
356 | |||
357 | GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach), | ||
358 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
359 | GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY)) | ||
360 | |||
361 | GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate), | ||
362 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
363 | GENL_op(DRBD_ADM_INVAL_PEER, 20, GENL_doit(drbd_adm_invalidate_peer), | ||
364 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
365 | GENL_op(DRBD_ADM_PAUSE_SYNC, 21, GENL_doit(drbd_adm_pause_sync), | ||
366 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
367 | GENL_op(DRBD_ADM_RESUME_SYNC, 22, GENL_doit(drbd_adm_resume_sync), | ||
368 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
369 | GENL_op(DRBD_ADM_SUSPEND_IO, 23, GENL_doit(drbd_adm_suspend_io), | ||
370 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
371 | GENL_op(DRBD_ADM_RESUME_IO, 24, GENL_doit(drbd_adm_resume_io), | ||
372 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
373 | GENL_op(DRBD_ADM_OUTDATE, 25, GENL_doit(drbd_adm_outdate), | ||
374 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
375 | GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), | ||
376 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
377 | GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), | ||
378 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
diff --git a/include/linux/drbd_genl_api.h b/include/linux/drbd_genl_api.h new file mode 100644 index 000000000000..9ef50d51e34e --- /dev/null +++ b/include/linux/drbd_genl_api.h | |||
@@ -0,0 +1,55 @@ | |||
1 | #ifndef DRBD_GENL_STRUCT_H | ||
2 | #define DRBD_GENL_STRUCT_H | ||
3 | |||
4 | /** | ||
5 | * struct drbd_genlmsghdr - DRBD specific header used in NETLINK_GENERIC requests | ||
6 | * @minor: | ||
7 | * For admin requests (user -> kernel): which minor device to operate on. | ||
8 | * For (unicast) replies or informational (broadcast) messages | ||
9 | * (kernel -> user): which minor device the information is about. | ||
10 | * If we do not operate on minors, but on connections or resources, | ||
11 | * the minor value shall be (~0), and the attribute DRBD_NLA_CFG_CONTEXT | ||
12 | * is used instead. | ||
13 | * @flags: possible operation modifiers (relevant only for user->kernel): | ||
14 | * DRBD_GENL_F_SET_DEFAULTS | ||
15 | * @volume: | ||
16 | * When creating a new minor (adding it to a resource), the resource needs | ||
17 | * to know which volume number within the resource this is supposed to be. | ||
18 | * The volume number corresponds to the same volume number on the remote side, | ||
19 | * whereas the minor number on the remote side may be different | ||
20 | * (union with flags). | ||
21 | * @ret_code: kernel->userland unicast cfg reply return code (union with flags); | ||
22 | */ | ||
23 | struct drbd_genlmsghdr { | ||
24 | __u32 minor; | ||
25 | union { | ||
26 | __u32 flags; | ||
27 | __s32 ret_code; | ||
28 | }; | ||
29 | }; | ||
30 | |||
31 | /* To be used in drbd_genlmsghdr.flags */ | ||
32 | enum { | ||
33 | DRBD_GENL_F_SET_DEFAULTS = 1, | ||
34 | }; | ||
35 | |||
36 | enum drbd_state_info_bcast_reason { | ||
37 | SIB_GET_STATUS_REPLY = 1, | ||
38 | SIB_STATE_CHANGE = 2, | ||
39 | SIB_HELPER_PRE = 3, | ||
40 | SIB_HELPER_POST = 4, | ||
41 | SIB_SYNC_PROGRESS = 5, | ||
42 | }; | ||
43 | |||
44 | /* hack around predefined gcc/cpp "linux=1", | ||
45 | * we cannot possibly include <1/drbd_genl.h> */ | ||
46 | #undef linux | ||
47 | |||
48 | #include <linux/drbd.h> | ||
49 | #define GENL_MAGIC_VERSION API_VERSION | ||
50 | #define GENL_MAGIC_FAMILY drbd | ||
51 | #define GENL_MAGIC_FAMILY_HDRSZ sizeof(struct drbd_genlmsghdr) | ||
52 | #define GENL_MAGIC_INCLUDE_FILE <linux/drbd_genl.h> | ||
53 | #include <linux/genl_magic_struct.h> | ||
54 | |||
55 | #endif | ||
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index fb670bf603f7..1fa19c5f5e64 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h | |||
@@ -16,29 +16,37 @@ | |||
16 | #define DEBUG_RANGE_CHECK 0 | 16 | #define DEBUG_RANGE_CHECK 0 |
17 | 17 | ||
18 | #define DRBD_MINOR_COUNT_MIN 1 | 18 | #define DRBD_MINOR_COUNT_MIN 1 |
19 | #define DRBD_MINOR_COUNT_MAX 256 | 19 | #define DRBD_MINOR_COUNT_MAX 255 |
20 | #define DRBD_MINOR_COUNT_DEF 32 | 20 | #define DRBD_MINOR_COUNT_DEF 32 |
21 | #define DRBD_MINOR_COUNT_SCALE '1' | ||
22 | |||
23 | #define DRBD_VOLUME_MAX 65535 | ||
21 | 24 | ||
22 | #define DRBD_DIALOG_REFRESH_MIN 0 | 25 | #define DRBD_DIALOG_REFRESH_MIN 0 |
23 | #define DRBD_DIALOG_REFRESH_MAX 600 | 26 | #define DRBD_DIALOG_REFRESH_MAX 600 |
27 | #define DRBD_DIALOG_REFRESH_SCALE '1' | ||
24 | 28 | ||
25 | /* valid port number */ | 29 | /* valid port number */ |
26 | #define DRBD_PORT_MIN 1 | 30 | #define DRBD_PORT_MIN 1 |
27 | #define DRBD_PORT_MAX 0xffff | 31 | #define DRBD_PORT_MAX 0xffff |
32 | #define DRBD_PORT_SCALE '1' | ||
28 | 33 | ||
29 | /* startup { */ | 34 | /* startup { */ |
30 | /* if you want more than 3.4 days, disable */ | 35 | /* if you want more than 3.4 days, disable */ |
31 | #define DRBD_WFC_TIMEOUT_MIN 0 | 36 | #define DRBD_WFC_TIMEOUT_MIN 0 |
32 | #define DRBD_WFC_TIMEOUT_MAX 300000 | 37 | #define DRBD_WFC_TIMEOUT_MAX 300000 |
33 | #define DRBD_WFC_TIMEOUT_DEF 0 | 38 | #define DRBD_WFC_TIMEOUT_DEF 0 |
39 | #define DRBD_WFC_TIMEOUT_SCALE '1' | ||
34 | 40 | ||
35 | #define DRBD_DEGR_WFC_TIMEOUT_MIN 0 | 41 | #define DRBD_DEGR_WFC_TIMEOUT_MIN 0 |
36 | #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 | 42 | #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 |
37 | #define DRBD_DEGR_WFC_TIMEOUT_DEF 0 | 43 | #define DRBD_DEGR_WFC_TIMEOUT_DEF 0 |
44 | #define DRBD_DEGR_WFC_TIMEOUT_SCALE '1' | ||
38 | 45 | ||
39 | #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 | 46 | #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 |
40 | #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 | 47 | #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 |
41 | #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 | 48 | #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 |
49 | #define DRBD_OUTDATED_WFC_TIMEOUT_SCALE '1' | ||
42 | /* }*/ | 50 | /* }*/ |
43 | 51 | ||
44 | /* net { */ | 52 | /* net { */ |
@@ -47,75 +55,91 @@ | |||
47 | #define DRBD_TIMEOUT_MIN 1 | 55 | #define DRBD_TIMEOUT_MIN 1 |
48 | #define DRBD_TIMEOUT_MAX 600 | 56 | #define DRBD_TIMEOUT_MAX 600 |
49 | #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ | 57 | #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ |
58 | #define DRBD_TIMEOUT_SCALE '1' | ||
50 | 59 | ||
51 | /* If backing disk takes longer than disk_timeout, mark the disk as failed */ | 60 | /* If backing disk takes longer than disk_timeout, mark the disk as failed */ |
52 | #define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ | 61 | #define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ |
53 | #define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ | 62 | #define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ |
54 | #define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ | 63 | #define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ |
64 | #define DRBD_DISK_TIMEOUT_SCALE '1' | ||
55 | 65 | ||
56 | /* active connection retries when C_WF_CONNECTION */ | 66 | /* active connection retries when C_WF_CONNECTION */ |
57 | #define DRBD_CONNECT_INT_MIN 1 | 67 | #define DRBD_CONNECT_INT_MIN 1 |
58 | #define DRBD_CONNECT_INT_MAX 120 | 68 | #define DRBD_CONNECT_INT_MAX 120 |
59 | #define DRBD_CONNECT_INT_DEF 10 /* seconds */ | 69 | #define DRBD_CONNECT_INT_DEF 10 /* seconds */ |
70 | #define DRBD_CONNECT_INT_SCALE '1' | ||
60 | 71 | ||
61 | /* keep-alive probes when idle */ | 72 | /* keep-alive probes when idle */ |
62 | #define DRBD_PING_INT_MIN 1 | 73 | #define DRBD_PING_INT_MIN 1 |
63 | #define DRBD_PING_INT_MAX 120 | 74 | #define DRBD_PING_INT_MAX 120 |
64 | #define DRBD_PING_INT_DEF 10 | 75 | #define DRBD_PING_INT_DEF 10 |
76 | #define DRBD_PING_INT_SCALE '1' | ||
65 | 77 | ||
66 | /* timeout for the ping packets.*/ | 78 | /* timeout for the ping packets.*/ |
67 | #define DRBD_PING_TIMEO_MIN 1 | 79 | #define DRBD_PING_TIMEO_MIN 1 |
68 | #define DRBD_PING_TIMEO_MAX 300 | 80 | #define DRBD_PING_TIMEO_MAX 300 |
69 | #define DRBD_PING_TIMEO_DEF 5 | 81 | #define DRBD_PING_TIMEO_DEF 5 |
82 | #define DRBD_PING_TIMEO_SCALE '1' | ||
70 | 83 | ||
71 | /* max number of write requests between write barriers */ | 84 | /* max number of write requests between write barriers */ |
72 | #define DRBD_MAX_EPOCH_SIZE_MIN 1 | 85 | #define DRBD_MAX_EPOCH_SIZE_MIN 1 |
73 | #define DRBD_MAX_EPOCH_SIZE_MAX 20000 | 86 | #define DRBD_MAX_EPOCH_SIZE_MAX 20000 |
74 | #define DRBD_MAX_EPOCH_SIZE_DEF 2048 | 87 | #define DRBD_MAX_EPOCH_SIZE_DEF 2048 |
88 | #define DRBD_MAX_EPOCH_SIZE_SCALE '1' | ||
75 | 89 | ||
76 | /* I don't think that a tcp send buffer of more than 10M is useful */ | 90 | /* I don't think that a tcp send buffer of more than 10M is useful */ |
77 | #define DRBD_SNDBUF_SIZE_MIN 0 | 91 | #define DRBD_SNDBUF_SIZE_MIN 0 |
78 | #define DRBD_SNDBUF_SIZE_MAX (10<<20) | 92 | #define DRBD_SNDBUF_SIZE_MAX (10<<20) |
79 | #define DRBD_SNDBUF_SIZE_DEF 0 | 93 | #define DRBD_SNDBUF_SIZE_DEF 0 |
94 | #define DRBD_SNDBUF_SIZE_SCALE '1' | ||
80 | 95 | ||
81 | #define DRBD_RCVBUF_SIZE_MIN 0 | 96 | #define DRBD_RCVBUF_SIZE_MIN 0 |
82 | #define DRBD_RCVBUF_SIZE_MAX (10<<20) | 97 | #define DRBD_RCVBUF_SIZE_MAX (10<<20) |
83 | #define DRBD_RCVBUF_SIZE_DEF 0 | 98 | #define DRBD_RCVBUF_SIZE_DEF 0 |
99 | #define DRBD_RCVBUF_SIZE_SCALE '1' | ||
84 | 100 | ||
85 | /* @4k PageSize -> 128kB - 512MB */ | 101 | /* @4k PageSize -> 128kB - 512MB */ |
86 | #define DRBD_MAX_BUFFERS_MIN 32 | 102 | #define DRBD_MAX_BUFFERS_MIN 32 |
87 | #define DRBD_MAX_BUFFERS_MAX 131072 | 103 | #define DRBD_MAX_BUFFERS_MAX 131072 |
88 | #define DRBD_MAX_BUFFERS_DEF 2048 | 104 | #define DRBD_MAX_BUFFERS_DEF 2048 |
105 | #define DRBD_MAX_BUFFERS_SCALE '1' | ||
89 | 106 | ||
90 | /* @4k PageSize -> 4kB - 512MB */ | 107 | /* @4k PageSize -> 4kB - 512MB */ |
91 | #define DRBD_UNPLUG_WATERMARK_MIN 1 | 108 | #define DRBD_UNPLUG_WATERMARK_MIN 1 |
92 | #define DRBD_UNPLUG_WATERMARK_MAX 131072 | 109 | #define DRBD_UNPLUG_WATERMARK_MAX 131072 |
93 | #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) | 110 | #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) |
111 | #define DRBD_UNPLUG_WATERMARK_SCALE '1' | ||
94 | 112 | ||
95 | /* 0 is disabled. | 113 | /* 0 is disabled. |
96 | * 200 should be more than enough even for very short timeouts */ | 114 | * 200 should be more than enough even for very short timeouts */ |
97 | #define DRBD_KO_COUNT_MIN 0 | 115 | #define DRBD_KO_COUNT_MIN 0 |
98 | #define DRBD_KO_COUNT_MAX 200 | 116 | #define DRBD_KO_COUNT_MAX 200 |
99 | #define DRBD_KO_COUNT_DEF 0 | 117 | #define DRBD_KO_COUNT_DEF 7 |
118 | #define DRBD_KO_COUNT_SCALE '1' | ||
100 | /* } */ | 119 | /* } */ |
101 | 120 | ||
102 | /* syncer { */ | 121 | /* syncer { */ |
103 | /* FIXME allow rate to be zero? */ | 122 | /* FIXME allow rate to be zero? */ |
104 | #define DRBD_RATE_MIN 1 | 123 | #define DRBD_RESYNC_RATE_MIN 1 |
105 | /* channel bonding 10 GbE, or other hardware */ | 124 | /* channel bonding 10 GbE, or other hardware */ |
106 | #define DRBD_RATE_MAX (4 << 20) | 125 | #define DRBD_RESYNC_RATE_MAX (4 << 20) |
107 | #define DRBD_RATE_DEF 250 /* kb/second */ | 126 | #define DRBD_RESYNC_RATE_DEF 250 |
127 | #define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ | ||
108 | 128 | ||
109 | /* less than 7 would hit performance unnecessarily. | 129 | /* less than 7 would hit performance unnecessarily. |
110 | * 3833 is the largest prime that still does fit | 130 | * 919 slots context information per transaction, |
111 | * into 64 sectors of activity log */ | 131 | * 32k activity log, 4k transaction size, |
132 | * one transaction in flight: | ||
133 | * 919 * 7 = 6433 */ | ||
112 | #define DRBD_AL_EXTENTS_MIN 7 | 134 | #define DRBD_AL_EXTENTS_MIN 7 |
113 | #define DRBD_AL_EXTENTS_MAX 3833 | 135 | #define DRBD_AL_EXTENTS_MAX 6433 |
114 | #define DRBD_AL_EXTENTS_DEF 127 | 136 | #define DRBD_AL_EXTENTS_DEF 1237 |
137 | #define DRBD_AL_EXTENTS_SCALE '1' | ||
115 | 138 | ||
116 | #define DRBD_AFTER_MIN -1 | 139 | #define DRBD_MINOR_NUMBER_MIN -1 |
117 | #define DRBD_AFTER_MAX 255 | 140 | #define DRBD_MINOR_NUMBER_MAX ((1 << 20) - 1) |
118 | #define DRBD_AFTER_DEF -1 | 141 | #define DRBD_MINOR_NUMBER_DEF -1 |
142 | #define DRBD_MINOR_NUMBER_SCALE '1' | ||
119 | 143 | ||
120 | /* } */ | 144 | /* } */ |
121 | 145 | ||
@@ -124,11 +148,12 @@ | |||
124 | * the upper limit with 64bit kernel, enough ram and flexible meta data | 148 | * the upper limit with 64bit kernel, enough ram and flexible meta data |
125 | * is 1 PiB, currently. */ | 149 | * is 1 PiB, currently. */ |
126 | /* DRBD_MAX_SECTORS */ | 150 | /* DRBD_MAX_SECTORS */ |
127 | #define DRBD_DISK_SIZE_SECT_MIN 0 | 151 | #define DRBD_DISK_SIZE_MIN 0 |
128 | #define DRBD_DISK_SIZE_SECT_MAX (1 * (2LLU << 40)) | 152 | #define DRBD_DISK_SIZE_MAX (1 * (2LLU << 40)) |
129 | #define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ | 153 | #define DRBD_DISK_SIZE_DEF 0 /* = disabled = no user size... */ |
154 | #define DRBD_DISK_SIZE_SCALE 's' /* sectors */ | ||
130 | 155 | ||
131 | #define DRBD_ON_IO_ERROR_DEF EP_PASS_ON | 156 | #define DRBD_ON_IO_ERROR_DEF EP_DETACH |
132 | #define DRBD_FENCING_DEF FP_DONT_CARE | 157 | #define DRBD_FENCING_DEF FP_DONT_CARE |
133 | #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT | 158 | #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT |
134 | #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT | 159 | #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT |
@@ -136,38 +161,59 @@ | |||
136 | #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT | 161 | #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT |
137 | #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR | 162 | #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR |
138 | #define DRBD_ON_CONGESTION_DEF OC_BLOCK | 163 | #define DRBD_ON_CONGESTION_DEF OC_BLOCK |
164 | #define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL | ||
139 | 165 | ||
140 | #define DRBD_MAX_BIO_BVECS_MIN 0 | 166 | #define DRBD_MAX_BIO_BVECS_MIN 0 |
141 | #define DRBD_MAX_BIO_BVECS_MAX 128 | 167 | #define DRBD_MAX_BIO_BVECS_MAX 128 |
142 | #define DRBD_MAX_BIO_BVECS_DEF 0 | 168 | #define DRBD_MAX_BIO_BVECS_DEF 0 |
169 | #define DRBD_MAX_BIO_BVECS_SCALE '1' | ||
143 | 170 | ||
144 | #define DRBD_C_PLAN_AHEAD_MIN 0 | 171 | #define DRBD_C_PLAN_AHEAD_MIN 0 |
145 | #define DRBD_C_PLAN_AHEAD_MAX 300 | 172 | #define DRBD_C_PLAN_AHEAD_MAX 300 |
146 | #define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */ | 173 | #define DRBD_C_PLAN_AHEAD_DEF 20 |
174 | #define DRBD_C_PLAN_AHEAD_SCALE '1' | ||
147 | 175 | ||
148 | #define DRBD_C_DELAY_TARGET_MIN 1 | 176 | #define DRBD_C_DELAY_TARGET_MIN 1 |
149 | #define DRBD_C_DELAY_TARGET_MAX 100 | 177 | #define DRBD_C_DELAY_TARGET_MAX 100 |
150 | #define DRBD_C_DELAY_TARGET_DEF 10 | 178 | #define DRBD_C_DELAY_TARGET_DEF 10 |
179 | #define DRBD_C_DELAY_TARGET_SCALE '1' | ||
151 | 180 | ||
152 | #define DRBD_C_FILL_TARGET_MIN 0 | 181 | #define DRBD_C_FILL_TARGET_MIN 0 |
153 | #define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */ | 182 | #define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */ |
154 | #define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */ | 183 | #define DRBD_C_FILL_TARGET_DEF 100 /* Try to place 50KiB in socket send buffer during resync */ |
184 | #define DRBD_C_FILL_TARGET_SCALE 's' /* sectors */ | ||
155 | 185 | ||
156 | #define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */ | 186 | #define DRBD_C_MAX_RATE_MIN 250 |
157 | #define DRBD_C_MAX_RATE_MAX (4 << 20) | 187 | #define DRBD_C_MAX_RATE_MAX (4 << 20) |
158 | #define DRBD_C_MAX_RATE_DEF 102400 | 188 | #define DRBD_C_MAX_RATE_DEF 102400 |
189 | #define DRBD_C_MAX_RATE_SCALE 'k' /* kilobytes */ | ||
159 | 190 | ||
160 | #define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */ | 191 | #define DRBD_C_MIN_RATE_MIN 0 |
161 | #define DRBD_C_MIN_RATE_MAX (4 << 20) | 192 | #define DRBD_C_MIN_RATE_MAX (4 << 20) |
162 | #define DRBD_C_MIN_RATE_DEF 4096 | 193 | #define DRBD_C_MIN_RATE_DEF 250 |
194 | #define DRBD_C_MIN_RATE_SCALE 'k' /* kilobytes */ | ||
163 | 195 | ||
164 | #define DRBD_CONG_FILL_MIN 0 | 196 | #define DRBD_CONG_FILL_MIN 0 |
165 | #define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */ | 197 | #define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */ |
166 | #define DRBD_CONG_FILL_DEF 0 | 198 | #define DRBD_CONG_FILL_DEF 0 |
199 | #define DRBD_CONG_FILL_SCALE 's' /* sectors */ | ||
167 | 200 | ||
168 | #define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN | 201 | #define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN |
169 | #define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX | 202 | #define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX |
170 | #define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF | 203 | #define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF |
204 | #define DRBD_CONG_EXTENTS_SCALE DRBD_AL_EXTENTS_SCALE | ||
205 | |||
206 | #define DRBD_PROTOCOL_DEF DRBD_PROT_C | ||
207 | |||
208 | #define DRBD_DISK_BARRIER_DEF 0 | ||
209 | #define DRBD_DISK_FLUSHES_DEF 1 | ||
210 | #define DRBD_DISK_DRAIN_DEF 1 | ||
211 | #define DRBD_MD_FLUSHES_DEF 1 | ||
212 | #define DRBD_TCP_CORK_DEF 1 | ||
213 | #define DRBD_AL_UPDATES_DEF 1 | ||
214 | |||
215 | #define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 | ||
216 | #define DRBD_ALWAYS_ASBP_DEF 0 | ||
217 | #define DRBD_USE_RLE_DEF 1 | ||
171 | 218 | ||
172 | #undef RANGE | ||
173 | #endif | 219 | #endif |
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h deleted file mode 100644 index f6a576df19e0..000000000000 --- a/include/linux/drbd_nl.h +++ /dev/null | |||
@@ -1,164 +0,0 @@ | |||
1 | /* | ||
2 | PAKET( name, | ||
3 | TYPE ( pn, pr, member ) | ||
4 | ... | ||
5 | ) | ||
6 | |||
7 | You may never reissue one of the pn arguments | ||
8 | */ | ||
9 | |||
10 | #if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64) | ||
11 | #error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined" | ||
12 | #endif | ||
13 | |||
14 | NL_PACKET(primary, 1, | ||
15 | NL_BIT( 1, T_MAY_IGNORE, primary_force) | ||
16 | ) | ||
17 | |||
18 | NL_PACKET(secondary, 2, ) | ||
19 | |||
20 | NL_PACKET(disk_conf, 3, | ||
21 | NL_INT64( 2, T_MAY_IGNORE, disk_size) | ||
22 | NL_STRING( 3, T_MANDATORY, backing_dev, 128) | ||
23 | NL_STRING( 4, T_MANDATORY, meta_dev, 128) | ||
24 | NL_INTEGER( 5, T_MANDATORY, meta_dev_idx) | ||
25 | NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) | ||
26 | NL_INTEGER( 7, T_MAY_IGNORE, fencing) | ||
27 | NL_BIT( 37, T_MAY_IGNORE, use_bmbv) | ||
28 | NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) | ||
29 | NL_BIT( 54, T_MAY_IGNORE, no_md_flush) | ||
30 | /* 55 max_bio_size was available in 8.2.6rc2 */ | ||
31 | NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) | ||
32 | NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) | ||
33 | NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) | ||
34 | NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout) | ||
35 | ) | ||
36 | |||
37 | NL_PACKET(detach, 4, | ||
38 | NL_BIT( 88, T_MANDATORY, detach_force) | ||
39 | ) | ||
40 | |||
41 | NL_PACKET(net_conf, 5, | ||
42 | NL_STRING( 8, T_MANDATORY, my_addr, 128) | ||
43 | NL_STRING( 9, T_MANDATORY, peer_addr, 128) | ||
44 | NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) | ||
45 | NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) | ||
46 | NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX) | ||
47 | NL_INTEGER( 14, T_MAY_IGNORE, timeout) | ||
48 | NL_INTEGER( 15, T_MANDATORY, wire_protocol) | ||
49 | NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int) | ||
50 | NL_INTEGER( 17, T_MAY_IGNORE, ping_int) | ||
51 | NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size) | ||
52 | NL_INTEGER( 19, T_MAY_IGNORE, max_buffers) | ||
53 | NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark) | ||
54 | NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size) | ||
55 | NL_INTEGER( 22, T_MAY_IGNORE, ko_count) | ||
56 | NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p) | ||
57 | NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p) | ||
58 | NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) | ||
59 | NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) | ||
60 | NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) | ||
61 | NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) | ||
62 | NL_INTEGER( 81, T_MAY_IGNORE, on_congestion) | ||
63 | NL_INTEGER( 82, T_MAY_IGNORE, cong_fill) | ||
64 | NL_INTEGER( 83, T_MAY_IGNORE, cong_extents) | ||
65 | /* 59 addr_family was available in GIT, never released */ | ||
66 | NL_BIT( 60, T_MANDATORY, mind_af) | ||
67 | NL_BIT( 27, T_MAY_IGNORE, want_lose) | ||
68 | NL_BIT( 28, T_MAY_IGNORE, two_primaries) | ||
69 | NL_BIT( 41, T_MAY_IGNORE, always_asbp) | ||
70 | NL_BIT( 61, T_MAY_IGNORE, no_cork) | ||
71 | NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) | ||
72 | NL_BIT( 70, T_MANDATORY, dry_run) | ||
73 | ) | ||
74 | |||
75 | NL_PACKET(disconnect, 6, | ||
76 | NL_BIT( 84, T_MAY_IGNORE, force) | ||
77 | ) | ||
78 | |||
79 | NL_PACKET(resize, 7, | ||
80 | NL_INT64( 29, T_MAY_IGNORE, resize_size) | ||
81 | NL_BIT( 68, T_MAY_IGNORE, resize_force) | ||
82 | NL_BIT( 69, T_MANDATORY, no_resync) | ||
83 | ) | ||
84 | |||
85 | NL_PACKET(syncer_conf, 8, | ||
86 | NL_INTEGER( 30, T_MAY_IGNORE, rate) | ||
87 | NL_INTEGER( 31, T_MAY_IGNORE, after) | ||
88 | NL_INTEGER( 32, T_MAY_IGNORE, al_extents) | ||
89 | /* NL_INTEGER( 71, T_MAY_IGNORE, dp_volume) | ||
90 | * NL_INTEGER( 72, T_MAY_IGNORE, dp_interval) | ||
91 | * NL_INTEGER( 73, T_MAY_IGNORE, throttle_th) | ||
92 | * NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th) | ||
93 | * feature will be reimplemented differently with 8.3.9 */ | ||
94 | NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) | ||
95 | NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) | ||
96 | NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) | ||
97 | NL_BIT( 65, T_MAY_IGNORE, use_rle) | ||
98 | NL_INTEGER( 75, T_MAY_IGNORE, on_no_data) | ||
99 | NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead) | ||
100 | NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target) | ||
101 | NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target) | ||
102 | NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate) | ||
103 | NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate) | ||
104 | ) | ||
105 | |||
106 | NL_PACKET(invalidate, 9, ) | ||
107 | NL_PACKET(invalidate_peer, 10, ) | ||
108 | NL_PACKET(pause_sync, 11, ) | ||
109 | NL_PACKET(resume_sync, 12, ) | ||
110 | NL_PACKET(suspend_io, 13, ) | ||
111 | NL_PACKET(resume_io, 14, ) | ||
112 | NL_PACKET(outdate, 15, ) | ||
113 | NL_PACKET(get_config, 16, ) | ||
114 | NL_PACKET(get_state, 17, | ||
115 | NL_INTEGER( 33, T_MAY_IGNORE, state_i) | ||
116 | ) | ||
117 | |||
118 | NL_PACKET(get_uuids, 18, | ||
119 | NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64))) | ||
120 | NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags) | ||
121 | ) | ||
122 | |||
123 | NL_PACKET(get_timeout_flag, 19, | ||
124 | NL_BIT( 36, T_MAY_IGNORE, use_degraded) | ||
125 | ) | ||
126 | |||
127 | NL_PACKET(call_helper, 20, | ||
128 | NL_STRING( 38, T_MAY_IGNORE, helper, 32) | ||
129 | ) | ||
130 | |||
131 | /* Tag nr 42 already allocated in drbd-8.1 development. */ | ||
132 | |||
133 | NL_PACKET(sync_progress, 23, | ||
134 | NL_INTEGER( 43, T_MAY_IGNORE, sync_progress) | ||
135 | ) | ||
136 | |||
137 | NL_PACKET(dump_ee, 24, | ||
138 | NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32) | ||
139 | NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX) | ||
140 | NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX) | ||
141 | NL_INT64( 48, T_MAY_IGNORE, ee_sector) | ||
142 | NL_INT64( 49, T_MAY_IGNORE, ee_block_id) | ||
143 | NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10) | ||
144 | ) | ||
145 | |||
146 | NL_PACKET(start_ov, 25, | ||
147 | NL_INT64( 66, T_MAY_IGNORE, start_sector) | ||
148 | NL_INT64( 90, T_MANDATORY, stop_sector) | ||
149 | ) | ||
150 | |||
151 | NL_PACKET(new_c_uuid, 26, | ||
152 | NL_BIT( 63, T_MANDATORY, clear_bm) | ||
153 | ) | ||
154 | |||
155 | #ifdef NL_RESPONSE | ||
156 | NL_RESPONSE(return_code_only, 27) | ||
157 | #endif | ||
158 | |||
159 | #undef NL_PACKET | ||
160 | #undef NL_INTEGER | ||
161 | #undef NL_INT64 | ||
162 | #undef NL_BIT | ||
163 | #undef NL_STRING | ||
164 | #undef NL_RESPONSE | ||
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h deleted file mode 100644 index 82de1f9e48b1..000000000000 --- a/include/linux/drbd_tag_magic.h +++ /dev/null | |||
@@ -1,84 +0,0 @@ | |||
1 | #ifndef DRBD_TAG_MAGIC_H | ||
2 | #define DRBD_TAG_MAGIC_H | ||
3 | |||
4 | #define TT_END 0 | ||
5 | #define TT_REMOVED 0xE000 | ||
6 | |||
7 | /* declare packet_type enums */ | ||
8 | enum packet_types { | ||
9 | #define NL_PACKET(name, number, fields) P_ ## name = number, | ||
10 | #define NL_RESPONSE(name, number) P_ ## name = number, | ||
11 | #define NL_INTEGER(pn, pr, member) | ||
12 | #define NL_INT64(pn, pr, member) | ||
13 | #define NL_BIT(pn, pr, member) | ||
14 | #define NL_STRING(pn, pr, member, len) | ||
15 | #include <linux/drbd_nl.h> | ||
16 | P_nl_after_last_packet, | ||
17 | }; | ||
18 | |||
19 | /* These struct are used to deduce the size of the tag lists: */ | ||
20 | #define NL_PACKET(name, number, fields) \ | ||
21 | struct name ## _tag_len_struct { fields }; | ||
22 | #define NL_INTEGER(pn, pr, member) \ | ||
23 | int member; int tag_and_len ## member; | ||
24 | #define NL_INT64(pn, pr, member) \ | ||
25 | __u64 member; int tag_and_len ## member; | ||
26 | #define NL_BIT(pn, pr, member) \ | ||
27 | unsigned char member:1; int tag_and_len ## member; | ||
28 | #define NL_STRING(pn, pr, member, len) \ | ||
29 | unsigned char member[len]; int member ## _len; \ | ||
30 | int tag_and_len ## member; | ||
31 | #include <linux/drbd_nl.h> | ||
32 | |||
33 | /* declare tag-list-sizes */ | ||
34 | static const int tag_list_sizes[] = { | ||
35 | #define NL_PACKET(name, number, fields) 2 fields , | ||
36 | #define NL_INTEGER(pn, pr, member) + 4 + 4 | ||
37 | #define NL_INT64(pn, pr, member) + 4 + 8 | ||
38 | #define NL_BIT(pn, pr, member) + 4 + 1 | ||
39 | #define NL_STRING(pn, pr, member, len) + 4 + (len) | ||
40 | #include <linux/drbd_nl.h> | ||
41 | }; | ||
42 | |||
43 | /* The two highest bits are used for the tag type */ | ||
44 | #define TT_MASK 0xC000 | ||
45 | #define TT_INTEGER 0x0000 | ||
46 | #define TT_INT64 0x4000 | ||
47 | #define TT_BIT 0x8000 | ||
48 | #define TT_STRING 0xC000 | ||
49 | /* The next bit indicates if processing of the tag is mandatory */ | ||
50 | #define T_MANDATORY 0x2000 | ||
51 | #define T_MAY_IGNORE 0x0000 | ||
52 | #define TN_MASK 0x1fff | ||
53 | /* The remaining 13 bits are used to enumerate the tags */ | ||
54 | |||
55 | #define tag_type(T) ((T) & TT_MASK) | ||
56 | #define tag_number(T) ((T) & TN_MASK) | ||
57 | |||
58 | /* declare tag enums */ | ||
59 | #define NL_PACKET(name, number, fields) fields | ||
60 | enum drbd_tags { | ||
61 | #define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr , | ||
62 | #define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr , | ||
63 | #define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr , | ||
64 | #define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr , | ||
65 | #include <linux/drbd_nl.h> | ||
66 | }; | ||
67 | |||
68 | struct tag { | ||
69 | const char *name; | ||
70 | int type_n_flags; | ||
71 | int max_len; | ||
72 | }; | ||
73 | |||
74 | /* declare tag names */ | ||
75 | #define NL_PACKET(name, number, fields) fields | ||
76 | static const struct tag tag_descriptions[] = { | ||
77 | #define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, | ||
78 | #define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, | ||
79 | #define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, | ||
80 | #define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) }, | ||
81 | #include <linux/drbd_nl.h> | ||
82 | }; | ||
83 | |||
84 | #endif | ||
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h new file mode 100644 index 000000000000..023bc346b877 --- /dev/null +++ b/include/linux/genl_magic_func.h | |||
@@ -0,0 +1,422 @@ | |||
1 | #ifndef GENL_MAGIC_FUNC_H | ||
2 | #define GENL_MAGIC_FUNC_H | ||
3 | |||
4 | #include <linux/genl_magic_struct.h> | ||
5 | |||
6 | /* | ||
7 | * Magic: declare tla policy {{{1 | ||
8 | * Magic: declare nested policies | ||
9 | * {{{2 | ||
10 | */ | ||
11 | #undef GENL_mc_group | ||
12 | #define GENL_mc_group(group) | ||
13 | |||
14 | #undef GENL_notification | ||
15 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) | ||
16 | |||
17 | #undef GENL_op | ||
18 | #define GENL_op(op_name, op_num, handler, tla_list) | ||
19 | |||
20 | #undef GENL_struct | ||
21 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
22 | [tag_name] = { .type = NLA_NESTED }, | ||
23 | |||
24 | static struct nla_policy CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy)[] = { | ||
25 | #include GENL_MAGIC_INCLUDE_FILE | ||
26 | }; | ||
27 | |||
28 | #undef GENL_struct | ||
29 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
30 | static struct nla_policy s_name ## _nl_policy[] __read_mostly = \ | ||
31 | { s_fields }; | ||
32 | |||
33 | #undef __field | ||
34 | #define __field(attr_nr, attr_flag, name, nla_type, _type, __get, \ | ||
35 | __put, __is_signed) \ | ||
36 | [attr_nr] = { .type = nla_type }, | ||
37 | |||
38 | #undef __array | ||
39 | #define __array(attr_nr, attr_flag, name, nla_type, _type, maxlen, \ | ||
40 | __get, __put, __is_signed) \ | ||
41 | [attr_nr] = { .type = nla_type, \ | ||
42 | .len = maxlen - (nla_type == NLA_NUL_STRING) }, | ||
43 | |||
44 | #include GENL_MAGIC_INCLUDE_FILE | ||
45 | |||
46 | #ifndef __KERNEL__ | ||
47 | #ifndef pr_info | ||
48 | #define pr_info(args...) fprintf(stderr, args); | ||
49 | #endif | ||
50 | #endif | ||
51 | |||
52 | #ifdef GENL_MAGIC_DEBUG | ||
53 | static void dprint_field(const char *dir, int nla_type, | ||
54 | const char *name, void *valp) | ||
55 | { | ||
56 | __u64 val = valp ? *(__u32 *)valp : 1; | ||
57 | switch (nla_type) { | ||
58 | case NLA_U8: val = (__u8)val; | ||
59 | case NLA_U16: val = (__u16)val; | ||
60 | case NLA_U32: val = (__u32)val; | ||
61 | pr_info("%s attr %s: %d 0x%08x\n", dir, | ||
62 | name, (int)val, (unsigned)val); | ||
63 | break; | ||
64 | case NLA_U64: | ||
65 | val = *(__u64*)valp; | ||
66 | pr_info("%s attr %s: %lld 0x%08llx\n", dir, | ||
67 | name, (long long)val, (unsigned long long)val); | ||
68 | break; | ||
69 | case NLA_FLAG: | ||
70 | if (val) | ||
71 | pr_info("%s attr %s: set\n", dir, name); | ||
72 | break; | ||
73 | } | ||
74 | } | ||
75 | |||
76 | static void dprint_array(const char *dir, int nla_type, | ||
77 | const char *name, const char *val, unsigned len) | ||
78 | { | ||
79 | switch (nla_type) { | ||
80 | case NLA_NUL_STRING: | ||
81 | if (len && val[len-1] == '\0') | ||
82 | len--; | ||
83 | pr_info("%s attr %s: [len:%u] '%s'\n", dir, name, len, val); | ||
84 | break; | ||
85 | default: | ||
86 | /* we can always show 4 byte, | ||
87 | * thats what nlattr are aligned to. */ | ||
88 | pr_info("%s attr %s: [len:%u] %02x%02x%02x%02x ...\n", | ||
89 | dir, name, len, val[0], val[1], val[2], val[3]); | ||
90 | } | ||
91 | } | ||
92 | |||
93 | #define DPRINT_TLA(a, op, b) pr_info("%s %s %s\n", a, op, b); | ||
94 | |||
95 | /* Name is a member field name of the struct s. | ||
96 | * If s is NULL (only parsing, no copy requested in *_from_attrs()), | ||
97 | * nla is supposed to point to the attribute containing the information | ||
98 | * corresponding to that struct member. */ | ||
99 | #define DPRINT_FIELD(dir, nla_type, name, s, nla) \ | ||
100 | do { \ | ||
101 | if (s) \ | ||
102 | dprint_field(dir, nla_type, #name, &s->name); \ | ||
103 | else if (nla) \ | ||
104 | dprint_field(dir, nla_type, #name, \ | ||
105 | (nla_type == NLA_FLAG) ? NULL \ | ||
106 | : nla_data(nla)); \ | ||
107 | } while (0) | ||
108 | |||
109 | #define DPRINT_ARRAY(dir, nla_type, name, s, nla) \ | ||
110 | do { \ | ||
111 | if (s) \ | ||
112 | dprint_array(dir, nla_type, #name, \ | ||
113 | s->name, s->name ## _len); \ | ||
114 | else if (nla) \ | ||
115 | dprint_array(dir, nla_type, #name, \ | ||
116 | nla_data(nla), nla_len(nla)); \ | ||
117 | } while (0) | ||
118 | #else | ||
119 | #define DPRINT_TLA(a, op, b) do {} while (0) | ||
120 | #define DPRINT_FIELD(dir, nla_type, name, s, nla) do {} while (0) | ||
121 | #define DPRINT_ARRAY(dir, nla_type, name, s, nla) do {} while (0) | ||
122 | #endif | ||
123 | |||
124 | /* | ||
125 | * Magic: provide conversion functions {{{1 | ||
126 | * populate struct from attribute table: | ||
127 | * {{{2 | ||
128 | */ | ||
129 | |||
130 | /* processing of generic netlink messages is serialized. | ||
131 | * use one static buffer for parsing of nested attributes */ | ||
132 | static struct nlattr *nested_attr_tb[128]; | ||
133 | |||
134 | #ifndef BUILD_BUG_ON | ||
135 | /* Force a compilation error if condition is true */ | ||
136 | #define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) | ||
137 | /* Force a compilation error if condition is true, but also produce a | ||
138 | result (of value 0 and type size_t), so the expression can be used | ||
139 | e.g. in a structure initializer (or where-ever else comma expressions | ||
140 | aren't permitted). */ | ||
141 | #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) | ||
142 | #define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) | ||
143 | #endif | ||
144 | |||
145 | #undef GENL_struct | ||
146 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
147 | /* *_from_attrs functions are static, but potentially unused */ \ | ||
148 | static int __ ## s_name ## _from_attrs(struct s_name *s, \ | ||
149 | struct genl_info *info, bool exclude_invariants) \ | ||
150 | { \ | ||
151 | const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1; \ | ||
152 | struct nlattr *tla = info->attrs[tag_number]; \ | ||
153 | struct nlattr **ntb = nested_attr_tb; \ | ||
154 | struct nlattr *nla; \ | ||
155 | int err; \ | ||
156 | BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb)); \ | ||
157 | if (!tla) \ | ||
158 | return -ENOMSG; \ | ||
159 | DPRINT_TLA(#s_name, "<=-", #tag_name); \ | ||
160 | err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \ | ||
161 | if (err) \ | ||
162 | return err; \ | ||
163 | \ | ||
164 | s_fields \ | ||
165 | return 0; \ | ||
166 | } __attribute__((unused)) \ | ||
167 | static int s_name ## _from_attrs(struct s_name *s, \ | ||
168 | struct genl_info *info) \ | ||
169 | { \ | ||
170 | return __ ## s_name ## _from_attrs(s, info, false); \ | ||
171 | } __attribute__((unused)) \ | ||
172 | static int s_name ## _from_attrs_for_change(struct s_name *s, \ | ||
173 | struct genl_info *info) \ | ||
174 | { \ | ||
175 | return __ ## s_name ## _from_attrs(s, info, true); \ | ||
176 | } __attribute__((unused)) \ | ||
177 | |||
178 | #define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...) \ | ||
179 | nla = ntb[attr_nr]; \ | ||
180 | if (nla) { \ | ||
181 | if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ | ||
182 | pr_info("<< must not change invariant attr: %s\n", #name); \ | ||
183 | return -EEXIST; \ | ||
184 | } \ | ||
185 | assignment; \ | ||
186 | } else if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ | ||
187 | /* attribute missing from payload, */ \ | ||
188 | /* which was expected */ \ | ||
189 | } else if ((attr_flag) & DRBD_F_REQUIRED) { \ | ||
190 | pr_info("<< missing attr: %s\n", #name); \ | ||
191 | return -ENOMSG; \ | ||
192 | } | ||
193 | |||
194 | #undef __field | ||
195 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
196 | __is_signed) \ | ||
197 | __assign(attr_nr, attr_flag, name, nla_type, type, \ | ||
198 | if (s) \ | ||
199 | s->name = __get(nla); \ | ||
200 | DPRINT_FIELD("<<", nla_type, name, s, nla)) | ||
201 | |||
202 | /* validate_nla() already checked nla_len <= maxlen appropriately. */ | ||
203 | #undef __array | ||
204 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
205 | __get, __put, __is_signed) \ | ||
206 | __assign(attr_nr, attr_flag, name, nla_type, type, \ | ||
207 | if (s) \ | ||
208 | s->name ## _len = \ | ||
209 | __get(s->name, nla, maxlen); \ | ||
210 | DPRINT_ARRAY("<<", nla_type, name, s, nla)) | ||
211 | |||
212 | #include GENL_MAGIC_INCLUDE_FILE | ||
213 | |||
214 | #undef GENL_struct | ||
215 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) | ||
216 | |||
217 | /* | ||
218 | * Magic: define op number to op name mapping {{{1 | ||
219 | * {{{2 | ||
220 | */ | ||
221 | const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd) | ||
222 | { | ||
223 | switch (cmd) { | ||
224 | #undef GENL_op | ||
225 | #define GENL_op(op_name, op_num, handler, tla_list) \ | ||
226 | case op_num: return #op_name; | ||
227 | #include GENL_MAGIC_INCLUDE_FILE | ||
228 | default: | ||
229 | return "unknown"; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | #ifdef __KERNEL__ | ||
234 | #include <linux/stringify.h> | ||
235 | /* | ||
236 | * Magic: define genl_ops {{{1 | ||
237 | * {{{2 | ||
238 | */ | ||
239 | |||
240 | #undef GENL_op | ||
241 | #define GENL_op(op_name, op_num, handler, tla_list) \ | ||
242 | { \ | ||
243 | handler \ | ||
244 | .cmd = op_name, \ | ||
245 | .policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), \ | ||
246 | }, | ||
247 | |||
248 | #define ZZZ_genl_ops CONCAT_(GENL_MAGIC_FAMILY, _genl_ops) | ||
249 | static struct genl_ops ZZZ_genl_ops[] __read_mostly = { | ||
250 | #include GENL_MAGIC_INCLUDE_FILE | ||
251 | }; | ||
252 | |||
253 | #undef GENL_op | ||
254 | #define GENL_op(op_name, op_num, handler, tla_list) | ||
255 | |||
256 | /* | ||
257 | * Define the genl_family, multicast groups, {{{1 | ||
258 | * and provide register/unregister functions. | ||
259 | * {{{2 | ||
260 | */ | ||
261 | #define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family) | ||
262 | static struct genl_family ZZZ_genl_family __read_mostly = { | ||
263 | .id = GENL_ID_GENERATE, | ||
264 | .name = __stringify(GENL_MAGIC_FAMILY), | ||
265 | .version = GENL_MAGIC_VERSION, | ||
266 | #ifdef GENL_MAGIC_FAMILY_HDRSZ | ||
267 | .hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ), | ||
268 | #endif | ||
269 | .maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1, | ||
270 | }; | ||
271 | |||
272 | /* | ||
273 | * Magic: define multicast groups | ||
274 | * Magic: define multicast group registration helper | ||
275 | */ | ||
276 | #undef GENL_mc_group | ||
277 | #define GENL_mc_group(group) \ | ||
278 | static struct genl_multicast_group \ | ||
279 | CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = { \ | ||
280 | .name = #group, \ | ||
281 | }; \ | ||
282 | static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \ | ||
283 | struct sk_buff *skb, gfp_t flags) \ | ||
284 | { \ | ||
285 | unsigned int group_id = \ | ||
286 | CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id; \ | ||
287 | if (!group_id) \ | ||
288 | return -EINVAL; \ | ||
289 | return genlmsg_multicast(skb, 0, group_id, flags); \ | ||
290 | } | ||
291 | |||
292 | #include GENL_MAGIC_INCLUDE_FILE | ||
293 | |||
294 | int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void) | ||
295 | { | ||
296 | int err = genl_register_family_with_ops(&ZZZ_genl_family, | ||
297 | ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops)); | ||
298 | if (err) | ||
299 | return err; | ||
300 | #undef GENL_mc_group | ||
301 | #define GENL_mc_group(group) \ | ||
302 | err = genl_register_mc_group(&ZZZ_genl_family, \ | ||
303 | &CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group)); \ | ||
304 | if (err) \ | ||
305 | goto fail; \ | ||
306 | else \ | ||
307 | pr_info("%s: mcg %s: %u\n", #group, \ | ||
308 | __stringify(GENL_MAGIC_FAMILY), \ | ||
309 | CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id); | ||
310 | |||
311 | #include GENL_MAGIC_INCLUDE_FILE | ||
312 | |||
313 | #undef GENL_mc_group | ||
314 | #define GENL_mc_group(group) | ||
315 | return 0; | ||
316 | fail: | ||
317 | genl_unregister_family(&ZZZ_genl_family); | ||
318 | return err; | ||
319 | } | ||
320 | |||
321 | void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void) | ||
322 | { | ||
323 | genl_unregister_family(&ZZZ_genl_family); | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * Magic: provide conversion functions {{{1 | ||
328 | * populate skb from struct. | ||
329 | * {{{2 | ||
330 | */ | ||
331 | |||
332 | #undef GENL_op | ||
333 | #define GENL_op(op_name, op_num, handler, tla_list) | ||
334 | |||
335 | #undef GENL_struct | ||
336 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
337 | static int s_name ## _to_skb(struct sk_buff *skb, struct s_name *s, \ | ||
338 | const bool exclude_sensitive) \ | ||
339 | { \ | ||
340 | struct nlattr *tla = nla_nest_start(skb, tag_number); \ | ||
341 | if (!tla) \ | ||
342 | goto nla_put_failure; \ | ||
343 | DPRINT_TLA(#s_name, "-=>", #tag_name); \ | ||
344 | s_fields \ | ||
345 | nla_nest_end(skb, tla); \ | ||
346 | return 0; \ | ||
347 | \ | ||
348 | nla_put_failure: \ | ||
349 | if (tla) \ | ||
350 | nla_nest_cancel(skb, tla); \ | ||
351 | return -EMSGSIZE; \ | ||
352 | } \ | ||
353 | static inline int s_name ## _to_priv_skb(struct sk_buff *skb, \ | ||
354 | struct s_name *s) \ | ||
355 | { \ | ||
356 | return s_name ## _to_skb(skb, s, 0); \ | ||
357 | } \ | ||
358 | static inline int s_name ## _to_unpriv_skb(struct sk_buff *skb, \ | ||
359 | struct s_name *s) \ | ||
360 | { \ | ||
361 | return s_name ## _to_skb(skb, s, 1); \ | ||
362 | } | ||
363 | |||
364 | |||
365 | #undef __field | ||
366 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
367 | __is_signed) \ | ||
368 | if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ | ||
369 | DPRINT_FIELD(">>", nla_type, name, s, NULL); \ | ||
370 | if (__put(skb, attr_nr, s->name)) \ | ||
371 | goto nla_put_failure; \ | ||
372 | } | ||
373 | |||
374 | #undef __array | ||
375 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
376 | __get, __put, __is_signed) \ | ||
377 | if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ | ||
378 | DPRINT_ARRAY(">>",nla_type, name, s, NULL); \ | ||
379 | if (__put(skb, attr_nr, min_t(int, maxlen, \ | ||
380 | s->name ## _len + (nla_type == NLA_NUL_STRING)),\ | ||
381 | s->name)) \ | ||
382 | goto nla_put_failure; \ | ||
383 | } | ||
384 | |||
385 | #include GENL_MAGIC_INCLUDE_FILE | ||
386 | |||
387 | |||
388 | /* Functions for initializing structs to default values. */ | ||
389 | |||
390 | #undef __field | ||
391 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
392 | __is_signed) | ||
393 | #undef __array | ||
394 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
395 | __get, __put, __is_signed) | ||
396 | #undef __u32_field_def | ||
397 | #define __u32_field_def(attr_nr, attr_flag, name, default) \ | ||
398 | x->name = default; | ||
399 | #undef __s32_field_def | ||
400 | #define __s32_field_def(attr_nr, attr_flag, name, default) \ | ||
401 | x->name = default; | ||
402 | #undef __flg_field_def | ||
403 | #define __flg_field_def(attr_nr, attr_flag, name, default) \ | ||
404 | x->name = default; | ||
405 | #undef __str_field_def | ||
406 | #define __str_field_def(attr_nr, attr_flag, name, maxlen) \ | ||
407 | memset(x->name, 0, sizeof(x->name)); \ | ||
408 | x->name ## _len = 0; | ||
409 | #undef GENL_struct | ||
410 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
411 | static void set_ ## s_name ## _defaults(struct s_name *x) __attribute__((unused)); \ | ||
412 | static void set_ ## s_name ## _defaults(struct s_name *x) { \ | ||
413 | s_fields \ | ||
414 | } | ||
415 | |||
416 | #include GENL_MAGIC_INCLUDE_FILE | ||
417 | |||
418 | #endif /* __KERNEL__ */ | ||
419 | |||
420 | /* }}}1 */ | ||
421 | #endif /* GENL_MAGIC_FUNC_H */ | ||
422 | /* vim: set foldmethod=marker foldlevel=1 nofoldenable : */ | ||
diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h new file mode 100644 index 000000000000..eecd19b37001 --- /dev/null +++ b/include/linux/genl_magic_struct.h | |||
@@ -0,0 +1,277 @@ | |||
1 | #ifndef GENL_MAGIC_STRUCT_H | ||
2 | #define GENL_MAGIC_STRUCT_H | ||
3 | |||
4 | #ifndef GENL_MAGIC_FAMILY | ||
5 | # error "you need to define GENL_MAGIC_FAMILY before inclusion" | ||
6 | #endif | ||
7 | |||
8 | #ifndef GENL_MAGIC_VERSION | ||
9 | # error "you need to define GENL_MAGIC_VERSION before inclusion" | ||
10 | #endif | ||
11 | |||
12 | #ifndef GENL_MAGIC_INCLUDE_FILE | ||
13 | # error "you need to define GENL_MAGIC_INCLUDE_FILE before inclusion" | ||
14 | #endif | ||
15 | |||
16 | #include <linux/genetlink.h> | ||
17 | #include <linux/types.h> | ||
18 | |||
19 | #define CONCAT__(a,b) a ## b | ||
20 | #define CONCAT_(a,b) CONCAT__(a,b) | ||
21 | |||
22 | extern int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void); | ||
23 | extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void); | ||
24 | |||
25 | /* | ||
26 | * Extension of genl attribute validation policies {{{2 | ||
27 | */ | ||
28 | |||
29 | /* | ||
30 | * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not | ||
31 | * know about. This flag can be set in nlattr->nla_type to indicate that this | ||
32 | * attribute must not be ignored. | ||
33 | * | ||
34 | * We check and remove this flag in drbd_nla_check_mandatory() before | ||
35 | * validating the attribute types and lengths via nla_parse_nested(). | ||
36 | */ | ||
37 | #define DRBD_GENLA_F_MANDATORY (1 << 14) | ||
38 | |||
39 | /* | ||
40 | * Flags specific to drbd and not visible at the netlink layer, used in | ||
41 | * <struct>_from_attrs and <struct>_to_skb: | ||
42 | * | ||
43 | * @DRBD_F_REQUIRED: Attribute is required; a request without this attribute is | ||
44 | * invalid. | ||
45 | * | ||
46 | * @DRBD_F_SENSITIVE: Attribute includes sensitive information and must not be | ||
47 | * included in unpriviledged get requests or broadcasts. | ||
48 | * | ||
49 | * @DRBD_F_INVARIANT: Attribute is set when an object is initially created, but | ||
50 | * cannot subsequently be changed. | ||
51 | */ | ||
52 | #define DRBD_F_REQUIRED (1 << 0) | ||
53 | #define DRBD_F_SENSITIVE (1 << 1) | ||
54 | #define DRBD_F_INVARIANT (1 << 2) | ||
55 | |||
56 | #define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY)) | ||
57 | |||
58 | /* }}}1 | ||
59 | * MAGIC | ||
60 | * multi-include macro expansion magic starts here | ||
61 | */ | ||
62 | |||
63 | /* MAGIC helpers {{{2 */ | ||
64 | |||
65 | /* possible field types */ | ||
66 | #define __flg_field(attr_nr, attr_flag, name) \ | ||
67 | __field(attr_nr, attr_flag, name, NLA_U8, char, \ | ||
68 | nla_get_u8, nla_put_u8, false) | ||
69 | #define __u8_field(attr_nr, attr_flag, name) \ | ||
70 | __field(attr_nr, attr_flag, name, NLA_U8, unsigned char, \ | ||
71 | nla_get_u8, nla_put_u8, false) | ||
72 | #define __u16_field(attr_nr, attr_flag, name) \ | ||
73 | __field(attr_nr, attr_flag, name, NLA_U16, __u16, \ | ||
74 | nla_get_u16, nla_put_u16, false) | ||
75 | #define __u32_field(attr_nr, attr_flag, name) \ | ||
76 | __field(attr_nr, attr_flag, name, NLA_U32, __u32, \ | ||
77 | nla_get_u32, nla_put_u32, false) | ||
78 | #define __s32_field(attr_nr, attr_flag, name) \ | ||
79 | __field(attr_nr, attr_flag, name, NLA_U32, __s32, \ | ||
80 | nla_get_u32, nla_put_u32, true) | ||
81 | #define __u64_field(attr_nr, attr_flag, name) \ | ||
82 | __field(attr_nr, attr_flag, name, NLA_U64, __u64, \ | ||
83 | nla_get_u64, nla_put_u64, false) | ||
84 | #define __str_field(attr_nr, attr_flag, name, maxlen) \ | ||
85 | __array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \ | ||
86 | nla_strlcpy, nla_put, false) | ||
87 | #define __bin_field(attr_nr, attr_flag, name, maxlen) \ | ||
88 | __array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \ | ||
89 | nla_memcpy, nla_put, false) | ||
90 | |||
91 | /* fields with default values */ | ||
92 | #define __flg_field_def(attr_nr, attr_flag, name, default) \ | ||
93 | __flg_field(attr_nr, attr_flag, name) | ||
94 | #define __u32_field_def(attr_nr, attr_flag, name, default) \ | ||
95 | __u32_field(attr_nr, attr_flag, name) | ||
96 | #define __s32_field_def(attr_nr, attr_flag, name, default) \ | ||
97 | __s32_field(attr_nr, attr_flag, name) | ||
98 | #define __str_field_def(attr_nr, attr_flag, name, maxlen) \ | ||
99 | __str_field(attr_nr, attr_flag, name, maxlen) | ||
100 | |||
101 | #define GENL_op_init(args...) args | ||
102 | #define GENL_doit(handler) \ | ||
103 | .doit = handler, \ | ||
104 | .flags = GENL_ADMIN_PERM, | ||
105 | #define GENL_dumpit(handler) \ | ||
106 | .dumpit = handler, \ | ||
107 | .flags = GENL_ADMIN_PERM, | ||
108 | |||
109 | /* }}}1 | ||
110 | * Magic: define the enum symbols for genl_ops | ||
111 | * Magic: define the enum symbols for top level attributes | ||
112 | * Magic: define the enum symbols for nested attributes | ||
113 | * {{{2 | ||
114 | */ | ||
115 | |||
116 | #undef GENL_struct | ||
117 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) | ||
118 | |||
119 | #undef GENL_mc_group | ||
120 | #define GENL_mc_group(group) | ||
121 | |||
122 | #undef GENL_notification | ||
123 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) \ | ||
124 | op_name = op_num, | ||
125 | |||
126 | #undef GENL_op | ||
127 | #define GENL_op(op_name, op_num, handler, tla_list) \ | ||
128 | op_name = op_num, | ||
129 | |||
130 | enum { | ||
131 | #include GENL_MAGIC_INCLUDE_FILE | ||
132 | }; | ||
133 | |||
134 | #undef GENL_notification | ||
135 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) | ||
136 | |||
137 | #undef GENL_op | ||
138 | #define GENL_op(op_name, op_num, handler, attr_list) | ||
139 | |||
140 | #undef GENL_struct | ||
141 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
142 | tag_name = tag_number, | ||
143 | |||
144 | enum { | ||
145 | #include GENL_MAGIC_INCLUDE_FILE | ||
146 | }; | ||
147 | |||
148 | #undef GENL_struct | ||
149 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
150 | enum { \ | ||
151 | s_fields \ | ||
152 | }; | ||
153 | |||
154 | #undef __field | ||
155 | #define __field(attr_nr, attr_flag, name, nla_type, type, \ | ||
156 | __get, __put, __is_signed) \ | ||
157 | T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), | ||
158 | |||
159 | #undef __array | ||
160 | #define __array(attr_nr, attr_flag, name, nla_type, type, \ | ||
161 | maxlen, __get, __put, __is_signed) \ | ||
162 | T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), | ||
163 | |||
164 | #include GENL_MAGIC_INCLUDE_FILE | ||
165 | |||
166 | /* }}}1 | ||
167 | * Magic: compile time assert unique numbers for operations | ||
168 | * Magic: -"- unique numbers for top level attributes | ||
169 | * Magic: -"- unique numbers for nested attributes | ||
170 | * {{{2 | ||
171 | */ | ||
172 | |||
173 | #undef GENL_struct | ||
174 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) | ||
175 | |||
176 | #undef GENL_op | ||
177 | #define GENL_op(op_name, op_num, handler, attr_list) \ | ||
178 | case op_name: | ||
179 | |||
180 | #undef GENL_notification | ||
181 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) \ | ||
182 | case op_name: | ||
183 | |||
184 | static inline void ct_assert_unique_operations(void) | ||
185 | { | ||
186 | switch (0) { | ||
187 | #include GENL_MAGIC_INCLUDE_FILE | ||
188 | ; | ||
189 | } | ||
190 | } | ||
191 | |||
192 | #undef GENL_op | ||
193 | #define GENL_op(op_name, op_num, handler, attr_list) | ||
194 | |||
195 | #undef GENL_notification | ||
196 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) | ||
197 | |||
198 | #undef GENL_struct | ||
199 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
200 | case tag_number: | ||
201 | |||
202 | static inline void ct_assert_unique_top_level_attributes(void) | ||
203 | { | ||
204 | switch (0) { | ||
205 | #include GENL_MAGIC_INCLUDE_FILE | ||
206 | ; | ||
207 | } | ||
208 | } | ||
209 | |||
210 | #undef GENL_struct | ||
211 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
212 | static inline void ct_assert_unique_ ## s_name ## _attributes(void) \ | ||
213 | { \ | ||
214 | switch (0) { \ | ||
215 | s_fields \ | ||
216 | ; \ | ||
217 | } \ | ||
218 | } | ||
219 | |||
220 | #undef __field | ||
221 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
222 | __is_signed) \ | ||
223 | case attr_nr: | ||
224 | |||
225 | #undef __array | ||
226 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
227 | __get, __put, __is_signed) \ | ||
228 | case attr_nr: | ||
229 | |||
230 | #include GENL_MAGIC_INCLUDE_FILE | ||
231 | |||
232 | /* }}}1 | ||
233 | * Magic: declare structs | ||
234 | * struct <name> { | ||
235 | * fields | ||
236 | * }; | ||
237 | * {{{2 | ||
238 | */ | ||
239 | |||
240 | #undef GENL_struct | ||
241 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
242 | struct s_name { s_fields }; | ||
243 | |||
244 | #undef __field | ||
245 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
246 | __is_signed) \ | ||
247 | type name; | ||
248 | |||
249 | #undef __array | ||
250 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
251 | __get, __put, __is_signed) \ | ||
252 | type name[maxlen]; \ | ||
253 | __u32 name ## _len; | ||
254 | |||
255 | #include GENL_MAGIC_INCLUDE_FILE | ||
256 | |||
257 | #undef GENL_struct | ||
258 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
259 | enum { \ | ||
260 | s_fields \ | ||
261 | }; | ||
262 | |||
263 | #undef __field | ||
264 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
265 | is_signed) \ | ||
266 | F_ ## name ## _IS_SIGNED = is_signed, | ||
267 | |||
268 | #undef __array | ||
269 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
270 | __get, __put, is_signed) \ | ||
271 | F_ ## name ## _IS_SIGNED = is_signed, | ||
272 | |||
273 | #include GENL_MAGIC_INCLUDE_FILE | ||
274 | |||
275 | /* }}}1 */ | ||
276 | #endif /* GENL_MAGIC_STRUCT_H */ | ||
277 | /* vim: set foldmethod=marker nofoldenable : */ | ||
diff --git a/include/linux/idr.h b/include/linux/idr.h index 87259a44c251..de7e190f1af4 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h | |||
@@ -152,4 +152,15 @@ void ida_simple_remove(struct ida *ida, unsigned int id); | |||
152 | 152 | ||
153 | void __init idr_init_cache(void); | 153 | void __init idr_init_cache(void); |
154 | 154 | ||
155 | /** | ||
156 | * idr_for_each_entry - iterate over an idr's elements of a given type | ||
157 | * @idp: idr handle | ||
158 | * @entry: the type * to use as cursor | ||
159 | * @id: id entry's key | ||
160 | */ | ||
161 | #define idr_for_each_entry(idp, entry, id) \ | ||
162 | for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \ | ||
163 | entry != NULL; \ | ||
164 | ++id, entry = (typeof(entry))idr_get_next((idp), &(id))) | ||
165 | |||
155 | #endif /* __IDR_H__ */ | 166 | #endif /* __IDR_H__ */ |
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 7a71ffad037c..cbafae40c649 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h | |||
@@ -166,9 +166,11 @@ struct lc_element { | |||
166 | /* if we want to track a larger set of objects, | 166 | /* if we want to track a larger set of objects, |
167 | * it needs to become arch independend u64 */ | 167 | * it needs to become arch independend u64 */ |
168 | unsigned lc_number; | 168 | unsigned lc_number; |
169 | |||
170 | /* special label when on free list */ | 169 | /* special label when on free list */ |
171 | #define LC_FREE (~0U) | 170 | #define LC_FREE (~0U) |
171 | |||
172 | /* for pending changes */ | ||
173 | unsigned lc_new_number; | ||
172 | }; | 174 | }; |
173 | 175 | ||
174 | struct lru_cache { | 176 | struct lru_cache { |
@@ -176,6 +178,7 @@ struct lru_cache { | |||
176 | struct list_head lru; | 178 | struct list_head lru; |
177 | struct list_head free; | 179 | struct list_head free; |
178 | struct list_head in_use; | 180 | struct list_head in_use; |
181 | struct list_head to_be_changed; | ||
179 | 182 | ||
180 | /* the pre-created kmem cache to allocate the objects from */ | 183 | /* the pre-created kmem cache to allocate the objects from */ |
181 | struct kmem_cache *lc_cache; | 184 | struct kmem_cache *lc_cache; |
@@ -186,7 +189,7 @@ struct lru_cache { | |||
186 | size_t element_off; | 189 | size_t element_off; |
187 | 190 | ||
188 | /* number of elements (indices) */ | 191 | /* number of elements (indices) */ |
189 | unsigned int nr_elements; | 192 | unsigned int nr_elements; |
190 | /* Arbitrary limit on maximum tracked objects. Practical limit is much | 193 | /* Arbitrary limit on maximum tracked objects. Practical limit is much |
191 | * lower due to allocation failures, probably. For typical use cases, | 194 | * lower due to allocation failures, probably. For typical use cases, |
192 | * nr_elements should be a few thousand at most. | 195 | * nr_elements should be a few thousand at most. |
@@ -194,18 +197,19 @@ struct lru_cache { | |||
194 | * 8 high bits of .lc_index to be overloaded with flags in the future. */ | 197 | * 8 high bits of .lc_index to be overloaded with flags in the future. */ |
195 | #define LC_MAX_ACTIVE (1<<24) | 198 | #define LC_MAX_ACTIVE (1<<24) |
196 | 199 | ||
200 | /* allow to accumulate a few (index:label) changes, | ||
201 | * but no more than max_pending_changes */ | ||
202 | unsigned int max_pending_changes; | ||
203 | /* number of elements currently on to_be_changed list */ | ||
204 | unsigned int pending_changes; | ||
205 | |||
197 | /* statistics */ | 206 | /* statistics */ |
198 | unsigned used; /* number of lelements currently on in_use list */ | 207 | unsigned used; /* number of elements currently on in_use list */ |
199 | unsigned long hits, misses, starving, dirty, changed; | 208 | unsigned long hits, misses, starving, locked, changed; |
200 | 209 | ||
201 | /* see below: flag-bits for lru_cache */ | 210 | /* see below: flag-bits for lru_cache */ |
202 | unsigned long flags; | 211 | unsigned long flags; |
203 | 212 | ||
204 | /* when changing the label of an index element */ | ||
205 | unsigned int new_number; | ||
206 | |||
207 | /* for paranoia when changing the label of an index element */ | ||
208 | struct lc_element *changing_element; | ||
209 | 213 | ||
210 | void *lc_private; | 214 | void *lc_private; |
211 | const char *name; | 215 | const char *name; |
@@ -221,10 +225,15 @@ enum { | |||
221 | /* debugging aid, to catch concurrent access early. | 225 | /* debugging aid, to catch concurrent access early. |
222 | * user needs to guarantee exclusive access by proper locking! */ | 226 | * user needs to guarantee exclusive access by proper locking! */ |
223 | __LC_PARANOIA, | 227 | __LC_PARANOIA, |
224 | /* if we need to change the set, but currently there is a changing | 228 | |
225 | * transaction pending, we are "dirty", and must deferr further | 229 | /* annotate that the set is "dirty", possibly accumulating further |
226 | * changing requests */ | 230 | * changes, until a transaction is finally triggered */ |
227 | __LC_DIRTY, | 231 | __LC_DIRTY, |
232 | |||
233 | /* Locked, no further changes allowed. | ||
234 | * Also used to serialize changing transactions. */ | ||
235 | __LC_LOCKED, | ||
236 | |||
228 | /* if we need to change the set, but currently there is no free nor | 237 | /* if we need to change the set, but currently there is no free nor |
229 | * unused element available, we are "starving", and must not give out | 238 | * unused element available, we are "starving", and must not give out |
230 | * further references, to guarantee that eventually some refcnt will | 239 | * further references, to guarantee that eventually some refcnt will |
@@ -236,9 +245,11 @@ enum { | |||
236 | }; | 245 | }; |
237 | #define LC_PARANOIA (1<<__LC_PARANOIA) | 246 | #define LC_PARANOIA (1<<__LC_PARANOIA) |
238 | #define LC_DIRTY (1<<__LC_DIRTY) | 247 | #define LC_DIRTY (1<<__LC_DIRTY) |
248 | #define LC_LOCKED (1<<__LC_LOCKED) | ||
239 | #define LC_STARVING (1<<__LC_STARVING) | 249 | #define LC_STARVING (1<<__LC_STARVING) |
240 | 250 | ||
241 | extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | 251 | extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, |
252 | unsigned max_pending_changes, | ||
242 | unsigned e_count, size_t e_size, size_t e_off); | 253 | unsigned e_count, size_t e_size, size_t e_off); |
243 | extern void lc_reset(struct lru_cache *lc); | 254 | extern void lc_reset(struct lru_cache *lc); |
244 | extern void lc_destroy(struct lru_cache *lc); | 255 | extern void lc_destroy(struct lru_cache *lc); |
@@ -249,7 +260,7 @@ extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); | |||
249 | extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); | 260 | extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); |
250 | extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); | 261 | extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); |
251 | extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); | 262 | extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); |
252 | extern void lc_changed(struct lru_cache *lc, struct lc_element *e); | 263 | extern void lc_committed(struct lru_cache *lc); |
253 | 264 | ||
254 | struct seq_file; | 265 | struct seq_file; |
255 | extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); | 266 | extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); |
@@ -258,32 +269,40 @@ extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char | |||
258 | void (*detail) (struct seq_file *, struct lc_element *)); | 269 | void (*detail) (struct seq_file *, struct lc_element *)); |
259 | 270 | ||
260 | /** | 271 | /** |
261 | * lc_try_lock - can be used to stop lc_get() from changing the tracked set | 272 | * lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set |
262 | * @lc: the lru cache to operate on | 273 | * @lc: the lru cache to operate on |
263 | * | 274 | * |
264 | * Note that the reference counts and order on the active and lru lists may | 275 | * Allows (expects) the set to be "dirty". Note that the reference counts and |
265 | * still change. Returns true if we acquired the lock. | 276 | * order on the active and lru lists may still change. Used to serialize |
277 | * changing transactions. Returns true if we aquired the lock. | ||
266 | */ | 278 | */ |
267 | static inline int lc_try_lock(struct lru_cache *lc) | 279 | static inline int lc_try_lock_for_transaction(struct lru_cache *lc) |
268 | { | 280 | { |
269 | return !test_and_set_bit(__LC_DIRTY, &lc->flags); | 281 | return !test_and_set_bit(__LC_LOCKED, &lc->flags); |
270 | } | 282 | } |
271 | 283 | ||
272 | /** | 284 | /** |
285 | * lc_try_lock - variant to stop lc_get() from changing the tracked set | ||
286 | * @lc: the lru cache to operate on | ||
287 | * | ||
288 | * Note that the reference counts and order on the active and lru lists may | ||
289 | * still change. Only works on a "clean" set. Returns true if we aquired the | ||
290 | * lock, which means there are no pending changes, and any further attempt to | ||
291 | * change the set will not succeed until the next lc_unlock(). | ||
292 | */ | ||
293 | extern int lc_try_lock(struct lru_cache *lc); | ||
294 | |||
295 | /** | ||
273 | * lc_unlock - unlock @lc, allow lc_get() to change the set again | 296 | * lc_unlock - unlock @lc, allow lc_get() to change the set again |
274 | * @lc: the lru cache to operate on | 297 | * @lc: the lru cache to operate on |
275 | */ | 298 | */ |
276 | static inline void lc_unlock(struct lru_cache *lc) | 299 | static inline void lc_unlock(struct lru_cache *lc) |
277 | { | 300 | { |
278 | clear_bit(__LC_DIRTY, &lc->flags); | 301 | clear_bit(__LC_DIRTY, &lc->flags); |
279 | smp_mb__after_clear_bit(); | 302 | clear_bit_unlock(__LC_LOCKED, &lc->flags); |
280 | } | 303 | } |
281 | 304 | ||
282 | static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) | 305 | extern bool lc_is_used(struct lru_cache *lc, unsigned int enr); |
283 | { | ||
284 | struct lc_element *e = lc_find(lc, enr); | ||
285 | return e && e->refcnt; | ||
286 | } | ||
287 | 306 | ||
288 | #define lc_entry(ptr, type, member) \ | 307 | #define lc_entry(ptr, type, member) \ |
289 | container_of(ptr, type, member) | 308 | container_of(ptr, type, member) |
diff --git a/lib/lru_cache.c b/lib/lru_cache.c index a07e7268d7ed..d71d89498943 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c | |||
@@ -44,8 +44,8 @@ MODULE_LICENSE("GPL"); | |||
44 | } while (0) | 44 | } while (0) |
45 | 45 | ||
46 | #define RETURN(x...) do { \ | 46 | #define RETURN(x...) do { \ |
47 | clear_bit(__LC_PARANOIA, &lc->flags); \ | 47 | clear_bit_unlock(__LC_PARANOIA, &lc->flags); \ |
48 | smp_mb__after_clear_bit(); return x ; } while (0) | 48 | return x ; } while (0) |
49 | 49 | ||
50 | /* BUG() if e is not one of the elements tracked by lc */ | 50 | /* BUG() if e is not one of the elements tracked by lc */ |
51 | #define PARANOIA_LC_ELEMENT(lc, e) do { \ | 51 | #define PARANOIA_LC_ELEMENT(lc, e) do { \ |
@@ -55,9 +55,40 @@ MODULE_LICENSE("GPL"); | |||
55 | BUG_ON(i >= lc_->nr_elements); \ | 55 | BUG_ON(i >= lc_->nr_elements); \ |
56 | BUG_ON(lc_->lc_element[i] != e_); } while (0) | 56 | BUG_ON(lc_->lc_element[i] != e_); } while (0) |
57 | 57 | ||
58 | |||
59 | /* We need to atomically | ||
60 | * - try to grab the lock (set LC_LOCKED) | ||
61 | * - only if there is no pending transaction | ||
62 | * (neither LC_DIRTY nor LC_STARVING is set) | ||
63 | * Because of PARANOIA_ENTRY() above abusing lc->flags as well, | ||
64 | * it is not sufficient to just say | ||
65 | * return 0 == cmpxchg(&lc->flags, 0, LC_LOCKED); | ||
66 | */ | ||
67 | int lc_try_lock(struct lru_cache *lc) | ||
68 | { | ||
69 | unsigned long val; | ||
70 | do { | ||
71 | val = cmpxchg(&lc->flags, 0, LC_LOCKED); | ||
72 | } while (unlikely (val == LC_PARANOIA)); | ||
73 | /* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */ | ||
74 | return 0 == val; | ||
75 | #if 0 | ||
76 | /* Alternative approach, spin in case someone enters or leaves a | ||
77 | * PARANOIA_ENTRY()/RETURN() section. */ | ||
78 | unsigned long old, new, val; | ||
79 | do { | ||
80 | old = lc->flags & LC_PARANOIA; | ||
81 | new = old | LC_LOCKED; | ||
82 | val = cmpxchg(&lc->flags, old, new); | ||
83 | } while (unlikely (val == (old ^ LC_PARANOIA))); | ||
84 | return old == val; | ||
85 | #endif | ||
86 | } | ||
87 | |||
58 | /** | 88 | /** |
59 | * lc_create - prepares to track objects in an active set | 89 | * lc_create - prepares to track objects in an active set |
60 | * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details | 90 | * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details |
91 | * @max_pending_changes: maximum changes to accumulate until a transaction is required | ||
61 | * @e_count: number of elements allowed to be active simultaneously | 92 | * @e_count: number of elements allowed to be active simultaneously |
62 | * @e_size: size of the tracked objects | 93 | * @e_size: size of the tracked objects |
63 | * @e_off: offset to the &struct lc_element member in a tracked object | 94 | * @e_off: offset to the &struct lc_element member in a tracked object |
@@ -66,6 +97,7 @@ MODULE_LICENSE("GPL"); | |||
66 | * or NULL on (allocation) failure. | 97 | * or NULL on (allocation) failure. |
67 | */ | 98 | */ |
68 | struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | 99 | struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, |
100 | unsigned max_pending_changes, | ||
69 | unsigned e_count, size_t e_size, size_t e_off) | 101 | unsigned e_count, size_t e_size, size_t e_off) |
70 | { | 102 | { |
71 | struct hlist_head *slot = NULL; | 103 | struct hlist_head *slot = NULL; |
@@ -98,12 +130,13 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | |||
98 | INIT_LIST_HEAD(&lc->in_use); | 130 | INIT_LIST_HEAD(&lc->in_use); |
99 | INIT_LIST_HEAD(&lc->lru); | 131 | INIT_LIST_HEAD(&lc->lru); |
100 | INIT_LIST_HEAD(&lc->free); | 132 | INIT_LIST_HEAD(&lc->free); |
133 | INIT_LIST_HEAD(&lc->to_be_changed); | ||
101 | 134 | ||
102 | lc->name = name; | 135 | lc->name = name; |
103 | lc->element_size = e_size; | 136 | lc->element_size = e_size; |
104 | lc->element_off = e_off; | 137 | lc->element_off = e_off; |
105 | lc->nr_elements = e_count; | 138 | lc->nr_elements = e_count; |
106 | lc->new_number = LC_FREE; | 139 | lc->max_pending_changes = max_pending_changes; |
107 | lc->lc_cache = cache; | 140 | lc->lc_cache = cache; |
108 | lc->lc_element = element; | 141 | lc->lc_element = element; |
109 | lc->lc_slot = slot; | 142 | lc->lc_slot = slot; |
@@ -117,6 +150,7 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | |||
117 | e = p + e_off; | 150 | e = p + e_off; |
118 | e->lc_index = i; | 151 | e->lc_index = i; |
119 | e->lc_number = LC_FREE; | 152 | e->lc_number = LC_FREE; |
153 | e->lc_new_number = LC_FREE; | ||
120 | list_add(&e->list, &lc->free); | 154 | list_add(&e->list, &lc->free); |
121 | element[i] = e; | 155 | element[i] = e; |
122 | } | 156 | } |
@@ -175,15 +209,15 @@ void lc_reset(struct lru_cache *lc) | |||
175 | INIT_LIST_HEAD(&lc->in_use); | 209 | INIT_LIST_HEAD(&lc->in_use); |
176 | INIT_LIST_HEAD(&lc->lru); | 210 | INIT_LIST_HEAD(&lc->lru); |
177 | INIT_LIST_HEAD(&lc->free); | 211 | INIT_LIST_HEAD(&lc->free); |
212 | INIT_LIST_HEAD(&lc->to_be_changed); | ||
178 | lc->used = 0; | 213 | lc->used = 0; |
179 | lc->hits = 0; | 214 | lc->hits = 0; |
180 | lc->misses = 0; | 215 | lc->misses = 0; |
181 | lc->starving = 0; | 216 | lc->starving = 0; |
182 | lc->dirty = 0; | 217 | lc->locked = 0; |
183 | lc->changed = 0; | 218 | lc->changed = 0; |
219 | lc->pending_changes = 0; | ||
184 | lc->flags = 0; | 220 | lc->flags = 0; |
185 | lc->changing_element = NULL; | ||
186 | lc->new_number = LC_FREE; | ||
187 | memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements); | 221 | memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements); |
188 | 222 | ||
189 | for (i = 0; i < lc->nr_elements; i++) { | 223 | for (i = 0; i < lc->nr_elements; i++) { |
@@ -194,6 +228,7 @@ void lc_reset(struct lru_cache *lc) | |||
194 | /* re-init it */ | 228 | /* re-init it */ |
195 | e->lc_index = i; | 229 | e->lc_index = i; |
196 | e->lc_number = LC_FREE; | 230 | e->lc_number = LC_FREE; |
231 | e->lc_new_number = LC_FREE; | ||
197 | list_add(&e->list, &lc->free); | 232 | list_add(&e->list, &lc->free); |
198 | } | 233 | } |
199 | } | 234 | } |
@@ -208,14 +243,14 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) | |||
208 | /* NOTE: | 243 | /* NOTE: |
209 | * total calls to lc_get are | 244 | * total calls to lc_get are |
210 | * (starving + hits + misses) | 245 | * (starving + hits + misses) |
211 | * misses include "dirty" count (update from an other thread in | 246 | * misses include "locked" count (update from an other thread in |
212 | * progress) and "changed", when this in fact lead to an successful | 247 | * progress) and "changed", when this in fact lead to an successful |
213 | * update of the cache. | 248 | * update of the cache. |
214 | */ | 249 | */ |
215 | return seq_printf(seq, "\t%s: used:%u/%u " | 250 | return seq_printf(seq, "\t%s: used:%u/%u " |
216 | "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n", | 251 | "hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", |
217 | lc->name, lc->used, lc->nr_elements, | 252 | lc->name, lc->used, lc->nr_elements, |
218 | lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed); | 253 | lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); |
219 | } | 254 | } |
220 | 255 | ||
221 | static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) | 256 | static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) |
@@ -224,16 +259,8 @@ static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) | |||
224 | } | 259 | } |
225 | 260 | ||
226 | 261 | ||
227 | /** | 262 | static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr, |
228 | * lc_find - find element by label, if present in the hash table | 263 | bool include_changing) |
229 | * @lc: The lru_cache object | ||
230 | * @enr: element number | ||
231 | * | ||
232 | * Returns the pointer to an element, if the element with the requested | ||
233 | * "label" or element number is present in the hash table, | ||
234 | * or NULL if not found. Does not change the refcnt. | ||
235 | */ | ||
236 | struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) | ||
237 | { | 264 | { |
238 | struct hlist_node *n; | 265 | struct hlist_node *n; |
239 | struct lc_element *e; | 266 | struct lc_element *e; |
@@ -241,29 +268,48 @@ struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) | |||
241 | BUG_ON(!lc); | 268 | BUG_ON(!lc); |
242 | BUG_ON(!lc->nr_elements); | 269 | BUG_ON(!lc->nr_elements); |
243 | hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { | 270 | hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { |
244 | if (e->lc_number == enr) | 271 | /* "about to be changed" elements, pending transaction commit, |
272 | * are hashed by their "new number". "Normal" elements have | ||
273 | * lc_number == lc_new_number. */ | ||
274 | if (e->lc_new_number != enr) | ||
275 | continue; | ||
276 | if (e->lc_new_number == e->lc_number || include_changing) | ||
245 | return e; | 277 | return e; |
278 | break; | ||
246 | } | 279 | } |
247 | return NULL; | 280 | return NULL; |
248 | } | 281 | } |
249 | 282 | ||
250 | /* returned element will be "recycled" immediately */ | 283 | /** |
251 | static struct lc_element *lc_evict(struct lru_cache *lc) | 284 | * lc_find - find element by label, if present in the hash table |
285 | * @lc: The lru_cache object | ||
286 | * @enr: element number | ||
287 | * | ||
288 | * Returns the pointer to an element, if the element with the requested | ||
289 | * "label" or element number is present in the hash table, | ||
290 | * or NULL if not found. Does not change the refcnt. | ||
291 | * Ignores elements that are "about to be used", i.e. not yet in the active | ||
292 | * set, but still pending transaction commit. | ||
293 | */ | ||
294 | struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) | ||
252 | { | 295 | { |
253 | struct list_head *n; | 296 | return __lc_find(lc, enr, 0); |
254 | struct lc_element *e; | 297 | } |
255 | |||
256 | if (list_empty(&lc->lru)) | ||
257 | return NULL; | ||
258 | |||
259 | n = lc->lru.prev; | ||
260 | e = list_entry(n, struct lc_element, list); | ||
261 | |||
262 | PARANOIA_LC_ELEMENT(lc, e); | ||
263 | 298 | ||
264 | list_del(&e->list); | 299 | /** |
265 | hlist_del(&e->colision); | 300 | * lc_is_used - find element by label |
266 | return e; | 301 | * @lc: The lru_cache object |
302 | * @enr: element number | ||
303 | * | ||
304 | * Returns true, if the element with the requested "label" or element number is | ||
305 | * present in the hash table, and is used (refcnt > 0). | ||
306 | * Also finds elements that are not _currently_ used but only "about to be | ||
307 | * used", i.e. on the "to_be_changed" list, pending transaction commit. | ||
308 | */ | ||
309 | bool lc_is_used(struct lru_cache *lc, unsigned int enr) | ||
310 | { | ||
311 | struct lc_element *e = __lc_find(lc, enr, 1); | ||
312 | return e && e->refcnt; | ||
267 | } | 313 | } |
268 | 314 | ||
269 | /** | 315 | /** |
@@ -280,22 +326,34 @@ void lc_del(struct lru_cache *lc, struct lc_element *e) | |||
280 | PARANOIA_LC_ELEMENT(lc, e); | 326 | PARANOIA_LC_ELEMENT(lc, e); |
281 | BUG_ON(e->refcnt); | 327 | BUG_ON(e->refcnt); |
282 | 328 | ||
283 | e->lc_number = LC_FREE; | 329 | e->lc_number = e->lc_new_number = LC_FREE; |
284 | hlist_del_init(&e->colision); | 330 | hlist_del_init(&e->colision); |
285 | list_move(&e->list, &lc->free); | 331 | list_move(&e->list, &lc->free); |
286 | RETURN(); | 332 | RETURN(); |
287 | } | 333 | } |
288 | 334 | ||
289 | static struct lc_element *lc_get_unused_element(struct lru_cache *lc) | 335 | static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned new_number) |
290 | { | 336 | { |
291 | struct list_head *n; | 337 | struct list_head *n; |
338 | struct lc_element *e; | ||
339 | |||
340 | if (!list_empty(&lc->free)) | ||
341 | n = lc->free.next; | ||
342 | else if (!list_empty(&lc->lru)) | ||
343 | n = lc->lru.prev; | ||
344 | else | ||
345 | return NULL; | ||
346 | |||
347 | e = list_entry(n, struct lc_element, list); | ||
348 | PARANOIA_LC_ELEMENT(lc, e); | ||
292 | 349 | ||
293 | if (list_empty(&lc->free)) | 350 | e->lc_new_number = new_number; |
294 | return lc_evict(lc); | 351 | if (!hlist_unhashed(&e->colision)) |
352 | __hlist_del(&e->colision); | ||
353 | hlist_add_head(&e->colision, lc_hash_slot(lc, new_number)); | ||
354 | list_move(&e->list, &lc->to_be_changed); | ||
295 | 355 | ||
296 | n = lc->free.next; | 356 | return e; |
297 | list_del(n); | ||
298 | return list_entry(n, struct lc_element, list); | ||
299 | } | 357 | } |
300 | 358 | ||
301 | static int lc_unused_element_available(struct lru_cache *lc) | 359 | static int lc_unused_element_available(struct lru_cache *lc) |
@@ -308,45 +366,7 @@ static int lc_unused_element_available(struct lru_cache *lc) | |||
308 | return 0; | 366 | return 0; |
309 | } | 367 | } |
310 | 368 | ||
311 | 369 | static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) | |
312 | /** | ||
313 | * lc_get - get element by label, maybe change the active set | ||
314 | * @lc: the lru cache to operate on | ||
315 | * @enr: the label to look up | ||
316 | * | ||
317 | * Finds an element in the cache, increases its usage count, | ||
318 | * "touches" and returns it. | ||
319 | * | ||
320 | * In case the requested number is not present, it needs to be added to the | ||
321 | * cache. Therefore it is possible that an other element becomes evicted from | ||
322 | * the cache. In either case, the user is notified so he is able to e.g. keep | ||
323 | * a persistent log of the cache changes, and therefore the objects in use. | ||
324 | * | ||
325 | * Return values: | ||
326 | * NULL | ||
327 | * The cache was marked %LC_STARVING, | ||
328 | * or the requested label was not in the active set | ||
329 | * and a changing transaction is still pending (@lc was marked %LC_DIRTY). | ||
330 | * Or no unused or free element could be recycled (@lc will be marked as | ||
331 | * %LC_STARVING, blocking further lc_get() operations). | ||
332 | * | ||
333 | * pointer to the element with the REQUESTED element number. | ||
334 | * In this case, it can be used right away | ||
335 | * | ||
336 | * pointer to an UNUSED element with some different element number, | ||
337 | * where that different number may also be %LC_FREE. | ||
338 | * | ||
339 | * In this case, the cache is marked %LC_DIRTY (blocking further changes), | ||
340 | * and the returned element pointer is removed from the lru list and | ||
341 | * hash collision chains. The user now should do whatever housekeeping | ||
342 | * is necessary. | ||
343 | * Then he must call lc_changed(lc,element_pointer), to finish | ||
344 | * the change. | ||
345 | * | ||
346 | * NOTE: The user needs to check the lc_number on EACH use, so he recognizes | ||
347 | * any cache set change. | ||
348 | */ | ||
349 | struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) | ||
350 | { | 370 | { |
351 | struct lc_element *e; | 371 | struct lc_element *e; |
352 | 372 | ||
@@ -356,8 +376,12 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) | |||
356 | RETURN(NULL); | 376 | RETURN(NULL); |
357 | } | 377 | } |
358 | 378 | ||
359 | e = lc_find(lc, enr); | 379 | e = __lc_find(lc, enr, 1); |
360 | if (e) { | 380 | /* if lc_new_number != lc_number, |
381 | * this enr is currently being pulled in already, | ||
382 | * and will be available once the pending transaction | ||
383 | * has been committed. */ | ||
384 | if (e && e->lc_new_number == e->lc_number) { | ||
361 | ++lc->hits; | 385 | ++lc->hits; |
362 | if (e->refcnt++ == 0) | 386 | if (e->refcnt++ == 0) |
363 | lc->used++; | 387 | lc->used++; |
@@ -366,6 +390,26 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) | |||
366 | } | 390 | } |
367 | 391 | ||
368 | ++lc->misses; | 392 | ++lc->misses; |
393 | if (!may_change) | ||
394 | RETURN(NULL); | ||
395 | |||
396 | /* It has been found above, but on the "to_be_changed" list, not yet | ||
397 | * committed. Don't pull it in twice, wait for the transaction, then | ||
398 | * try again */ | ||
399 | if (e) | ||
400 | RETURN(NULL); | ||
401 | |||
402 | /* To avoid races with lc_try_lock(), first, mark us dirty | ||
403 | * (using test_and_set_bit, as it implies memory barriers), ... */ | ||
404 | test_and_set_bit(__LC_DIRTY, &lc->flags); | ||
405 | |||
406 | /* ... only then check if it is locked anyways. If lc_unlock clears | ||
407 | * the dirty bit again, that's not a problem, we will come here again. | ||
408 | */ | ||
409 | if (test_bit(__LC_LOCKED, &lc->flags)) { | ||
410 | ++lc->locked; | ||
411 | RETURN(NULL); | ||
412 | } | ||
369 | 413 | ||
370 | /* In case there is nothing available and we can not kick out | 414 | /* In case there is nothing available and we can not kick out |
371 | * the LRU element, we have to wait ... | 415 | * the LRU element, we have to wait ... |
@@ -375,71 +419,109 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) | |||
375 | RETURN(NULL); | 419 | RETURN(NULL); |
376 | } | 420 | } |
377 | 421 | ||
378 | /* it was not present in the active set. | 422 | /* It was not present in the active set. We are going to recycle an |
379 | * we are going to recycle an unused (or even "free") element. | 423 | * unused (or even "free") element, but we won't accumulate more than |
380 | * user may need to commit a transaction to record that change. | 424 | * max_pending_changes changes. */ |
381 | * we serialize on flags & TF_DIRTY */ | 425 | if (lc->pending_changes >= lc->max_pending_changes) |
382 | if (test_and_set_bit(__LC_DIRTY, &lc->flags)) { | ||
383 | ++lc->dirty; | ||
384 | RETURN(NULL); | 426 | RETURN(NULL); |
385 | } | ||
386 | 427 | ||
387 | e = lc_get_unused_element(lc); | 428 | e = lc_prepare_for_change(lc, enr); |
388 | BUG_ON(!e); | 429 | BUG_ON(!e); |
389 | 430 | ||
390 | clear_bit(__LC_STARVING, &lc->flags); | 431 | clear_bit(__LC_STARVING, &lc->flags); |
391 | BUG_ON(++e->refcnt != 1); | 432 | BUG_ON(++e->refcnt != 1); |
392 | lc->used++; | 433 | lc->used++; |
393 | 434 | lc->pending_changes++; | |
394 | lc->changing_element = e; | ||
395 | lc->new_number = enr; | ||
396 | 435 | ||
397 | RETURN(e); | 436 | RETURN(e); |
398 | } | 437 | } |
399 | 438 | ||
400 | /* similar to lc_get, | 439 | /** |
401 | * but only gets a new reference on an existing element. | 440 | * lc_get - get element by label, maybe change the active set |
402 | * you either get the requested element, or NULL. | 441 | * @lc: the lru cache to operate on |
403 | * will be consolidated into one function. | 442 | * @enr: the label to look up |
443 | * | ||
444 | * Finds an element in the cache, increases its usage count, | ||
445 | * "touches" and returns it. | ||
446 | * | ||
447 | * In case the requested number is not present, it needs to be added to the | ||
448 | * cache. Therefore it is possible that an other element becomes evicted from | ||
449 | * the cache. In either case, the user is notified so he is able to e.g. keep | ||
450 | * a persistent log of the cache changes, and therefore the objects in use. | ||
451 | * | ||
452 | * Return values: | ||
453 | * NULL | ||
454 | * The cache was marked %LC_STARVING, | ||
455 | * or the requested label was not in the active set | ||
456 | * and a changing transaction is still pending (@lc was marked %LC_DIRTY). | ||
457 | * Or no unused or free element could be recycled (@lc will be marked as | ||
458 | * %LC_STARVING, blocking further lc_get() operations). | ||
459 | * | ||
460 | * pointer to the element with the REQUESTED element number. | ||
461 | * In this case, it can be used right away | ||
462 | * | ||
463 | * pointer to an UNUSED element with some different element number, | ||
464 | * where that different number may also be %LC_FREE. | ||
465 | * | ||
466 | * In this case, the cache is marked %LC_DIRTY, | ||
467 | * so lc_try_lock() will no longer succeed. | ||
468 | * The returned element pointer is moved to the "to_be_changed" list, | ||
469 | * and registered with the new element number on the hash collision chains, | ||
470 | * so it is possible to pick it up from lc_is_used(). | ||
471 | * Up to "max_pending_changes" (see lc_create()) can be accumulated. | ||
472 | * The user now should do whatever housekeeping is necessary, | ||
473 | * typically serialize on lc_try_lock_for_transaction(), then call | ||
474 | * lc_committed(lc) and lc_unlock(), to finish the change. | ||
475 | * | ||
476 | * NOTE: The user needs to check the lc_number on EACH use, so he recognizes | ||
477 | * any cache set change. | ||
404 | */ | 478 | */ |
405 | struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) | 479 | struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) |
406 | { | 480 | { |
407 | struct lc_element *e; | 481 | return __lc_get(lc, enr, 1); |
408 | 482 | } | |
409 | PARANOIA_ENTRY(); | ||
410 | if (lc->flags & LC_STARVING) { | ||
411 | ++lc->starving; | ||
412 | RETURN(NULL); | ||
413 | } | ||
414 | 483 | ||
415 | e = lc_find(lc, enr); | 484 | /** |
416 | if (e) { | 485 | * lc_try_get - get element by label, if present; do not change the active set |
417 | ++lc->hits; | 486 | * @lc: the lru cache to operate on |
418 | if (e->refcnt++ == 0) | 487 | * @enr: the label to look up |
419 | lc->used++; | 488 | * |
420 | list_move(&e->list, &lc->in_use); /* Not evictable... */ | 489 | * Finds an element in the cache, increases its usage count, |
421 | } | 490 | * "touches" and returns it. |
422 | RETURN(e); | 491 | * |
492 | * Return values: | ||
493 | * NULL | ||
494 | * The cache was marked %LC_STARVING, | ||
495 | * or the requested label was not in the active set | ||
496 | * | ||
497 | * pointer to the element with the REQUESTED element number. | ||
498 | * In this case, it can be used right away | ||
499 | */ | ||
500 | struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) | ||
501 | { | ||
502 | return __lc_get(lc, enr, 0); | ||
423 | } | 503 | } |
424 | 504 | ||
425 | /** | 505 | /** |
426 | * lc_changed - tell @lc that the change has been recorded | 506 | * lc_committed - tell @lc that pending changes have been recorded |
427 | * @lc: the lru cache to operate on | 507 | * @lc: the lru cache to operate on |
428 | * @e: the element pending label change | 508 | * |
509 | * User is expected to serialize on explicit lc_try_lock_for_transaction() | ||
510 | * before the transaction is started, and later needs to lc_unlock() explicitly | ||
511 | * as well. | ||
429 | */ | 512 | */ |
430 | void lc_changed(struct lru_cache *lc, struct lc_element *e) | 513 | void lc_committed(struct lru_cache *lc) |
431 | { | 514 | { |
515 | struct lc_element *e, *tmp; | ||
516 | |||
432 | PARANOIA_ENTRY(); | 517 | PARANOIA_ENTRY(); |
433 | BUG_ON(e != lc->changing_element); | 518 | list_for_each_entry_safe(e, tmp, &lc->to_be_changed, list) { |
434 | PARANOIA_LC_ELEMENT(lc, e); | 519 | /* count number of changes, not number of transactions */ |
435 | ++lc->changed; | 520 | ++lc->changed; |
436 | e->lc_number = lc->new_number; | 521 | e->lc_number = e->lc_new_number; |
437 | list_add(&e->list, &lc->in_use); | 522 | list_move(&e->list, &lc->in_use); |
438 | hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number)); | 523 | } |
439 | lc->changing_element = NULL; | 524 | lc->pending_changes = 0; |
440 | lc->new_number = LC_FREE; | ||
441 | clear_bit(__LC_DIRTY, &lc->flags); | ||
442 | smp_mb__after_clear_bit(); | ||
443 | RETURN(); | 525 | RETURN(); |
444 | } | 526 | } |
445 | 527 | ||
@@ -458,13 +540,12 @@ unsigned int lc_put(struct lru_cache *lc, struct lc_element *e) | |||
458 | PARANOIA_ENTRY(); | 540 | PARANOIA_ENTRY(); |
459 | PARANOIA_LC_ELEMENT(lc, e); | 541 | PARANOIA_LC_ELEMENT(lc, e); |
460 | BUG_ON(e->refcnt == 0); | 542 | BUG_ON(e->refcnt == 0); |
461 | BUG_ON(e == lc->changing_element); | 543 | BUG_ON(e->lc_number != e->lc_new_number); |
462 | if (--e->refcnt == 0) { | 544 | if (--e->refcnt == 0) { |
463 | /* move it to the front of LRU. */ | 545 | /* move it to the front of LRU. */ |
464 | list_move(&e->list, &lc->lru); | 546 | list_move(&e->list, &lc->lru); |
465 | lc->used--; | 547 | lc->used--; |
466 | clear_bit(__LC_STARVING, &lc->flags); | 548 | clear_bit_unlock(__LC_STARVING, &lc->flags); |
467 | smp_mb__after_clear_bit(); | ||
468 | } | 549 | } |
469 | RETURN(e->refcnt); | 550 | RETURN(e->refcnt); |
470 | } | 551 | } |
@@ -504,16 +585,24 @@ unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e) | |||
504 | void lc_set(struct lru_cache *lc, unsigned int enr, int index) | 585 | void lc_set(struct lru_cache *lc, unsigned int enr, int index) |
505 | { | 586 | { |
506 | struct lc_element *e; | 587 | struct lc_element *e; |
588 | struct list_head *lh; | ||
507 | 589 | ||
508 | if (index < 0 || index >= lc->nr_elements) | 590 | if (index < 0 || index >= lc->nr_elements) |
509 | return; | 591 | return; |
510 | 592 | ||
511 | e = lc_element_by_index(lc, index); | 593 | e = lc_element_by_index(lc, index); |
512 | e->lc_number = enr; | 594 | BUG_ON(e->lc_number != e->lc_new_number); |
595 | BUG_ON(e->refcnt != 0); | ||
513 | 596 | ||
597 | e->lc_number = e->lc_new_number = enr; | ||
514 | hlist_del_init(&e->colision); | 598 | hlist_del_init(&e->colision); |
515 | hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); | 599 | if (enr == LC_FREE) |
516 | list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); | 600 | lh = &lc->free; |
601 | else { | ||
602 | hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); | ||
603 | lh = &lc->lru; | ||
604 | } | ||
605 | list_move(&e->list, lh); | ||
517 | } | 606 | } |
518 | 607 | ||
519 | /** | 608 | /** |
@@ -553,8 +642,10 @@ EXPORT_SYMBOL(lc_try_get); | |||
553 | EXPORT_SYMBOL(lc_find); | 642 | EXPORT_SYMBOL(lc_find); |
554 | EXPORT_SYMBOL(lc_get); | 643 | EXPORT_SYMBOL(lc_get); |
555 | EXPORT_SYMBOL(lc_put); | 644 | EXPORT_SYMBOL(lc_put); |
556 | EXPORT_SYMBOL(lc_changed); | 645 | EXPORT_SYMBOL(lc_committed); |
557 | EXPORT_SYMBOL(lc_element_by_index); | 646 | EXPORT_SYMBOL(lc_element_by_index); |
558 | EXPORT_SYMBOL(lc_index_of); | 647 | EXPORT_SYMBOL(lc_index_of); |
559 | EXPORT_SYMBOL(lc_seq_printf_stats); | 648 | EXPORT_SYMBOL(lc_seq_printf_stats); |
560 | EXPORT_SYMBOL(lc_seq_dump_details); | 649 | EXPORT_SYMBOL(lc_seq_dump_details); |
650 | EXPORT_SYMBOL(lc_try_lock); | ||
651 | EXPORT_SYMBOL(lc_is_used); | ||