diff options
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 246 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 13 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 179 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 251 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 200 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 10 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 16 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 192 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 8 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state.c | 28 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_strings.c | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 24 |
12 files changed, 845 insertions, 323 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 92510f8ad013..6608076dc39e 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -104,7 +104,6 @@ struct update_al_work { | |||
104 | int err; | 104 | int err; |
105 | }; | 105 | }; |
106 | 106 | ||
107 | static int al_write_transaction(struct drbd_conf *mdev); | ||
108 | 107 | ||
109 | void *drbd_md_get_buffer(struct drbd_conf *mdev) | 108 | void *drbd_md_get_buffer(struct drbd_conf *mdev) |
110 | { | 109 | { |
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
168 | bio->bi_end_io = drbd_md_io_complete; | 167 | bio->bi_end_io = drbd_md_io_complete; |
169 | bio->bi_rw = rw; | 168 | bio->bi_rw = rw; |
170 | 169 | ||
171 | if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ | 170 | if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL) |
171 | /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ | ||
172 | ; | ||
173 | else if (!get_ldev_if_state(mdev, D_ATTACHING)) { | ||
174 | /* Corresponding put_ldev in drbd_md_io_complete() */ | ||
172 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); | 175 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); |
173 | err = -ENODEV; | 176 | err = -ENODEV; |
174 | goto out; | 177 | goto out; |
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
199 | 202 | ||
200 | BUG_ON(!bdev->md_bdev); | 203 | BUG_ON(!bdev->md_bdev); |
201 | 204 | ||
202 | dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", | 205 | dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", |
203 | current->comm, current->pid, __func__, | 206 | current->comm, current->pid, __func__, |
204 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 207 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", |
208 | (void*)_RET_IP_ ); | ||
205 | 209 | ||
206 | if (sector < drbd_md_first_sector(bdev) || | 210 | if (sector < drbd_md_first_sector(bdev) || |
207 | sector + 7 > drbd_md_last_sector(bdev)) | 211 | sector + 7 > drbd_md_last_sector(bdev)) |
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
209 | current->comm, current->pid, __func__, | 213 | current->comm, current->pid, __func__, |
210 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 214 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); |
211 | 215 | ||
212 | err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); | 216 | /* we do all our meta data IO in aligned 4k blocks. */ |
217 | err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096); | ||
213 | if (err) { | 218 | if (err) { |
214 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", | 219 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", |
215 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); | 220 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); |
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
217 | return err; | 222 | return err; |
218 | } | 223 | } |
219 | 224 | ||
220 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | 225 | static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr) |
221 | { | 226 | { |
222 | struct lc_element *al_ext; | ||
223 | struct lc_element *tmp; | 227 | struct lc_element *tmp; |
224 | int wake; | ||
225 | |||
226 | spin_lock_irq(&mdev->al_lock); | ||
227 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); | 228 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); |
228 | if (unlikely(tmp != NULL)) { | 229 | if (unlikely(tmp != NULL)) { |
229 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); | 230 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); |
230 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { | 231 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) |
231 | wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); | 232 | return bm_ext; |
232 | spin_unlock_irq(&mdev->al_lock); | 233 | } |
233 | if (wake) | 234 | return NULL; |
234 | wake_up(&mdev->al_wait); | 235 | } |
235 | return NULL; | 236 | |
236 | } | 237 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock) |
238 | { | ||
239 | struct lc_element *al_ext; | ||
240 | struct bm_extent *bm_ext; | ||
241 | int wake; | ||
242 | |||
243 | spin_lock_irq(&mdev->al_lock); | ||
244 | bm_ext = find_active_resync_extent(mdev, enr); | ||
245 | if (bm_ext) { | ||
246 | wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); | ||
247 | spin_unlock_irq(&mdev->al_lock); | ||
248 | if (wake) | ||
249 | wake_up(&mdev->al_wait); | ||
250 | return NULL; | ||
237 | } | 251 | } |
238 | al_ext = lc_get(mdev->act_log, enr); | 252 | if (nonblock) |
253 | al_ext = lc_try_get(mdev->act_log, enr); | ||
254 | else | ||
255 | al_ext = lc_get(mdev->act_log, enr); | ||
239 | spin_unlock_irq(&mdev->al_lock); | 256 | spin_unlock_irq(&mdev->al_lock); |
240 | return al_ext; | 257 | return al_ext; |
241 | } | 258 | } |
242 | 259 | ||
243 | void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | 260 | bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i) |
244 | { | 261 | { |
245 | /* for bios crossing activity log extent boundaries, | 262 | /* for bios crossing activity log extent boundaries, |
246 | * we may need to activate two extents in one go */ | 263 | * we may need to activate two extents in one go */ |
247 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | 264 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); |
248 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | 265 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); |
249 | unsigned enr; | ||
250 | bool locked = false; | ||
251 | 266 | ||
267 | D_ASSERT((unsigned)(last - first) <= 1); | ||
268 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | ||
269 | |||
270 | /* FIXME figure out a fast path for bios crossing AL extent boundaries */ | ||
271 | if (first != last) | ||
272 | return false; | ||
273 | |||
274 | return _al_get(mdev, first, true); | ||
275 | } | ||
276 | |||
277 | bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i) | ||
278 | { | ||
279 | /* for bios crossing activity log extent boundaries, | ||
280 | * we may need to activate two extents in one go */ | ||
281 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | ||
282 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
283 | unsigned enr; | ||
284 | bool need_transaction = false; | ||
252 | 285 | ||
253 | D_ASSERT(first <= last); | 286 | D_ASSERT(first <= last); |
254 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | 287 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); |
255 | 288 | ||
256 | for (enr = first; enr <= last; enr++) | 289 | for (enr = first; enr <= last; enr++) { |
257 | wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); | 290 | struct lc_element *al_ext; |
291 | wait_event(mdev->al_wait, | ||
292 | (al_ext = _al_get(mdev, enr, false)) != NULL); | ||
293 | if (al_ext->lc_number != enr) | ||
294 | need_transaction = true; | ||
295 | } | ||
296 | return need_transaction; | ||
297 | } | ||
298 | |||
299 | static int al_write_transaction(struct drbd_conf *mdev, bool delegate); | ||
300 | |||
301 | /* When called through generic_make_request(), we must delegate | ||
302 | * activity log I/O to the worker thread: a further request | ||
303 | * submitted via generic_make_request() within the same task | ||
304 | * would be queued on current->bio_list, and would only start | ||
305 | * after this function returns (see generic_make_request()). | ||
306 | * | ||
307 | * However, if we *are* the worker, we must not delegate to ourselves. | ||
308 | */ | ||
309 | |||
310 | /* | ||
311 | * @delegate: delegate activity log I/O to the worker thread | ||
312 | */ | ||
313 | void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate) | ||
314 | { | ||
315 | bool locked = false; | ||
316 | |||
317 | BUG_ON(delegate && current == mdev->tconn->worker.task); | ||
258 | 318 | ||
259 | /* Serialize multiple transactions. | 319 | /* Serialize multiple transactions. |
260 | * This uses test_and_set_bit, memory barrier is implicit. | 320 | * This uses test_and_set_bit, memory barrier is implicit. |
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | |||
264 | (locked = lc_try_lock_for_transaction(mdev->act_log))); | 324 | (locked = lc_try_lock_for_transaction(mdev->act_log))); |
265 | 325 | ||
266 | if (locked) { | 326 | if (locked) { |
267 | /* drbd_al_write_transaction(mdev,al_ext,enr); | ||
268 | * recurses into generic_make_request(), which | ||
269 | * disallows recursion, bios being serialized on the | ||
270 | * current->bio_tail list now. | ||
271 | * we have to delegate updates to the activity log | ||
272 | * to the worker thread. */ | ||
273 | |||
274 | /* Double check: it may have been committed by someone else, | 327 | /* Double check: it may have been committed by someone else, |
275 | * while we have been waiting for the lock. */ | 328 | * while we have been waiting for the lock. */ |
276 | if (mdev->act_log->pending_changes) { | 329 | if (mdev->act_log->pending_changes) { |
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | |||
280 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; | 333 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; |
281 | rcu_read_unlock(); | 334 | rcu_read_unlock(); |
282 | 335 | ||
283 | if (write_al_updates) { | 336 | if (write_al_updates) |
284 | al_write_transaction(mdev); | 337 | al_write_transaction(mdev, delegate); |
285 | mdev->al_writ_cnt++; | ||
286 | } | ||
287 | |||
288 | spin_lock_irq(&mdev->al_lock); | 338 | spin_lock_irq(&mdev->al_lock); |
289 | /* FIXME | 339 | /* FIXME |
290 | if (err) | 340 | if (err) |
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | |||
298 | } | 348 | } |
299 | } | 349 | } |
300 | 350 | ||
351 | /* | ||
352 | * @delegate: delegate activity log I/O to the worker thread | ||
353 | */ | ||
354 | void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate) | ||
355 | { | ||
356 | BUG_ON(delegate && current == mdev->tconn->worker.task); | ||
357 | |||
358 | if (drbd_al_begin_io_prepare(mdev, i)) | ||
359 | drbd_al_begin_io_commit(mdev, delegate); | ||
360 | } | ||
361 | |||
362 | int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i) | ||
363 | { | ||
364 | struct lru_cache *al = mdev->act_log; | ||
365 | /* for bios crossing activity log extent boundaries, | ||
366 | * we may need to activate two extents in one go */ | ||
367 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | ||
368 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
369 | unsigned nr_al_extents; | ||
370 | unsigned available_update_slots; | ||
371 | unsigned enr; | ||
372 | |||
373 | D_ASSERT(first <= last); | ||
374 | |||
375 | nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ | ||
376 | available_update_slots = min(al->nr_elements - al->used, | ||
377 | al->max_pending_changes - al->pending_changes); | ||
378 | |||
379 | /* We want all necessary updates for a given request within the same transaction | ||
380 | * We could first check how many updates are *actually* needed, | ||
381 | * and use that instead of the worst-case nr_al_extents */ | ||
382 | if (available_update_slots < nr_al_extents) | ||
383 | return -EWOULDBLOCK; | ||
384 | |||
385 | /* Is resync active in this area? */ | ||
386 | for (enr = first; enr <= last; enr++) { | ||
387 | struct lc_element *tmp; | ||
388 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); | ||
389 | if (unlikely(tmp != NULL)) { | ||
390 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); | ||
391 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { | ||
392 | if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) | ||
393 | return -EBUSY; | ||
394 | return -EWOULDBLOCK; | ||
395 | } | ||
396 | } | ||
397 | } | ||
398 | |||
399 | /* Checkout the refcounts. | ||
400 | * Given that we checked for available elements and update slots above, | ||
401 | * this has to be successful. */ | ||
402 | for (enr = first; enr <= last; enr++) { | ||
403 | struct lc_element *al_ext; | ||
404 | al_ext = lc_get_cumulative(mdev->act_log, enr); | ||
405 | if (!al_ext) | ||
406 | dev_info(DEV, "LOGIC BUG for enr=%u\n", enr); | ||
407 | } | ||
408 | return 0; | ||
409 | } | ||
410 | |||
301 | void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) | 411 | void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) |
302 | { | 412 | { |
303 | /* for bios crossing activity log extent boundaries, | 413 | /* for bios crossing activity log extent boundaries, |
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) | |||
350 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); | 460 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); |
351 | } | 461 | } |
352 | 462 | ||
463 | static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev) | ||
464 | { | ||
465 | const unsigned int stripes = mdev->ldev->md.al_stripes; | ||
466 | const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k; | ||
467 | |||
468 | /* transaction number, modulo on-disk ring buffer wrap around */ | ||
469 | unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k); | ||
470 | |||
471 | /* ... to aligned 4k on disk block */ | ||
472 | t = ((t % stripes) * stripe_size_4kB) + t/stripes; | ||
473 | |||
474 | /* ... to 512 byte sector in activity log */ | ||
475 | t *= 8; | ||
476 | |||
477 | /* ... plus offset to the on disk position */ | ||
478 | return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t; | ||
479 | } | ||
480 | |||
353 | static int | 481 | static int |
354 | _al_write_transaction(struct drbd_conf *mdev) | 482 | _al_write_transaction(struct drbd_conf *mdev) |
355 | { | 483 | { |
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev) | |||
432 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) | 560 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) |
433 | mdev->al_tr_cycle = 0; | 561 | mdev->al_tr_cycle = 0; |
434 | 562 | ||
435 | sector = mdev->ldev->md.md_offset | 563 | sector = al_tr_number_to_on_disk_sector(mdev); |
436 | + mdev->ldev->md.al_offset | ||
437 | + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); | ||
438 | 564 | ||
439 | crc = crc32c(0, buffer, 4096); | 565 | crc = crc32c(0, buffer, 4096); |
440 | buffer->crc32c = cpu_to_be32(crc); | 566 | buffer->crc32c = cpu_to_be32(crc); |
441 | 567 | ||
442 | if (drbd_bm_write_hinted(mdev)) | 568 | if (drbd_bm_write_hinted(mdev)) |
443 | err = -EIO; | 569 | err = -EIO; |
444 | /* drbd_chk_io_error done already */ | 570 | else { |
445 | else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | 571 | bool write_al_updates; |
446 | err = -EIO; | 572 | rcu_read_lock(); |
447 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 573 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; |
448 | } else { | 574 | rcu_read_unlock(); |
449 | /* advance ringbuffer position and transaction counter */ | 575 | if (write_al_updates) { |
450 | mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); | 576 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { |
451 | mdev->al_tr_number++; | 577 | err = -EIO; |
578 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | ||
579 | } else { | ||
580 | mdev->al_tr_number++; | ||
581 | mdev->al_writ_cnt++; | ||
582 | } | ||
583 | } | ||
452 | } | 584 | } |
453 | 585 | ||
454 | drbd_md_put_buffer(mdev); | 586 | drbd_md_put_buffer(mdev); |
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused) | |||
474 | /* Calls from worker context (see w_restart_disk_io()) need to write the | 606 | /* Calls from worker context (see w_restart_disk_io()) need to write the |
475 | transaction directly. Others came through generic_make_request(), | 607 | transaction directly. Others came through generic_make_request(), |
476 | those need to delegate it to the worker. */ | 608 | those need to delegate it to the worker. */ |
477 | static int al_write_transaction(struct drbd_conf *mdev) | 609 | static int al_write_transaction(struct drbd_conf *mdev, bool delegate) |
478 | { | 610 | { |
479 | struct update_al_work al_work; | 611 | if (delegate) { |
480 | 612 | struct update_al_work al_work; | |
481 | if (current == mdev->tconn->worker.task) | 613 | init_completion(&al_work.event); |
614 | al_work.w.cb = w_al_write_transaction; | ||
615 | al_work.w.mdev = mdev; | ||
616 | drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); | ||
617 | wait_for_completion(&al_work.event); | ||
618 | return al_work.err; | ||
619 | } else | ||
482 | return _al_write_transaction(mdev); | 620 | return _al_write_transaction(mdev); |
483 | |||
484 | init_completion(&al_work.event); | ||
485 | al_work.w.cb = w_al_write_transaction; | ||
486 | al_work.w.mdev = mdev; | ||
487 | drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); | ||
488 | wait_for_completion(&al_work.event); | ||
489 | |||
490 | return al_work.err; | ||
491 | } | 621 | } |
492 | 622 | ||
493 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) | 623 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) |
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 8dc29502dc08..64fbb8385cdc 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | |||
612 | } | 612 | } |
613 | } | 613 | } |
614 | 614 | ||
615 | /* For the layout, see comment above drbd_md_set_sector_offsets(). */ | ||
616 | static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev) | ||
617 | { | ||
618 | u64 bitmap_sectors; | ||
619 | if (ldev->md.al_offset == 8) | ||
620 | bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset; | ||
621 | else | ||
622 | bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset; | ||
623 | return bitmap_sectors << (9 + 3); | ||
624 | } | ||
625 | |||
615 | /* | 626 | /* |
616 | * make sure the bitmap has enough room for the attached storage, | 627 | * make sure the bitmap has enough room for the attached storage, |
617 | * if necessary, resize. | 628 | * if necessary, resize. |
@@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) | |||
668 | words = ALIGN(bits, 64) >> LN2_BPL; | 679 | words = ALIGN(bits, 64) >> LN2_BPL; |
669 | 680 | ||
670 | if (get_ldev(mdev)) { | 681 | if (get_ldev(mdev)) { |
671 | u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12; | 682 | u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev); |
672 | put_ldev(mdev); | 683 | put_ldev(mdev); |
673 | if (bits > bits_on_disk) { | 684 | if (bits > bits_on_disk) { |
674 | dev_info(DEV, "bits = %lu\n", bits); | 685 | dev_info(DEV, "bits = %lu\n", bits); |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 6b51afa1aae1..f943aacfdad8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -753,13 +753,16 @@ struct drbd_md { | |||
753 | u32 flags; | 753 | u32 flags; |
754 | u32 md_size_sect; | 754 | u32 md_size_sect; |
755 | 755 | ||
756 | s32 al_offset; /* signed relative sector offset to al area */ | 756 | s32 al_offset; /* signed relative sector offset to activity log */ |
757 | s32 bm_offset; /* signed relative sector offset to bitmap */ | 757 | s32 bm_offset; /* signed relative sector offset to bitmap */ |
758 | 758 | ||
759 | /* u32 al_nr_extents; important for restoring the AL | 759 | /* cached value of bdev->disk_conf->meta_dev_idx (see below) */ |
760 | * is stored into ldev->dc.al_extents, which in turn | 760 | s32 meta_dev_idx; |
761 | * gets applied to act_log->nr_elements | 761 | |
762 | */ | 762 | /* see al_tr_number_to_on_disk_sector() */ |
763 | u32 al_stripes; | ||
764 | u32 al_stripe_size_4k; | ||
765 | u32 al_size_4k; /* cached product of the above */ | ||
763 | }; | 766 | }; |
764 | 767 | ||
765 | struct drbd_backing_dev { | 768 | struct drbd_backing_dev { |
@@ -891,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */ | |||
891 | } send; | 894 | } send; |
892 | }; | 895 | }; |
893 | 896 | ||
897 | struct submit_worker { | ||
898 | struct workqueue_struct *wq; | ||
899 | struct work_struct worker; | ||
900 | |||
901 | spinlock_t lock; | ||
902 | struct list_head writes; | ||
903 | }; | ||
904 | |||
894 | struct drbd_conf { | 905 | struct drbd_conf { |
895 | struct drbd_tconn *tconn; | 906 | struct drbd_tconn *tconn; |
896 | int vnr; /* volume number within the connection */ | 907 | int vnr; /* volume number within the connection */ |
@@ -1009,7 +1020,6 @@ struct drbd_conf { | |||
1009 | struct lru_cache *act_log; /* activity log */ | 1020 | struct lru_cache *act_log; /* activity log */ |
1010 | unsigned int al_tr_number; | 1021 | unsigned int al_tr_number; |
1011 | int al_tr_cycle; | 1022 | int al_tr_cycle; |
1012 | int al_tr_pos; /* position of the next transaction in the journal */ | ||
1013 | wait_queue_head_t seq_wait; | 1023 | wait_queue_head_t seq_wait; |
1014 | atomic_t packet_seq; | 1024 | atomic_t packet_seq; |
1015 | unsigned int peer_seq; | 1025 | unsigned int peer_seq; |
@@ -1032,6 +1042,10 @@ struct drbd_conf { | |||
1032 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ | 1042 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ |
1033 | unsigned int peer_max_bio_size; | 1043 | unsigned int peer_max_bio_size; |
1034 | unsigned int local_max_bio_size; | 1044 | unsigned int local_max_bio_size; |
1045 | |||
1046 | /* any requests that would block in drbd_make_request() | ||
1047 | * are deferred to this single-threaded work queue */ | ||
1048 | struct submit_worker submit; | ||
1035 | }; | 1049 | }; |
1036 | 1050 | ||
1037 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) | 1051 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) |
@@ -1148,25 +1162,44 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | |||
1148 | char *why, enum bm_flag flags); | 1162 | char *why, enum bm_flag flags); |
1149 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); | 1163 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); |
1150 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); | 1164 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); |
1151 | extern void drbd_go_diskless(struct drbd_conf *mdev); | ||
1152 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); | 1165 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); |
1153 | 1166 | ||
1154 | /* Meta data layout | 1167 | /* Meta data layout |
1155 | We reserve a 128MB Block (4k aligned) | 1168 | * |
1156 | * either at the end of the backing device | 1169 | * We currently have two possible layouts. |
1157 | * or on a separate meta data device. */ | 1170 | * Offsets in (512 byte) sectors. |
1171 | * external: | ||
1172 | * |----------- md_size_sect ------------------| | ||
1173 | * [ 4k superblock ][ activity log ][ Bitmap ] | ||
1174 | * | al_offset == 8 | | ||
1175 | * | bm_offset = al_offset + X | | ||
1176 | * ==> bitmap sectors = md_size_sect - bm_offset | ||
1177 | * | ||
1178 | * Variants: | ||
1179 | * old, indexed fixed size meta data: | ||
1180 | * | ||
1181 | * internal: | ||
1182 | * |----------- md_size_sect ------------------| | ||
1183 | * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*] | ||
1184 | * | al_offset < 0 | | ||
1185 | * | bm_offset = al_offset - Y | | ||
1186 | * ==> bitmap sectors = Y = al_offset - bm_offset | ||
1187 | * | ||
1188 | * [padding*] are zero or up to 7 unused 512 Byte sectors to the | ||
1189 | * end of the device, so that the [4k superblock] will be 4k aligned. | ||
1190 | * | ||
1191 | * The activity log consists of 4k transaction blocks, | ||
1192 | * which are written in a ring-buffer, or striped ring-buffer like fashion, | ||
1193 | * which are writtensize used to be fixed 32kB, | ||
1194 | * but is about to become configurable. | ||
1195 | */ | ||
1158 | 1196 | ||
1159 | /* The following numbers are sectors */ | 1197 | /* Our old fixed size meta data layout |
1160 | /* Allows up to about 3.8TB, so if you want more, | 1198 | * allows up to about 3.8TB, so if you want more, |
1161 | * you need to use the "flexible" meta data format. */ | 1199 | * you need to use the "flexible" meta data format. */ |
1162 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ | 1200 | #define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */ |
1163 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ | 1201 | #define MD_4kB_SECT 8 |
1164 | #define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ | 1202 | #define MD_32kB_SECT 64 |
1165 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) | ||
1166 | |||
1167 | /* we do all meta data IO in 4k blocks */ | ||
1168 | #define MD_BLOCK_SHIFT 12 | ||
1169 | #define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT) | ||
1170 | 1203 | ||
1171 | /* One activity log extent represents 4M of storage */ | 1204 | /* One activity log extent represents 4M of storage */ |
1172 | #define AL_EXTENT_SHIFT 22 | 1205 | #define AL_EXTENT_SHIFT 22 |
@@ -1256,7 +1289,6 @@ struct bm_extent { | |||
1256 | 1289 | ||
1257 | /* in one sector of the bitmap, we have this many activity_log extents. */ | 1290 | /* in one sector of the bitmap, we have this many activity_log extents. */ |
1258 | #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) | 1291 | #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) |
1259 | #define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
1260 | 1292 | ||
1261 | #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) | 1293 | #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) |
1262 | #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) | 1294 | #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) |
@@ -1276,16 +1308,18 @@ struct bm_extent { | |||
1276 | */ | 1308 | */ |
1277 | 1309 | ||
1278 | #define DRBD_MAX_SECTORS_32 (0xffffffffLU) | 1310 | #define DRBD_MAX_SECTORS_32 (0xffffffffLU) |
1279 | #define DRBD_MAX_SECTORS_BM \ | 1311 | /* we have a certain meta data variant that has a fixed on-disk size of 128 |
1280 | ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) | 1312 | * MiB, of which 4k are our "superblock", and 32k are the fixed size activity |
1281 | #if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 | 1313 | * log, leaving this many sectors for the bitmap. |
1282 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | 1314 | */ |
1283 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM | 1315 | |
1284 | #elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 | 1316 | #define DRBD_MAX_SECTORS_FIXED_BM \ |
1317 | ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9))) | ||
1318 | #if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 | ||
1285 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 | 1319 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 |
1286 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 | 1320 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 |
1287 | #else | 1321 | #else |
1288 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | 1322 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM |
1289 | /* 16 TB in units of sectors */ | 1323 | /* 16 TB in units of sectors */ |
1290 | #if BITS_PER_LONG == 32 | 1324 | #if BITS_PER_LONG == 32 |
1291 | /* adjust by one page worth of bitmap, | 1325 | /* adjust by one page worth of bitmap, |
@@ -1418,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn); | |||
1418 | extern int proc_details; | 1452 | extern int proc_details; |
1419 | 1453 | ||
1420 | /* drbd_req */ | 1454 | /* drbd_req */ |
1455 | extern void do_submit(struct work_struct *ws); | ||
1421 | extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); | 1456 | extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); |
1422 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); | 1457 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); |
1423 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); | 1458 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); |
@@ -1576,7 +1611,10 @@ extern const char *drbd_conn_str(enum drbd_conns s); | |||
1576 | extern const char *drbd_role_str(enum drbd_role s); | 1611 | extern const char *drbd_role_str(enum drbd_role s); |
1577 | 1612 | ||
1578 | /* drbd_actlog.c */ | 1613 | /* drbd_actlog.c */ |
1579 | extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); | 1614 | extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i); |
1615 | extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate); | ||
1616 | extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i); | ||
1617 | extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate); | ||
1580 | extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); | 1618 | extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); |
1581 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); | 1619 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); |
1582 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | 1620 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); |
@@ -1755,9 +1793,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1755 | * BTW, for internal meta data, this happens to be the maximum capacity | 1793 | * BTW, for internal meta data, this happens to be the maximum capacity |
1756 | * we could agree upon with our peer node. | 1794 | * we could agree upon with our peer node. |
1757 | */ | 1795 | */ |
1758 | static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) | 1796 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) |
1759 | { | 1797 | { |
1760 | switch (meta_dev_idx) { | 1798 | switch (bdev->md.meta_dev_idx) { |
1761 | case DRBD_MD_INDEX_INTERNAL: | 1799 | case DRBD_MD_INDEX_INTERNAL: |
1762 | case DRBD_MD_INDEX_FLEX_INT: | 1800 | case DRBD_MD_INDEX_FLEX_INT: |
1763 | return bdev->md.md_offset + bdev->md.bm_offset; | 1801 | return bdev->md.md_offset + bdev->md.bm_offset; |
@@ -1767,36 +1805,19 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi | |||
1767 | } | 1805 | } |
1768 | } | 1806 | } |
1769 | 1807 | ||
1770 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | ||
1771 | { | ||
1772 | int meta_dev_idx; | ||
1773 | |||
1774 | rcu_read_lock(); | ||
1775 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1776 | rcu_read_unlock(); | ||
1777 | |||
1778 | return _drbd_md_first_sector(meta_dev_idx, bdev); | ||
1779 | } | ||
1780 | |||
1781 | /** | 1808 | /** |
1782 | * drbd_md_last_sector() - Return the last sector number of the meta data area | 1809 | * drbd_md_last_sector() - Return the last sector number of the meta data area |
1783 | * @bdev: Meta data block device. | 1810 | * @bdev: Meta data block device. |
1784 | */ | 1811 | */ |
1785 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) | 1812 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) |
1786 | { | 1813 | { |
1787 | int meta_dev_idx; | 1814 | switch (bdev->md.meta_dev_idx) { |
1788 | |||
1789 | rcu_read_lock(); | ||
1790 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1791 | rcu_read_unlock(); | ||
1792 | |||
1793 | switch (meta_dev_idx) { | ||
1794 | case DRBD_MD_INDEX_INTERNAL: | 1815 | case DRBD_MD_INDEX_INTERNAL: |
1795 | case DRBD_MD_INDEX_FLEX_INT: | 1816 | case DRBD_MD_INDEX_FLEX_INT: |
1796 | return bdev->md.md_offset + MD_AL_OFFSET - 1; | 1817 | return bdev->md.md_offset + MD_4kB_SECT -1; |
1797 | case DRBD_MD_INDEX_FLEX_EXT: | 1818 | case DRBD_MD_INDEX_FLEX_EXT: |
1798 | default: | 1819 | default: |
1799 | return bdev->md.md_offset + bdev->md.md_size_sect; | 1820 | return bdev->md.md_offset + bdev->md.md_size_sect -1; |
1800 | } | 1821 | } |
1801 | } | 1822 | } |
1802 | 1823 | ||
@@ -1818,18 +1839,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) | |||
1818 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | 1839 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) |
1819 | { | 1840 | { |
1820 | sector_t s; | 1841 | sector_t s; |
1821 | int meta_dev_idx; | ||
1822 | 1842 | ||
1823 | rcu_read_lock(); | 1843 | switch (bdev->md.meta_dev_idx) { |
1824 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1825 | rcu_read_unlock(); | ||
1826 | |||
1827 | switch (meta_dev_idx) { | ||
1828 | case DRBD_MD_INDEX_INTERNAL: | 1844 | case DRBD_MD_INDEX_INTERNAL: |
1829 | case DRBD_MD_INDEX_FLEX_INT: | 1845 | case DRBD_MD_INDEX_FLEX_INT: |
1830 | s = drbd_get_capacity(bdev->backing_bdev) | 1846 | s = drbd_get_capacity(bdev->backing_bdev) |
1831 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | 1847 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, |
1832 | _drbd_md_first_sector(meta_dev_idx, bdev)) | 1848 | drbd_md_first_sector(bdev)) |
1833 | : 0; | 1849 | : 0; |
1834 | break; | 1850 | break; |
1835 | case DRBD_MD_INDEX_FLEX_EXT: | 1851 | case DRBD_MD_INDEX_FLEX_EXT: |
@@ -1848,39 +1864,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | |||
1848 | } | 1864 | } |
1849 | 1865 | ||
1850 | /** | 1866 | /** |
1851 | * drbd_md_ss__() - Return the sector number of our meta data super block | 1867 | * drbd_md_ss() - Return the sector number of our meta data super block |
1852 | * @mdev: DRBD device. | ||
1853 | * @bdev: Meta data block device. | 1868 | * @bdev: Meta data block device. |
1854 | */ | 1869 | */ |
1855 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, | 1870 | static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev) |
1856 | struct drbd_backing_dev *bdev) | ||
1857 | { | 1871 | { |
1858 | int meta_dev_idx; | 1872 | const int meta_dev_idx = bdev->md.meta_dev_idx; |
1859 | 1873 | ||
1860 | rcu_read_lock(); | 1874 | if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT) |
1861 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1862 | rcu_read_unlock(); | ||
1863 | |||
1864 | switch (meta_dev_idx) { | ||
1865 | default: /* external, some index */ | ||
1866 | return MD_RESERVED_SECT * meta_dev_idx; | ||
1867 | case DRBD_MD_INDEX_INTERNAL: | ||
1868 | /* with drbd08, internal meta data is always "flexible" */ | ||
1869 | case DRBD_MD_INDEX_FLEX_INT: | ||
1870 | /* sizeof(struct md_on_disk_07) == 4k | ||
1871 | * position: last 4k aligned block of 4k size */ | ||
1872 | if (!bdev->backing_bdev) { | ||
1873 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
1874 | dev_err(DEV, "bdev->backing_bdev==NULL\n"); | ||
1875 | dump_stack(); | ||
1876 | } | ||
1877 | return 0; | ||
1878 | } | ||
1879 | return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) | ||
1880 | - MD_AL_OFFSET; | ||
1881 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1882 | return 0; | 1875 | return 0; |
1883 | } | 1876 | |
1877 | /* Since drbd08, internal meta data is always "flexible". | ||
1878 | * position: last 4k aligned block of 4k size */ | ||
1879 | if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL || | ||
1880 | meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) | ||
1881 | return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; | ||
1882 | |||
1883 | /* external, some index; this is the old fixed size layout */ | ||
1884 | return MD_128MB_SECT * bdev->md.meta_dev_idx; | ||
1884 | } | 1885 | } |
1885 | 1886 | ||
1886 | static inline void | 1887 | static inline void |
@@ -2053,9 +2054,11 @@ static inline void put_ldev(struct drbd_conf *mdev) | |||
2053 | if (mdev->state.disk == D_DISKLESS) | 2054 | if (mdev->state.disk == D_DISKLESS) |
2054 | /* even internal references gone, safe to destroy */ | 2055 | /* even internal references gone, safe to destroy */ |
2055 | drbd_ldev_destroy(mdev); | 2056 | drbd_ldev_destroy(mdev); |
2056 | if (mdev->state.disk == D_FAILED) | 2057 | if (mdev->state.disk == D_FAILED) { |
2057 | /* all application IO references gone. */ | 2058 | /* all application IO references gone. */ |
2058 | drbd_go_diskless(mdev); | 2059 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) |
2060 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); | ||
2061 | } | ||
2059 | wake_up(&mdev->misc_wait); | 2062 | wake_up(&mdev->misc_wait); |
2060 | } | 2063 | } |
2061 | } | 2064 | } |
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 298b868910dc..a5dca6affcbb 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -45,7 +45,7 @@ | |||
45 | #include <linux/reboot.h> | 45 | #include <linux/reboot.h> |
46 | #include <linux/notifier.h> | 46 | #include <linux/notifier.h> |
47 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
48 | 48 | #include <linux/workqueue.h> | |
49 | #define __KERNEL_SYSCALLS__ | 49 | #define __KERNEL_SYSCALLS__ |
50 | #include <linux/unistd.h> | 50 | #include <linux/unistd.h> |
51 | #include <linux/vmalloc.h> | 51 | #include <linux/vmalloc.h> |
@@ -2299,6 +2299,7 @@ static void drbd_cleanup(void) | |||
2299 | idr_for_each_entry(&minors, mdev, i) { | 2299 | idr_for_each_entry(&minors, mdev, i) { |
2300 | idr_remove(&minors, mdev_to_minor(mdev)); | 2300 | idr_remove(&minors, mdev_to_minor(mdev)); |
2301 | idr_remove(&mdev->tconn->volumes, mdev->vnr); | 2301 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
2302 | destroy_workqueue(mdev->submit.wq); | ||
2302 | del_gendisk(mdev->vdisk); | 2303 | del_gendisk(mdev->vdisk); |
2303 | /* synchronize_rcu(); No other threads running at this point */ | 2304 | /* synchronize_rcu(); No other threads running at this point */ |
2304 | kref_put(&mdev->kref, &drbd_minor_destroy); | 2305 | kref_put(&mdev->kref, &drbd_minor_destroy); |
@@ -2588,6 +2589,21 @@ void conn_destroy(struct kref *kref) | |||
2588 | kfree(tconn); | 2589 | kfree(tconn); |
2589 | } | 2590 | } |
2590 | 2591 | ||
2592 | int init_submitter(struct drbd_conf *mdev) | ||
2593 | { | ||
2594 | /* opencoded create_singlethread_workqueue(), | ||
2595 | * to be able to say "drbd%d", ..., minor */ | ||
2596 | mdev->submit.wq = alloc_workqueue("drbd%u_submit", | ||
2597 | WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor); | ||
2598 | if (!mdev->submit.wq) | ||
2599 | return -ENOMEM; | ||
2600 | |||
2601 | INIT_WORK(&mdev->submit.worker, do_submit); | ||
2602 | spin_lock_init(&mdev->submit.lock); | ||
2603 | INIT_LIST_HEAD(&mdev->submit.writes); | ||
2604 | return 0; | ||
2605 | } | ||
2606 | |||
2591 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) | 2607 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) |
2592 | { | 2608 | { |
2593 | struct drbd_conf *mdev; | 2609 | struct drbd_conf *mdev; |
@@ -2677,6 +2693,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, | |||
2677 | goto out_idr_remove_minor; | 2693 | goto out_idr_remove_minor; |
2678 | } | 2694 | } |
2679 | 2695 | ||
2696 | if (init_submitter(mdev)) { | ||
2697 | err = ERR_NOMEM; | ||
2698 | drbd_msg_put_info("unable to create submit workqueue"); | ||
2699 | goto out_idr_remove_vol; | ||
2700 | } | ||
2701 | |||
2680 | add_disk(disk); | 2702 | add_disk(disk); |
2681 | kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ | 2703 | kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ |
2682 | 2704 | ||
@@ -2687,6 +2709,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, | |||
2687 | 2709 | ||
2688 | return NO_ERROR; | 2710 | return NO_ERROR; |
2689 | 2711 | ||
2712 | out_idr_remove_vol: | ||
2713 | idr_remove(&tconn->volumes, vnr_got); | ||
2690 | out_idr_remove_minor: | 2714 | out_idr_remove_minor: |
2691 | idr_remove(&minors, minor_got); | 2715 | idr_remove(&minors, minor_got); |
2692 | synchronize_rcu(); | 2716 | synchronize_rcu(); |
@@ -2794,6 +2818,7 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) | |||
2794 | blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 2818 | blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
2795 | blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 2819 | blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
2796 | 2820 | ||
2821 | kfree(ldev->disk_conf); | ||
2797 | kfree(ldev); | 2822 | kfree(ldev); |
2798 | } | 2823 | } |
2799 | 2824 | ||
@@ -2833,8 +2858,9 @@ void conn_md_sync(struct drbd_tconn *tconn) | |||
2833 | rcu_read_unlock(); | 2858 | rcu_read_unlock(); |
2834 | } | 2859 | } |
2835 | 2860 | ||
2861 | /* aligned 4kByte */ | ||
2836 | struct meta_data_on_disk { | 2862 | struct meta_data_on_disk { |
2837 | u64 la_size; /* last agreed size. */ | 2863 | u64 la_size_sect; /* last agreed size. */ |
2838 | u64 uuid[UI_SIZE]; /* UUIDs. */ | 2864 | u64 uuid[UI_SIZE]; /* UUIDs. */ |
2839 | u64 device_uuid; | 2865 | u64 device_uuid; |
2840 | u64 reserved_u64_1; | 2866 | u64 reserved_u64_1; |
@@ -2842,13 +2868,17 @@ struct meta_data_on_disk { | |||
2842 | u32 magic; | 2868 | u32 magic; |
2843 | u32 md_size_sect; | 2869 | u32 md_size_sect; |
2844 | u32 al_offset; /* offset to this block */ | 2870 | u32 al_offset; /* offset to this block */ |
2845 | u32 al_nr_extents; /* important for restoring the AL */ | 2871 | u32 al_nr_extents; /* important for restoring the AL (userspace) */ |
2846 | /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ | 2872 | /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ |
2847 | u32 bm_offset; /* offset to the bitmap, from here */ | 2873 | u32 bm_offset; /* offset to the bitmap, from here */ |
2848 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ | 2874 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ |
2849 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ | 2875 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ |
2850 | u32 reserved_u32[3]; | ||
2851 | 2876 | ||
2877 | /* see al_tr_number_to_on_disk_sector() */ | ||
2878 | u32 al_stripes; | ||
2879 | u32 al_stripe_size_4k; | ||
2880 | |||
2881 | u8 reserved_u8[4096 - (7*8 + 10*4)]; | ||
2852 | } __packed; | 2882 | } __packed; |
2853 | 2883 | ||
2854 | /** | 2884 | /** |
@@ -2861,6 +2891,10 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
2861 | sector_t sector; | 2891 | sector_t sector; |
2862 | int i; | 2892 | int i; |
2863 | 2893 | ||
2894 | /* Don't accidentally change the DRBD meta data layout. */ | ||
2895 | BUILD_BUG_ON(UI_SIZE != 4); | ||
2896 | BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); | ||
2897 | |||
2864 | del_timer(&mdev->md_sync_timer); | 2898 | del_timer(&mdev->md_sync_timer); |
2865 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ | 2899 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ |
2866 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) | 2900 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) |
@@ -2875,9 +2909,9 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
2875 | if (!buffer) | 2909 | if (!buffer) |
2876 | goto out; | 2910 | goto out; |
2877 | 2911 | ||
2878 | memset(buffer, 0, 512); | 2912 | memset(buffer, 0, sizeof(*buffer)); |
2879 | 2913 | ||
2880 | buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); | 2914 | buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); |
2881 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 2915 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
2882 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); | 2916 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); |
2883 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); | 2917 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); |
@@ -2892,7 +2926,10 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
2892 | buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); | 2926 | buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); |
2893 | buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); | 2927 | buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); |
2894 | 2928 | ||
2895 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); | 2929 | buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes); |
2930 | buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k); | ||
2931 | |||
2932 | D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset); | ||
2896 | sector = mdev->ldev->md.md_offset; | 2933 | sector = mdev->ldev->md.md_offset; |
2897 | 2934 | ||
2898 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | 2935 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { |
@@ -2910,13 +2947,141 @@ out: | |||
2910 | put_ldev(mdev); | 2947 | put_ldev(mdev); |
2911 | } | 2948 | } |
2912 | 2949 | ||
2950 | static int check_activity_log_stripe_size(struct drbd_conf *mdev, | ||
2951 | struct meta_data_on_disk *on_disk, | ||
2952 | struct drbd_md *in_core) | ||
2953 | { | ||
2954 | u32 al_stripes = be32_to_cpu(on_disk->al_stripes); | ||
2955 | u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k); | ||
2956 | u64 al_size_4k; | ||
2957 | |||
2958 | /* both not set: default to old fixed size activity log */ | ||
2959 | if (al_stripes == 0 && al_stripe_size_4k == 0) { | ||
2960 | al_stripes = 1; | ||
2961 | al_stripe_size_4k = MD_32kB_SECT/8; | ||
2962 | } | ||
2963 | |||
2964 | /* some paranoia plausibility checks */ | ||
2965 | |||
2966 | /* we need both values to be set */ | ||
2967 | if (al_stripes == 0 || al_stripe_size_4k == 0) | ||
2968 | goto err; | ||
2969 | |||
2970 | al_size_4k = (u64)al_stripes * al_stripe_size_4k; | ||
2971 | |||
2972 | /* Upper limit of activity log area, to avoid potential overflow | ||
2973 | * problems in al_tr_number_to_on_disk_sector(). As right now, more | ||
2974 | * than 72 * 4k blocks total only increases the amount of history, | ||
2975 | * limiting this arbitrarily to 16 GB is not a real limitation ;-) */ | ||
2976 | if (al_size_4k > (16 * 1024 * 1024/4)) | ||
2977 | goto err; | ||
2978 | |||
2979 | /* Lower limit: we need at least 8 transaction slots (32kB) | ||
2980 | * to not break existing setups */ | ||
2981 | if (al_size_4k < MD_32kB_SECT/8) | ||
2982 | goto err; | ||
2983 | |||
2984 | in_core->al_stripe_size_4k = al_stripe_size_4k; | ||
2985 | in_core->al_stripes = al_stripes; | ||
2986 | in_core->al_size_4k = al_size_4k; | ||
2987 | |||
2988 | return 0; | ||
2989 | err: | ||
2990 | dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n", | ||
2991 | al_stripes, al_stripe_size_4k); | ||
2992 | return -EINVAL; | ||
2993 | } | ||
2994 | |||
2995 | static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
2996 | { | ||
2997 | sector_t capacity = drbd_get_capacity(bdev->md_bdev); | ||
2998 | struct drbd_md *in_core = &bdev->md; | ||
2999 | s32 on_disk_al_sect; | ||
3000 | s32 on_disk_bm_sect; | ||
3001 | |||
3002 | /* The on-disk size of the activity log, calculated from offsets, and | ||
3003 | * the size of the activity log calculated from the stripe settings, | ||
3004 | * should match. | ||
3005 | * Though we could relax this a bit: it is ok, if the striped activity log | ||
3006 | * fits in the available on-disk activity log size. | ||
3007 | * Right now, that would break how resize is implemented. | ||
3008 | * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware | ||
3009 | * of possible unused padding space in the on disk layout. */ | ||
3010 | if (in_core->al_offset < 0) { | ||
3011 | if (in_core->bm_offset > in_core->al_offset) | ||
3012 | goto err; | ||
3013 | on_disk_al_sect = -in_core->al_offset; | ||
3014 | on_disk_bm_sect = in_core->al_offset - in_core->bm_offset; | ||
3015 | } else { | ||
3016 | if (in_core->al_offset != MD_4kB_SECT) | ||
3017 | goto err; | ||
3018 | if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT) | ||
3019 | goto err; | ||
3020 | |||
3021 | on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT; | ||
3022 | on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset; | ||
3023 | } | ||
3024 | |||
3025 | /* old fixed size meta data is exactly that: fixed. */ | ||
3026 | if (in_core->meta_dev_idx >= 0) { | ||
3027 | if (in_core->md_size_sect != MD_128MB_SECT | ||
3028 | || in_core->al_offset != MD_4kB_SECT | ||
3029 | || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT | ||
3030 | || in_core->al_stripes != 1 | ||
3031 | || in_core->al_stripe_size_4k != MD_32kB_SECT/8) | ||
3032 | goto err; | ||
3033 | } | ||
3034 | |||
3035 | if (capacity < in_core->md_size_sect) | ||
3036 | goto err; | ||
3037 | if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev)) | ||
3038 | goto err; | ||
3039 | |||
3040 | /* should be aligned, and at least 32k */ | ||
3041 | if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT)) | ||
3042 | goto err; | ||
3043 | |||
3044 | /* should fit (for now: exactly) into the available on-disk space; | ||
3045 | * overflow prevention is in check_activity_log_stripe_size() above. */ | ||
3046 | if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT) | ||
3047 | goto err; | ||
3048 | |||
3049 | /* again, should be aligned */ | ||
3050 | if (in_core->bm_offset & 7) | ||
3051 | goto err; | ||
3052 | |||
3053 | /* FIXME check for device grow with flex external meta data? */ | ||
3054 | |||
3055 | /* can the available bitmap space cover the last agreed device size? */ | ||
3056 | if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512) | ||
3057 | goto err; | ||
3058 | |||
3059 | return 0; | ||
3060 | |||
3061 | err: | ||
3062 | dev_err(DEV, "meta data offsets don't make sense: idx=%d " | ||
3063 | "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, " | ||
3064 | "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n", | ||
3065 | in_core->meta_dev_idx, | ||
3066 | in_core->al_stripes, in_core->al_stripe_size_4k, | ||
3067 | in_core->al_offset, in_core->bm_offset, in_core->md_size_sect, | ||
3068 | (unsigned long long)in_core->la_size_sect, | ||
3069 | (unsigned long long)capacity); | ||
3070 | |||
3071 | return -EINVAL; | ||
3072 | } | ||
3073 | |||
3074 | |||
2913 | /** | 3075 | /** |
2914 | * drbd_md_read() - Reads in the meta data super block | 3076 | * drbd_md_read() - Reads in the meta data super block |
2915 | * @mdev: DRBD device. | 3077 | * @mdev: DRBD device. |
2916 | * @bdev: Device from which the meta data should be read in. | 3078 | * @bdev: Device from which the meta data should be read in. |
2917 | * | 3079 | * |
2918 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case | 3080 | * Return NO_ERROR on success, and an enum drbd_ret_code in case |
2919 | * something goes wrong. | 3081 | * something goes wrong. |
3082 | * | ||
3083 | * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS, | ||
3084 | * even before @bdev is assigned to @mdev->ldev. | ||
2920 | */ | 3085 | */ |
2921 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | 3086 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) |
2922 | { | 3087 | { |
@@ -2924,12 +3089,17 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
2924 | u32 magic, flags; | 3089 | u32 magic, flags; |
2925 | int i, rv = NO_ERROR; | 3090 | int i, rv = NO_ERROR; |
2926 | 3091 | ||
2927 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | 3092 | if (mdev->state.disk != D_DISKLESS) |
2928 | return ERR_IO_MD_DISK; | 3093 | return ERR_DISK_CONFIGURED; |
2929 | 3094 | ||
2930 | buffer = drbd_md_get_buffer(mdev); | 3095 | buffer = drbd_md_get_buffer(mdev); |
2931 | if (!buffer) | 3096 | if (!buffer) |
2932 | goto out; | 3097 | return ERR_NOMEM; |
3098 | |||
3099 | /* First, figure out where our meta data superblock is located, | ||
3100 | * and read it. */ | ||
3101 | bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; | ||
3102 | bdev->md.md_offset = drbd_md_ss(bdev); | ||
2933 | 3103 | ||
2934 | if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { | 3104 | if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { |
2935 | /* NOTE: can't do normal error processing here as this is | 3105 | /* NOTE: can't do normal error processing here as this is |
@@ -2948,45 +3118,51 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
2948 | rv = ERR_MD_UNCLEAN; | 3118 | rv = ERR_MD_UNCLEAN; |
2949 | goto err; | 3119 | goto err; |
2950 | } | 3120 | } |
3121 | |||
3122 | rv = ERR_MD_INVALID; | ||
2951 | if (magic != DRBD_MD_MAGIC_08) { | 3123 | if (magic != DRBD_MD_MAGIC_08) { |
2952 | if (magic == DRBD_MD_MAGIC_07) | 3124 | if (magic == DRBD_MD_MAGIC_07) |
2953 | dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); | 3125 | dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); |
2954 | else | 3126 | else |
2955 | dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); | 3127 | dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); |
2956 | rv = ERR_MD_INVALID; | ||
2957 | goto err; | 3128 | goto err; |
2958 | } | 3129 | } |
2959 | if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { | 3130 | |
2960 | dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", | 3131 | if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { |
2961 | be32_to_cpu(buffer->al_offset), bdev->md.al_offset); | 3132 | dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", |
2962 | rv = ERR_MD_INVALID; | 3133 | be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); |
2963 | goto err; | 3134 | goto err; |
2964 | } | 3135 | } |
3136 | |||
3137 | |||
3138 | /* convert to in_core endian */ | ||
3139 | bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); | ||
3140 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
3141 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | ||
3142 | bdev->md.flags = be32_to_cpu(buffer->flags); | ||
3143 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | ||
3144 | |||
3145 | bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect); | ||
3146 | bdev->md.al_offset = be32_to_cpu(buffer->al_offset); | ||
3147 | bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset); | ||
3148 | |||
3149 | if (check_activity_log_stripe_size(mdev, buffer, &bdev->md)) | ||
3150 | goto err; | ||
3151 | if (check_offsets_and_sizes(mdev, bdev)) | ||
3152 | goto err; | ||
3153 | |||
2965 | if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { | 3154 | if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { |
2966 | dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", | 3155 | dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", |
2967 | be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); | 3156 | be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); |
2968 | rv = ERR_MD_INVALID; | ||
2969 | goto err; | 3157 | goto err; |
2970 | } | 3158 | } |
2971 | if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { | 3159 | if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { |
2972 | dev_err(DEV, "unexpected md_size: %u (expected %u)\n", | 3160 | dev_err(DEV, "unexpected md_size: %u (expected %u)\n", |
2973 | be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); | 3161 | be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); |
2974 | rv = ERR_MD_INVALID; | ||
2975 | goto err; | 3162 | goto err; |
2976 | } | 3163 | } |
2977 | 3164 | ||
2978 | if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { | 3165 | rv = NO_ERROR; |
2979 | dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", | ||
2980 | be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); | ||
2981 | rv = ERR_MD_INVALID; | ||
2982 | goto err; | ||
2983 | } | ||
2984 | |||
2985 | bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); | ||
2986 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
2987 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | ||
2988 | bdev->md.flags = be32_to_cpu(buffer->flags); | ||
2989 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | ||
2990 | 3166 | ||
2991 | spin_lock_irq(&mdev->tconn->req_lock); | 3167 | spin_lock_irq(&mdev->tconn->req_lock); |
2992 | if (mdev->state.conn < C_CONNECTED) { | 3168 | if (mdev->state.conn < C_CONNECTED) { |
@@ -2999,8 +3175,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
2999 | 3175 | ||
3000 | err: | 3176 | err: |
3001 | drbd_md_put_buffer(mdev); | 3177 | drbd_md_put_buffer(mdev); |
3002 | out: | ||
3003 | put_ldev(mdev); | ||
3004 | 3178 | ||
3005 | return rv; | 3179 | return rv; |
3006 | } | 3180 | } |
@@ -3238,8 +3412,12 @@ static int w_go_diskless(struct drbd_work *w, int unused) | |||
3238 | * end up here after a failed attach, before ldev was even assigned. | 3412 | * end up here after a failed attach, before ldev was even assigned. |
3239 | */ | 3413 | */ |
3240 | if (mdev->bitmap && mdev->ldev) { | 3414 | if (mdev->bitmap && mdev->ldev) { |
3415 | /* An interrupted resync or similar is allowed to recounts bits | ||
3416 | * while we detach. | ||
3417 | * Any modifications would not be expected anymore, though. | ||
3418 | */ | ||
3241 | if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, | 3419 | if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, |
3242 | "detach", BM_LOCKED_MASK)) { | 3420 | "detach", BM_LOCKED_TEST_ALLOWED)) { |
3243 | if (test_bit(WAS_READ_ERROR, &mdev->flags)) { | 3421 | if (test_bit(WAS_READ_ERROR, &mdev->flags)) { |
3244 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); | 3422 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); |
3245 | drbd_md_sync(mdev); | 3423 | drbd_md_sync(mdev); |
@@ -3251,13 +3429,6 @@ static int w_go_diskless(struct drbd_work *w, int unused) | |||
3251 | return 0; | 3429 | return 0; |
3252 | } | 3430 | } |
3253 | 3431 | ||
3254 | void drbd_go_diskless(struct drbd_conf *mdev) | ||
3255 | { | ||
3256 | D_ASSERT(mdev->state.disk == D_FAILED); | ||
3257 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) | ||
3258 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); | ||
3259 | } | ||
3260 | |||
3261 | /** | 3432 | /** |
3262 | * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap | 3433 | * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap |
3263 | * @mdev: DRBD device. | 3434 | * @mdev: DRBD device. |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 2af26fc95280..9e3f441e7e84 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -696,37 +696,52 @@ out: | |||
696 | return 0; | 696 | return 0; |
697 | } | 697 | } |
698 | 698 | ||
699 | /* initializes the md.*_offset members, so we are able to find | 699 | /* Initializes the md.*_offset members, so we are able to find |
700 | * the on disk meta data */ | 700 | * the on disk meta data. |
701 | * | ||
702 | * We currently have two possible layouts: | ||
703 | * external: | ||
704 | * |----------- md_size_sect ------------------| | ||
705 | * [ 4k superblock ][ activity log ][ Bitmap ] | ||
706 | * | al_offset == 8 | | ||
707 | * | bm_offset = al_offset + X | | ||
708 | * ==> bitmap sectors = md_size_sect - bm_offset | ||
709 | * | ||
710 | * internal: | ||
711 | * |----------- md_size_sect ------------------| | ||
712 | * [data.....][ Bitmap ][ activity log ][ 4k superblock ] | ||
713 | * | al_offset < 0 | | ||
714 | * | bm_offset = al_offset - Y | | ||
715 | * ==> bitmap sectors = Y = al_offset - bm_offset | ||
716 | * | ||
717 | * Activity log size used to be fixed 32kB, | ||
718 | * but is about to become configurable. | ||
719 | */ | ||
701 | static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | 720 | static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, |
702 | struct drbd_backing_dev *bdev) | 721 | struct drbd_backing_dev *bdev) |
703 | { | 722 | { |
704 | sector_t md_size_sect = 0; | 723 | sector_t md_size_sect = 0; |
705 | int meta_dev_idx; | 724 | unsigned int al_size_sect = bdev->md.al_size_4k * 8; |
706 | 725 | ||
707 | rcu_read_lock(); | 726 | bdev->md.md_offset = drbd_md_ss(bdev); |
708 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
709 | 727 | ||
710 | switch (meta_dev_idx) { | 728 | switch (bdev->md.meta_dev_idx) { |
711 | default: | 729 | default: |
712 | /* v07 style fixed size indexed meta data */ | 730 | /* v07 style fixed size indexed meta data */ |
713 | bdev->md.md_size_sect = MD_RESERVED_SECT; | 731 | bdev->md.md_size_sect = MD_128MB_SECT; |
714 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | 732 | bdev->md.al_offset = MD_4kB_SECT; |
715 | bdev->md.al_offset = MD_AL_OFFSET; | 733 | bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; |
716 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
717 | break; | 734 | break; |
718 | case DRBD_MD_INDEX_FLEX_EXT: | 735 | case DRBD_MD_INDEX_FLEX_EXT: |
719 | /* just occupy the full device; unit: sectors */ | 736 | /* just occupy the full device; unit: sectors */ |
720 | bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); | 737 | bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); |
721 | bdev->md.md_offset = 0; | 738 | bdev->md.al_offset = MD_4kB_SECT; |
722 | bdev->md.al_offset = MD_AL_OFFSET; | 739 | bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; |
723 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
724 | break; | 740 | break; |
725 | case DRBD_MD_INDEX_INTERNAL: | 741 | case DRBD_MD_INDEX_INTERNAL: |
726 | case DRBD_MD_INDEX_FLEX_INT: | 742 | case DRBD_MD_INDEX_FLEX_INT: |
727 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | ||
728 | /* al size is still fixed */ | 743 | /* al size is still fixed */ |
729 | bdev->md.al_offset = -MD_AL_SECTORS; | 744 | bdev->md.al_offset = -al_size_sect; |
730 | /* we need (slightly less than) ~ this much bitmap sectors: */ | 745 | /* we need (slightly less than) ~ this much bitmap sectors: */ |
731 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); | 746 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); |
732 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); | 747 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); |
@@ -735,14 +750,13 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
735 | 750 | ||
736 | /* plus the "drbd meta data super block", | 751 | /* plus the "drbd meta data super block", |
737 | * and the activity log; */ | 752 | * and the activity log; */ |
738 | md_size_sect += MD_BM_OFFSET; | 753 | md_size_sect += MD_4kB_SECT + al_size_sect; |
739 | 754 | ||
740 | bdev->md.md_size_sect = md_size_sect; | 755 | bdev->md.md_size_sect = md_size_sect; |
741 | /* bitmap offset is adjusted by 'super' block size */ | 756 | /* bitmap offset is adjusted by 'super' block size */ |
742 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; | 757 | bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; |
743 | break; | 758 | break; |
744 | } | 759 | } |
745 | rcu_read_unlock(); | ||
746 | } | 760 | } |
747 | 761 | ||
748 | /* input size is expected to be in KB */ | 762 | /* input size is expected to be in KB */ |
@@ -805,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev) | |||
805 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) | 819 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) |
806 | { | 820 | { |
807 | sector_t prev_first_sect, prev_size; /* previous meta location */ | 821 | sector_t prev_first_sect, prev_size; /* previous meta location */ |
808 | sector_t la_size, u_size; | 822 | sector_t la_size_sect, u_size; |
809 | sector_t size; | 823 | sector_t size; |
810 | char ppb[10]; | 824 | char ppb[10]; |
811 | 825 | ||
@@ -828,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
828 | 842 | ||
829 | prev_first_sect = drbd_md_first_sector(mdev->ldev); | 843 | prev_first_sect = drbd_md_first_sector(mdev->ldev); |
830 | prev_size = mdev->ldev->md.md_size_sect; | 844 | prev_size = mdev->ldev->md.md_size_sect; |
831 | la_size = mdev->ldev->md.la_size_sect; | 845 | la_size_sect = mdev->ldev->md.la_size_sect; |
832 | 846 | ||
833 | /* TODO: should only be some assert here, not (re)init... */ | 847 | /* TODO: should only be some assert here, not (re)init... */ |
834 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | 848 | drbd_md_set_sector_offsets(mdev, mdev->ldev); |
@@ -864,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
864 | if (rv == dev_size_error) | 878 | if (rv == dev_size_error) |
865 | goto out; | 879 | goto out; |
866 | 880 | ||
867 | la_size_changed = (la_size != mdev->ldev->md.la_size_sect); | 881 | la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); |
868 | 882 | ||
869 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) | 883 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) |
870 | || prev_size != mdev->ldev->md.md_size_sect; | 884 | || prev_size != mdev->ldev->md.md_size_sect; |
@@ -886,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
886 | drbd_md_mark_dirty(mdev); | 900 | drbd_md_mark_dirty(mdev); |
887 | } | 901 | } |
888 | 902 | ||
889 | if (size > la_size) | 903 | if (size > la_size_sect) |
890 | rv = grew; | 904 | rv = grew; |
891 | if (size < la_size) | 905 | if (size < la_size_sect) |
892 | rv = shrunk; | 906 | rv = shrunk; |
893 | out: | 907 | out: |
894 | lc_unlock(mdev->act_log); | 908 | lc_unlock(mdev->act_log); |
@@ -903,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
903 | sector_t u_size, int assume_peer_has_space) | 917 | sector_t u_size, int assume_peer_has_space) |
904 | { | 918 | { |
905 | sector_t p_size = mdev->p_size; /* partner's disk size. */ | 919 | sector_t p_size = mdev->p_size; /* partner's disk size. */ |
906 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ | 920 | sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */ |
907 | sector_t m_size; /* my size */ | 921 | sector_t m_size; /* my size */ |
908 | sector_t size = 0; | 922 | sector_t size = 0; |
909 | 923 | ||
@@ -917,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
917 | if (p_size && m_size) { | 931 | if (p_size && m_size) { |
918 | size = min_t(sector_t, p_size, m_size); | 932 | size = min_t(sector_t, p_size, m_size); |
919 | } else { | 933 | } else { |
920 | if (la_size) { | 934 | if (la_size_sect) { |
921 | size = la_size; | 935 | size = la_size_sect; |
922 | if (m_size && m_size < size) | 936 | if (m_size && m_size < size) |
923 | size = m_size; | 937 | size = m_size; |
924 | if (p_size && p_size < size) | 938 | if (p_size && p_size < size) |
@@ -1127,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info) | |||
1127 | return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); | 1141 | return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); |
1128 | } | 1142 | } |
1129 | 1143 | ||
1130 | static void enforce_disk_conf_limits(struct disk_conf *dc) | 1144 | static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) |
1131 | { | 1145 | { |
1132 | if (dc->al_extents < DRBD_AL_EXTENTS_MIN) | 1146 | /* This is limited by 16 bit "slot" numbers, |
1133 | dc->al_extents = DRBD_AL_EXTENTS_MIN; | 1147 | * and by available on-disk context storage. |
1134 | if (dc->al_extents > DRBD_AL_EXTENTS_MAX) | 1148 | * |
1135 | dc->al_extents = DRBD_AL_EXTENTS_MAX; | 1149 | * Also (u16)~0 is special (denotes a "free" extent). |
1150 | * | ||
1151 | * One transaction occupies one 4kB on-disk block, | ||
1152 | * we have n such blocks in the on disk ring buffer, | ||
1153 | * the "current" transaction may fail (n-1), | ||
1154 | * and there is 919 slot numbers context information per transaction. | ||
1155 | * | ||
1156 | * 72 transaction blocks amounts to more than 2**16 context slots, | ||
1157 | * so cap there first. | ||
1158 | */ | ||
1159 | const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX; | ||
1160 | const unsigned int sufficient_on_disk = | ||
1161 | (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1) | ||
1162 | /AL_CONTEXT_PER_TRANSACTION; | ||
1163 | |||
1164 | unsigned int al_size_4k = bdev->md.al_size_4k; | ||
1165 | |||
1166 | if (al_size_4k > sufficient_on_disk) | ||
1167 | return max_al_nr; | ||
1136 | 1168 | ||
1137 | if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) | 1169 | return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; |
1138 | dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; | ||
1139 | } | 1170 | } |
1140 | 1171 | ||
1141 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) | 1172 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) |
@@ -1182,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) | |||
1182 | if (!expect(new_disk_conf->resync_rate >= 1)) | 1213 | if (!expect(new_disk_conf->resync_rate >= 1)) |
1183 | new_disk_conf->resync_rate = 1; | 1214 | new_disk_conf->resync_rate = 1; |
1184 | 1215 | ||
1185 | enforce_disk_conf_limits(new_disk_conf); | 1216 | if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) |
1217 | new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; | ||
1218 | if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev)) | ||
1219 | new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev); | ||
1220 | |||
1221 | if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) | ||
1222 | new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; | ||
1186 | 1223 | ||
1187 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; | 1224 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; |
1188 | if (fifo_size != mdev->rs_plan_s->size) { | 1225 | if (fifo_size != mdev->rs_plan_s->size) { |
@@ -1330,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1330 | goto fail; | 1367 | goto fail; |
1331 | } | 1368 | } |
1332 | 1369 | ||
1333 | enforce_disk_conf_limits(new_disk_conf); | 1370 | if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) |
1371 | new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; | ||
1334 | 1372 | ||
1335 | new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); | 1373 | new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); |
1336 | if (!new_plan) { | 1374 | if (!new_plan) { |
@@ -1343,6 +1381,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1343 | goto fail; | 1381 | goto fail; |
1344 | } | 1382 | } |
1345 | 1383 | ||
1384 | write_lock_irq(&global_state_lock); | ||
1385 | retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); | ||
1386 | write_unlock_irq(&global_state_lock); | ||
1387 | if (retcode != NO_ERROR) | ||
1388 | goto fail; | ||
1389 | |||
1346 | rcu_read_lock(); | 1390 | rcu_read_lock(); |
1347 | nc = rcu_dereference(mdev->tconn->net_conf); | 1391 | nc = rcu_dereference(mdev->tconn->net_conf); |
1348 | if (nc) { | 1392 | if (nc) { |
@@ -1399,8 +1443,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1399 | goto fail; | 1443 | goto fail; |
1400 | } | 1444 | } |
1401 | 1445 | ||
1402 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ | 1446 | /* Read our meta data super block early. |
1403 | drbd_md_set_sector_offsets(mdev, nbc); | 1447 | * This also sets other on-disk offsets. */ |
1448 | retcode = drbd_md_read(mdev, nbc); | ||
1449 | if (retcode != NO_ERROR) | ||
1450 | goto fail; | ||
1451 | |||
1452 | if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) | ||
1453 | new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; | ||
1454 | if (new_disk_conf->al_extents > drbd_al_extents_max(nbc)) | ||
1455 | new_disk_conf->al_extents = drbd_al_extents_max(nbc); | ||
1404 | 1456 | ||
1405 | if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { | 1457 | if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { |
1406 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", | 1458 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", |
@@ -1416,7 +1468,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1416 | min_md_device_sectors = (2<<10); | 1468 | min_md_device_sectors = (2<<10); |
1417 | } else { | 1469 | } else { |
1418 | max_possible_sectors = DRBD_MAX_SECTORS; | 1470 | max_possible_sectors = DRBD_MAX_SECTORS; |
1419 | min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); | 1471 | min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1); |
1420 | } | 1472 | } |
1421 | 1473 | ||
1422 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { | 1474 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { |
@@ -1467,8 +1519,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1467 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | 1519 | if (!get_ldev_if_state(mdev, D_ATTACHING)) |
1468 | goto force_diskless; | 1520 | goto force_diskless; |
1469 | 1521 | ||
1470 | drbd_md_set_sector_offsets(mdev, nbc); | ||
1471 | |||
1472 | if (!mdev->bitmap) { | 1522 | if (!mdev->bitmap) { |
1473 | if (drbd_bm_init(mdev)) { | 1523 | if (drbd_bm_init(mdev)) { |
1474 | retcode = ERR_NOMEM; | 1524 | retcode = ERR_NOMEM; |
@@ -1476,10 +1526,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1476 | } | 1526 | } |
1477 | } | 1527 | } |
1478 | 1528 | ||
1479 | retcode = drbd_md_read(mdev, nbc); | ||
1480 | if (retcode != NO_ERROR) | ||
1481 | goto force_diskless_dec; | ||
1482 | |||
1483 | if (mdev->state.conn < C_CONNECTED && | 1529 | if (mdev->state.conn < C_CONNECTED && |
1484 | mdev->state.role == R_PRIMARY && | 1530 | mdev->state.role == R_PRIMARY && |
1485 | (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { | 1531 | (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { |
@@ -2158,8 +2204,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for | |||
2158 | return SS_SUCCESS; | 2204 | return SS_SUCCESS; |
2159 | case SS_PRIMARY_NOP: | 2205 | case SS_PRIMARY_NOP: |
2160 | /* Our state checking code wants to see the peer outdated. */ | 2206 | /* Our state checking code wants to see the peer outdated. */ |
2161 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, | 2207 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0); |
2162 | pdsk, D_OUTDATED), CS_VERBOSE); | 2208 | |
2209 | if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */ | ||
2210 | rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE); | ||
2211 | |||
2163 | break; | 2212 | break; |
2164 | case SS_CW_FAILED_BY_PEER: | 2213 | case SS_CW_FAILED_BY_PEER: |
2165 | /* The peer probably wants to see us outdated. */ | 2214 | /* The peer probably wants to see us outdated. */ |
@@ -2406,22 +2455,19 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) | |||
2406 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); | 2455 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
2407 | drbd_flush_workqueue(mdev); | 2456 | drbd_flush_workqueue(mdev); |
2408 | 2457 | ||
2409 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); | 2458 | /* If we happen to be C_STANDALONE R_SECONDARY, just change to |
2410 | 2459 | * D_INCONSISTENT, and set all bits in the bitmap. Otherwise, | |
2411 | if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) | 2460 | * try to start a resync handshake as sync target for full sync. |
2412 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | 2461 | */ |
2413 | 2462 | if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) { | |
2414 | while (retcode == SS_NEED_CONNECTION) { | 2463 | retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT)); |
2415 | spin_lock_irq(&mdev->tconn->req_lock); | 2464 | if (retcode >= SS_SUCCESS) { |
2416 | if (mdev->state.conn < C_CONNECTED) | 2465 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, |
2417 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); | 2466 | "set_n_write from invalidate", BM_LOCKED_MASK)) |
2418 | spin_unlock_irq(&mdev->tconn->req_lock); | 2467 | retcode = ERR_IO_MD_DISK; |
2419 | 2468 | } | |
2420 | if (retcode != SS_NEED_CONNECTION) | 2469 | } else |
2421 | break; | ||
2422 | |||
2423 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | 2470 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); |
2424 | } | ||
2425 | drbd_resume_io(mdev); | 2471 | drbd_resume_io(mdev); |
2426 | 2472 | ||
2427 | out: | 2473 | out: |
@@ -2475,21 +2521,22 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) | |||
2475 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); | 2521 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
2476 | drbd_flush_workqueue(mdev); | 2522 | drbd_flush_workqueue(mdev); |
2477 | 2523 | ||
2478 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); | 2524 | /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits |
2479 | if (retcode < SS_SUCCESS) { | 2525 | * in the bitmap. Otherwise, try to start a resync handshake |
2480 | if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { | 2526 | * as sync source for full sync. |
2481 | /* The peer will get a resync upon connect anyways. | 2527 | */ |
2482 | * Just make that into a full resync. */ | 2528 | if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) { |
2483 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); | 2529 | /* The peer will get a resync upon connect anyways. Just make that |
2484 | if (retcode >= SS_SUCCESS) { | 2530 | into a full resync. */ |
2485 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, | 2531 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); |
2486 | "set_n_write from invalidate_peer", | 2532 | if (retcode >= SS_SUCCESS) { |
2487 | BM_LOCKED_SET_ALLOWED)) | 2533 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, |
2488 | retcode = ERR_IO_MD_DISK; | 2534 | "set_n_write from invalidate_peer", |
2489 | } | 2535 | BM_LOCKED_SET_ALLOWED)) |
2490 | } else | 2536 | retcode = ERR_IO_MD_DISK; |
2491 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); | 2537 | } |
2492 | } | 2538 | } else |
2539 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); | ||
2493 | drbd_resume_io(mdev); | 2540 | drbd_resume_io(mdev); |
2494 | 2541 | ||
2495 | out: | 2542 | out: |
@@ -3162,6 +3209,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) | |||
3162 | CS_VERBOSE + CS_WAIT_COMPLETE); | 3209 | CS_VERBOSE + CS_WAIT_COMPLETE); |
3163 | idr_remove(&mdev->tconn->volumes, mdev->vnr); | 3210 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
3164 | idr_remove(&minors, mdev_to_minor(mdev)); | 3211 | idr_remove(&minors, mdev_to_minor(mdev)); |
3212 | destroy_workqueue(mdev->submit.wq); | ||
3165 | del_gendisk(mdev->vdisk); | 3213 | del_gendisk(mdev->vdisk); |
3166 | synchronize_rcu(); | 3214 | synchronize_rcu(); |
3167 | kref_put(&mdev->kref, &drbd_minor_destroy); | 3215 | kref_put(&mdev->kref, &drbd_minor_destroy); |
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 928adb815b09..bf31d41dbaad 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -313,8 +313,14 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
313 | 313 | ||
314 | static int drbd_proc_open(struct inode *inode, struct file *file) | 314 | static int drbd_proc_open(struct inode *inode, struct file *file) |
315 | { | 315 | { |
316 | if (try_module_get(THIS_MODULE)) | 316 | int err; |
317 | return single_open(file, drbd_seq_show, PDE_DATA(inode)); | 317 | |
318 | if (try_module_get(THIS_MODULE)) { | ||
319 | err = single_open(file, drbd_seq_show, PDE_DATA(inode)); | ||
320 | if (err) | ||
321 | module_put(THIS_MODULE); | ||
322 | return err; | ||
323 | } | ||
318 | return -ENODEV; | 324 | return -ENODEV; |
319 | } | 325 | } |
320 | 326 | ||
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 83c5ae0ed56b..4222affff488 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -850,6 +850,7 @@ int drbd_connected(struct drbd_conf *mdev) | |||
850 | err = drbd_send_current_state(mdev); | 850 | err = drbd_send_current_state(mdev); |
851 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | 851 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); |
852 | clear_bit(RESIZE_PENDING, &mdev->flags); | 852 | clear_bit(RESIZE_PENDING, &mdev->flags); |
853 | atomic_set(&mdev->ap_in_flight, 0); | ||
853 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ | 854 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ |
854 | return err; | 855 | return err; |
855 | } | 856 | } |
@@ -2266,7 +2267,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) | |||
2266 | drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); | 2267 | drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); |
2267 | peer_req->flags |= EE_CALL_AL_COMPLETE_IO; | 2268 | peer_req->flags |= EE_CALL_AL_COMPLETE_IO; |
2268 | peer_req->flags &= ~EE_MAY_SET_IN_SYNC; | 2269 | peer_req->flags &= ~EE_MAY_SET_IN_SYNC; |
2269 | drbd_al_begin_io(mdev, &peer_req->i); | 2270 | drbd_al_begin_io(mdev, &peer_req->i, true); |
2270 | } | 2271 | } |
2271 | 2272 | ||
2272 | err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); | 2273 | err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); |
@@ -2662,7 +2663,6 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | |||
2662 | if (hg == -1 && mdev->state.role == R_PRIMARY) { | 2663 | if (hg == -1 && mdev->state.role == R_PRIMARY) { |
2663 | enum drbd_state_rv rv2; | 2664 | enum drbd_state_rv rv2; |
2664 | 2665 | ||
2665 | drbd_set_role(mdev, R_SECONDARY, 0); | ||
2666 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, | 2666 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, |
2667 | * we might be here in C_WF_REPORT_PARAMS which is transient. | 2667 | * we might be here in C_WF_REPORT_PARAMS which is transient. |
2668 | * we do not need to wait for the after state change work either. */ | 2668 | * we do not need to wait for the after state change work either. */ |
@@ -3993,7 +3993,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) | |||
3993 | 3993 | ||
3994 | clear_bit(DISCARD_MY_DATA, &mdev->flags); | 3994 | clear_bit(DISCARD_MY_DATA, &mdev->flags); |
3995 | 3995 | ||
3996 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ | 3996 | drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */ |
3997 | 3997 | ||
3998 | return 0; | 3998 | return 0; |
3999 | } | 3999 | } |
@@ -4660,8 +4660,8 @@ static int drbd_do_features(struct drbd_tconn *tconn) | |||
4660 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) | 4660 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) |
4661 | static int drbd_do_auth(struct drbd_tconn *tconn) | 4661 | static int drbd_do_auth(struct drbd_tconn *tconn) |
4662 | { | 4662 | { |
4663 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); | 4663 | conn_err(tconn, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); |
4664 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); | 4664 | conn_err(tconn, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); |
4665 | return -1; | 4665 | return -1; |
4666 | } | 4666 | } |
4667 | #else | 4667 | #else |
@@ -5258,9 +5258,11 @@ int drbd_asender(struct drbd_thread *thi) | |||
5258 | bool ping_timeout_active = false; | 5258 | bool ping_timeout_active = false; |
5259 | struct net_conf *nc; | 5259 | struct net_conf *nc; |
5260 | int ping_timeo, tcp_cork, ping_int; | 5260 | int ping_timeo, tcp_cork, ping_int; |
5261 | struct sched_param param = { .sched_priority = 2 }; | ||
5261 | 5262 | ||
5262 | current->policy = SCHED_RR; /* Make this a realtime task! */ | 5263 | rv = sched_setscheduler(current, SCHED_RR, ¶m); |
5263 | current->rt_priority = 2; /* more important than all other tasks */ | 5264 | if (rv < 0) |
5265 | conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv); | ||
5264 | 5266 | ||
5265 | while (get_t_state(thi) == RUNNING) { | 5267 | while (get_t_state(thi) == RUNNING) { |
5266 | drbd_thread_current_set_cpu(thi); | 5268 | drbd_thread_current_set_cpu(thi); |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 2b8303ad63c9..c24379ffd4e3 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -34,14 +34,14 @@ | |||
34 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); | 34 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); |
35 | 35 | ||
36 | /* Update disk stats at start of I/O request */ | 36 | /* Update disk stats at start of I/O request */ |
37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) | 37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req) |
38 | { | 38 | { |
39 | const int rw = bio_data_dir(bio); | 39 | const int rw = bio_data_dir(req->master_bio); |
40 | int cpu; | 40 | int cpu; |
41 | cpu = part_stat_lock(); | 41 | cpu = part_stat_lock(); |
42 | part_round_stats(cpu, &mdev->vdisk->part0); | 42 | part_round_stats(cpu, &mdev->vdisk->part0); |
43 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | 43 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); |
44 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | 44 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9); |
45 | (void) cpu; /* The macro invocations above want the cpu argument, I do not like | 45 | (void) cpu; /* The macro invocations above want the cpu argument, I do not like |
46 | the compiler warning about cpu only assigned but never used... */ | 46 | the compiler warning about cpu only assigned but never used... */ |
47 | part_inc_in_flight(&mdev->vdisk->part0, rw); | 47 | part_inc_in_flight(&mdev->vdisk->part0, rw); |
@@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) | |||
263 | else | 263 | else |
264 | root = &mdev->read_requests; | 264 | root = &mdev->read_requests; |
265 | drbd_remove_request_interval(root, req); | 265 | drbd_remove_request_interval(root, req); |
266 | } else if (!(s & RQ_POSTPONED)) | 266 | } |
267 | D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); | ||
268 | 267 | ||
269 | /* Before we can signal completion to the upper layers, | 268 | /* Before we can signal completion to the upper layers, |
270 | * we may need to close the current transfer log epoch. | 269 | * we may need to close the current transfer log epoch. |
@@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
755 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 754 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
756 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); | 755 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); |
757 | break; | 756 | break; |
757 | |||
758 | case QUEUE_AS_DRBD_BARRIER: | ||
759 | start_new_tl_epoch(mdev->tconn); | ||
760 | mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); | ||
761 | break; | ||
758 | }; | 762 | }; |
759 | 763 | ||
760 | return rv; | 764 | return rv; |
@@ -861,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev) | |||
861 | bool congested = false; | 865 | bool congested = false; |
862 | enum drbd_on_congestion on_congestion; | 866 | enum drbd_on_congestion on_congestion; |
863 | 867 | ||
868 | rcu_read_lock(); | ||
864 | nc = rcu_dereference(tconn->net_conf); | 869 | nc = rcu_dereference(tconn->net_conf); |
865 | on_congestion = nc ? nc->on_congestion : OC_BLOCK; | 870 | on_congestion = nc ? nc->on_congestion : OC_BLOCK; |
871 | rcu_read_unlock(); | ||
866 | if (on_congestion == OC_BLOCK || | 872 | if (on_congestion == OC_BLOCK || |
867 | tconn->agreed_pro_version < 96) | 873 | tconn->agreed_pro_version < 96) |
868 | return; | 874 | return; |
@@ -956,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req) | |||
956 | struct drbd_conf *mdev = req->w.mdev; | 962 | struct drbd_conf *mdev = req->w.mdev; |
957 | int remote, send_oos; | 963 | int remote, send_oos; |
958 | 964 | ||
959 | rcu_read_lock(); | ||
960 | remote = drbd_should_do_remote(mdev->state); | 965 | remote = drbd_should_do_remote(mdev->state); |
961 | if (remote) { | ||
962 | maybe_pull_ahead(mdev); | ||
963 | remote = drbd_should_do_remote(mdev->state); | ||
964 | } | ||
965 | send_oos = drbd_should_send_out_of_sync(mdev->state); | 966 | send_oos = drbd_should_send_out_of_sync(mdev->state); |
966 | rcu_read_unlock(); | ||
967 | 967 | ||
968 | /* Need to replicate writes. Unless it is an empty flush, | 968 | /* Need to replicate writes. Unless it is an empty flush, |
969 | * which is better mapped to a DRBD P_BARRIER packet, | 969 | * which is better mapped to a DRBD P_BARRIER packet, |
@@ -975,8 +975,8 @@ static int drbd_process_write_request(struct drbd_request *req) | |||
975 | /* The only size==0 bios we expect are empty flushes. */ | 975 | /* The only size==0 bios we expect are empty flushes. */ |
976 | D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); | 976 | D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); |
977 | if (remote) | 977 | if (remote) |
978 | start_new_tl_epoch(mdev->tconn); | 978 | _req_mod(req, QUEUE_AS_DRBD_BARRIER); |
979 | return 0; | 979 | return remote; |
980 | } | 980 | } |
981 | 981 | ||
982 | if (!remote && !send_oos) | 982 | if (!remote && !send_oos) |
@@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req) | |||
1020 | bio_endio(bio, -EIO); | 1020 | bio_endio(bio, -EIO); |
1021 | } | 1021 | } |
1022 | 1022 | ||
1023 | void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | 1023 | static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req) |
1024 | { | 1024 | { |
1025 | const int rw = bio_rw(bio); | 1025 | spin_lock(&mdev->submit.lock); |
1026 | struct bio_and_error m = { NULL, }; | 1026 | list_add_tail(&req->tl_requests, &mdev->submit.writes); |
1027 | spin_unlock(&mdev->submit.lock); | ||
1028 | queue_work(mdev->submit.wq, &mdev->submit.worker); | ||
1029 | } | ||
1030 | |||
1031 | /* returns the new drbd_request pointer, if the caller is expected to | ||
1032 | * drbd_send_and_submit() it (to save latency), or NULL if we queued the | ||
1033 | * request on the submitter thread. | ||
1034 | * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. | ||
1035 | */ | ||
1036 | struct drbd_request * | ||
1037 | drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | ||
1038 | { | ||
1039 | const int rw = bio_data_dir(bio); | ||
1027 | struct drbd_request *req; | 1040 | struct drbd_request *req; |
1028 | bool no_remote = false; | ||
1029 | 1041 | ||
1030 | /* allocate outside of all locks; */ | 1042 | /* allocate outside of all locks; */ |
1031 | req = drbd_req_new(mdev, bio); | 1043 | req = drbd_req_new(mdev, bio); |
@@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long | |||
1035 | * if user cannot handle io errors, that's not our business. */ | 1047 | * if user cannot handle io errors, that's not our business. */ |
1036 | dev_err(DEV, "could not kmalloc() req\n"); | 1048 | dev_err(DEV, "could not kmalloc() req\n"); |
1037 | bio_endio(bio, -ENOMEM); | 1049 | bio_endio(bio, -ENOMEM); |
1038 | return; | 1050 | return ERR_PTR(-ENOMEM); |
1039 | } | 1051 | } |
1040 | req->start_time = start_time; | 1052 | req->start_time = start_time; |
1041 | 1053 | ||
@@ -1044,28 +1056,40 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long | |||
1044 | req->private_bio = NULL; | 1056 | req->private_bio = NULL; |
1045 | } | 1057 | } |
1046 | 1058 | ||
1047 | /* For WRITES going to the local disk, grab a reference on the target | 1059 | /* Update disk stats */ |
1048 | * extent. This waits for any resync activity in the corresponding | 1060 | _drbd_start_io_acct(mdev, req); |
1049 | * resync extent to finish, and, if necessary, pulls in the target | 1061 | |
1050 | * extent into the activity log, which involves further disk io because | ||
1051 | * of transactional on-disk meta data updates. | ||
1052 | * Empty flushes don't need to go into the activity log, they can only | ||
1053 | * flush data for pending writes which are already in there. */ | ||
1054 | if (rw == WRITE && req->private_bio && req->i.size | 1062 | if (rw == WRITE && req->private_bio && req->i.size |
1055 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | 1063 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { |
1064 | if (!drbd_al_begin_io_fastpath(mdev, &req->i)) { | ||
1065 | drbd_queue_write(mdev, req); | ||
1066 | return NULL; | ||
1067 | } | ||
1056 | req->rq_state |= RQ_IN_ACT_LOG; | 1068 | req->rq_state |= RQ_IN_ACT_LOG; |
1057 | drbd_al_begin_io(mdev, &req->i); | ||
1058 | } | 1069 | } |
1059 | 1070 | ||
1071 | return req; | ||
1072 | } | ||
1073 | |||
1074 | static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req) | ||
1075 | { | ||
1076 | const int rw = bio_rw(req->master_bio); | ||
1077 | struct bio_and_error m = { NULL, }; | ||
1078 | bool no_remote = false; | ||
1079 | |||
1060 | spin_lock_irq(&mdev->tconn->req_lock); | 1080 | spin_lock_irq(&mdev->tconn->req_lock); |
1061 | if (rw == WRITE) { | 1081 | if (rw == WRITE) { |
1062 | /* This may temporarily give up the req_lock, | 1082 | /* This may temporarily give up the req_lock, |
1063 | * but will re-aquire it before it returns here. | 1083 | * but will re-aquire it before it returns here. |
1064 | * Needs to be before the check on drbd_suspended() */ | 1084 | * Needs to be before the check on drbd_suspended() */ |
1065 | complete_conflicting_writes(req); | 1085 | complete_conflicting_writes(req); |
1086 | /* no more giving up req_lock from now on! */ | ||
1087 | |||
1088 | /* check for congestion, and potentially stop sending | ||
1089 | * full data updates, but start sending "dirty bits" only. */ | ||
1090 | maybe_pull_ahead(mdev); | ||
1066 | } | 1091 | } |
1067 | 1092 | ||
1068 | /* no more giving up req_lock from now on! */ | ||
1069 | 1093 | ||
1070 | if (drbd_suspended(mdev)) { | 1094 | if (drbd_suspended(mdev)) { |
1071 | /* push back and retry: */ | 1095 | /* push back and retry: */ |
@@ -1078,9 +1102,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long | |||
1078 | goto out; | 1102 | goto out; |
1079 | } | 1103 | } |
1080 | 1104 | ||
1081 | /* Update disk stats */ | ||
1082 | _drbd_start_io_acct(mdev, req, bio); | ||
1083 | |||
1084 | /* We fail READ/READA early, if we can not serve it. | 1105 | /* We fail READ/READA early, if we can not serve it. |
1085 | * We must do this before req is registered on any lists. | 1106 | * We must do this before req is registered on any lists. |
1086 | * Otherwise, drbd_req_complete() will queue failed READ for retry. */ | 1107 | * Otherwise, drbd_req_complete() will queue failed READ for retry. */ |
@@ -1137,7 +1158,116 @@ out: | |||
1137 | 1158 | ||
1138 | if (m.bio) | 1159 | if (m.bio) |
1139 | complete_master_bio(mdev, &m); | 1160 | complete_master_bio(mdev, &m); |
1140 | return; | 1161 | } |
1162 | |||
1163 | void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | ||
1164 | { | ||
1165 | struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time); | ||
1166 | if (IS_ERR_OR_NULL(req)) | ||
1167 | return; | ||
1168 | drbd_send_and_submit(mdev, req); | ||
1169 | } | ||
1170 | |||
1171 | static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming) | ||
1172 | { | ||
1173 | struct drbd_request *req, *tmp; | ||
1174 | list_for_each_entry_safe(req, tmp, incoming, tl_requests) { | ||
1175 | const int rw = bio_data_dir(req->master_bio); | ||
1176 | |||
1177 | if (rw == WRITE /* rw != WRITE should not even end up here! */ | ||
1178 | && req->private_bio && req->i.size | ||
1179 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | ||
1180 | if (!drbd_al_begin_io_fastpath(mdev, &req->i)) | ||
1181 | continue; | ||
1182 | |||
1183 | req->rq_state |= RQ_IN_ACT_LOG; | ||
1184 | } | ||
1185 | |||
1186 | list_del_init(&req->tl_requests); | ||
1187 | drbd_send_and_submit(mdev, req); | ||
1188 | } | ||
1189 | } | ||
1190 | |||
1191 | static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev, | ||
1192 | struct list_head *incoming, | ||
1193 | struct list_head *pending) | ||
1194 | { | ||
1195 | struct drbd_request *req, *tmp; | ||
1196 | int wake = 0; | ||
1197 | int err; | ||
1198 | |||
1199 | spin_lock_irq(&mdev->al_lock); | ||
1200 | list_for_each_entry_safe(req, tmp, incoming, tl_requests) { | ||
1201 | err = drbd_al_begin_io_nonblock(mdev, &req->i); | ||
1202 | if (err == -EBUSY) | ||
1203 | wake = 1; | ||
1204 | if (err) | ||
1205 | continue; | ||
1206 | req->rq_state |= RQ_IN_ACT_LOG; | ||
1207 | list_move_tail(&req->tl_requests, pending); | ||
1208 | } | ||
1209 | spin_unlock_irq(&mdev->al_lock); | ||
1210 | if (wake) | ||
1211 | wake_up(&mdev->al_wait); | ||
1212 | |||
1213 | return !list_empty(pending); | ||
1214 | } | ||
1215 | |||
1216 | void do_submit(struct work_struct *ws) | ||
1217 | { | ||
1218 | struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker); | ||
1219 | LIST_HEAD(incoming); | ||
1220 | LIST_HEAD(pending); | ||
1221 | struct drbd_request *req, *tmp; | ||
1222 | |||
1223 | for (;;) { | ||
1224 | spin_lock(&mdev->submit.lock); | ||
1225 | list_splice_tail_init(&mdev->submit.writes, &incoming); | ||
1226 | spin_unlock(&mdev->submit.lock); | ||
1227 | |||
1228 | submit_fast_path(mdev, &incoming); | ||
1229 | if (list_empty(&incoming)) | ||
1230 | break; | ||
1231 | |||
1232 | wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending)); | ||
1233 | /* Maybe more was queued, while we prepared the transaction? | ||
1234 | * Try to stuff them into this transaction as well. | ||
1235 | * Be strictly non-blocking here, no wait_event, we already | ||
1236 | * have something to commit. | ||
1237 | * Stop if we don't make any more progres. | ||
1238 | */ | ||
1239 | for (;;) { | ||
1240 | LIST_HEAD(more_pending); | ||
1241 | LIST_HEAD(more_incoming); | ||
1242 | bool made_progress; | ||
1243 | |||
1244 | /* It is ok to look outside the lock, | ||
1245 | * it's only an optimization anyways */ | ||
1246 | if (list_empty(&mdev->submit.writes)) | ||
1247 | break; | ||
1248 | |||
1249 | spin_lock(&mdev->submit.lock); | ||
1250 | list_splice_tail_init(&mdev->submit.writes, &more_incoming); | ||
1251 | spin_unlock(&mdev->submit.lock); | ||
1252 | |||
1253 | if (list_empty(&more_incoming)) | ||
1254 | break; | ||
1255 | |||
1256 | made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending); | ||
1257 | |||
1258 | list_splice_tail_init(&more_pending, &pending); | ||
1259 | list_splice_tail_init(&more_incoming, &incoming); | ||
1260 | |||
1261 | if (!made_progress) | ||
1262 | break; | ||
1263 | } | ||
1264 | drbd_al_begin_io_commit(mdev, false); | ||
1265 | |||
1266 | list_for_each_entry_safe(req, tmp, &pending, tl_requests) { | ||
1267 | list_del_init(&req->tl_requests); | ||
1268 | drbd_send_and_submit(mdev, req); | ||
1269 | } | ||
1270 | } | ||
1141 | } | 1271 | } |
1142 | 1272 | ||
1143 | void drbd_make_request(struct request_queue *q, struct bio *bio) | 1273 | void drbd_make_request(struct request_queue *q, struct bio *bio) |
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index c08d22964d06..978cb1addc98 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h | |||
@@ -88,6 +88,14 @@ enum drbd_req_event { | |||
88 | QUEUE_FOR_NET_READ, | 88 | QUEUE_FOR_NET_READ, |
89 | QUEUE_FOR_SEND_OOS, | 89 | QUEUE_FOR_SEND_OOS, |
90 | 90 | ||
91 | /* An empty flush is queued as P_BARRIER, | ||
92 | * which will cause it to complete "successfully", | ||
93 | * even if the local disk flush failed. | ||
94 | * | ||
95 | * Just like "real" requests, empty flushes (blkdev_issue_flush()) will | ||
96 | * only see an error if neither local nor remote data is reachable. */ | ||
97 | QUEUE_AS_DRBD_BARRIER, | ||
98 | |||
91 | SEND_CANCELED, | 99 | SEND_CANCELED, |
92 | SEND_FAILED, | 100 | SEND_FAILED, |
93 | HANDED_OVER_TO_NETWORK, | 101 | HANDED_OVER_TO_NETWORK, |
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 0fe220cfb9e9..90c5be2b1d30 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c | |||
@@ -570,6 +570,13 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | |||
570 | mdev->tconn->agreed_pro_version < 88) | 570 | mdev->tconn->agreed_pro_version < 88) |
571 | rv = SS_NOT_SUPPORTED; | 571 | rv = SS_NOT_SUPPORTED; |
572 | 572 | ||
573 | else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) | ||
574 | rv = SS_NO_UP_TO_DATE_DISK; | ||
575 | |||
576 | else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
577 | ns.pdsk == D_UNKNOWN) | ||
578 | rv = SS_NEED_CONNECTION; | ||
579 | |||
573 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) | 580 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) |
574 | rv = SS_CONNECTED_OUTDATES; | 581 | rv = SS_CONNECTED_OUTDATES; |
575 | 582 | ||
@@ -635,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t | |||
635 | && os.conn < C_WF_REPORT_PARAMS) | 642 | && os.conn < C_WF_REPORT_PARAMS) |
636 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ | 643 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ |
637 | 644 | ||
645 | if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED && | ||
646 | os.conn < C_CONNECTED && os.pdsk > D_OUTDATED) | ||
647 | rv = SS_OUTDATE_WO_CONN; | ||
648 | |||
638 | return rv; | 649 | return rv; |
639 | } | 650 | } |
640 | 651 | ||
@@ -1377,13 +1388,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1377 | &drbd_bmio_set_n_write, &abw_start_sync, | 1388 | &drbd_bmio_set_n_write, &abw_start_sync, |
1378 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); | 1389 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); |
1379 | 1390 | ||
1380 | /* We are invalidating our self... */ | ||
1381 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1382 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1383 | /* other bitmap operation expected during this phase */ | ||
1384 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, | ||
1385 | "set_n_write from invalidate", BM_LOCKED_MASK); | ||
1386 | |||
1387 | /* first half of local IO error, failure to attach, | 1391 | /* first half of local IO error, failure to attach, |
1388 | * or administrative detach */ | 1392 | * or administrative detach */ |
1389 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { | 1393 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { |
@@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state | |||
1748 | if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) | 1752 | if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) |
1749 | return SS_CW_FAILED_BY_PEER; | 1753 | return SS_CW_FAILED_BY_PEER; |
1750 | 1754 | ||
1751 | rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; | 1755 | rv = conn_is_valid_transition(tconn, mask, val, 0); |
1752 | 1756 | if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS) | |
1753 | if (rv == SS_UNKNOWN_ERROR) | 1757 | rv = SS_UNKNOWN_ERROR; /* continue waiting */ |
1754 | rv = conn_is_valid_transition(tconn, mask, val, 0); | ||
1755 | |||
1756 | if (rv == SS_SUCCESS) | ||
1757 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
1758 | 1758 | ||
1759 | return rv; | 1759 | return rv; |
1760 | } | 1760 | } |
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 9a664bd27404..58e08ff2b2ce 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c | |||
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = { | |||
89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", | 89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", |
90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", | 90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", |
91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", | 91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", |
92 | [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer", | ||
92 | [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", | 93 | [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", |
93 | }; | 94 | }; |
94 | 95 | ||
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 424dc7bdf9b7..891c0ecaa292 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error) | |||
89 | md_io->done = 1; | 89 | md_io->done = 1; |
90 | wake_up(&mdev->misc_wait); | 90 | wake_up(&mdev->misc_wait); |
91 | bio_put(bio); | 91 | bio_put(bio); |
92 | put_ldev(mdev); | 92 | if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ |
93 | put_ldev(mdev); | ||
93 | } | 94 | } |
94 | 95 | ||
95 | /* reads on behalf of the partner, | 96 | /* reads on behalf of the partner, |
@@ -1410,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) | |||
1410 | struct drbd_conf *mdev = w->mdev; | 1411 | struct drbd_conf *mdev = w->mdev; |
1411 | 1412 | ||
1412 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) | 1413 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) |
1413 | drbd_al_begin_io(mdev, &req->i); | 1414 | drbd_al_begin_io(mdev, &req->i, false); |
1414 | 1415 | ||
1415 | drbd_req_make_private_bio(req, req->master_bio); | 1416 | drbd_req_make_private_bio(req, req->master_bio); |
1416 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | 1417 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; |
@@ -1425,7 +1426,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev) | |||
1425 | int resync_after; | 1426 | int resync_after; |
1426 | 1427 | ||
1427 | while (1) { | 1428 | while (1) { |
1428 | if (!odev->ldev) | 1429 | if (!odev->ldev || odev->state.disk == D_DISKLESS) |
1429 | return 1; | 1430 | return 1; |
1430 | rcu_read_lock(); | 1431 | rcu_read_lock(); |
1431 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | 1432 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; |
@@ -1433,7 +1434,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev) | |||
1433 | if (resync_after == -1) | 1434 | if (resync_after == -1) |
1434 | return 1; | 1435 | return 1; |
1435 | odev = minor_to_mdev(resync_after); | 1436 | odev = minor_to_mdev(resync_after); |
1436 | if (!expect(odev)) | 1437 | if (!odev) |
1437 | return 1; | 1438 | return 1; |
1438 | if ((odev->state.conn >= C_SYNC_SOURCE && | 1439 | if ((odev->state.conn >= C_SYNC_SOURCE && |
1439 | odev->state.conn <= C_PAUSED_SYNC_T) || | 1440 | odev->state.conn <= C_PAUSED_SYNC_T) || |
@@ -1515,7 +1516,7 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) | |||
1515 | 1516 | ||
1516 | if (o_minor == -1) | 1517 | if (o_minor == -1) |
1517 | return NO_ERROR; | 1518 | return NO_ERROR; |
1518 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) | 1519 | if (o_minor < -1 || o_minor > MINORMASK) |
1519 | return ERR_RESYNC_AFTER; | 1520 | return ERR_RESYNC_AFTER; |
1520 | 1521 | ||
1521 | /* check for loops */ | 1522 | /* check for loops */ |
@@ -1524,6 +1525,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) | |||
1524 | if (odev == mdev) | 1525 | if (odev == mdev) |
1525 | return ERR_RESYNC_AFTER_CYCLE; | 1526 | return ERR_RESYNC_AFTER_CYCLE; |
1526 | 1527 | ||
1528 | /* You are free to depend on diskless, non-existing, | ||
1529 | * or not yet/no longer existing minors. | ||
1530 | * We only reject dependency loops. | ||
1531 | * We cannot follow the dependency chain beyond a detached or | ||
1532 | * missing minor. | ||
1533 | */ | ||
1534 | if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS) | ||
1535 | return NO_ERROR; | ||
1536 | |||
1527 | rcu_read_lock(); | 1537 | rcu_read_lock(); |
1528 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | 1538 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; |
1529 | rcu_read_unlock(); | 1539 | rcu_read_unlock(); |
@@ -1652,7 +1662,9 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1652 | clear_bit(B_RS_H_DONE, &mdev->flags); | 1662 | clear_bit(B_RS_H_DONE, &mdev->flags); |
1653 | 1663 | ||
1654 | write_lock_irq(&global_state_lock); | 1664 | write_lock_irq(&global_state_lock); |
1655 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { | 1665 | /* Did some connection breakage or IO error race with us? */ |
1666 | if (mdev->state.conn < C_CONNECTED | ||
1667 | || !get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
1656 | write_unlock_irq(&global_state_lock); | 1668 | write_unlock_irq(&global_state_lock); |
1657 | mutex_unlock(mdev->state_mutex); | 1669 | mutex_unlock(mdev->state_mutex); |
1658 | return; | 1670 | return; |