aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLars Ellenberg <lars.ellenberg@linbit.com>2015-06-08 09:18:45 -0400
committerJens Axboe <axboe@fb.com>2015-11-25 11:22:03 -0500
commit5f7c01249bea67c32a1a1551a8f2fe0b8b801ab4 (patch)
tree369efade0df17d0170e1dcade7b9043e559f372a
parent603ee2c8c78b2fb5a9dc14fb8b2bb2650ebcab1f (diff)
drbd: avoid potential deadlock during handshake
During handshake communication, we also reconsider our device size, using drbd_determine_dev_size(). Just in case we need to change the offsets or layout of our on-disk metadata, we lock out application and other meta data IO, and wait for the activity log to be "idle" (no more referenced extents). If this handshake happens just after a connection loss, with a fencing policy of "resource-and-stonith", we have frozen IO. If, additionally, the activity log was "starving" (too many incoming random writes at that point in time), it won't become idle, ever, because of the frozen IO, and this would be a lockup of the receiver thread, and consquentially of DRBD. Previous logic (re-)initialized with a special "empty" transaction block, which required the activity log to fully drain first. Instead, write out some standard activity log transactions. Using lc_try_lock_for_transaction() instead of lc_try_lock() does not care about pending activity log references, avoiding the potential deadlock. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com> Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r--drivers/block/drbd/drbd_actlog.c19
-rw-r--r--drivers/block/drbd/drbd_int.h2
-rw-r--r--drivers/block/drbd/drbd_nl.c33
3 files changed, 31 insertions, 23 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 4b484ac1d8cb..10459a145062 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -614,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device)
614 wake_up(&device->al_wait); 614 wake_up(&device->al_wait);
615} 615}
616 616
617int drbd_initialize_al(struct drbd_device *device, void *buffer) 617int drbd_al_initialize(struct drbd_device *device, void *buffer)
618{ 618{
619 struct al_transaction_on_disk *al = buffer; 619 struct al_transaction_on_disk *al = buffer;
620 struct drbd_md *md = &device->ldev->md; 620 struct drbd_md *md = &device->ldev->md;
621 sector_t al_base = md->md_offset + md->al_offset;
622 int al_size_4k = md->al_stripes * md->al_stripe_size_4k; 621 int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
623 int i; 622 int i;
624 623
625 memset(al, 0, 4096); 624 __al_write_transaction(device, al);
626 al->magic = cpu_to_be32(DRBD_AL_MAGIC); 625 /* There may or may not have been a pending transaction. */
627 al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); 626 spin_lock_irq(&device->al_lock);
628 al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); 627 lc_committed(device->act_log);
628 spin_unlock_irq(&device->al_lock);
629 629
630 for (i = 0; i < al_size_4k; i++) { 630 /* The rest of the transactions will have an empty "updates" list, and
631 int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); 631 * are written out only to provide the context, and to initialize the
632 * on-disk ring buffer. */
633 for (i = 1; i < al_size_4k; i++) {
634 int err = __al_write_transaction(device, al);
632 if (err) 635 if (err)
633 return err; 636 return err;
634 } 637 }
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index df3d89d5777a..b6844feb9f9b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1667,7 +1667,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s
1667#define drbd_rs_failed_io(device, sector, size) \ 1667#define drbd_rs_failed_io(device, sector, size) \
1668 __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) 1668 __drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
1669extern void drbd_al_shrink(struct drbd_device *device); 1669extern void drbd_al_shrink(struct drbd_device *device);
1670extern int drbd_initialize_al(struct drbd_device *, void *); 1670extern int drbd_al_initialize(struct drbd_device *, void *);
1671 1671
1672/* drbd_nl.c */ 1672/* drbd_nl.c */
1673/* state info broadcast */ 1673/* state info broadcast */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index c7cd3df8107e..f4ca27359541 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -903,15 +903,14 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
903 int md_moved, la_size_changed; 903 int md_moved, la_size_changed;
904 enum determine_dev_size rv = DS_UNCHANGED; 904 enum determine_dev_size rv = DS_UNCHANGED;
905 905
906 /* race: 906 /* We may change the on-disk offsets of our meta data below. Lock out
907 * application request passes inc_ap_bio, 907 * anything that may cause meta data IO, to avoid acting on incomplete
908 * but then cannot get an AL-reference. 908 * layout changes or scribbling over meta data that is in the process
909 * this function later may wait on ap_bio_cnt == 0. -> deadlock. 909 * of being moved.
910 * 910 *
911 * to avoid that: 911 * Move is not exactly correct, btw, currently we have all our meta
912 * Suspend IO right here. 912 * data in core memory, to "move" it we just write it all out, there
913 * still lock the act_log to not trigger ASSERTs there. 913 * are no reads. */
914 */
915 drbd_suspend_io(device); 914 drbd_suspend_io(device);
916 buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ 915 buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
917 if (!buffer) { 916 if (!buffer) {
@@ -919,9 +918,6 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
919 return DS_ERROR; 918 return DS_ERROR;
920 } 919 }
921 920
922 /* no wait necessary anymore, actually we could assert that */
923 wait_event(device->al_wait, lc_try_lock(device->act_log));
924
925 prev_first_sect = drbd_md_first_sector(device->ldev); 921 prev_first_sect = drbd_md_first_sector(device->ldev);
926 prev_size = device->ldev->md.md_size_sect; 922 prev_size = device->ldev->md.md_size_sect;
927 la_size_sect = device->ldev->md.la_size_sect; 923 la_size_sect = device->ldev->md.la_size_sect;
@@ -997,20 +993,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
997 * Clear the timer, to avoid scary "timer expired!" messages, 993 * Clear the timer, to avoid scary "timer expired!" messages,
998 * "Superblock" is written out at least twice below, anyways. */ 994 * "Superblock" is written out at least twice below, anyways. */
999 del_timer(&device->md_sync_timer); 995 del_timer(&device->md_sync_timer);
1000 drbd_al_shrink(device); /* All extents inactive. */
1001 996
997 /* We won't change the "al-extents" setting, we just may need
998 * to move the on-disk location of the activity log ringbuffer.
999 * Lock for transaction is good enough, it may well be "dirty"
1000 * or even "starving". */
1001 wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log));
1002
1003 /* mark current on-disk bitmap and activity log as unreliable */
1002 prev_flags = md->flags; 1004 prev_flags = md->flags;
1003 md->flags &= ~MDF_PRIMARY_IND; 1005 md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED;
1004 drbd_md_write(device, buffer); 1006 drbd_md_write(device, buffer);
1005 1007
1008 drbd_al_initialize(device, buffer);
1009
1006 drbd_info(device, "Writing the whole bitmap, %s\n", 1010 drbd_info(device, "Writing the whole bitmap, %s\n",
1007 la_size_changed && md_moved ? "size changed and md moved" : 1011 la_size_changed && md_moved ? "size changed and md moved" :
1008 la_size_changed ? "size changed" : "md moved"); 1012 la_size_changed ? "size changed" : "md moved");
1009 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ 1013 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
1010 drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, 1014 drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
1011 "size changed", BM_LOCKED_MASK); 1015 "size changed", BM_LOCKED_MASK);
1012 drbd_initialize_al(device, buffer);
1013 1016
1017 /* on-disk bitmap and activity log is authoritative again
1018 * (unless there was an IO error meanwhile...) */
1014 md->flags = prev_flags; 1019 md->flags = prev_flags;
1015 drbd_md_write(device, buffer); 1020 drbd_md_write(device, buffer);
1016 1021