diff options
author | Lars Ellenberg <lars.ellenberg@linbit.com> | 2010-10-16 06:13:47 -0400 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2010-10-22 09:46:11 -0400 |
commit | 82f59cc6353889b426cf13b6596d5a3d100fa09e (patch) | |
tree | 6d5a678516334f0a37a56a509b84322a0352719b /drivers/block/drbd/drbd_main.c | |
parent | 3beec1d446fba335f07787636920892dd3b2c658 (diff) |
drbd: fix potential deadlock on detach
If we have contention in drbd_al_begin_iod (heavy randon IO),
an administrative request to detach the disk may deadlock
for similar reasons as the recently fixed deadlock if detaching
because of IO-error.
The approach taken here is to either go through the intermediate
cleanup state D_FAILED, or first lock out application io,
don't just go directly to D_DISKLESS.
We need an additional state bit (WAS_IO_ERROR) to distinguish
the -> D_FAILED because of IO-error from other failures.
Sanitize D_ATTACHING -> D_FAILED to D_ATTACHING -> D_DISKLESS.
If only attaching, ldev may be missing still, but would be referenced
from within the after_state_ch for -> D_FAILED, potentially
dereferencing a NULL pointer.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd/drbd_main.c')
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 138 |
1 files changed, 82 insertions, 56 deletions
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8d029b14e7c..992c3aecdf7 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -834,6 +834,15 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state | |||
834 | ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) | 834 | ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) |
835 | ns.conn = os.conn; | 835 | ns.conn = os.conn; |
836 | 836 | ||
837 | /* we cannot fail (again) if we already detached */ | ||
838 | if (ns.disk == D_FAILED && os.disk == D_DISKLESS) | ||
839 | ns.disk = D_DISKLESS; | ||
840 | |||
841 | /* if we are only D_ATTACHING yet, | ||
842 | * we can (and should) go directly to D_DISKLESS. */ | ||
843 | if (ns.disk == D_FAILED && os.disk == D_ATTACHING) | ||
844 | ns.disk = D_DISKLESS; | ||
845 | |||
837 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | 846 | /* After C_DISCONNECTING only C_STANDALONE may follow */ |
838 | if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) | 847 | if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) |
839 | ns.conn = os.conn; | 848 | ns.conn = os.conn; |
@@ -1055,7 +1064,15 @@ int __drbd_set_state(struct drbd_conf *mdev, | |||
1055 | !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) | 1064 | !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) |
1056 | set_bit(DEVICE_DYING, &mdev->flags); | 1065 | set_bit(DEVICE_DYING, &mdev->flags); |
1057 | 1066 | ||
1058 | mdev->state.i = ns.i; | 1067 | /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference |
1068 | * on the ldev here, to be sure the transition -> D_DISKLESS resp. | ||
1069 | * drbd_ldev_destroy() won't happen before our corresponding | ||
1070 | * after_state_ch works run, where we put_ldev again. */ | ||
1071 | if ((os.disk != D_FAILED && ns.disk == D_FAILED) || | ||
1072 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) | ||
1073 | atomic_inc(&mdev->local_cnt); | ||
1074 | |||
1075 | mdev->state = ns; | ||
1059 | wake_up(&mdev->misc_wait); | 1076 | wake_up(&mdev->misc_wait); |
1060 | wake_up(&mdev->state_wait); | 1077 | wake_up(&mdev->state_wait); |
1061 | 1078 | ||
@@ -1363,63 +1380,64 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1363 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | 1380 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) |
1364 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); | 1381 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); |
1365 | 1382 | ||
1366 | /* first half of local IO error */ | 1383 | /* first half of local IO error, failure to attach, |
1367 | if (os.disk > D_FAILED && ns.disk == D_FAILED) { | 1384 | * or administrative detach */ |
1368 | enum drbd_io_error_p eh = EP_PASS_ON; | 1385 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { |
1386 | enum drbd_io_error_p eh; | ||
1387 | int was_io_error; | ||
1388 | /* corresponding get_ldev was in __drbd_set_state, to serialize | ||
1389 | * our cleanup here with the transition to D_DISKLESS, | ||
1390 | * so it is safe to dreference ldev here. */ | ||
1391 | eh = mdev->ldev->dc.on_io_error; | ||
1392 | was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); | ||
1393 | |||
1394 | /* current state still has to be D_FAILED, | ||
1395 | * there is only one way out: to D_DISKLESS, | ||
1396 | * and that may only happen after our put_ldev below. */ | ||
1397 | if (mdev->state.disk != D_FAILED) | ||
1398 | dev_err(DEV, | ||
1399 | "ASSERT FAILED: disk is %s during detach\n", | ||
1400 | drbd_disk_str(mdev->state.disk)); | ||
1369 | 1401 | ||
1370 | if (drbd_send_state(mdev)) | 1402 | if (drbd_send_state(mdev)) |
1371 | dev_warn(DEV, "Notified peer that my disk is broken.\n"); | 1403 | dev_warn(DEV, "Notified peer that I am detaching my disk\n"); |
1372 | else | 1404 | else |
1373 | dev_err(DEV, "Sending state for drbd_io_error() failed\n"); | 1405 | dev_err(DEV, "Sending state for detaching disk failed\n"); |
1374 | 1406 | ||
1375 | drbd_rs_cancel_all(mdev); | 1407 | drbd_rs_cancel_all(mdev); |
1376 | 1408 | ||
1377 | if (get_ldev_if_state(mdev, D_FAILED)) { | 1409 | /* In case we want to get something to stable storage still, |
1378 | eh = mdev->ldev->dc.on_io_error; | 1410 | * this may be the last chance. |
1379 | put_ldev(mdev); | 1411 | * Following put_ldev may transition to D_DISKLESS. */ |
1380 | } | 1412 | drbd_md_sync(mdev); |
1381 | if (eh == EP_CALL_HELPER) | 1413 | put_ldev(mdev); |
1414 | |||
1415 | if (was_io_error && eh == EP_CALL_HELPER) | ||
1382 | drbd_khelper(mdev, "local-io-error"); | 1416 | drbd_khelper(mdev, "local-io-error"); |
1383 | } | 1417 | } |
1384 | 1418 | ||
1419 | /* second half of local IO error, failure to attach, | ||
1420 | * or administrative detach, | ||
1421 | * after local_cnt references have reached zero again */ | ||
1422 | if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1423 | /* We must still be diskless, | ||
1424 | * re-attach has to be serialized with this! */ | ||
1425 | if (mdev->state.disk != D_DISKLESS) | ||
1426 | dev_err(DEV, | ||
1427 | "ASSERT FAILED: disk is %s while going diskless\n", | ||
1428 | drbd_disk_str(mdev->state.disk)); | ||
1385 | 1429 | ||
1386 | /* second half of local IO error handling, | 1430 | mdev->rs_total = 0; |
1387 | * after local_cnt references have reached zero: */ | 1431 | mdev->rs_failed = 0; |
1388 | if (os.disk == D_FAILED && ns.disk == D_DISKLESS) { | 1432 | atomic_set(&mdev->rs_pending_cnt, 0); |
1389 | mdev->rs_total = 0; | ||
1390 | mdev->rs_failed = 0; | ||
1391 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1392 | } | ||
1393 | |||
1394 | if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1395 | /* We must still be diskless, | ||
1396 | * re-attach has to be serialized with this! */ | ||
1397 | if (mdev->state.disk != D_DISKLESS) | ||
1398 | dev_err(DEV, | ||
1399 | "ASSERT FAILED: disk is %s while going diskless\n", | ||
1400 | drbd_disk_str(mdev->state.disk)); | ||
1401 | 1433 | ||
1402 | /* we cannot assert local_cnt == 0 here, as get_ldev_if_state | ||
1403 | * will inc/dec it frequently. Since we became D_DISKLESS, no | ||
1404 | * one has touched the protected members anymore, though, so we | ||
1405 | * are safe to free them here. */ | ||
1406 | if (drbd_send_state(mdev)) | 1434 | if (drbd_send_state(mdev)) |
1407 | dev_warn(DEV, "Notified peer that I detached my disk.\n"); | 1435 | dev_warn(DEV, "Notified peer that I'm now diskless.\n"); |
1408 | else | 1436 | else |
1409 | dev_err(DEV, "Sending state for detach failed\n"); | 1437 | dev_err(DEV, "Sending state for being diskless failed\n"); |
1410 | 1438 | /* corresponding get_ldev in __drbd_set_state | |
1411 | lc_destroy(mdev->resync); | 1439 | * this may finaly trigger drbd_ldev_destroy. */ |
1412 | mdev->resync = NULL; | 1440 | put_ldev(mdev); |
1413 | lc_destroy(mdev->act_log); | ||
1414 | mdev->act_log = NULL; | ||
1415 | __no_warn(local, | ||
1416 | drbd_free_bc(mdev->ldev); | ||
1417 | mdev->ldev = NULL;); | ||
1418 | |||
1419 | if (mdev->md_io_tmpp) { | ||
1420 | __free_page(mdev->md_io_tmpp); | ||
1421 | mdev->md_io_tmpp = NULL; | ||
1422 | } | ||
1423 | } | 1441 | } |
1424 | 1442 | ||
1425 | /* Disks got bigger while they were detached */ | 1443 | /* Disks got bigger while they were detached */ |
@@ -2897,7 +2915,6 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) | |||
2897 | D_ASSERT(list_empty(&mdev->resync_work.list)); | 2915 | D_ASSERT(list_empty(&mdev->resync_work.list)); |
2898 | D_ASSERT(list_empty(&mdev->unplug_work.list)); | 2916 | D_ASSERT(list_empty(&mdev->unplug_work.list)); |
2899 | D_ASSERT(list_empty(&mdev->go_diskless.list)); | 2917 | D_ASSERT(list_empty(&mdev->go_diskless.list)); |
2900 | |||
2901 | } | 2918 | } |
2902 | 2919 | ||
2903 | 2920 | ||
@@ -3756,19 +3773,31 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | |||
3756 | return 1; | 3773 | return 1; |
3757 | } | 3774 | } |
3758 | 3775 | ||
3776 | void drbd_ldev_destroy(struct drbd_conf *mdev) | ||
3777 | { | ||
3778 | lc_destroy(mdev->resync); | ||
3779 | mdev->resync = NULL; | ||
3780 | lc_destroy(mdev->act_log); | ||
3781 | mdev->act_log = NULL; | ||
3782 | __no_warn(local, | ||
3783 | drbd_free_bc(mdev->ldev); | ||
3784 | mdev->ldev = NULL;); | ||
3785 | |||
3786 | if (mdev->md_io_tmpp) { | ||
3787 | __free_page(mdev->md_io_tmpp); | ||
3788 | mdev->md_io_tmpp = NULL; | ||
3789 | } | ||
3790 | clear_bit(GO_DISKLESS, &mdev->flags); | ||
3791 | } | ||
3792 | |||
3759 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3793 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) |
3760 | { | 3794 | { |
3761 | D_ASSERT(mdev->state.disk == D_FAILED); | 3795 | D_ASSERT(mdev->state.disk == D_FAILED); |
3762 | /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will | 3796 | /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will |
3763 | * inc/dec it frequently. Once we are D_DISKLESS, no one will touch | 3797 | * inc/dec it frequently. Once we are D_DISKLESS, no one will touch |
3764 | * the protected members anymore, though, so in the after_state_ch work | 3798 | * the protected members anymore, though, so once put_ldev reaches zero |
3765 | * it will be safe to free them. */ | 3799 | * again, it will be safe to free them. */ |
3766 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | 3800 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); |
3767 | /* We need to wait for return of references checked out while we still | ||
3768 | * have been D_FAILED, though (drbd_md_sync, bitmap io). */ | ||
3769 | wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); | ||
3770 | |||
3771 | clear_bit(GO_DISKLESS, &mdev->flags); | ||
3772 | return 1; | 3801 | return 1; |
3773 | } | 3802 | } |
3774 | 3803 | ||
@@ -3777,9 +3806,6 @@ void drbd_go_diskless(struct drbd_conf *mdev) | |||
3777 | D_ASSERT(mdev->state.disk == D_FAILED); | 3806 | D_ASSERT(mdev->state.disk == D_FAILED); |
3778 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) | 3807 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) |
3779 | drbd_queue_work(&mdev->data.work, &mdev->go_diskless); | 3808 | drbd_queue_work(&mdev->data.work, &mdev->go_diskless); |
3780 | /* don't drbd_queue_work_front, | ||
3781 | * we need to serialize with the after_state_ch work | ||
3782 | * of the -> D_FAILED transition. */ | ||
3783 | } | 3809 | } |
3784 | 3810 | ||
3785 | /** | 3811 | /** |