aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/drbd
diff options
context:
space:
mode:
authorLars Ellenberg <lars.ellenberg@linbit.com>2010-10-16 06:13:47 -0400
committerPhilipp Reisner <philipp.reisner@linbit.com>2010-10-22 09:46:11 -0400
commit82f59cc6353889b426cf13b6596d5a3d100fa09e (patch)
tree6d5a678516334f0a37a56a509b84322a0352719b /drivers/block/drbd
parent3beec1d446fba335f07787636920892dd3b2c658 (diff)
drbd: fix potential deadlock on detach
If we have contention in drbd_al_begin_iod (heavy randon IO), an administrative request to detach the disk may deadlock for similar reasons as the recently fixed deadlock if detaching because of IO-error. The approach taken here is to either go through the intermediate cleanup state D_FAILED, or first lock out application io, don't just go directly to D_DISKLESS. We need an additional state bit (WAS_IO_ERROR) to distinguish the -> D_FAILED because of IO-error from other failures. Sanitize D_ATTACHING -> D_FAILED to D_ATTACHING -> D_DISKLESS. If only attaching, ldev may be missing still, but would be referenced from within the after_state_ch for -> D_FAILED, potentially dereferencing a NULL pointer. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r--drivers/block/drbd/drbd_int.h20
-rw-r--r--drivers/block/drbd/drbd_main.c138
-rw-r--r--drivers/block/drbd/drbd_nl.c16
-rw-r--r--drivers/block/drbd/drbd_receiver.c2
4 files changed, 113 insertions, 63 deletions
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e0e0bf6f16a1..03c15e317c37 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -852,7 +852,8 @@ enum {
852 BITMAP_IO, /* suspend application io; 852 BITMAP_IO, /* suspend application io;
853 once no more io in flight, start bitmap io */ 853 once no more io in flight, start bitmap io */
854 BITMAP_IO_QUEUED, /* Started bitmap IO */ 854 BITMAP_IO_QUEUED, /* Started bitmap IO */
855 GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */ 855 GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
856 WAS_IO_ERROR, /* Local disk failed returned IO error */
856 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ 857 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
857 NET_CONGESTED, /* The data socket is congested */ 858 NET_CONGESTED, /* The data socket is congested */
858 859
@@ -1281,6 +1282,7 @@ extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1281extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); 1282extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1282extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); 1283extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
1283extern void drbd_go_diskless(struct drbd_conf *mdev); 1284extern void drbd_go_diskless(struct drbd_conf *mdev);
1285extern void drbd_ldev_destroy(struct drbd_conf *mdev);
1284 1286
1285 1287
1286/* Meta data layout 1288/* Meta data layout
@@ -1798,17 +1800,17 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
1798 case EP_PASS_ON: 1800 case EP_PASS_ON:
1799 if (!forcedetach) { 1801 if (!forcedetach) {
1800 if (__ratelimit(&drbd_ratelimit_state)) 1802 if (__ratelimit(&drbd_ratelimit_state))
1801 dev_err(DEV, "Local IO failed in %s." 1803 dev_err(DEV, "Local IO failed in %s.\n", where);
1802 "Passing error on...\n", where);
1803 break; 1804 break;
1804 } 1805 }
1805 /* NOTE fall through to detach case if forcedetach set */ 1806 /* NOTE fall through to detach case if forcedetach set */
1806 case EP_DETACH: 1807 case EP_DETACH:
1807 case EP_CALL_HELPER: 1808 case EP_CALL_HELPER:
1809 set_bit(WAS_IO_ERROR, &mdev->flags);
1808 if (mdev->state.disk > D_FAILED) { 1810 if (mdev->state.disk > D_FAILED) {
1809 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); 1811 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
1810 dev_err(DEV, "Local IO failed in %s." 1812 dev_err(DEV,
1811 "Detaching...\n", where); 1813 "Local IO failed in %s. Detaching...\n", where);
1812 } 1814 }
1813 break; 1815 break;
1814 } 1816 }
@@ -2127,7 +2129,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
2127 __release(local); 2129 __release(local);
2128 D_ASSERT(i >= 0); 2130 D_ASSERT(i >= 0);
2129 if (i == 0) { 2131 if (i == 0) {
2132 if (mdev->state.disk == D_DISKLESS)
2133 /* even internal references gone, safe to destroy */
2134 drbd_ldev_destroy(mdev);
2130 if (mdev->state.disk == D_FAILED) 2135 if (mdev->state.disk == D_FAILED)
2136 /* all application IO references gone. */
2131 drbd_go_diskless(mdev); 2137 drbd_go_diskless(mdev);
2132 wake_up(&mdev->misc_wait); 2138 wake_up(&mdev->misc_wait);
2133 } 2139 }
@@ -2138,6 +2144,10 @@ static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_stat
2138{ 2144{
2139 int io_allowed; 2145 int io_allowed;
2140 2146
2147 /* never get a reference while D_DISKLESS */
2148 if (mdev->state.disk == D_DISKLESS)
2149 return 0;
2150
2141 atomic_inc(&mdev->local_cnt); 2151 atomic_inc(&mdev->local_cnt);
2142 io_allowed = (mdev->state.disk >= mins); 2152 io_allowed = (mdev->state.disk >= mins);
2143 if (!io_allowed) 2153 if (!io_allowed)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8d029b14e7cc..992c3aecdf7e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -834,6 +834,15 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
834 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) 834 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
835 ns.conn = os.conn; 835 ns.conn = os.conn;
836 836
837 /* we cannot fail (again) if we already detached */
838 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
839 ns.disk = D_DISKLESS;
840
841 /* if we are only D_ATTACHING yet,
842 * we can (and should) go directly to D_DISKLESS. */
843 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
844 ns.disk = D_DISKLESS;
845
837 /* After C_DISCONNECTING only C_STANDALONE may follow */ 846 /* After C_DISCONNECTING only C_STANDALONE may follow */
838 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) 847 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
839 ns.conn = os.conn; 848 ns.conn = os.conn;
@@ -1055,7 +1064,15 @@ int __drbd_set_state(struct drbd_conf *mdev,
1055 !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) 1064 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1056 set_bit(DEVICE_DYING, &mdev->flags); 1065 set_bit(DEVICE_DYING, &mdev->flags);
1057 1066
1058 mdev->state.i = ns.i; 1067 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1068 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1069 * drbd_ldev_destroy() won't happen before our corresponding
1070 * after_state_ch works run, where we put_ldev again. */
1071 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1072 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1073 atomic_inc(&mdev->local_cnt);
1074
1075 mdev->state = ns;
1059 wake_up(&mdev->misc_wait); 1076 wake_up(&mdev->misc_wait);
1060 wake_up(&mdev->state_wait); 1077 wake_up(&mdev->state_wait);
1061 1078
@@ -1363,63 +1380,64 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1363 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) 1380 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1364 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); 1381 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1365 1382
1366 /* first half of local IO error */ 1383 /* first half of local IO error, failure to attach,
1367 if (os.disk > D_FAILED && ns.disk == D_FAILED) { 1384 * or administrative detach */
1368 enum drbd_io_error_p eh = EP_PASS_ON; 1385 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1386 enum drbd_io_error_p eh;
1387 int was_io_error;
1388 /* corresponding get_ldev was in __drbd_set_state, to serialize
1389 * our cleanup here with the transition to D_DISKLESS,
1390 * so it is safe to dreference ldev here. */
1391 eh = mdev->ldev->dc.on_io_error;
1392 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1393
1394 /* current state still has to be D_FAILED,
1395 * there is only one way out: to D_DISKLESS,
1396 * and that may only happen after our put_ldev below. */
1397 if (mdev->state.disk != D_FAILED)
1398 dev_err(DEV,
1399 "ASSERT FAILED: disk is %s during detach\n",
1400 drbd_disk_str(mdev->state.disk));
1369 1401
1370 if (drbd_send_state(mdev)) 1402 if (drbd_send_state(mdev))
1371 dev_warn(DEV, "Notified peer that my disk is broken.\n"); 1403 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1372 else 1404 else
1373 dev_err(DEV, "Sending state for drbd_io_error() failed\n"); 1405 dev_err(DEV, "Sending state for detaching disk failed\n");
1374 1406
1375 drbd_rs_cancel_all(mdev); 1407 drbd_rs_cancel_all(mdev);
1376 1408
1377 if (get_ldev_if_state(mdev, D_FAILED)) { 1409 /* In case we want to get something to stable storage still,
1378 eh = mdev->ldev->dc.on_io_error; 1410 * this may be the last chance.
1379 put_ldev(mdev); 1411 * Following put_ldev may transition to D_DISKLESS. */
1380 } 1412 drbd_md_sync(mdev);
1381 if (eh == EP_CALL_HELPER) 1413 put_ldev(mdev);
1414
1415 if (was_io_error && eh == EP_CALL_HELPER)
1382 drbd_khelper(mdev, "local-io-error"); 1416 drbd_khelper(mdev, "local-io-error");
1383 } 1417 }
1384 1418
1419 /* second half of local IO error, failure to attach,
1420 * or administrative detach,
1421 * after local_cnt references have reached zero again */
1422 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1423 /* We must still be diskless,
1424 * re-attach has to be serialized with this! */
1425 if (mdev->state.disk != D_DISKLESS)
1426 dev_err(DEV,
1427 "ASSERT FAILED: disk is %s while going diskless\n",
1428 drbd_disk_str(mdev->state.disk));
1385 1429
1386 /* second half of local IO error handling, 1430 mdev->rs_total = 0;
1387 * after local_cnt references have reached zero: */ 1431 mdev->rs_failed = 0;
1388 if (os.disk == D_FAILED && ns.disk == D_DISKLESS) { 1432 atomic_set(&mdev->rs_pending_cnt, 0);
1389 mdev->rs_total = 0;
1390 mdev->rs_failed = 0;
1391 atomic_set(&mdev->rs_pending_cnt, 0);
1392 }
1393
1394 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1395 /* We must still be diskless,
1396 * re-attach has to be serialized with this! */
1397 if (mdev->state.disk != D_DISKLESS)
1398 dev_err(DEV,
1399 "ASSERT FAILED: disk is %s while going diskless\n",
1400 drbd_disk_str(mdev->state.disk));
1401 1433
1402 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state
1403 * will inc/dec it frequently. Since we became D_DISKLESS, no
1404 * one has touched the protected members anymore, though, so we
1405 * are safe to free them here. */
1406 if (drbd_send_state(mdev)) 1434 if (drbd_send_state(mdev))
1407 dev_warn(DEV, "Notified peer that I detached my disk.\n"); 1435 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1408 else 1436 else
1409 dev_err(DEV, "Sending state for detach failed\n"); 1437 dev_err(DEV, "Sending state for being diskless failed\n");
1410 1438 /* corresponding get_ldev in __drbd_set_state
1411 lc_destroy(mdev->resync); 1439 * this may finaly trigger drbd_ldev_destroy. */
1412 mdev->resync = NULL; 1440 put_ldev(mdev);
1413 lc_destroy(mdev->act_log);
1414 mdev->act_log = NULL;
1415 __no_warn(local,
1416 drbd_free_bc(mdev->ldev);
1417 mdev->ldev = NULL;);
1418
1419 if (mdev->md_io_tmpp) {
1420 __free_page(mdev->md_io_tmpp);
1421 mdev->md_io_tmpp = NULL;
1422 }
1423 } 1441 }
1424 1442
1425 /* Disks got bigger while they were detached */ 1443 /* Disks got bigger while they were detached */
@@ -2897,7 +2915,6 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2897 D_ASSERT(list_empty(&mdev->resync_work.list)); 2915 D_ASSERT(list_empty(&mdev->resync_work.list));
2898 D_ASSERT(list_empty(&mdev->unplug_work.list)); 2916 D_ASSERT(list_empty(&mdev->unplug_work.list));
2899 D_ASSERT(list_empty(&mdev->go_diskless.list)); 2917 D_ASSERT(list_empty(&mdev->go_diskless.list));
2900
2901} 2918}
2902 2919
2903 2920
@@ -3756,19 +3773,31 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3756 return 1; 3773 return 1;
3757} 3774}
3758 3775
3776void drbd_ldev_destroy(struct drbd_conf *mdev)
3777{
3778 lc_destroy(mdev->resync);
3779 mdev->resync = NULL;
3780 lc_destroy(mdev->act_log);
3781 mdev->act_log = NULL;
3782 __no_warn(local,
3783 drbd_free_bc(mdev->ldev);
3784 mdev->ldev = NULL;);
3785
3786 if (mdev->md_io_tmpp) {
3787 __free_page(mdev->md_io_tmpp);
3788 mdev->md_io_tmpp = NULL;
3789 }
3790 clear_bit(GO_DISKLESS, &mdev->flags);
3791}
3792
3759static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) 3793static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3760{ 3794{
3761 D_ASSERT(mdev->state.disk == D_FAILED); 3795 D_ASSERT(mdev->state.disk == D_FAILED);
3762 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will 3796 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3763 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch 3797 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3764 * the protected members anymore, though, so in the after_state_ch work 3798 * the protected members anymore, though, so once put_ldev reaches zero
3765 * it will be safe to free them. */ 3799 * again, it will be safe to free them. */
3766 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3800 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3767 /* We need to wait for return of references checked out while we still
3768 * have been D_FAILED, though (drbd_md_sync, bitmap io). */
3769 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
3770
3771 clear_bit(GO_DISKLESS, &mdev->flags);
3772 return 1; 3801 return 1;
3773} 3802}
3774 3803
@@ -3777,9 +3806,6 @@ void drbd_go_diskless(struct drbd_conf *mdev)
3777 D_ASSERT(mdev->state.disk == D_FAILED); 3806 D_ASSERT(mdev->state.disk == D_FAILED);
3778 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) 3807 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3779 drbd_queue_work(&mdev->data.work, &mdev->go_diskless); 3808 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3780 /* don't drbd_queue_work_front,
3781 * we need to serialize with the after_state_ch work
3782 * of the -> D_FAILED transition. */
3783} 3809}
3784 3810
3785/** 3811/**
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index c498c4827de4..0cba7d3d2b5d 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -870,6 +870,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
870 retcode = ERR_DISK_CONFIGURED; 870 retcode = ERR_DISK_CONFIGURED;
871 goto fail; 871 goto fail;
872 } 872 }
873 /* It may just now have detached because of IO error. Make sure
874 * drbd_ldev_destroy is done already, we may end up here very fast,
875 * e.g. if someone calls attach from the on-io-error handler,
876 * to realize a "hot spare" feature (not that I'd recommend that) */
877 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
873 878
874 /* allocation not in the IO path, cqueue thread context */ 879 /* allocation not in the IO path, cqueue thread context */
875 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); 880 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
@@ -1262,7 +1267,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1262 force_diskless_dec: 1267 force_diskless_dec:
1263 put_ldev(mdev); 1268 put_ldev(mdev);
1264 force_diskless: 1269 force_diskless:
1265 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 1270 drbd_force_state(mdev, NS(disk, D_FAILED));
1266 drbd_md_sync(mdev); 1271 drbd_md_sync(mdev);
1267 release_bdev2_fail: 1272 release_bdev2_fail:
1268 if (nbc) 1273 if (nbc)
@@ -1285,10 +1290,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1285 return 0; 1290 return 0;
1286} 1291}
1287 1292
1293/* Detaching the disk is a process in multiple stages. First we need to lock
1294 * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
1295 * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
1296 * internal references as well.
1297 * Only then we have finally detached. */
1288static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1298static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1289 struct drbd_nl_cfg_reply *reply) 1299 struct drbd_nl_cfg_reply *reply)
1290{ 1300{
1301 drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
1291 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); 1302 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
1303 if (mdev->state.disk == D_DISKLESS)
1304 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1305 drbd_resume_io(mdev);
1292 return 0; 1306 return 0;
1293} 1307}
1294 1308
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6ec922c623a1..04a823b01da5 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3363,7 +3363,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3363 if (ns.conn == C_MASK) { 3363 if (ns.conn == C_MASK) {
3364 ns.conn = C_CONNECTED; 3364 ns.conn = C_CONNECTED;
3365 if (mdev->state.disk == D_NEGOTIATING) { 3365 if (mdev->state.disk == D_NEGOTIATING) {
3366 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3366 drbd_force_state(mdev, NS(disk, D_FAILED));
3367 } else if (peer_state.disk == D_NEGOTIATING) { 3367 } else if (peer_state.disk == D_NEGOTIATING) {
3368 dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); 3368 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3369 peer_state.disk = D_DISKLESS; 3369 peer_state.disk = D_DISKLESS;