diff options
author | Lars Ellenberg <lars.ellenberg@linbit.com> | 2012-07-30 03:10:41 -0400 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2012-11-08 10:58:40 -0500 |
commit | a324896b173e569fb831c5caa04ccd02ec0bc9ca (patch) | |
tree | fedb4c82e66c304c6ced91a9e83538af735ddb45 /drivers/block/drbd | |
parent | 8a943170711b7a4d63528ea8eb6a41cc91e79309 (diff) |
drbd: do not reset rs_pending_cnt too early
Fix asserts like
block drbd0: in got_BlockAck:4634: rs_pending_cnt = -35 < 0 !
We reset the resync lru cache and related information (rs_pending_cnt),
once we successfully finished a resync or online verify, or if the
replication connection is lost.
We also need to reset it if a resync or online verify is aborted
because a lower level disk failed.
In that case the replication link is still established,
and we may still have packets queued in the network buffers
which want to touch rs_pending_cnt.
We do not have any synchronization mechanism to know for sure when all
such pending resync related packets have been drained.
To avoid this counter to go negative (and violate the ASSERT that it
will always be >= 0), just do not reset it when we lose a disk.
It is good enough to make sure it is re-initialized before the next
resync can start: reset it when we re-attach a disk.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 5 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state.c | 11 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 8 |
3 files changed, 12 insertions, 12 deletions
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 05ed4804c72c..a2925dedc23f 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -1309,6 +1309,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1309 | /* make sure there is no leftover from previous force-detach attempts */ | 1309 | /* make sure there is no leftover from previous force-detach attempts */ |
1310 | clear_bit(FORCE_DETACH, &mdev->flags); | 1310 | clear_bit(FORCE_DETACH, &mdev->flags); |
1311 | 1311 | ||
1312 | /* and no leftover from previously aborted resync or verify, either */ | ||
1313 | mdev->rs_total = 0; | ||
1314 | mdev->rs_failed = 0; | ||
1315 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1316 | |||
1312 | /* allocation not in the IO path, drbdsetup context */ | 1317 | /* allocation not in the IO path, drbdsetup context */ |
1313 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); | 1318 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); |
1314 | if (!nbc) { | 1319 | if (!nbc) { |
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index c9ec7d37632c..ad307fb8dc28 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c | |||
@@ -1216,6 +1216,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1216 | /* Do not change the order of the if above and the two below... */ | 1216 | /* Do not change the order of the if above and the two below... */ |
1217 | if (os.pdsk == D_DISKLESS && | 1217 | if (os.pdsk == D_DISKLESS && |
1218 | ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ | 1218 | ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ |
1219 | /* we probably will start a resync soon. | ||
1220 | * make sure those things are properly reset. */ | ||
1221 | mdev->rs_total = 0; | ||
1222 | mdev->rs_failed = 0; | ||
1223 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1224 | drbd_rs_cancel_all(mdev); | ||
1225 | |||
1219 | drbd_send_uuids(mdev); | 1226 | drbd_send_uuids(mdev); |
1220 | drbd_send_state(mdev, ns); | 1227 | drbd_send_state(mdev, ns); |
1221 | } | 1228 | } |
@@ -1386,10 +1393,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1386 | "ASSERT FAILED: disk is %s while going diskless\n", | 1393 | "ASSERT FAILED: disk is %s while going diskless\n", |
1387 | drbd_disk_str(mdev->state.disk)); | 1394 | drbd_disk_str(mdev->state.disk)); |
1388 | 1395 | ||
1389 | mdev->rs_total = 0; | ||
1390 | mdev->rs_failed = 0; | ||
1391 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1392 | |||
1393 | if (ns.conn >= C_CONNECTED) | 1396 | if (ns.conn >= C_CONNECTED) |
1394 | drbd_send_state(mdev, ns); | 1397 | drbd_send_state(mdev, ns); |
1395 | /* corresponding get_ldev in __drbd_set_state | 1398 | /* corresponding get_ldev in __drbd_set_state |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 07a4046dd8c3..9d7e1fb0f431 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -1549,14 +1549,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1549 | return; | 1549 | return; |
1550 | } | 1550 | } |
1551 | 1551 | ||
1552 | if (mdev->state.conn < C_AHEAD) { | ||
1553 | /* In case a previous resync run was aborted by an IO error/detach on the peer. */ | ||
1554 | drbd_rs_cancel_all(mdev); | ||
1555 | /* This should be done when we abort the resync. We definitely do not | ||
1556 | want to have this for connections going back and forth between | ||
1557 | Ahead/Behind and SyncSource/SyncTarget */ | ||
1558 | } | ||
1559 | |||
1560 | if (!test_bit(B_RS_H_DONE, &mdev->flags)) { | 1552 | if (!test_bit(B_RS_H_DONE, &mdev->flags)) { |
1561 | if (side == C_SYNC_TARGET) { | 1553 | if (side == C_SYNC_TARGET) { |
1562 | /* Since application IO was locked out during C_WF_BITMAP_T and | 1554 | /* Since application IO was locked out during C_WF_BITMAP_T and |