diff options
author | Lars Ellenberg <lars.ellenberg@linbit.com> | 2010-10-07 10:07:55 -0400 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2010-10-14 12:38:48 -0400 |
commit | af85e8e83d160f72a10e4467852646ac08614260 (patch) | |
tree | b2c842d6129065bbdd787a810038752e6239b0ef /drivers/block | |
parent | e9ef7bb6f9696471ddddf0065afac8b435e5d051 (diff) |
drbd: fix for spurious fullsync (uuids rotated too fast)
If it was an "empty" resync, the SyncSource may have already "finished"
the resync and rotated the UUIDs, before noticing the connection loss
(and generating a new uuid, if Primary, rotating again), while the
SyncTarget did not change its uuids at all, or only got to the previous
sync-uuid.
This would then again lead to a full sync on next handshake
(see also Bug #251).
Fix:
Use explicit resync finished notification even for empty resyncs,
do not finish an empty resync implicitly on the SyncSource.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 5 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 42 |
2 files changed, 36 insertions, 11 deletions
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index accb37d1215..63f45d730f3 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -1426,6 +1426,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1426 | (os.user_isp && !ns.user_isp)) | 1426 | (os.user_isp && !ns.user_isp)) |
1427 | resume_next_sg(mdev); | 1427 | resume_next_sg(mdev); |
1428 | 1428 | ||
1429 | /* sync target done with resync. Explicitly notify peer, even though | ||
1430 | * it should (at least for non-empty resyncs) already know itself. */ | ||
1431 | if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) | ||
1432 | drbd_send_state(mdev); | ||
1433 | |||
1429 | /* free tl_hash if we Got thawed and are C_STANDALONE */ | 1434 | /* free tl_hash if we Got thawed and are C_STANDALONE */ |
1430 | if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) | 1435 | if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) |
1431 | drbd_free_tl_hash(mdev); | 1436 | drbd_free_tl_hash(mdev); |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 166b51ec7b6..88be45ad84e 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -522,6 +522,12 @@ int w_make_resync_request(struct drbd_conf *mdev, | |||
522 | dev_err(DEV, "%s in w_make_resync_request\n", | 522 | dev_err(DEV, "%s in w_make_resync_request\n", |
523 | drbd_conn_str(mdev->state.conn)); | 523 | drbd_conn_str(mdev->state.conn)); |
524 | 524 | ||
525 | if (mdev->rs_total == 0) { | ||
526 | /* empty resync? */ | ||
527 | drbd_resync_finished(mdev); | ||
528 | return 1; | ||
529 | } | ||
530 | |||
525 | if (!get_ldev(mdev)) { | 531 | if (!get_ldev(mdev)) { |
526 | /* Since we only need to access mdev->rsync a | 532 | /* Since we only need to access mdev->rsync a |
527 | get_ldev_if_state(mdev,D_FAILED) would be sufficient, but | 533 | get_ldev_if_state(mdev,D_FAILED) would be sufficient, but |
@@ -768,6 +774,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
768 | return 1; | 774 | return 1; |
769 | } | 775 | } |
770 | 776 | ||
777 | static void ping_peer(struct drbd_conf *mdev) | ||
778 | { | ||
779 | clear_bit(GOT_PING_ACK, &mdev->flags); | ||
780 | request_ping(mdev); | ||
781 | wait_event(mdev->misc_wait, | ||
782 | test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); | ||
783 | } | ||
784 | |||
771 | int drbd_resync_finished(struct drbd_conf *mdev) | 785 | int drbd_resync_finished(struct drbd_conf *mdev) |
772 | { | 786 | { |
773 | unsigned long db, dt, dbdt; | 787 | unsigned long db, dt, dbdt; |
@@ -807,6 +821,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
807 | if (!get_ldev(mdev)) | 821 | if (!get_ldev(mdev)) |
808 | goto out; | 822 | goto out; |
809 | 823 | ||
824 | ping_peer(mdev); | ||
825 | |||
810 | spin_lock_irq(&mdev->req_lock); | 826 | spin_lock_irq(&mdev->req_lock); |
811 | os = mdev->state; | 827 | os = mdev->state; |
812 | 828 | ||
@@ -1420,14 +1436,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na) | |||
1420 | return retcode; | 1436 | return retcode; |
1421 | } | 1437 | } |
1422 | 1438 | ||
1423 | static void ping_peer(struct drbd_conf *mdev) | ||
1424 | { | ||
1425 | clear_bit(GOT_PING_ACK, &mdev->flags); | ||
1426 | request_ping(mdev); | ||
1427 | wait_event(mdev->misc_wait, | ||
1428 | test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); | ||
1429 | } | ||
1430 | |||
1431 | /** | 1439 | /** |
1432 | * drbd_start_resync() - Start the resync process | 1440 | * drbd_start_resync() - Start the resync process |
1433 | * @mdev: DRBD device. | 1441 | * @mdev: DRBD device. |
@@ -1527,9 +1535,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1527 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), | 1535 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), |
1528 | (unsigned long) mdev->rs_total); | 1536 | (unsigned long) mdev->rs_total); |
1529 | 1537 | ||
1530 | if (mdev->rs_total == 0) { | 1538 | if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) { |
1531 | /* Peer still reachable? Beware of failing before-resync-target handlers! */ | 1539 | /* This still has a race (about when exactly the peers |
1532 | ping_peer(mdev); | 1540 | * detect connection loss) that can lead to a full sync |
1541 | * on next handshake. In 8.3.9 we fixed this with explicit | ||
1542 | * resync-finished notifications, but the fix | ||
1543 | * introduces a protocol change. Sleeping for some | ||
1544 | * time longer than the ping interval + timeout on the | ||
1545 | * SyncSource, to give the SyncTarget the chance to | ||
1546 | * detect connection loss, then waiting for a ping | ||
1547 | * response (implicit in drbd_resync_finished) reduces | ||
1548 | * the race considerably, but does not solve it. */ | ||
1549 | if (side == C_SYNC_SOURCE) | ||
1550 | schedule_timeout_interruptible( | ||
1551 | mdev->net_conf->ping_int * HZ + | ||
1552 | mdev->net_conf->ping_timeo*HZ/9); | ||
1533 | drbd_resync_finished(mdev); | 1553 | drbd_resync_finished(mdev); |
1534 | } | 1554 | } |
1535 | 1555 | ||