aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLars Ellenberg <lars.ellenberg@linbit.com>2010-10-07 10:07:55 -0400
committerPhilipp Reisner <philipp.reisner@linbit.com>2010-10-14 12:38:48 -0400
commitaf85e8e83d160f72a10e4467852646ac08614260 (patch)
treeb2c842d6129065bbdd787a810038752e6239b0ef /drivers/block
parente9ef7bb6f9696471ddddf0065afac8b435e5d051 (diff)
drbd: fix for spurious fullsync (uuids rotated too fast)
If it was an "empty" resync, the SyncSource may have already "finished" the resync and rotated the UUIDs, before noticing the connection loss (and generating a new uuid, if Primary, rotating again), while the SyncTarget did not change its uuids at all, or only got to the previous sync-uuid. This would then again lead to a full sync on next handshake (see also Bug #251). Fix: Use explicit resync finished notification even for empty resyncs, do not finish an empty resync implicitly on the SyncSource. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/drbd/drbd_main.c5
-rw-r--r--drivers/block/drbd/drbd_worker.c42
2 files changed, 36 insertions, 11 deletions
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index accb37d1215..63f45d730f3 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1426,6 +1426,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1426 (os.user_isp && !ns.user_isp)) 1426 (os.user_isp && !ns.user_isp))
1427 resume_next_sg(mdev); 1427 resume_next_sg(mdev);
1428 1428
1429 /* sync target done with resync. Explicitly notify peer, even though
1430 * it should (at least for non-empty resyncs) already know itself. */
1431 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1432 drbd_send_state(mdev);
1433
1429 /* free tl_hash if we Got thawed and are C_STANDALONE */ 1434 /* free tl_hash if we Got thawed and are C_STANDALONE */
1430 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) 1435 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1431 drbd_free_tl_hash(mdev); 1436 drbd_free_tl_hash(mdev);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 166b51ec7b6..88be45ad84e 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -522,6 +522,12 @@ int w_make_resync_request(struct drbd_conf *mdev,
522 dev_err(DEV, "%s in w_make_resync_request\n", 522 dev_err(DEV, "%s in w_make_resync_request\n",
523 drbd_conn_str(mdev->state.conn)); 523 drbd_conn_str(mdev->state.conn));
524 524
525 if (mdev->rs_total == 0) {
526 /* empty resync? */
527 drbd_resync_finished(mdev);
528 return 1;
529 }
530
525 if (!get_ldev(mdev)) { 531 if (!get_ldev(mdev)) {
526 /* Since we only need to access mdev->rsync a 532 /* Since we only need to access mdev->rsync a
527 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but 533 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
@@ -768,6 +774,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
768 return 1; 774 return 1;
769} 775}
770 776
777static void ping_peer(struct drbd_conf *mdev)
778{
779 clear_bit(GOT_PING_ACK, &mdev->flags);
780 request_ping(mdev);
781 wait_event(mdev->misc_wait,
782 test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
783}
784
771int drbd_resync_finished(struct drbd_conf *mdev) 785int drbd_resync_finished(struct drbd_conf *mdev)
772{ 786{
773 unsigned long db, dt, dbdt; 787 unsigned long db, dt, dbdt;
@@ -807,6 +821,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
807 if (!get_ldev(mdev)) 821 if (!get_ldev(mdev))
808 goto out; 822 goto out;
809 823
824 ping_peer(mdev);
825
810 spin_lock_irq(&mdev->req_lock); 826 spin_lock_irq(&mdev->req_lock);
811 os = mdev->state; 827 os = mdev->state;
812 828
@@ -1420,14 +1436,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
1420 return retcode; 1436 return retcode;
1421} 1437}
1422 1438
1423static void ping_peer(struct drbd_conf *mdev)
1424{
1425 clear_bit(GOT_PING_ACK, &mdev->flags);
1426 request_ping(mdev);
1427 wait_event(mdev->misc_wait,
1428 test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
1429}
1430
1431/** 1439/**
1432 * drbd_start_resync() - Start the resync process 1440 * drbd_start_resync() - Start the resync process
1433 * @mdev: DRBD device. 1441 * @mdev: DRBD device.
@@ -1527,9 +1535,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1527 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), 1535 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1528 (unsigned long) mdev->rs_total); 1536 (unsigned long) mdev->rs_total);
1529 1537
1530 if (mdev->rs_total == 0) { 1538 if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
1531 /* Peer still reachable? Beware of failing before-resync-target handlers! */ 1539 /* This still has a race (about when exactly the peers
1532 ping_peer(mdev); 1540 * detect connection loss) that can lead to a full sync
1541 * on next handshake. In 8.3.9 we fixed this with explicit
1542 * resync-finished notifications, but the fix
1543 * introduces a protocol change. Sleeping for some
1544 * time longer than the ping interval + timeout on the
1545 * SyncSource, to give the SyncTarget the chance to
1546 * detect connection loss, then waiting for a ping
1547 * response (implicit in drbd_resync_finished) reduces
1548 * the race considerably, but does not solve it. */
1549 if (side == C_SYNC_SOURCE)
1550 schedule_timeout_interruptible(
1551 mdev->net_conf->ping_int * HZ +
1552 mdev->net_conf->ping_timeo*HZ/9);
1533 drbd_resync_finished(mdev); 1553 drbd_resync_finished(mdev);
1534 } 1554 }
1535 1555