md-cluster/raid10: support add disk under grow mode

For clustered raid10 scenario, we need to let all the nodes know about that a new disk is added to the array, and the reshape caused by add new member just need to be happened in one node, but other nodes should know about the change. Since reshape means read data from somewhere (which is already used by array) and write data to unused region. Obviously, it is awful if one node is reading data from address while another node is writing to the same address. Considering we have implemented suspend writes in the resyncing area, so we can just broadcast the reading address to other nodes to avoid the trouble. For master node, it would call reshape_request then update sb during the reshape period. To avoid above trouble, we call resync_info_update to send RESYNC message in reshape_request. Then from slave node's view, it receives two type messages: 1. RESYNCING message Slave node add the address (where master node reading data from) to suspend list. 2. METADATA_UPDATED message Once slave nodes know the reshaping is started in master node, it is time to update reshape position and call start_reshape to follow master node's step. After reshape is done, only reshape position is need to be updated, so the majority task of reshaping is happened on the master node. Reviewed-by: NeilBrown <neilb@suse.com> Signed-off-by: Guoqing Jiang <gqjiang@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
author: Guoqing Jiang <gqjiang@suse.com> 2018-10-18 04:37:42 -0400
committer: Shaohua Li <shli@fb.com> 2018-10-18 12:34:56 -0400
commit: 7564beda19b3646d781934d04fc382b738053e6f (patch)
tree: de2367e01d0a2392b7c5cecdadd95da72afa6251 /drivers/md
parent: afd75628608337cf427a1f9ca0e46698a74f25d8 (diff)
3 files changed, 59 insertions, 0 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4c0f3e0331d5..e07096c4ff20 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9230,6 +9230,30 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
        if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
                update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
+        /*
+         * Since mddev->delta_disks has already updated in update_raid_disks,
+         * so it is time to check reshape.
+         */
+        if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+            (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+                /*
+                 * reshape is happening in the remote node, we need to
+                 * update reshape_position and call start_reshape.
+                 */
+                mddev->reshape_position = sb->reshape_position;
+                if (mddev->pers->update_reshape_pos)
+                        mddev->pers->update_reshape_pos(mddev);
+                if (mddev->pers->start_reshape)
+                        mddev->pers->start_reshape(mddev);
+        } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+                   mddev->reshape_position != MaxSector &&
+                   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+                /* reshape is just done in another node. */
+                mddev->reshape_position = MaxSector;
+                if (mddev->pers->update_reshape_pos)
+                        mddev->pers->update_reshape_pos(mddev);
+        }
        /* Finally set the event to be up to date */
        mddev->events = le64_to_cpu(sb->events);
 }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8afd6bfdbfb9..c52afb52c776 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -557,6 +557,7 @@ struct md_personality
        int (*check_reshape) (struct mddev *mddev);
        int (*start_reshape) (struct mddev *mddev);
        void (*finish_reshape) (struct mddev *mddev);
+        void (*update_reshape_pos) (struct mddev *mddev);
        /* quiesce suspends or resumes internal processing.
         * 1 - stop new actions and wait for action io to complete
         * 0 - return to normal behaviour
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 72e52921c545..1edd58a3098b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4605,6 +4605,32 @@ read_more:
        r10_bio->master_bio = read_bio;
        r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
+        /*
+         * Broadcast RESYNC message to other nodes, so all nodes would not
+         * write to the region to avoid conflict.
+        */
+        if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
+                struct mdp_superblock_1 *sb = NULL;
+                int sb_reshape_pos = 0;
+                conf->cluster_sync_low = sector_nr;
+                conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
+                sb = page_address(rdev->sb_page);
+                if (sb) {
+                        sb_reshape_pos = le64_to_cpu(sb->reshape_position);
+                        /*
+                         * Set cluster_sync_low again if next address for array
+                         * reshape is less than cluster_sync_low. Since we can't
+                         * update cluster_sync_low until it has finished reshape.
+                         */
+                        if (sb_reshape_pos < conf->cluster_sync_low)
+                                conf->cluster_sync_low = sb_reshape_pos;
+                }
+                md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
+                                                          conf->cluster_sync_high);
+        }
        /* Now find the locations in the new layout */
        __raid10_find_phys(&conf->geo, r10_bio);
@@ -4756,6 +4782,13 @@ static void end_reshape(struct r10conf *conf)
        conf->fullsync = 0;
 }
+static void raid10_update_reshape_pos(struct mddev *mddev)
+{
+        struct r10conf *conf = mddev->private;
+        conf->reshape_progress = mddev->reshape_position;
+}
 static int handle_reshape_read_error(struct mddev *mddev,
                                     struct r10bio *r10_bio)
 {
@@ -4924,6 +4957,7 @@ static struct md_personality raid10_personality =
        .check_reshape  = raid10_check_reshape,
        .start_reshape  = raid10_start_reshape,
        .finish_reshape = raid10_finish_reshape,
+        .update_reshape_pos = raid10_update_reshape_pos,
        .congested      = raid10_congested,
 };
author	Guoqing Jiang <gqjiang@suse.com>	2018-10-18 04:37:42 -0400
committer	Shaohua Li <shli@fb.com>	2018-10-18 12:34:56 -0400
commit	7564beda19b3646d781934d04fc382b738053e6f (patch)
tree	de2367e01d0a2392b7c5cecdadd95da72afa6251 /drivers/md
parent	afd75628608337cf427a1f9ca0e46698a74f25d8 (diff)

diff --git a/drivers/md/md.c b/drivers/md/md.c index 4c0f3e0331d5..e07096c4ff20 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c
@@ -9230,6 +9230,30 @@ static void check_sb_changes(struct mddev mddev, struct md_rdev rdev)
9230	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))	9230	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
9231	update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));	9231	update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9232		9232
		9233	/*
		9234	* Since mddev->delta_disks has already updated in update_raid_disks,
		9235	* so it is time to check reshape.
		9236	*/
		9237	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
		9238	(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
		9239	/*
		9240	* reshape is happening in the remote node, we need to
		9241	* update reshape_position and call start_reshape.
		9242	*/
		9243	mddev->reshape_position = sb->reshape_position;
		9244	if (mddev->pers->update_reshape_pos)
		9245	mddev->pers->update_reshape_pos(mddev);
		9246	if (mddev->pers->start_reshape)
		9247	mddev->pers->start_reshape(mddev);
		9248	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
		9249	mddev->reshape_position != MaxSector &&
		9250	!(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
		9251	/* reshape is just done in another node. */
		9252	mddev->reshape_position = MaxSector;
		9253	if (mddev->pers->update_reshape_pos)
		9254	mddev->pers->update_reshape_pos(mddev);
		9255	}
		9256
9233	/* Finally set the event to be up to date */	9257	/* Finally set the event to be up to date */
9234	mddev->events = le64_to_cpu(sb->events);	9258	mddev->events = le64_to_cpu(sb->events);
9235	}	9259	}


diff --git a/drivers/md/md.h b/drivers/md/md.h index 8afd6bfdbfb9..c52afb52c776 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h
@@ -557,6 +557,7 @@ struct md_personality
557	int (check_reshape) (struct mddev mddev);	557	int (check_reshape) (struct mddev mddev);
558	int (start_reshape) (struct mddev mddev);	558	int (start_reshape) (struct mddev mddev);
559	void (finish_reshape) (struct mddev mddev);	559	void (finish_reshape) (struct mddev mddev);
		560	void (update_reshape_pos) (struct mddev mddev);
560	/* quiesce suspends or resumes internal processing.	561	/* quiesce suspends or resumes internal processing.
561	* 1 - stop new actions and wait for action io to complete	562	* 1 - stop new actions and wait for action io to complete
562	* 0 - return to normal behaviour	563	* 0 - return to normal behaviour


diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 72e52921c545..1edd58a3098b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -4605,6 +4605,32 @@ read_more:
4605	r10_bio->master_bio = read_bio;	4605	r10_bio->master_bio = read_bio;
4606	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;	4606	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4607		4607
		4608	/*
		4609	* Broadcast RESYNC message to other nodes, so all nodes would not
		4610	* write to the region to avoid conflict.
		4611	*/
		4612	if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
		4613	struct mdp_superblock_1 *sb = NULL;
		4614	int sb_reshape_pos = 0;
		4615
		4616	conf->cluster_sync_low = sector_nr;
		4617	conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
		4618	sb = page_address(rdev->sb_page);
		4619	if (sb) {
		4620	sb_reshape_pos = le64_to_cpu(sb->reshape_position);
		4621	/*
		4622	* Set cluster_sync_low again if next address for array
		4623	* reshape is less than cluster_sync_low. Since we can't
		4624	* update cluster_sync_low until it has finished reshape.
		4625	*/
		4626	if (sb_reshape_pos < conf->cluster_sync_low)
		4627	conf->cluster_sync_low = sb_reshape_pos;
		4628	}
		4629
		4630	md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
		4631	conf->cluster_sync_high);
		4632	}
		4633
4608	/* Now find the locations in the new layout */	4634	/* Now find the locations in the new layout */
4609	__raid10_find_phys(&conf->geo, r10_bio);	4635	__raid10_find_phys(&conf->geo, r10_bio);
4610		4636
@@ -4756,6 +4782,13 @@ static void end_reshape(struct r10conf *conf)
4756	conf->fullsync = 0;	4782	conf->fullsync = 0;
4757	}	4783	}
4758		4784
		4785	static void raid10_update_reshape_pos(struct mddev *mddev)
		4786	{
		4787	struct r10conf *conf = mddev->private;
		4788
		4789	conf->reshape_progress = mddev->reshape_position;
		4790	}
		4791
4759	static int handle_reshape_read_error(struct mddev *mddev,	4792	static int handle_reshape_read_error(struct mddev *mddev,
4760	struct r10bio *r10_bio)	4793	struct r10bio *r10_bio)
4761	{	4794	{
@@ -4924,6 +4957,7 @@ static struct md_personality raid10_personality =
4924	.check_reshape = raid10_check_reshape,	4957	.check_reshape = raid10_check_reshape,
4925	.start_reshape = raid10_start_reshape,	4958	.start_reshape = raid10_start_reshape,
4926	.finish_reshape = raid10_finish_reshape,	4959	.finish_reshape = raid10_finish_reshape,
		4960	.update_reshape_pos = raid10_update_reshape_pos,
4927	.congested = raid10_congested,	4961	.congested = raid10_congested,
4928	};	4962	};
4929		4963