md/raid5: More careful check for "has array failed".

When we are reshaping an array, the device failure combinations that cause us to decide that the array as failed are more subtle. In particular, any 'spare' will be fully in-sync in the section of the array that has already been reshaped, thus failures that affect only that section are less critical. So encode this subtlety in a new function and call it as appropriate. The case that showed this problem was a 4 drive RAID5 to 8 drive RAID6 conversion where the last two devices failed. This resulted in: good good good good incomplete good good failed failed while converting a 5-drive RAID6 to 8 drive RAID5 The incomplete device causes the whole array to look bad, bad as it was actually good for the section that had been converted to 8-drives, all the data was actually safe. Reported-by: Terry Morris <tbmorris@tbmorris.com> Signed-off-by: NeilBrown <neilb@suse.de>
author: NeilBrown <neilb@suse.de> 2010-06-16 03:17:53 -0400
committer: NeilBrown <neilb@suse.de> 2010-06-23 23:35:27 -0400
commit: 674806d62fb02a22eea948c9f1b5e58e0947b728 (patch)
tree: 3367850a95d62713aa96acd2aecc493b66779398 /drivers/md/raid5.c
parent: 70fffd0bfab1558a8c64c5e903dea1fb84cd9f6b (diff)
1 files changed, 71 insertions, 4 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f972a94bbc32..d4b233c25f2e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -366,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
        return NULL;
 }
+/*
+ * Need to check if array has failed when deciding whether to:
+ *  - start an array
+ *  - remove non-faulty devices
+ *  - add a spare
+ *  - allow a reshape
+ * This determination is simple when no reshape is happening.
+ * However if there is a reshape, we need to carefully check
+ * both the before and after sections.
+ * This is because some failed devices may only affect one
+ * of the two sections, and some non-in_sync devices may
+ * be insync in the section most affected by failed devices.
+ */
+static int has_failed(raid5_conf_t *conf)
+{
+        int degraded;
+        int i;
+        if (conf->mddev->reshape_position == MaxSector)
+                return conf->mddev->degraded > conf->max_degraded;
+        rcu_read_lock();
+        degraded = 0;
+        for (i = 0; i < conf->previous_raid_disks; i++) {
+                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+                if (!rdev || test_bit(Faulty, &rdev->flags))
+                        degraded++;
+                else if (test_bit(In_sync, &rdev->flags))
+                        ;
+                else
+                        /* not in-sync or faulty.
+                         * If the reshape increases the number of devices,
+                         * this is being recovered by the reshape, so
+                         * this 'previous' section is not in_sync.
+                         * If the number of devices is being reduced however,
+                         * the device can only be part of the array if
+                         * we are reverting a reshape, so this section will
+                         * be in-sync.
+                         */
+                        if (conf->raid_disks >= conf->previous_raid_disks)
+                                degraded++;
+        }
+        rcu_read_unlock();
+        if (degraded > conf->max_degraded)
+                return 1;
+        rcu_read_lock();
+        degraded = 0;
+        for (i = 0; i < conf->raid_disks; i++) {
+                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+                if (!rdev || test_bit(Faulty, &rdev->flags))
+                        degraded++;
+                else if (test_bit(In_sync, &rdev->flags))
+                        ;
+                else
+                        /* not in-sync or faulty.
+                         * If reshape increases the number of devices, this
+                         * section has already been recovered, else it
+                         * almost certainly hasn't.
+                         */
+                        if (conf->raid_disks <= conf->previous_raid_disks)
+                                degraded++;
+        }
+        rcu_read_unlock();
+        if (degraded > conf->max_degraded)
+                return 1;
+        return 0;
+}
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(struct request_queue *q);
@@ -5006,7 +5073,7 @@ static int run(mddev_t *mddev)
        mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
                           - working_disks);
-        if (mddev->degraded > conf->max_degraded) {
+        if (has_failed(conf)) {
                printk(KERN_ERR "md/raid:%s: not enough operational devices"
                        " (%d/%d failed)\n",
                        mdname(mddev), mddev->degraded, conf->raid_disks);
@@ -5244,7 +5311,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                 * isn't possible.
                 */
                if (!test_bit(Faulty, &rdev->flags) &&
-                    mddev->degraded <= conf->max_degraded &&
+                    !has_failed(conf) &&
                    number < conf->raid_disks) {
                        err = -EBUSY;
                        goto abort;
@@ -5272,7 +5339,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
        int first = 0;
        int last = conf->raid_disks - 1;
-        if (mddev->degraded > conf->max_degraded)
+        if (has_failed(conf))
                /* no point adding a device */
                return -EINVAL;
@@ -5364,7 +5431,7 @@ static int check_reshape(mddev_t *mddev)
        if (mddev->bitmap)
                /* Cannot grow a bitmap yet */
                return -EBUSY;
-        if (mddev->degraded > conf->max_degraded)
+        if (has_failed(conf))
                return -EINVAL;
        if (mddev->delta_disks < 0) {
                /* We might be able to shrink, but the devices must
author	NeilBrown <neilb@suse.de>	2010-06-16 03:17:53 -0400
committer	NeilBrown <neilb@suse.de>	2010-06-23 23:35:27 -0400
commit	674806d62fb02a22eea948c9f1b5e58e0947b728 (patch)
tree	3367850a95d62713aa96acd2aecc493b66779398 /drivers/md/raid5.c
parent	70fffd0bfab1558a8c64c5e903dea1fb84cd9f6b (diff)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f972a94bbc32..d4b233c25f2e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c
@@ -366,6 +366,73 @@ static struct stripe_head __find_stripe(raid5_conf_t conf, sector_t sector,
366	return NULL;	366	return NULL;
367	}	367	}
368		368
		369	/*
		370	* Need to check if array has failed when deciding whether to:
		371	* - start an array
		372	* - remove non-faulty devices
		373	* - add a spare
		374	* - allow a reshape
		375	* This determination is simple when no reshape is happening.
		376	* However if there is a reshape, we need to carefully check
		377	* both the before and after sections.
		378	* This is because some failed devices may only affect one
		379	* of the two sections, and some non-in_sync devices may
		380	* be insync in the section most affected by failed devices.
		381	*/
		382	static int has_failed(raid5_conf_t *conf)
		383	{
		384	int degraded;
		385	int i;
		386	if (conf->mddev->reshape_position == MaxSector)
		387	return conf->mddev->degraded > conf->max_degraded;
		388
		389	rcu_read_lock();
		390	degraded = 0;
		391	for (i = 0; i < conf->previous_raid_disks; i++) {
		392	mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
		393	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
		394	degraded++;
		395	else if (test_bit(In_sync, &rdev->flags))
		396	;
		397	else
		398	/* not in-sync or faulty.
		399	* If the reshape increases the number of devices,
		400	* this is being recovered by the reshape, so
		401	* this 'previous' section is not in_sync.
		402	* If the number of devices is being reduced however,
		403	* the device can only be part of the array if
		404	* we are reverting a reshape, so this section will
		405	* be in-sync.
		406	*/
		407	if (conf->raid_disks >= conf->previous_raid_disks)
		408	degraded++;
		409	}
		410	rcu_read_unlock();
		411	if (degraded > conf->max_degraded)
		412	return 1;
		413	rcu_read_lock();
		414	degraded = 0;
		415	for (i = 0; i < conf->raid_disks; i++) {
		416	mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
		417	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
		418	degraded++;
		419	else if (test_bit(In_sync, &rdev->flags))
		420	;
		421	else
		422	/* not in-sync or faulty.
		423	* If reshape increases the number of devices, this
		424	* section has already been recovered, else it
		425	* almost certainly hasn't.
		426	*/
		427	if (conf->raid_disks <= conf->previous_raid_disks)
		428	degraded++;
		429	}
		430	rcu_read_unlock();
		431	if (degraded > conf->max_degraded)
		432	return 1;
		433	return 0;
		434	}
		435
369	static void unplug_slaves(mddev_t *mddev);	436	static void unplug_slaves(mddev_t *mddev);
370	static void raid5_unplug_device(struct request_queue *q);	437	static void raid5_unplug_device(struct request_queue *q);
371		438
@@ -5006,7 +5073,7 @@ static int run(mddev_t *mddev)
5006	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)	5073	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
5007	- working_disks);	5074	- working_disks);
5008		5075
5009	if (mddev->degraded > conf->max_degraded) {	5076	if (has_failed(conf)) {
5010	printk(KERN_ERR "md/raid:%s: not enough operational devices"	5077	printk(KERN_ERR "md/raid:%s: not enough operational devices"
5011	" (%d/%d failed)\n",	5078	" (%d/%d failed)\n",
5012	mdname(mddev), mddev->degraded, conf->raid_disks);	5079	mdname(mddev), mddev->degraded, conf->raid_disks);
@@ -5244,7 +5311,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
5244	* isn't possible.	5311	* isn't possible.
5245	*/	5312	*/
5246	if (!test_bit(Faulty, &rdev->flags) &&	5313	if (!test_bit(Faulty, &rdev->flags) &&
5247	mddev->degraded <= conf->max_degraded &&	5314	!has_failed(conf) &&
5248	number < conf->raid_disks) {	5315	number < conf->raid_disks) {
5249	err = -EBUSY;	5316	err = -EBUSY;
5250	goto abort;	5317	goto abort;
@@ -5272,7 +5339,7 @@ static int raid5_add_disk(mddev_t mddev, mdk_rdev_t rdev)
5272	int first = 0;	5339	int first = 0;
5273	int last = conf->raid_disks - 1;	5340	int last = conf->raid_disks - 1;
5274		5341
5275	if (mddev->degraded > conf->max_degraded)	5342	if (has_failed(conf))
5276	/* no point adding a device */	5343	/* no point adding a device */
5277	return -EINVAL;	5344	return -EINVAL;
5278		5345
@@ -5364,7 +5431,7 @@ static int check_reshape(mddev_t *mddev)
5364	if (mddev->bitmap)	5431	if (mddev->bitmap)
5365	/* Cannot grow a bitmap yet */	5432	/* Cannot grow a bitmap yet */
5366	return -EBUSY;	5433	return -EBUSY;
5367	if (mddev->degraded > conf->max_degraded)	5434	if (has_failed(conf))
5368	return -EINVAL;	5435	return -EINVAL;
5369	if (mddev->delta_disks < 0) {	5436	if (mddev->delta_disks < 0) {
5370	/* We might be able to shrink, but the devices must	5437	/* We might be able to shrink, but the devices must