md/raid5: Allow dirty-degraded arrays to be assembled when only party is degraded.

Normally is it not safe to allow a raid5 that is both dirty and degraded to be assembled without explicit request from that admin, as it can cause hidden data corruption. This is because 'dirty' means that the parity cannot be trusted, and 'degraded' means that the parity needs to be used. However, if the device that is missing contains only parity, then there is no issue and assembly can continue. This particularly applies when a RAID5 is being converted to a RAID6 and there is an unclean shutdown while the conversion is happening. So check for whether the degraded space only contains parity, and in that case, allow the assembly. Signed-off-by: NeilBrown <neilb@suse.de>
author: NeilBrown <neilb@suse.de> 2009-11-13 01:47:00 -0500
committer: NeilBrown <neilb@suse.de> 2009-11-13 01:47:00 -0500
commit: c148ffdcda00b6599b70f8b65e6a1fadd1dbb127 (patch)
tree: 3d50cc9dbef926f62a588dc1f45f1df304e1bf31 /drivers/md
parent: 7ef90146a14c2bb1de2e22399f147ebec5b74f0b (diff)
1 files changed, 75 insertions, 4 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ab40529bdabe..d29215d966da 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4823,11 +4823,40 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
                return ERR_PTR(-ENOMEM);
 }
+static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
+{
+        switch (algo) {
+        case ALGORITHM_PARITY_0:
+                if (raid_disk < max_degraded)
+                        return 1;
+                break;
+        case ALGORITHM_PARITY_N:
+                if (raid_disk >= raid_disks - max_degraded)
+                        return 1;
+                break;
+        case ALGORITHM_PARITY_0_6:
+                if (raid_disk == 0 || 
+                    raid_disk == raid_disks - 1)
+                        return 1;
+                break;
+        case ALGORITHM_LEFT_ASYMMETRIC_6:
+        case ALGORITHM_RIGHT_ASYMMETRIC_6:
+        case ALGORITHM_LEFT_SYMMETRIC_6:
+        case ALGORITHM_RIGHT_SYMMETRIC_6:
+                if (raid_disk == raid_disks - 1)
+                        return 1;
+        }
+        return 0;
+}
 static int run(mddev_t *mddev)
 {
        raid5_conf_t *conf;
        int working_disks = 0, chunk_size;
+        int dirty_parity_disks = 0;
        mdk_rdev_t *rdev;
+        sector_t reshape_offset = 0;
        if (mddev->recovery_cp != MaxSector)
                printk(KERN_NOTICE "raid5: %s is not clean"
@@ -4861,6 +4890,7 @@ static int run(mddev_t *mddev)
                               "on a stripe boundary\n");
                        return -EINVAL;
                }
+                reshape_offset = here_new * mddev->new_chunk_sectors;
                /* here_new is the stripe we will write to */
                here_old = mddev->reshape_position;
                sector_div(here_old, mddev->chunk_sectors *
@@ -4916,10 +4946,51 @@ static int run(mddev_t *mddev)
        /*
         * 0 for a fully functional array, 1 or 2 for a degraded array.
         */
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
-                if (rdev->raid_disk >= 0 &&
+                if (rdev->raid_disk < 0)
-                    test_bit(In_sync, &rdev->flags))
+                        continue;
+                if (test_bit(In_sync, &rdev->flags))
                        working_disks++;
+                /* This disc is not fully in-sync.  However if it
+                 * just stored parity (beyond the recovery_offset),
+                 * when we don't need to be concerned about the
+                 * array being dirty.
+                 * When reshape goes 'backwards', we never have
+                 * partially completed devices, so we only need
+                 * to worry about reshape going forwards.
+                 */
+                /* Hack because v0.91 doesn't store recovery_offset properly. */
+                if (mddev->major_version == 0 &&
+                    mddev->minor_version > 90)
+                        rdev->recovery_offset = reshape_offset;
+                        
+                printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n",
+                       rdev->raid_disk, working_disks, conf->prev_algo,
+                       conf->previous_raid_disks, conf->max_degraded,
+                       conf->algorithm, conf->raid_disks, 
+                       only_parity(rdev->raid_disk,
+                                   conf->prev_algo,
+                                   conf->previous_raid_disks,
+                                   conf->max_degraded),
+                       only_parity(rdev->raid_disk,
+                                   conf->algorithm,
+                                   conf->raid_disks,
+                                   conf->max_degraded));
+                if (rdev->recovery_offset < reshape_offset) {
+                        /* We need to check old and new layout */
+                        if (!only_parity(rdev->raid_disk,
+                                         conf->algorithm,
+                                         conf->raid_disks,
+                                         conf->max_degraded))
+                                continue;
+                }
+                if (!only_parity(rdev->raid_disk,
+                                 conf->prev_algo,
+                                 conf->previous_raid_disks,
+                                 conf->max_degraded))
+                        continue;
+                dirty_parity_disks++;
+        }
        mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
                           - working_disks);
@@ -4935,7 +5006,7 @@ static int run(mddev_t *mddev)
        mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
        mddev->resync_max_sectors = mddev->dev_sectors;
-        if (mddev->degraded > 0 &&
+        if (mddev->degraded > dirty_parity_disks &&
            mddev->recovery_cp != MaxSector) {
                if (mddev->ok_start_degraded)
                        printk(KERN_WARNING
author	NeilBrown <neilb@suse.de>	2009-11-13 01:47:00 -0500
committer	NeilBrown <neilb@suse.de>	2009-11-13 01:47:00 -0500
commit	c148ffdcda00b6599b70f8b65e6a1fadd1dbb127 (patch)
tree	3d50cc9dbef926f62a588dc1f45f1df304e1bf31 /drivers/md
parent	7ef90146a14c2bb1de2e22399f147ebec5b74f0b (diff)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab40529bdabe..d29215d966da 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c
@@ -4823,11 +4823,40 @@ static raid5_conf_t setup_conf(mddev_t mddev)
4823	return ERR_PTR(-ENOMEM);	4823	return ERR_PTR(-ENOMEM);
4824	}	4824	}
4825		4825
		4826
		4827	static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
		4828	{
		4829	switch (algo) {
		4830	case ALGORITHM_PARITY_0:
		4831	if (raid_disk < max_degraded)
		4832	return 1;
		4833	break;
		4834	case ALGORITHM_PARITY_N:
		4835	if (raid_disk >= raid_disks - max_degraded)
		4836	return 1;
		4837	break;
		4838	case ALGORITHM_PARITY_0_6:
		4839	if (raid_disk == 0 \|\|
		4840	raid_disk == raid_disks - 1)
		4841	return 1;
		4842	break;
		4843	case ALGORITHM_LEFT_ASYMMETRIC_6:
		4844	case ALGORITHM_RIGHT_ASYMMETRIC_6:
		4845	case ALGORITHM_LEFT_SYMMETRIC_6:
		4846	case ALGORITHM_RIGHT_SYMMETRIC_6:
		4847	if (raid_disk == raid_disks - 1)
		4848	return 1;
		4849	}
		4850	return 0;
		4851	}
		4852
4826	static int run(mddev_t *mddev)	4853	static int run(mddev_t *mddev)
4827	{	4854	{
4828	raid5_conf_t *conf;	4855	raid5_conf_t *conf;
4829	int working_disks = 0, chunk_size;	4856	int working_disks = 0, chunk_size;
		4857	int dirty_parity_disks = 0;
4830	mdk_rdev_t *rdev;	4858	mdk_rdev_t *rdev;
		4859	sector_t reshape_offset = 0;
4831		4860
4832	if (mddev->recovery_cp != MaxSector)	4861	if (mddev->recovery_cp != MaxSector)
4833	printk(KERN_NOTICE "raid5: %s is not clean"	4862	printk(KERN_NOTICE "raid5: %s is not clean"
@@ -4861,6 +4890,7 @@ static int run(mddev_t *mddev)
4861	"on a stripe boundary\n");	4890	"on a stripe boundary\n");
4862	return -EINVAL;	4891	return -EINVAL;
4863	}	4892	}
		4893	reshape_offset = here_new * mddev->new_chunk_sectors;
4864	/* here_new is the stripe we will write to */	4894	/* here_new is the stripe we will write to */
4865	here_old = mddev->reshape_position;	4895	here_old = mddev->reshape_position;
4866	sector_div(here_old, mddev->chunk_sectors *	4896	sector_div(here_old, mddev->chunk_sectors *
@@ -4916,10 +4946,51 @@ static int run(mddev_t *mddev)
4916	/*	4946	/*
4917	* 0 for a fully functional array, 1 or 2 for a degraded array.	4947	* 0 for a fully functional array, 1 or 2 for a degraded array.
4918	*/	4948	*/
4919	list_for_each_entry(rdev, &mddev->disks, same_set)	4949	list_for_each_entry(rdev, &mddev->disks, same_set) {
4920	if (rdev->raid_disk >= 0 &&	4950	if (rdev->raid_disk < 0)
4921	test_bit(In_sync, &rdev->flags))	4951	continue;
		4952	if (test_bit(In_sync, &rdev->flags))
4922	working_disks++;	4953	working_disks++;
		4954	/* This disc is not fully in-sync. However if it
		4955	* just stored parity (beyond the recovery_offset),
		4956	* when we don't need to be concerned about the
		4957	* array being dirty.
		4958	* When reshape goes 'backwards', we never have
		4959	* partially completed devices, so we only need
		4960	* to worry about reshape going forwards.
		4961	*/
		4962	/* Hack because v0.91 doesn't store recovery_offset properly. */
		4963	if (mddev->major_version == 0 &&
		4964	mddev->minor_version > 90)
		4965	rdev->recovery_offset = reshape_offset;
		4966
		4967	printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n",
		4968	rdev->raid_disk, working_disks, conf->prev_algo,
		4969	conf->previous_raid_disks, conf->max_degraded,
		4970	conf->algorithm, conf->raid_disks,
		4971	only_parity(rdev->raid_disk,
		4972	conf->prev_algo,
		4973	conf->previous_raid_disks,
		4974	conf->max_degraded),
		4975	only_parity(rdev->raid_disk,
		4976	conf->algorithm,
		4977	conf->raid_disks,
		4978	conf->max_degraded));
		4979	if (rdev->recovery_offset < reshape_offset) {
		4980	/* We need to check old and new layout */
		4981	if (!only_parity(rdev->raid_disk,
		4982	conf->algorithm,
		4983	conf->raid_disks,
		4984	conf->max_degraded))
		4985	continue;
		4986	}
		4987	if (!only_parity(rdev->raid_disk,
		4988	conf->prev_algo,
		4989	conf->previous_raid_disks,
		4990	conf->max_degraded))
		4991	continue;
		4992	dirty_parity_disks++;
		4993	}
4923		4994
4924	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)	4995	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
4925	- working_disks);	4996	- working_disks);
@@ -4935,7 +5006,7 @@ static int run(mddev_t *mddev)
4935	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);	5006	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
4936	mddev->resync_max_sectors = mddev->dev_sectors;	5007	mddev->resync_max_sectors = mddev->dev_sectors;
4937		5008
4938	if (mddev->degraded > 0 &&	5009	if (mddev->degraded > dirty_parity_disks &&
4939	mddev->recovery_cp != MaxSector) {	5010	mddev->recovery_cp != MaxSector) {
4940	if (mddev->ok_start_degraded)	5011	if (mddev->ok_start_degraded)
4941	printk(KERN_WARNING	5012	printk(KERN_WARNING