Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: md/raid5: don't include 'spare' drives when reshaping to fewer devices. md/raid5: add a missing 'continue' in a loop. md/raid5: Allow recovered part of partially recovered devices to be in-sync md/raid5: More careful check for "has array failed". md: Don't update ->recovery_offset when reshaping an array to fewer devices. md/raid5: avoid oops when number of devices is reduced then increased. md: enable raid4->raid0 takeover md: clear layout after ->raid0 takeover md: fix raid10 takeover: use new_layout for setup_conf md: fix handling of array level takeover that re-arranges devices. md: raid10: Fix null pointer dereference in fix_read_error() Restore partition detection of newly created md arrays.
author: Linus Torvalds <torvalds@linux-foundation.org> 2010-06-28 01:56:32 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-06-28 01:56:32 -0400
commit: 93416253073511716f7e70c06e32c3810c3deac4 (patch)
tree: 7e6a4c7dab40596f6b622f0eaa4b3366ed671b79
parent: b4322e7057ca851b0a3e15f29e26806efeada100 (diff)
parent: 3424bf6a772cff606fc4bc24a3639c937afb547f (diff)
7 files changed, 187 insertions, 79 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 46b3a044eadf..cb20d0b0555a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
        /* First make sure individual recovery_offsets are correct */
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->raid_disk >= 0 &&
+                    mddev->delta_disks >= 0 &&
                    !test_bit(In_sync, &rdev->flags) &&
                    mddev->curr_resync_completed > rdev->recovery_offset)
                                rdev->recovery_offset = mddev->curr_resync_completed;
@@ -3001,6 +3002,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
                return -EINVAL;
        }
+        list_for_each_entry(rdev, &mddev->disks, same_set)
+                rdev->new_raid_disk = rdev->raid_disk;
        /* ->takeover must set new_* and/or delta_disks
         * if it succeeds, and may set them when it fails.
         */
@@ -3051,13 +3055,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
                mddev->safemode = 0;
        }
-        module_put(mddev->pers->owner);
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
-        /* Invalidate devices that are now superfluous */
+                char nm[20];
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+                if (rdev->raid_disk < 0)
-                if (rdev->raid_disk >= mddev->raid_disks) {
+                        continue;
-                        rdev->raid_disk = -1;
+                if (rdev->new_raid_disk > mddev->raid_disks)
+                        rdev->new_raid_disk = -1;
+                if (rdev->new_raid_disk == rdev->raid_disk)
+                        continue;
+                sprintf(nm, "rd%d", rdev->raid_disk);
+                sysfs_remove_link(&mddev->kobj, nm);
+        }
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
+                if (rdev->raid_disk < 0)
+                        continue;
+                if (rdev->new_raid_disk == rdev->raid_disk)
+                        continue;
+                rdev->raid_disk = rdev->new_raid_disk;
+                if (rdev->raid_disk < 0)
                        clear_bit(In_sync, &rdev->flags);
+                else {
+                        char nm[20];
+                        sprintf(nm, "rd%d", rdev->raid_disk);
+                        if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
+                                printk("md: cannot register %s for %s after level change\n",
+                                       nm, mdname(mddev));
                }
+        }
+        module_put(mddev->pers->owner);
        mddev->pers = pers;
        mddev->private = priv;
        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
@@ -5895,6 +5921,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
        atomic_inc(&mddev->openers);
        mutex_unlock(&mddev->open_mutex);
+        check_disk_size_change(mddev->gendisk, bdev);
 out:
        return err;
 }
@@ -6846,6 +6873,7 @@ void md_do_sync(mddev_t *mddev)
                        rcu_read_lock();
                        list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
                                if (rdev->raid_disk >= 0 &&
+                                    mddev->delta_disks >= 0 &&
                                    !test_bit(Faulty, &rdev->flags) &&
                                    !test_bit(In_sync, &rdev->flags) &&
                                    rdev->recovery_offset < mddev->curr_resync)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7ab5ea155452..10597bfec000 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -78,6 +78,9 @@ struct mdk_rdev_s
        int desc_nr;                    /* descriptor index in the superblock */
        int raid_disk;                  /* role of device in array */
+        int new_raid_disk;              /* role that the device will have in
+                                         * the array after a level-change completes.
+                                         */
        int saved_raid_disk;            /* role that device used to have in the
                                         * array and could again if we did a partial
                                         * resync from the bitmap
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e70f004c99e8..563abed5a2cb 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -173,9 +173,11 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
        list_for_each_entry(rdev1, &mddev->disks, same_set) {
                int j = rdev1->raid_disk;
-                if (mddev->level == 10)
+                if (mddev->level == 10) {
                        /* taking over a raid10-n2 array */
                        j /= 2;
+                        rdev1->new_raid_disk = j;
+                }
                if (j < 0 || j >= mddev->raid_disks) {
                        printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
@@ -361,12 +363,6 @@ static int raid0_run(mddev_t *mddev)
                mddev->private = conf;
        }
        conf = mddev->private;
-        if (conf->scale_raid_disks) {
-                int i;
-                for (i=0; i < conf->strip_zone[0].nb_dev; i++)
-                        conf->devlist[i]->raid_disk /= conf->scale_raid_disks;
-                /* FIXME update sysfs rd links */
-        }
        /* calculate array device size */
        md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
@@ -573,7 +569,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
        return;
 }
-static void *raid0_takeover_raid5(mddev_t *mddev)
+static void *raid0_takeover_raid45(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
        raid0_conf_t *priv_conf;
@@ -596,6 +592,7 @@ static void *raid0_takeover_raid5(mddev_t *mddev)
        /* Set new parameters */
        mddev->new_level = 0;
+        mddev->new_layout = 0;
        mddev->new_chunk_sectors = mddev->chunk_sectors;
        mddev->raid_disks--;
        mddev->delta_disks = -1;
@@ -635,6 +632,7 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
        /* Set new parameters */
        mddev->new_level = 0;
+        mddev->new_layout = 0;
        mddev->new_chunk_sectors = mddev->chunk_sectors;
        mddev->delta_disks = - mddev->raid_disks / 2;
        mddev->raid_disks += mddev->delta_disks;
@@ -643,19 +641,22 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
        mddev->recovery_cp = MaxSector;
        create_strip_zones(mddev, &priv_conf);
-        priv_conf->scale_raid_disks = 2;
        return priv_conf;
 }
 static void *raid0_takeover(mddev_t *mddev)
 {
        /* raid0 can take over:
+         *  raid4 - if all data disks are active.
         *  raid5 - providing it is Raid4 layout and one disk is faulty
         *  raid10 - assuming we have all necessary active disks
         */
+        if (mddev->level == 4)
+                return raid0_takeover_raid45(mddev);
        if (mddev->level == 5) {
                if (mddev->layout == ALGORITHM_PARITY_N)
-                        return raid0_takeover_raid5(mddev);
+                        return raid0_takeover_raid45(mddev);
                printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
                       mdname(mddev), ALGORITHM_PARITY_N);
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index d724e664ca4d..91f8e876ee64 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -13,9 +13,6 @@ struct raid0_private_data
        struct strip_zone *strip_zone;
        mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
        int nr_strip_zones;
-        int scale_raid_disks; /* divide rdev->raid_disks by this in run()
-                               * to handle conversion from raid10
-                               */
 };
 typedef struct raid0_private_data raid0_conf_t;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 03724992cdf2..42e64e4e5e25 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1482,14 +1482,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
        int sectors = r10_bio->sectors;
        mdk_rdev_t*rdev;
        int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+        int d = r10_bio->devs[r10_bio->read_slot].devnum;
        rcu_read_lock();
-        {
+        rdev = rcu_dereference(conf->mirrors[d].rdev);
-                int d = r10_bio->devs[r10_bio->read_slot].devnum;
+        if (rdev) { /* If rdev is not NULL */
                char b[BDEVNAME_SIZE];
                int cur_read_error_count = 0;
-                rdev = rcu_dereference(conf->mirrors[d].rdev);
                bdevname(rdev->bdev, b);
                if (test_bit(Faulty, &rdev->flags)) {
@@ -1530,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                rcu_read_lock();
                do {
-                        int d = r10_bio->devs[sl].devnum;
+                        d = r10_bio->devs[sl].devnum;
                        rdev = rcu_dereference(conf->mirrors[d].rdev);
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags)) {
@@ -1564,7 +1564,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                rcu_read_lock();
                while (sl != r10_bio->read_slot) {
                        char b[BDEVNAME_SIZE];
-                        int d;
                        if (sl==0)
                                sl = conf->copies;
                        sl--;
@@ -1601,7 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                }
                sl = start;
                while (sl != r10_bio->read_slot) {
-                        int d;
                        if (sl==0)
                                sl = conf->copies;
                        sl--;
@@ -2161,22 +2161,22 @@ static conf_t *setup_conf(mddev_t *mddev)
        sector_t stride, size;
        int err = -EINVAL;
-        if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
+        if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
-            !is_power_of_2(mddev->chunk_sectors)) {
+            !is_power_of_2(mddev->new_chunk_sectors)) {
                printk(KERN_ERR "md/raid10:%s: chunk size must be "
                       "at least PAGE_SIZE(%ld) and be a power of 2.\n",
                       mdname(mddev), PAGE_SIZE);
                goto out;
        }
-        nc = mddev->layout & 255;
+        nc = mddev->new_layout & 255;
-        fc = (mddev->layout >> 8) & 255;
+        fc = (mddev->new_layout >> 8) & 255;
-        fo = mddev->layout & (1<<16);
+        fo = mddev->new_layout & (1<<16);
        if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
-            (mddev->layout >> 17)) {
+            (mddev->new_layout >> 17)) {
                printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
-                       mdname(mddev), mddev->layout);
+                       mdname(mddev), mddev->new_layout);
                goto out;
        }
@@ -2241,7 +2241,6 @@ static conf_t *setup_conf(mddev_t *mddev)
        if (!conf->thread)
                goto out;
-        conf->scale_disks = 0;
        conf->mddev = mddev;
        return conf;
@@ -2300,11 +2299,6 @@ static int run(mddev_t *mddev)
                if (disk_idx >= conf->raid_disks
                    || disk_idx < 0)
                        continue;
-                if (conf->scale_disks) {
-                        disk_idx *= conf->scale_disks;
-                        rdev->raid_disk = disk_idx;
-                        /* MOVE 'rd%d' link !! */
-                }
                disk = conf->mirrors + disk_idx;
                disk->rdev = rdev;
@@ -2435,26 +2429,22 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
                return ERR_PTR(-EINVAL);
        }
-        /* Update slot numbers to obtain
-         * degraded raid10 with missing mirrors
-         */
-        list_for_each_entry(rdev, &mddev->disks, same_set) {
-                rdev->raid_disk *= 2;
-        }
        /* Set new parameters */
        mddev->new_level = 10;
        /* new layout: far_copies = 1, near_copies = 2 */
        mddev->new_layout = (1<<8) + 2;
        mddev->new_chunk_sectors = mddev->chunk_sectors;
        mddev->delta_disks = mddev->raid_disks;
-        mddev->degraded = mddev->raid_disks;
        mddev->raid_disks *= 2;
        /* make sure it will be not marked as dirty */
        mddev->recovery_cp = MaxSector;
        conf = setup_conf(mddev);
-        conf->scale_disks = 2;
+        if (!IS_ERR(conf))
+                list_for_each_entry(rdev, &mddev->disks, same_set)
+                        if (rdev->raid_disk >= 0)
+                                rdev->new_raid_disk = rdev->raid_disk * 2;
+                
        return conf;
 }
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3824a087e17c..2316ac2e8e21 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -38,11 +38,6 @@ struct r10_private_data_s {
        int chunk_shift; /* shift from chunks to sectors */
        sector_t chunk_mask;
-        int                     scale_disks;  /* When starting array, multiply
-                                               * each ->raid_disk by this.
-                                               * Need for raid0->raid10 migration
-                                               */
        struct list_head        retry_list;
        /* queue pending writes and submit them on unplug */
        struct bio_list         pending_bio_list;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d2c0f94fa37d..96c690279fc6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -277,12 +277,13 @@ out:
        return sh;
 }
-static void shrink_buffers(struct stripe_head *sh, int num)
+static void shrink_buffers(struct stripe_head *sh)
 {
        struct page *p;
        int i;
+        int num = sh->raid_conf->pool_size;
-        for (i=0; i<num ; i++) {
+        for (i = 0; i < num ; i++) {
                p = sh->dev[i].page;
                if (!p)
                        continue;
@@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num)
        }
 }
-static int grow_buffers(struct stripe_head *sh, int num)
+static int grow_buffers(struct stripe_head *sh)
 {
        int i;
+        int num = sh->raid_conf->pool_size;
-        for (i=0; i<num; i++) {
+        for (i = 0; i < num; i++) {
                struct page *page;
                if (!(page = alloc_page(GFP_KERNEL))) {
@@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
        return NULL;
 }
+/*
+ * Need to check if array has failed when deciding whether to:
+ *  - start an array
+ *  - remove non-faulty devices
+ *  - add a spare
+ *  - allow a reshape
+ * This determination is simple when no reshape is happening.
+ * However if there is a reshape, we need to carefully check
+ * both the before and after sections.
+ * This is because some failed devices may only affect one
+ * of the two sections, and some non-in_sync devices may
+ * be insync in the section most affected by failed devices.
+ */
+static int has_failed(raid5_conf_t *conf)
+{
+        int degraded;
+        int i;
+        if (conf->mddev->reshape_position == MaxSector)
+                return conf->mddev->degraded > conf->max_degraded;
+        rcu_read_lock();
+        degraded = 0;
+        for (i = 0; i < conf->previous_raid_disks; i++) {
+                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+                if (!rdev || test_bit(Faulty, &rdev->flags))
+                        degraded++;
+                else if (test_bit(In_sync, &rdev->flags))
+                        ;
+                else
+                        /* not in-sync or faulty.
+                         * If the reshape increases the number of devices,
+                         * this is being recovered by the reshape, so
+                         * this 'previous' section is not in_sync.
+                         * If the number of devices is being reduced however,
+                         * the device can only be part of the array if
+                         * we are reverting a reshape, so this section will
+                         * be in-sync.
+                         */
+                        if (conf->raid_disks >= conf->previous_raid_disks)
+                                degraded++;
+        }
+        rcu_read_unlock();
+        if (degraded > conf->max_degraded)
+                return 1;
+        rcu_read_lock();
+        degraded = 0;
+        for (i = 0; i < conf->raid_disks; i++) {
+                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+                if (!rdev || test_bit(Faulty, &rdev->flags))
+                        degraded++;
+                else if (test_bit(In_sync, &rdev->flags))
+                        ;
+                else
+                        /* not in-sync or faulty.
+                         * If reshape increases the number of devices, this
+                         * section has already been recovered, else it
+                         * almost certainly hasn't.
+                         */
+                        if (conf->raid_disks <= conf->previous_raid_disks)
+                                degraded++;
+        }
+        rcu_read_unlock();
+        if (degraded > conf->max_degraded)
+                return 1;
+        return 0;
+}
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(struct request_queue *q);
@@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 static int grow_one_stripe(raid5_conf_t *conf)
 {
        struct stripe_head *sh;
-        int disks = max(conf->raid_disks, conf->previous_raid_disks);
        sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
        if (!sh)
                return 0;
-        memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev));
+        memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
        sh->raid_conf = conf;
        spin_lock_init(&sh->lock);
        #ifdef CONFIG_MULTICORE_RAID456
        init_waitqueue_head(&sh->ops.wait_for_ops);
        #endif
-        if (grow_buffers(sh, disks)) {
+        if (grow_buffers(sh)) {
-                shrink_buffers(sh, disks);
+                shrink_buffers(sh);
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
@@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
        if (!sh)
                return 0;
        BUG_ON(atomic_read(&sh->count));
-        shrink_buffers(sh, conf->pool_size);
+        shrink_buffers(sh);
        kmem_cache_free(conf->slab_cache, sh);
        atomic_dec(&conf->active_stripes);
        return 1;
@@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh)
                mdk_rdev_t *rdev;
                dev = &sh->dev[i];
-                clear_bit(R5_Insync, &dev->flags);
                pr_debug("check %d: state 0x%lx toread %p read %p write %p "
                        "written %p\n", i, dev->flags, dev->toread, dev->read,
@@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh)
                        blocked_rdev = rdev;
                        atomic_inc(&rdev->nr_pending);
                }
-                if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+                clear_bit(R5_Insync, &dev->flags);
+                if (!rdev)
+                        /* Not in-sync */;
+                else if (test_bit(In_sync, &rdev->flags))
+                        set_bit(R5_Insync, &dev->flags);
+                else {
+                        /* could be in-sync depending on recovery/reshape status */
+                        if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+                                set_bit(R5_Insync, &dev->flags);
+                }
+                if (!test_bit(R5_Insync, &dev->flags)) {
                        /* The ReadError flag will just be confusing now */
                        clear_bit(R5_ReadError, &dev->flags);
                        clear_bit(R5_ReWrite, &dev->flags);
                }
-                if (!rdev || !test_bit(In_sync, &rdev->flags)
+                if (test_bit(R5_ReadError, &dev->flags))
-                    || test_bit(R5_ReadError, &dev->flags)) {
+                        clear_bit(R5_Insync, &dev->flags);
+                if (!test_bit(R5_Insync, &dev->flags)) {
                        s.failed++;
                        s.failed_num = i;
-                } else
+                }
-                        set_bit(R5_Insync, &dev->flags);
        }
        rcu_read_unlock();
@@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh)
        for (i=disks; i--; ) {
                mdk_rdev_t *rdev;
                dev = &sh->dev[i];
-                clear_bit(R5_Insync, &dev->flags);
                pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
                        i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh)
                        blocked_rdev = rdev;
                        atomic_inc(&rdev->nr_pending);
                }
-                if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+                clear_bit(R5_Insync, &dev->flags);
+                if (!rdev)
+                        /* Not in-sync */;
+                else if (test_bit(In_sync, &rdev->flags))
+                        set_bit(R5_Insync, &dev->flags);
+                else {
+                        /* in sync if before recovery_offset */
+                        if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+                                set_bit(R5_Insync, &dev->flags);
+                }
+                if (!test_bit(R5_Insync, &dev->flags)) {
                        /* The ReadError flag will just be confusing now */
                        clear_bit(R5_ReadError, &dev->flags);
                        clear_bit(R5_ReWrite, &dev->flags);
                }
-                if (!rdev || !test_bit(In_sync, &rdev->flags)
+                if (test_bit(R5_ReadError, &dev->flags))
-                    || test_bit(R5_ReadError, &dev->flags)) {
+                        clear_bit(R5_Insync, &dev->flags);
+                if (!test_bit(R5_Insync, &dev->flags)) {
                        if (s.failed < 2)
                                r6s.failed_num[s.failed] = i;
                        s.failed++;
-                } else
+                }
-                        set_bit(R5_Insync, &dev->flags);
        }
        rcu_read_unlock();
@@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev)
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->raid_disk < 0)
                        continue;
-                if (test_bit(In_sync, &rdev->flags))
+                if (test_bit(In_sync, &rdev->flags)) {
                        working_disks++;
+                        continue;
+                }
                /* This disc is not fully in-sync.  However if it
                 * just stored parity (beyond the recovery_offset),
                 * when we don't need to be concerned about the
@@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev)
        mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
                           - working_disks);
-        if (mddev->degraded > conf->max_degraded) {
+        if (has_failed(conf)) {
                printk(KERN_ERR "md/raid:%s: not enough operational devices"
                        " (%d/%d failed)\n",
                        mdname(mddev), mddev->degraded, conf->raid_disks);
@@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev)
        for (i = 0; i < conf->raid_disks; i++) {
                tmp = conf->disks + i;
                if (tmp->rdev
+                    && tmp->rdev->recovery_offset == MaxSector
                    && !test_bit(Faulty, &tmp->rdev->flags)
                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
                        unsigned long flags;
@@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                 * isn't possible.
                 */
                if (!test_bit(Faulty, &rdev->flags) &&
-                    mddev->degraded <= conf->max_degraded &&
+                    !has_failed(conf) &&
                    number < conf->raid_disks) {
                        err = -EBUSY;
                        goto abort;
@@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
        int first = 0;
        int last = conf->raid_disks - 1;
-        if (mddev->degraded > conf->max_degraded)
+        if (has_failed(conf))
                /* no point adding a device */
                return -EINVAL;
@@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev)
        if (mddev->bitmap)
                /* Cannot grow a bitmap yet */
                return -EBUSY;
-        if (mddev->degraded > conf->max_degraded)
+        if (has_failed(conf))
                return -EINVAL;
        if (mddev->delta_disks < 0) {
                /* We might be able to shrink, but the devices must
@@ -5437,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev)
        /* Add some new drives, as many as will fit.
         * We know there are enough to make the newly sized array work.
+         * Don't add devices if we are reducing the number of
+         * devices in the array.  This is because it is not possible
+         * to correctly record the "partially reconstructed" state of
+         * such devices during the reshape and confusion could result.
         */
-        list_for_each_entry(rdev, &mddev->disks, same_set)
+        if (mddev->delta_disks >= 0)
+            list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk < 0 &&
                    !test_bit(Faulty, &rdev->flags)) {
                        if (raid5_add_disk(mddev, rdev) == 0) {
@@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev)
                }
        /* When a reshape changes the number of devices, ->degraded
-         * is measured against the large of the pre and post number of
+         * is measured against the larger of the pre and post number of
         * devices.*/
        if (mddev->delta_disks > 0) {
                spin_lock_irqsave(&conf->device_lock, flags);
author	Linus Torvalds <torvalds@linux-foundation.org>	2010-06-28 01:56:32 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-06-28 01:56:32 -0400
commit	93416253073511716f7e70c06e32c3810c3deac4 (patch)
tree	7e6a4c7dab40596f6b622f0eaa4b3366ed671b79
parent	b4322e7057ca851b0a3e15f29e26806efeada100 (diff)
parent	3424bf6a772cff606fc4bc24a3639c937afb547f (diff)