md/raid5: allow for change in data_offset while managing a reshape.

The important issue here is incorporating the different in data_offset into calculations concerning when we might need to over-write data that is still thought to be valid. To this end we find the minimum offset difference across all devices and add that where appropriate. Signed-off-by: NeilBrown <neilb@suse.de>
author: NeilBrown <neilb@suse.de> 2012-05-20 19:27:01 -0400
committer: NeilBrown <neilb@suse.de> 2012-05-20 19:27:01 -0400
commit: b5254dd5fdd9abcacadb5101beb35df9ae8cc564 (patch)
tree: 73d32b8dd7c0dc9ecfe61468965b06741070dee7 /drivers/md/raid5.c
parent: 05616be5e11f66888b66554957dbecdd90658a84 (diff)
1 files changed, 76 insertions, 33 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 71d1de909ba5..0172bdd37b48 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4165,13 +4165,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
        else
                reshape_sectors = mddev->chunk_sectors;
-        /* we update the metadata when there is more than 3Meg
+        /* We update the metadata at least every 10 seconds, or when
-         * in the block range (that is rather arbitrary, should
+         * the data about to be copied would over-write the source of
-         * probably be time based) or when the data about to be
+         * the data at the front of the range.  i.e. one new_stripe
-         * copied would over-write the source of the data at
+         * along from reshape_progress new_maps to after where
-         * the front of the range.
+         * reshape_safe old_maps to
-         * i.e. one new_stripe along from reshape_progress new_maps
-         * to after where reshape_safe old_maps to
         */
        writepos = conf->reshape_progress;
        sector_div(writepos, new_data_disks);
@@ -4189,11 +4187,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
                safepos -= min_t(sector_t, reshape_sectors, safepos);
        }
+        /* Having calculated the 'writepos' possibly use it
+         * to set 'stripe_addr' which is where we will write to.
+         */
+        if (mddev->reshape_backwards) {
+                BUG_ON(conf->reshape_progress == 0);
+                stripe_addr = writepos;
+                BUG_ON((mddev->dev_sectors &
+                        ~((sector_t)reshape_sectors - 1))
+                       - reshape_sectors - stripe_addr
+                       != sector_nr);
+        } else {
+                BUG_ON(writepos != sector_nr + reshape_sectors);
+                stripe_addr = sector_nr;
+        }
        /* 'writepos' is the most advanced device address we might write.
         * 'readpos' is the least advanced device address we might read.
         * 'safepos' is the least address recorded in the metadata as having
         *     been reshaped.
-         * If 'readpos' is behind 'writepos', then there is no way that we can
+         * If there is a min_offset_diff, these are adjusted either by
+         * increasing the safepos/readpos if diff is negative, or
+         * increasing writepos if diff is positive.
+         * If 'readpos' is then behind 'writepos', there is no way that we can
         * ensure safety in the face of a crash - that must be done by userspace
         * making a backup of the data.  So in that case there is no particular
         * rush to update metadata.
@@ -4206,6 +4222,12 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
         * Maybe that number should be configurable, but I'm not sure it is
         * worth it.... maybe it could be a multiple of safemode_delay???
         */
+        if (conf->min_offset_diff < 0) {
+                safepos += -conf->min_offset_diff;
+                readpos += -conf->min_offset_diff;
+        } else
+                writepos += conf->min_offset_diff;
        if ((mddev->reshape_backwards
             ? (safepos > writepos && readpos < writepos)
             : (safepos < writepos && readpos > writepos)) ||
@@ -4227,17 +4249,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        }
-        if (mddev->reshape_backwards) {
-                BUG_ON(conf->reshape_progress == 0);
-                stripe_addr = writepos;
-                BUG_ON((mddev->dev_sectors &
-                        ~((sector_t)reshape_sectors - 1))
-                       - reshape_sectors - stripe_addr
-                       != sector_nr);
-        } else {
-                BUG_ON(writepos != sector_nr + reshape_sectors);
-                stripe_addr = sector_nr;
-        }
        INIT_LIST_HEAD(&stripes);
        for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
                int j;
@@ -4984,16 +4995,42 @@ static int run(struct mddev *mddev)
        struct md_rdev *rdev;
        sector_t reshape_offset = 0;
        int i;
+        long long min_offset_diff = 0;
+        int first = 1;
        if (mddev->recovery_cp != MaxSector)
                printk(KERN_NOTICE "md/raid:%s: not clean"
                       " -- starting background reconstruction\n",
                       mdname(mddev));
+        rdev_for_each(rdev, mddev) {
+                long long diff;
+                if (rdev->raid_disk < 0)
+                        continue;
+                diff = (rdev->new_data_offset - rdev->data_offset);
+                if (first) {
+                        min_offset_diff = diff;
+                        first = 0;
+                } else if (mddev->reshape_backwards &&
+                         diff < min_offset_diff)
+                        min_offset_diff = diff;
+                else if (!mddev->reshape_backwards &&
+                         diff > min_offset_diff)
+                        min_offset_diff = diff;
+        }
        if (mddev->reshape_position != MaxSector) {
                /* Check that we can continue the reshape.
-                 * Currently only disks can change, it must
+                 * Difficulties arise if the stripe we would write to
-                 * increase, and we must be past the point where
+                 * next is at or after the stripe we would read from next.
-                 * a stripe over-writes itself
+                 * For a reshape that changes the number of devices, this
+                 * is only possible for a very short time, and mdadm makes
+                 * sure that time appears to have past before assembling
+                 * the array.  So we fail if that time hasn't passed.
+                 * For a reshape that keeps the number of devices the same
+                 * mdadm must be monitoring the reshape can keeping the
+                 * critical areas read-only and backed up.  It will start
+                 * the array in read-only mode, so we check for that.
                 */
                sector_t here_new, here_old;
                int old_disks;
@@ -5025,26 +5062,34 @@ static int run(struct mddev *mddev)
                /* here_old is the first stripe that we might need to read
                 * from */
                if (mddev->delta_disks == 0) {
+                        if ((here_new * mddev->new_chunk_sectors !=
+                             here_old * mddev->chunk_sectors)) {
+                                printk(KERN_ERR "md/raid:%s: reshape position is"
+                                       " confused - aborting\n", mdname(mddev));
+                                return -EINVAL;
+                        }
                        /* We cannot be sure it is safe to start an in-place
-                         * reshape.  It is only safe if user-space if monitoring
+                         * reshape.  It is only safe if user-space is monitoring
                         * and taking constant backups.
                         * mdadm always starts a situation like this in
                         * readonly mode so it can take control before
                         * allowing any writes.  So just check for that.
                         */
-                        if ((here_new * mddev->new_chunk_sectors != 
+                        if (abs(min_offset_diff) >= mddev->chunk_sectors &&
-                             here_old * mddev->chunk_sectors) ||
+                            abs(min_offset_diff) >= mddev->new_chunk_sectors)
-                            mddev->ro == 0) {
+                                /* not really in-place - so OK */;
-                                printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
+                        else if (mddev->ro == 0) {
-                                       " in read-only mode - aborting\n",
+                                printk(KERN_ERR "md/raid:%s: in-place reshape "
+                                       "must be started in read-only mode "
+                                       "- aborting\n",
                                       mdname(mddev));
                                return -EINVAL;
                        }
                } else if (mddev->reshape_backwards
-                    ? (here_new * mddev->new_chunk_sectors <=
+                    ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
                       here_old * mddev->chunk_sectors)
                    : (here_new * mddev->new_chunk_sectors >=
-                       here_old * mddev->chunk_sectors)) {
+                       here_old * mddev->chunk_sectors + (-min_offset_diff))) {
                        /* Reading from the same stripe as writing to - bad */
                        printk(KERN_ERR "md/raid:%s: reshape_position too early for "
                               "auto-recovery - aborting.\n",
@@ -5069,6 +5114,7 @@ static int run(struct mddev *mddev)
        if (IS_ERR(conf))
                return PTR_ERR(conf);
+        conf->min_offset_diff = min_offset_diff;
        mddev->thread = conf->thread;
        conf->thread = NULL;
        mddev->private = conf;
@@ -5541,9 +5587,6 @@ static int raid5_start_reshape(struct mddev *mddev)
                return -ENOSPC;
        rdev_for_each(rdev, mddev) {
-                /* Don't support changing data_offset yet */
-                if (rdev->new_data_offset != rdev->data_offset)
-                        return -EINVAL;
                if (!test_bit(In_sync, &rdev->flags)
                    && !test_bit(Faulty, &rdev->flags))
                        spares++;
author	NeilBrown <neilb@suse.de>	2012-05-20 19:27:01 -0400
committer	NeilBrown <neilb@suse.de>	2012-05-20 19:27:01 -0400
commit	b5254dd5fdd9abcacadb5101beb35df9ae8cc564 (patch)
tree	73d32b8dd7c0dc9ecfe61468965b06741070dee7 /drivers/md/raid5.c
parent	05616be5e11f66888b66554957dbecdd90658a84 (diff)