aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2012-05-20 19:27:01 -0400
committerNeilBrown <neilb@suse.de>2012-05-20 19:27:01 -0400
commitb5254dd5fdd9abcacadb5101beb35df9ae8cc564 (patch)
tree73d32b8dd7c0dc9ecfe61468965b06741070dee7 /drivers/md
parent05616be5e11f66888b66554957dbecdd90658a84 (diff)
md/raid5: allow for change in data_offset while managing a reshape.
The important issue here is incorporating the different in data_offset into calculations concerning when we might need to over-write data that is still thought to be valid. To this end we find the minimum offset difference across all devices and add that where appropriate. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid5.c109
-rw-r--r--drivers/md/raid5.h6
2 files changed, 82 insertions, 33 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 71d1de909ba5..0172bdd37b48 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4165,13 +4165,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4165 else 4165 else
4166 reshape_sectors = mddev->chunk_sectors; 4166 reshape_sectors = mddev->chunk_sectors;
4167 4167
4168 /* we update the metadata when there is more than 3Meg 4168 /* We update the metadata at least every 10 seconds, or when
4169 * in the block range (that is rather arbitrary, should 4169 * the data about to be copied would over-write the source of
4170 * probably be time based) or when the data about to be 4170 * the data at the front of the range. i.e. one new_stripe
4171 * copied would over-write the source of the data at 4171 * along from reshape_progress new_maps to after where
4172 * the front of the range. 4172 * reshape_safe old_maps to
4173 * i.e. one new_stripe along from reshape_progress new_maps
4174 * to after where reshape_safe old_maps to
4175 */ 4173 */
4176 writepos = conf->reshape_progress; 4174 writepos = conf->reshape_progress;
4177 sector_div(writepos, new_data_disks); 4175 sector_div(writepos, new_data_disks);
@@ -4189,11 +4187,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4189 safepos -= min_t(sector_t, reshape_sectors, safepos); 4187 safepos -= min_t(sector_t, reshape_sectors, safepos);
4190 } 4188 }
4191 4189
4190 /* Having calculated the 'writepos' possibly use it
4191 * to set 'stripe_addr' which is where we will write to.
4192 */
4193 if (mddev->reshape_backwards) {
4194 BUG_ON(conf->reshape_progress == 0);
4195 stripe_addr = writepos;
4196 BUG_ON((mddev->dev_sectors &
4197 ~((sector_t)reshape_sectors - 1))
4198 - reshape_sectors - stripe_addr
4199 != sector_nr);
4200 } else {
4201 BUG_ON(writepos != sector_nr + reshape_sectors);
4202 stripe_addr = sector_nr;
4203 }
4204
4192 /* 'writepos' is the most advanced device address we might write. 4205 /* 'writepos' is the most advanced device address we might write.
4193 * 'readpos' is the least advanced device address we might read. 4206 * 'readpos' is the least advanced device address we might read.
4194 * 'safepos' is the least address recorded in the metadata as having 4207 * 'safepos' is the least address recorded in the metadata as having
4195 * been reshaped. 4208 * been reshaped.
4196 * If 'readpos' is behind 'writepos', then there is no way that we can 4209 * If there is a min_offset_diff, these are adjusted either by
4210 * increasing the safepos/readpos if diff is negative, or
4211 * increasing writepos if diff is positive.
4212 * If 'readpos' is then behind 'writepos', there is no way that we can
4197 * ensure safety in the face of a crash - that must be done by userspace 4213 * ensure safety in the face of a crash - that must be done by userspace
4198 * making a backup of the data. So in that case there is no particular 4214 * making a backup of the data. So in that case there is no particular
4199 * rush to update metadata. 4215 * rush to update metadata.
@@ -4206,6 +4222,12 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4206 * Maybe that number should be configurable, but I'm not sure it is 4222 * Maybe that number should be configurable, but I'm not sure it is
4207 * worth it.... maybe it could be a multiple of safemode_delay??? 4223 * worth it.... maybe it could be a multiple of safemode_delay???
4208 */ 4224 */
4225 if (conf->min_offset_diff < 0) {
4226 safepos += -conf->min_offset_diff;
4227 readpos += -conf->min_offset_diff;
4228 } else
4229 writepos += conf->min_offset_diff;
4230
4209 if ((mddev->reshape_backwards 4231 if ((mddev->reshape_backwards
4210 ? (safepos > writepos && readpos < writepos) 4232 ? (safepos > writepos && readpos < writepos)
4211 : (safepos < writepos && readpos > writepos)) || 4233 : (safepos < writepos && readpos > writepos)) ||
@@ -4227,17 +4249,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4227 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4249 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4228 } 4250 }
4229 4251
4230 if (mddev->reshape_backwards) {
4231 BUG_ON(conf->reshape_progress == 0);
4232 stripe_addr = writepos;
4233 BUG_ON((mddev->dev_sectors &
4234 ~((sector_t)reshape_sectors - 1))
4235 - reshape_sectors - stripe_addr
4236 != sector_nr);
4237 } else {
4238 BUG_ON(writepos != sector_nr + reshape_sectors);
4239 stripe_addr = sector_nr;
4240 }
4241 INIT_LIST_HEAD(&stripes); 4252 INIT_LIST_HEAD(&stripes);
4242 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4253 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4243 int j; 4254 int j;
@@ -4984,16 +4995,42 @@ static int run(struct mddev *mddev)
4984 struct md_rdev *rdev; 4995 struct md_rdev *rdev;
4985 sector_t reshape_offset = 0; 4996 sector_t reshape_offset = 0;
4986 int i; 4997 int i;
4998 long long min_offset_diff = 0;
4999 int first = 1;
4987 5000
4988 if (mddev->recovery_cp != MaxSector) 5001 if (mddev->recovery_cp != MaxSector)
4989 printk(KERN_NOTICE "md/raid:%s: not clean" 5002 printk(KERN_NOTICE "md/raid:%s: not clean"
4990 " -- starting background reconstruction\n", 5003 " -- starting background reconstruction\n",
4991 mdname(mddev)); 5004 mdname(mddev));
5005
5006 rdev_for_each(rdev, mddev) {
5007 long long diff;
5008 if (rdev->raid_disk < 0)
5009 continue;
5010 diff = (rdev->new_data_offset - rdev->data_offset);
5011 if (first) {
5012 min_offset_diff = diff;
5013 first = 0;
5014 } else if (mddev->reshape_backwards &&
5015 diff < min_offset_diff)
5016 min_offset_diff = diff;
5017 else if (!mddev->reshape_backwards &&
5018 diff > min_offset_diff)
5019 min_offset_diff = diff;
5020 }
5021
4992 if (mddev->reshape_position != MaxSector) { 5022 if (mddev->reshape_position != MaxSector) {
4993 /* Check that we can continue the reshape. 5023 /* Check that we can continue the reshape.
4994 * Currently only disks can change, it must 5024 * Difficulties arise if the stripe we would write to
4995 * increase, and we must be past the point where 5025 * next is at or after the stripe we would read from next.
4996 * a stripe over-writes itself 5026 * For a reshape that changes the number of devices, this
5027 * is only possible for a very short time, and mdadm makes
5028 * sure that time appears to have past before assembling
5029 * the array. So we fail if that time hasn't passed.
5030 * For a reshape that keeps the number of devices the same
5031 * mdadm must be monitoring the reshape can keeping the
5032 * critical areas read-only and backed up. It will start
5033 * the array in read-only mode, so we check for that.
4997 */ 5034 */
4998 sector_t here_new, here_old; 5035 sector_t here_new, here_old;
4999 int old_disks; 5036 int old_disks;
@@ -5025,26 +5062,34 @@ static int run(struct mddev *mddev)
5025 /* here_old is the first stripe that we might need to read 5062 /* here_old is the first stripe that we might need to read
5026 * from */ 5063 * from */
5027 if (mddev->delta_disks == 0) { 5064 if (mddev->delta_disks == 0) {
5065 if ((here_new * mddev->new_chunk_sectors !=
5066 here_old * mddev->chunk_sectors)) {
5067 printk(KERN_ERR "md/raid:%s: reshape position is"
5068 " confused - aborting\n", mdname(mddev));
5069 return -EINVAL;
5070 }
5028 /* We cannot be sure it is safe to start an in-place 5071 /* We cannot be sure it is safe to start an in-place
5029 * reshape. It is only safe if user-space if monitoring 5072 * reshape. It is only safe if user-space is monitoring
5030 * and taking constant backups. 5073 * and taking constant backups.
5031 * mdadm always starts a situation like this in 5074 * mdadm always starts a situation like this in
5032 * readonly mode so it can take control before 5075 * readonly mode so it can take control before
5033 * allowing any writes. So just check for that. 5076 * allowing any writes. So just check for that.
5034 */ 5077 */
5035 if ((here_new * mddev->new_chunk_sectors != 5078 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
5036 here_old * mddev->chunk_sectors) || 5079 abs(min_offset_diff) >= mddev->new_chunk_sectors)
5037 mddev->ro == 0) { 5080 /* not really in-place - so OK */;
5038 printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 5081 else if (mddev->ro == 0) {
5039 " in read-only mode - aborting\n", 5082 printk(KERN_ERR "md/raid:%s: in-place reshape "
5083 "must be started in read-only mode "
5084 "- aborting\n",
5040 mdname(mddev)); 5085 mdname(mddev));
5041 return -EINVAL; 5086 return -EINVAL;
5042 } 5087 }
5043 } else if (mddev->reshape_backwards 5088 } else if (mddev->reshape_backwards
5044 ? (here_new * mddev->new_chunk_sectors <= 5089 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
5045 here_old * mddev->chunk_sectors) 5090 here_old * mddev->chunk_sectors)
5046 : (here_new * mddev->new_chunk_sectors >= 5091 : (here_new * mddev->new_chunk_sectors >=
5047 here_old * mddev->chunk_sectors)) { 5092 here_old * mddev->chunk_sectors + (-min_offset_diff))) {
5048 /* Reading from the same stripe as writing to - bad */ 5093 /* Reading from the same stripe as writing to - bad */
5049 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5094 printk(KERN_ERR "md/raid:%s: reshape_position too early for "
5050 "auto-recovery - aborting.\n", 5095 "auto-recovery - aborting.\n",
@@ -5069,6 +5114,7 @@ static int run(struct mddev *mddev)
5069 if (IS_ERR(conf)) 5114 if (IS_ERR(conf))
5070 return PTR_ERR(conf); 5115 return PTR_ERR(conf);
5071 5116
5117 conf->min_offset_diff = min_offset_diff;
5072 mddev->thread = conf->thread; 5118 mddev->thread = conf->thread;
5073 conf->thread = NULL; 5119 conf->thread = NULL;
5074 mddev->private = conf; 5120 mddev->private = conf;
@@ -5541,9 +5587,6 @@ static int raid5_start_reshape(struct mddev *mddev)
5541 return -ENOSPC; 5587 return -ENOSPC;
5542 5588
5543 rdev_for_each(rdev, mddev) { 5589 rdev_for_each(rdev, mddev) {
5544 /* Don't support changing data_offset yet */
5545 if (rdev->new_data_offset != rdev->data_offset)
5546 return -EINVAL;
5547 if (!test_bit(In_sync, &rdev->flags) 5590 if (!test_bit(In_sync, &rdev->flags)
5548 && !test_bit(Faulty, &rdev->flags)) 5591 && !test_bit(Faulty, &rdev->flags))
5549 spares++; 5592 spares++;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8d8e13934a48..c6bdfa01d987 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -385,6 +385,12 @@ struct r5conf {
385 short generation; /* increments with every reshape */ 385 short generation; /* increments with every reshape */
386 unsigned long reshape_checkpoint; /* Time we last updated 386 unsigned long reshape_checkpoint; /* Time we last updated
387 * metadata */ 387 * metadata */
388 long long min_offset_diff; /* minimum difference between
389 * data_offset and
390 * new_data_offset across all
391 * devices. May be negative,
392 * but is closest to zero.
393 */
388 394
389 struct list_head handle_list; /* stripes needing handling */ 395 struct list_head handle_list; /* stripes needing handling */
390 struct list_head hold_list; /* preread ready stripes */ 396 struct list_head hold_list; /* preread ready stripes */