diff options
author | NeilBrown <neilb@suse.de> | 2012-05-20 19:27:01 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2012-05-20 19:27:01 -0400 |
commit | b5254dd5fdd9abcacadb5101beb35df9ae8cc564 (patch) | |
tree | 73d32b8dd7c0dc9ecfe61468965b06741070dee7 /drivers/md | |
parent | 05616be5e11f66888b66554957dbecdd90658a84 (diff) |
md/raid5: allow for change in data_offset while managing a reshape.
The important issue here is incorporating the different in data_offset
into calculations concerning when we might need to over-write data
that is still thought to be valid.
To this end we find the minimum offset difference across all devices
and add that where appropriate.
Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/raid5.c | 109 | ||||
-rw-r--r-- | drivers/md/raid5.h | 6 |
2 files changed, 82 insertions, 33 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 71d1de909ba5..0172bdd37b48 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -4165,13 +4165,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4165 | else | 4165 | else |
4166 | reshape_sectors = mddev->chunk_sectors; | 4166 | reshape_sectors = mddev->chunk_sectors; |
4167 | 4167 | ||
4168 | /* we update the metadata when there is more than 3Meg | 4168 | /* We update the metadata at least every 10 seconds, or when |
4169 | * in the block range (that is rather arbitrary, should | 4169 | * the data about to be copied would over-write the source of |
4170 | * probably be time based) or when the data about to be | 4170 | * the data at the front of the range. i.e. one new_stripe |
4171 | * copied would over-write the source of the data at | 4171 | * along from reshape_progress new_maps to after where |
4172 | * the front of the range. | 4172 | * reshape_safe old_maps to |
4173 | * i.e. one new_stripe along from reshape_progress new_maps | ||
4174 | * to after where reshape_safe old_maps to | ||
4175 | */ | 4173 | */ |
4176 | writepos = conf->reshape_progress; | 4174 | writepos = conf->reshape_progress; |
4177 | sector_div(writepos, new_data_disks); | 4175 | sector_div(writepos, new_data_disks); |
@@ -4189,11 +4187,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4189 | safepos -= min_t(sector_t, reshape_sectors, safepos); | 4187 | safepos -= min_t(sector_t, reshape_sectors, safepos); |
4190 | } | 4188 | } |
4191 | 4189 | ||
4190 | /* Having calculated the 'writepos' possibly use it | ||
4191 | * to set 'stripe_addr' which is where we will write to. | ||
4192 | */ | ||
4193 | if (mddev->reshape_backwards) { | ||
4194 | BUG_ON(conf->reshape_progress == 0); | ||
4195 | stripe_addr = writepos; | ||
4196 | BUG_ON((mddev->dev_sectors & | ||
4197 | ~((sector_t)reshape_sectors - 1)) | ||
4198 | - reshape_sectors - stripe_addr | ||
4199 | != sector_nr); | ||
4200 | } else { | ||
4201 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
4202 | stripe_addr = sector_nr; | ||
4203 | } | ||
4204 | |||
4192 | /* 'writepos' is the most advanced device address we might write. | 4205 | /* 'writepos' is the most advanced device address we might write. |
4193 | * 'readpos' is the least advanced device address we might read. | 4206 | * 'readpos' is the least advanced device address we might read. |
4194 | * 'safepos' is the least address recorded in the metadata as having | 4207 | * 'safepos' is the least address recorded in the metadata as having |
4195 | * been reshaped. | 4208 | * been reshaped. |
4196 | * If 'readpos' is behind 'writepos', then there is no way that we can | 4209 | * If there is a min_offset_diff, these are adjusted either by |
4210 | * increasing the safepos/readpos if diff is negative, or | ||
4211 | * increasing writepos if diff is positive. | ||
4212 | * If 'readpos' is then behind 'writepos', there is no way that we can | ||
4197 | * ensure safety in the face of a crash - that must be done by userspace | 4213 | * ensure safety in the face of a crash - that must be done by userspace |
4198 | * making a backup of the data. So in that case there is no particular | 4214 | * making a backup of the data. So in that case there is no particular |
4199 | * rush to update metadata. | 4215 | * rush to update metadata. |
@@ -4206,6 +4222,12 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4206 | * Maybe that number should be configurable, but I'm not sure it is | 4222 | * Maybe that number should be configurable, but I'm not sure it is |
4207 | * worth it.... maybe it could be a multiple of safemode_delay??? | 4223 | * worth it.... maybe it could be a multiple of safemode_delay??? |
4208 | */ | 4224 | */ |
4225 | if (conf->min_offset_diff < 0) { | ||
4226 | safepos += -conf->min_offset_diff; | ||
4227 | readpos += -conf->min_offset_diff; | ||
4228 | } else | ||
4229 | writepos += conf->min_offset_diff; | ||
4230 | |||
4209 | if ((mddev->reshape_backwards | 4231 | if ((mddev->reshape_backwards |
4210 | ? (safepos > writepos && readpos < writepos) | 4232 | ? (safepos > writepos && readpos < writepos) |
4211 | : (safepos < writepos && readpos > writepos)) || | 4233 | : (safepos < writepos && readpos > writepos)) || |
@@ -4227,17 +4249,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4227 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 4249 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
4228 | } | 4250 | } |
4229 | 4251 | ||
4230 | if (mddev->reshape_backwards) { | ||
4231 | BUG_ON(conf->reshape_progress == 0); | ||
4232 | stripe_addr = writepos; | ||
4233 | BUG_ON((mddev->dev_sectors & | ||
4234 | ~((sector_t)reshape_sectors - 1)) | ||
4235 | - reshape_sectors - stripe_addr | ||
4236 | != sector_nr); | ||
4237 | } else { | ||
4238 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
4239 | stripe_addr = sector_nr; | ||
4240 | } | ||
4241 | INIT_LIST_HEAD(&stripes); | 4252 | INIT_LIST_HEAD(&stripes); |
4242 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { | 4253 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { |
4243 | int j; | 4254 | int j; |
@@ -4984,16 +4995,42 @@ static int run(struct mddev *mddev) | |||
4984 | struct md_rdev *rdev; | 4995 | struct md_rdev *rdev; |
4985 | sector_t reshape_offset = 0; | 4996 | sector_t reshape_offset = 0; |
4986 | int i; | 4997 | int i; |
4998 | long long min_offset_diff = 0; | ||
4999 | int first = 1; | ||
4987 | 5000 | ||
4988 | if (mddev->recovery_cp != MaxSector) | 5001 | if (mddev->recovery_cp != MaxSector) |
4989 | printk(KERN_NOTICE "md/raid:%s: not clean" | 5002 | printk(KERN_NOTICE "md/raid:%s: not clean" |
4990 | " -- starting background reconstruction\n", | 5003 | " -- starting background reconstruction\n", |
4991 | mdname(mddev)); | 5004 | mdname(mddev)); |
5005 | |||
5006 | rdev_for_each(rdev, mddev) { | ||
5007 | long long diff; | ||
5008 | if (rdev->raid_disk < 0) | ||
5009 | continue; | ||
5010 | diff = (rdev->new_data_offset - rdev->data_offset); | ||
5011 | if (first) { | ||
5012 | min_offset_diff = diff; | ||
5013 | first = 0; | ||
5014 | } else if (mddev->reshape_backwards && | ||
5015 | diff < min_offset_diff) | ||
5016 | min_offset_diff = diff; | ||
5017 | else if (!mddev->reshape_backwards && | ||
5018 | diff > min_offset_diff) | ||
5019 | min_offset_diff = diff; | ||
5020 | } | ||
5021 | |||
4992 | if (mddev->reshape_position != MaxSector) { | 5022 | if (mddev->reshape_position != MaxSector) { |
4993 | /* Check that we can continue the reshape. | 5023 | /* Check that we can continue the reshape. |
4994 | * Currently only disks can change, it must | 5024 | * Difficulties arise if the stripe we would write to |
4995 | * increase, and we must be past the point where | 5025 | * next is at or after the stripe we would read from next. |
4996 | * a stripe over-writes itself | 5026 | * For a reshape that changes the number of devices, this |
5027 | * is only possible for a very short time, and mdadm makes | ||
5028 | * sure that time appears to have past before assembling | ||
5029 | * the array. So we fail if that time hasn't passed. | ||
5030 | * For a reshape that keeps the number of devices the same | ||
5031 | * mdadm must be monitoring the reshape can keeping the | ||
5032 | * critical areas read-only and backed up. It will start | ||
5033 | * the array in read-only mode, so we check for that. | ||
4997 | */ | 5034 | */ |
4998 | sector_t here_new, here_old; | 5035 | sector_t here_new, here_old; |
4999 | int old_disks; | 5036 | int old_disks; |
@@ -5025,26 +5062,34 @@ static int run(struct mddev *mddev) | |||
5025 | /* here_old is the first stripe that we might need to read | 5062 | /* here_old is the first stripe that we might need to read |
5026 | * from */ | 5063 | * from */ |
5027 | if (mddev->delta_disks == 0) { | 5064 | if (mddev->delta_disks == 0) { |
5065 | if ((here_new * mddev->new_chunk_sectors != | ||
5066 | here_old * mddev->chunk_sectors)) { | ||
5067 | printk(KERN_ERR "md/raid:%s: reshape position is" | ||
5068 | " confused - aborting\n", mdname(mddev)); | ||
5069 | return -EINVAL; | ||
5070 | } | ||
5028 | /* We cannot be sure it is safe to start an in-place | 5071 | /* We cannot be sure it is safe to start an in-place |
5029 | * reshape. It is only safe if user-space if monitoring | 5072 | * reshape. It is only safe if user-space is monitoring |
5030 | * and taking constant backups. | 5073 | * and taking constant backups. |
5031 | * mdadm always starts a situation like this in | 5074 | * mdadm always starts a situation like this in |
5032 | * readonly mode so it can take control before | 5075 | * readonly mode so it can take control before |
5033 | * allowing any writes. So just check for that. | 5076 | * allowing any writes. So just check for that. |
5034 | */ | 5077 | */ |
5035 | if ((here_new * mddev->new_chunk_sectors != | 5078 | if (abs(min_offset_diff) >= mddev->chunk_sectors && |
5036 | here_old * mddev->chunk_sectors) || | 5079 | abs(min_offset_diff) >= mddev->new_chunk_sectors) |
5037 | mddev->ro == 0) { | 5080 | /* not really in-place - so OK */; |
5038 | printk(KERN_ERR "md/raid:%s: in-place reshape must be started" | 5081 | else if (mddev->ro == 0) { |
5039 | " in read-only mode - aborting\n", | 5082 | printk(KERN_ERR "md/raid:%s: in-place reshape " |
5083 | "must be started in read-only mode " | ||
5084 | "- aborting\n", | ||
5040 | mdname(mddev)); | 5085 | mdname(mddev)); |
5041 | return -EINVAL; | 5086 | return -EINVAL; |
5042 | } | 5087 | } |
5043 | } else if (mddev->reshape_backwards | 5088 | } else if (mddev->reshape_backwards |
5044 | ? (here_new * mddev->new_chunk_sectors <= | 5089 | ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= |
5045 | here_old * mddev->chunk_sectors) | 5090 | here_old * mddev->chunk_sectors) |
5046 | : (here_new * mddev->new_chunk_sectors >= | 5091 | : (here_new * mddev->new_chunk_sectors >= |
5047 | here_old * mddev->chunk_sectors)) { | 5092 | here_old * mddev->chunk_sectors + (-min_offset_diff))) { |
5048 | /* Reading from the same stripe as writing to - bad */ | 5093 | /* Reading from the same stripe as writing to - bad */ |
5049 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " | 5094 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " |
5050 | "auto-recovery - aborting.\n", | 5095 | "auto-recovery - aborting.\n", |
@@ -5069,6 +5114,7 @@ static int run(struct mddev *mddev) | |||
5069 | if (IS_ERR(conf)) | 5114 | if (IS_ERR(conf)) |
5070 | return PTR_ERR(conf); | 5115 | return PTR_ERR(conf); |
5071 | 5116 | ||
5117 | conf->min_offset_diff = min_offset_diff; | ||
5072 | mddev->thread = conf->thread; | 5118 | mddev->thread = conf->thread; |
5073 | conf->thread = NULL; | 5119 | conf->thread = NULL; |
5074 | mddev->private = conf; | 5120 | mddev->private = conf; |
@@ -5541,9 +5587,6 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5541 | return -ENOSPC; | 5587 | return -ENOSPC; |
5542 | 5588 | ||
5543 | rdev_for_each(rdev, mddev) { | 5589 | rdev_for_each(rdev, mddev) { |
5544 | /* Don't support changing data_offset yet */ | ||
5545 | if (rdev->new_data_offset != rdev->data_offset) | ||
5546 | return -EINVAL; | ||
5547 | if (!test_bit(In_sync, &rdev->flags) | 5590 | if (!test_bit(In_sync, &rdev->flags) |
5548 | && !test_bit(Faulty, &rdev->flags)) | 5591 | && !test_bit(Faulty, &rdev->flags)) |
5549 | spares++; | 5592 | spares++; |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 8d8e13934a48..c6bdfa01d987 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -385,6 +385,12 @@ struct r5conf { | |||
385 | short generation; /* increments with every reshape */ | 385 | short generation; /* increments with every reshape */ |
386 | unsigned long reshape_checkpoint; /* Time we last updated | 386 | unsigned long reshape_checkpoint; /* Time we last updated |
387 | * metadata */ | 387 | * metadata */ |
388 | long long min_offset_diff; /* minimum difference between | ||
389 | * data_offset and | ||
390 | * new_data_offset across all | ||
391 | * devices. May be negative, | ||
392 | * but is closest to zero. | ||
393 | */ | ||
388 | 394 | ||
389 | struct list_head handle_list; /* stripes needing handling */ | 395 | struct list_head handle_list; /* stripes needing handling */ |
390 | struct list_head hold_list; /* preread ready stripes */ | 396 | struct list_head hold_list; /* preread ready stripes */ |