aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2009-03-31 00:28:40 -0400
committerNeilBrown <neilb@suse.de>2009-03-31 00:28:40 -0400
commitc8f517c444e4f9f55b5b5ca202b8404691a35805 (patch)
treee679ae13a07e1a2644da0dfc4a4d66bf73f83626 /drivers
parentb0f9ec047b79a92e8b8a9dfbf97537c8fbef234a (diff)
md/raid5 revise rules for when to update metadata during reshape
We currently update the metadata : 1/ every 3Megabytes 2/ When the place we will write new-layout data to is recorded in the metadata as still containing old-layout data. Rule one exists to avoid having to re-do too much reshaping in the face of a crash/restart. So it should really be time based rather than size based. So change it to "every 10 seconds". Rule two turns out to be too harsh when restriping an array 'in-place', as in that case the metadata much be updates for every stripe. For the in-place update, it can only possibly be safe from a crash if some user-space program data a backup of every e.g. few hundred stripes before allowing them to be reshaped. In that case, the constant metadata update is pointless. So only update the metadata if the new metadata will report that the end of the 'old-layout' data is beyond where we are currently writing 'new-layout' data. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/raid5.c34
-rw-r--r--drivers/md/raid5.h2
2 files changed, 30 insertions, 6 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bb4b12e370df..3bbc6d647044 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3766,7 +3766,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3766 int new_data_disks = conf->raid_disks - conf->max_degraded; 3766 int new_data_disks = conf->raid_disks - conf->max_degraded;
3767 int i; 3767 int i;
3768 int dd_idx; 3768 int dd_idx;
3769 sector_t writepos, safepos, gap; 3769 sector_t writepos, readpos, safepos;
3770 sector_t stripe_addr; 3770 sector_t stripe_addr;
3771 int reshape_sectors; 3771 int reshape_sectors;
3772 struct list_head stripes; 3772 struct list_head stripes;
@@ -3806,26 +3806,46 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3806 */ 3806 */
3807 writepos = conf->reshape_progress; 3807 writepos = conf->reshape_progress;
3808 sector_div(writepos, new_data_disks); 3808 sector_div(writepos, new_data_disks);
3809 readpos = conf->reshape_progress;
3810 sector_div(readpos, data_disks);
3809 safepos = conf->reshape_safe; 3811 safepos = conf->reshape_safe;
3810 sector_div(safepos, data_disks); 3812 sector_div(safepos, data_disks);
3811 if (mddev->delta_disks < 0) { 3813 if (mddev->delta_disks < 0) {
3812 writepos -= reshape_sectors; 3814 writepos -= reshape_sectors;
3815 readpos += reshape_sectors;
3813 safepos += reshape_sectors; 3816 safepos += reshape_sectors;
3814 gap = conf->reshape_safe - conf->reshape_progress;
3815 } else { 3817 } else {
3816 writepos += reshape_sectors; 3818 writepos += reshape_sectors;
3819 readpos -= reshape_sectors;
3817 safepos -= reshape_sectors; 3820 safepos -= reshape_sectors;
3818 gap = conf->reshape_progress - conf->reshape_safe;
3819 } 3821 }
3820 3822
3823 /* 'writepos' is the most advanced device address we might write.
3824 * 'readpos' is the least advanced device address we might read.
3825 * 'safepos' is the least address recorded in the metadata as having
3826 * been reshaped.
3827 * If 'readpos' is behind 'writepos', then there is no way that we can
3828 * ensure safety in the face of a crash - that must be done by userspace
3829 * making a backup of the data. So in that case there is no particular
3830 * rush to update metadata.
3831 * Otherwise if 'safepos' is behind 'writepos', then we really need to
3832 * update the metadata to advance 'safepos' to match 'readpos' so that
3833 * we can be safe in the event of a crash.
3834 * So we insist on updating metadata if safepos is behind writepos and
3835 * readpos is beyond writepos.
3836 * In any case, update the metadata every 10 seconds.
3837 * Maybe that number should be configurable, but I'm not sure it is
3838 * worth it.... maybe it could be a multiple of safemode_delay???
3839 */
3821 if ((mddev->delta_disks < 0 3840 if ((mddev->delta_disks < 0
3822 ? writepos < safepos 3841 ? (safepos > writepos && readpos < writepos)
3823 : writepos > safepos) || 3842 : (safepos < writepos && readpos > writepos)) ||
3824 gap > (new_data_disks)*3000*2 /*3Meg*/) { 3843 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
3825 /* Cannot proceed until we've updated the superblock... */ 3844 /* Cannot proceed until we've updated the superblock... */
3826 wait_event(conf->wait_for_overlap, 3845 wait_event(conf->wait_for_overlap,
3827 atomic_read(&conf->reshape_stripes)==0); 3846 atomic_read(&conf->reshape_stripes)==0);
3828 mddev->reshape_position = conf->reshape_progress; 3847 mddev->reshape_position = conf->reshape_progress;
3848 conf->reshape_checkpoint = jiffies;
3829 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3849 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3830 md_wakeup_thread(mddev->thread); 3850 md_wakeup_thread(mddev->thread);
3831 wait_event(mddev->sb_wait, mddev->flags == 0 || 3851 wait_event(mddev->sb_wait, mddev->flags == 0 ||
@@ -3923,6 +3943,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3923 wait_event(conf->wait_for_overlap, 3943 wait_event(conf->wait_for_overlap,
3924 atomic_read(&conf->reshape_stripes) == 0); 3944 atomic_read(&conf->reshape_stripes) == 0);
3925 mddev->reshape_position = conf->reshape_progress; 3945 mddev->reshape_position = conf->reshape_progress;
3946 conf->reshape_checkpoint = jiffies;
3926 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3947 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3927 md_wakeup_thread(mddev->thread); 3948 md_wakeup_thread(mddev->thread);
3928 wait_event(mddev->sb_wait, 3949 wait_event(mddev->sb_wait,
@@ -4957,6 +4978,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4957 spin_unlock_irq(&conf->device_lock); 4978 spin_unlock_irq(&conf->device_lock);
4958 return -EAGAIN; 4979 return -EAGAIN;
4959 } 4980 }
4981 conf->reshape_checkpoint = jiffies;
4960 md_wakeup_thread(mddev->sync_thread); 4982 md_wakeup_thread(mddev->sync_thread);
4961 md_new_event(mddev); 4983 md_new_event(mddev);
4962 return 0; 4984 return 0;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cdd045681720..52ba99954dec 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -352,6 +352,8 @@ struct raid5_private_data {
352 int previous_raid_disks; 352 int previous_raid_disks;
353 int prev_chunk, prev_algo; 353 int prev_chunk, prev_algo;
354 short generation; /* increments with every reshape */ 354 short generation; /* increments with every reshape */
355 unsigned long reshape_checkpoint; /* Time we last updated
356 * metadata */
355 357
356 struct list_head handle_list; /* stripes needing handling */ 358 struct list_head handle_list; /* stripes needing handling */
357 struct list_head hold_list; /* preread ready stripes */ 359 struct list_head hold_list; /* preread ready stripes */