diff options
author | NeilBrown <neilb@suse.de> | 2009-03-31 00:28:40 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2009-03-31 00:28:40 -0400 |
commit | c8f517c444e4f9f55b5b5ca202b8404691a35805 (patch) | |
tree | e679ae13a07e1a2644da0dfc4a4d66bf73f83626 /drivers/md | |
parent | b0f9ec047b79a92e8b8a9dfbf97537c8fbef234a (diff) |
md/raid5 revise rules for when to update metadata during reshape
We currently update the metadata :
1/ every 3Megabytes
2/ When the place we will write new-layout data to is recorded in
the metadata as still containing old-layout data.
Rule one exists to avoid having to re-do too much reshaping in the
face of a crash/restart. So it should really be time based rather
than size based. So change it to "every 10 seconds".
Rule two turns out to be too harsh when restriping an array
'in-place', as in that case the metadata much be updates for every
stripe.
For the in-place update, it can only possibly be safe from a crash if
some user-space program data a backup of every e.g. few hundred
stripes before allowing them to be reshaped. In that case, the
constant metadata update is pointless.
So only update the metadata if the new metadata will report that the
end of the 'old-layout' data is beyond where we are currently
writing 'new-layout' data.
Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/raid5.c | 34 | ||||
-rw-r--r-- | drivers/md/raid5.h | 2 |
2 files changed, 30 insertions, 6 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bb4b12e370df..3bbc6d647044 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -3766,7 +3766,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3766 | int new_data_disks = conf->raid_disks - conf->max_degraded; | 3766 | int new_data_disks = conf->raid_disks - conf->max_degraded; |
3767 | int i; | 3767 | int i; |
3768 | int dd_idx; | 3768 | int dd_idx; |
3769 | sector_t writepos, safepos, gap; | 3769 | sector_t writepos, readpos, safepos; |
3770 | sector_t stripe_addr; | 3770 | sector_t stripe_addr; |
3771 | int reshape_sectors; | 3771 | int reshape_sectors; |
3772 | struct list_head stripes; | 3772 | struct list_head stripes; |
@@ -3806,26 +3806,46 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3806 | */ | 3806 | */ |
3807 | writepos = conf->reshape_progress; | 3807 | writepos = conf->reshape_progress; |
3808 | sector_div(writepos, new_data_disks); | 3808 | sector_div(writepos, new_data_disks); |
3809 | readpos = conf->reshape_progress; | ||
3810 | sector_div(readpos, data_disks); | ||
3809 | safepos = conf->reshape_safe; | 3811 | safepos = conf->reshape_safe; |
3810 | sector_div(safepos, data_disks); | 3812 | sector_div(safepos, data_disks); |
3811 | if (mddev->delta_disks < 0) { | 3813 | if (mddev->delta_disks < 0) { |
3812 | writepos -= reshape_sectors; | 3814 | writepos -= reshape_sectors; |
3815 | readpos += reshape_sectors; | ||
3813 | safepos += reshape_sectors; | 3816 | safepos += reshape_sectors; |
3814 | gap = conf->reshape_safe - conf->reshape_progress; | ||
3815 | } else { | 3817 | } else { |
3816 | writepos += reshape_sectors; | 3818 | writepos += reshape_sectors; |
3819 | readpos -= reshape_sectors; | ||
3817 | safepos -= reshape_sectors; | 3820 | safepos -= reshape_sectors; |
3818 | gap = conf->reshape_progress - conf->reshape_safe; | ||
3819 | } | 3821 | } |
3820 | 3822 | ||
3823 | /* 'writepos' is the most advanced device address we might write. | ||
3824 | * 'readpos' is the least advanced device address we might read. | ||
3825 | * 'safepos' is the least address recorded in the metadata as having | ||
3826 | * been reshaped. | ||
3827 | * If 'readpos' is behind 'writepos', then there is no way that we can | ||
3828 | * ensure safety in the face of a crash - that must be done by userspace | ||
3829 | * making a backup of the data. So in that case there is no particular | ||
3830 | * rush to update metadata. | ||
3831 | * Otherwise if 'safepos' is behind 'writepos', then we really need to | ||
3832 | * update the metadata to advance 'safepos' to match 'readpos' so that | ||
3833 | * we can be safe in the event of a crash. | ||
3834 | * So we insist on updating metadata if safepos is behind writepos and | ||
3835 | * readpos is beyond writepos. | ||
3836 | * In any case, update the metadata every 10 seconds. | ||
3837 | * Maybe that number should be configurable, but I'm not sure it is | ||
3838 | * worth it.... maybe it could be a multiple of safemode_delay??? | ||
3839 | */ | ||
3821 | if ((mddev->delta_disks < 0 | 3840 | if ((mddev->delta_disks < 0 |
3822 | ? writepos < safepos | 3841 | ? (safepos > writepos && readpos < writepos) |
3823 | : writepos > safepos) || | 3842 | : (safepos < writepos && readpos > writepos)) || |
3824 | gap > (new_data_disks)*3000*2 /*3Meg*/) { | 3843 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { |
3825 | /* Cannot proceed until we've updated the superblock... */ | 3844 | /* Cannot proceed until we've updated the superblock... */ |
3826 | wait_event(conf->wait_for_overlap, | 3845 | wait_event(conf->wait_for_overlap, |
3827 | atomic_read(&conf->reshape_stripes)==0); | 3846 | atomic_read(&conf->reshape_stripes)==0); |
3828 | mddev->reshape_position = conf->reshape_progress; | 3847 | mddev->reshape_position = conf->reshape_progress; |
3848 | conf->reshape_checkpoint = jiffies; | ||
3829 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3849 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
3830 | md_wakeup_thread(mddev->thread); | 3850 | md_wakeup_thread(mddev->thread); |
3831 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 3851 | wait_event(mddev->sb_wait, mddev->flags == 0 || |
@@ -3923,6 +3943,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3923 | wait_event(conf->wait_for_overlap, | 3943 | wait_event(conf->wait_for_overlap, |
3924 | atomic_read(&conf->reshape_stripes) == 0); | 3944 | atomic_read(&conf->reshape_stripes) == 0); |
3925 | mddev->reshape_position = conf->reshape_progress; | 3945 | mddev->reshape_position = conf->reshape_progress; |
3946 | conf->reshape_checkpoint = jiffies; | ||
3926 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3947 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
3927 | md_wakeup_thread(mddev->thread); | 3948 | md_wakeup_thread(mddev->thread); |
3928 | wait_event(mddev->sb_wait, | 3949 | wait_event(mddev->sb_wait, |
@@ -4957,6 +4978,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
4957 | spin_unlock_irq(&conf->device_lock); | 4978 | spin_unlock_irq(&conf->device_lock); |
4958 | return -EAGAIN; | 4979 | return -EAGAIN; |
4959 | } | 4980 | } |
4981 | conf->reshape_checkpoint = jiffies; | ||
4960 | md_wakeup_thread(mddev->sync_thread); | 4982 | md_wakeup_thread(mddev->sync_thread); |
4961 | md_new_event(mddev); | 4983 | md_new_event(mddev); |
4962 | return 0; | 4984 | return 0; |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index cdd045681720..52ba99954dec 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -352,6 +352,8 @@ struct raid5_private_data { | |||
352 | int previous_raid_disks; | 352 | int previous_raid_disks; |
353 | int prev_chunk, prev_algo; | 353 | int prev_chunk, prev_algo; |
354 | short generation; /* increments with every reshape */ | 354 | short generation; /* increments with every reshape */ |
355 | unsigned long reshape_checkpoint; /* Time we last updated | ||
356 | * metadata */ | ||
355 | 357 | ||
356 | struct list_head handle_list; /* stripes needing handling */ | 358 | struct list_head handle_list; /* stripes needing handling */ |
357 | struct list_head hold_list; /* preread ready stripes */ | 359 | struct list_head hold_list; /* preread ready stripes */ |