aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2009-03-31 00:16:46 -0400
committerNeilBrown <neilb@suse.de>2009-03-31 00:16:46 -0400
commitfef9c61fdfabf97a307c2cf3621a6949f0a4b995 (patch)
tree82b128341c12205db62fe092d692d32103a7ea9f /drivers
parentcea9c22800773cecb1d41f4a6139f9eb6a95368b (diff)
md/raid5: change reshape-progress measurement to cope with reshaping backwards.
When reducing the number of devices in a raid4/5/6, the reshape process has to start at the end of the array and work down to the beginning. So we need to handle expand_progress and expand_lo differently. This patch renames "expand_progress" and "expand_lo" to avoid the implication that anything is getting bigger (expand->reshape) and every place they are used, we make sure that they are used the right way depending on whether delta_disks is positive or negative. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/raid5.c94
-rw-r--r--drivers/md/raid5.h15
2 files changed, 71 insertions, 38 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a0f22dd33234..1023c4e48a91 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3593,24 +3593,28 @@ static int make_request(struct request_queue *q, struct bio * bi)
3593 retry: 3593 retry:
3594 previous = 0; 3594 previous = 0;
3595 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3595 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3596 if (likely(conf->expand_progress == MaxSector)) 3596 if (likely(conf->reshape_progress == MaxSector))
3597 disks = conf->raid_disks; 3597 disks = conf->raid_disks;
3598 else { 3598 else {
3599 /* spinlock is needed as expand_progress may be 3599 /* spinlock is needed as reshape_progress may be
3600 * 64bit on a 32bit platform, and so it might be 3600 * 64bit on a 32bit platform, and so it might be
3601 * possible to see a half-updated value 3601 * possible to see a half-updated value
3602 * Ofcourse expand_progress could change after 3602 * Ofcourse reshape_progress could change after
3603 * the lock is dropped, so once we get a reference 3603 * the lock is dropped, so once we get a reference
3604 * to the stripe that we think it is, we will have 3604 * to the stripe that we think it is, we will have
3605 * to check again. 3605 * to check again.
3606 */ 3606 */
3607 spin_lock_irq(&conf->device_lock); 3607 spin_lock_irq(&conf->device_lock);
3608 disks = conf->raid_disks; 3608 disks = conf->raid_disks;
3609 if (logical_sector >= conf->expand_progress) { 3609 if (mddev->delta_disks < 0
3610 ? logical_sector < conf->reshape_progress
3611 : logical_sector >= conf->reshape_progress) {
3610 disks = conf->previous_raid_disks; 3612 disks = conf->previous_raid_disks;
3611 previous = 1; 3613 previous = 1;
3612 } else { 3614 } else {
3613 if (logical_sector >= conf->expand_lo) { 3615 if (mddev->delta_disks < 0
3616 ? logical_sector < conf->reshape_safe
3617 : logical_sector >= conf->reshape_safe) {
3614 spin_unlock_irq(&conf->device_lock); 3618 spin_unlock_irq(&conf->device_lock);
3615 schedule(); 3619 schedule();
3616 goto retry; 3620 goto retry;
@@ -3630,7 +3634,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3630 sh = get_active_stripe(conf, new_sector, previous, 3634 sh = get_active_stripe(conf, new_sector, previous,
3631 (bi->bi_rw&RWA_MASK)); 3635 (bi->bi_rw&RWA_MASK));
3632 if (sh) { 3636 if (sh) {
3633 if (unlikely(conf->expand_progress != MaxSector)) { 3637 if (unlikely(conf->reshape_progress != MaxSector)) {
3634 /* expansion might have moved on while waiting for a 3638 /* expansion might have moved on while waiting for a
3635 * stripe, so we must do the range check again. 3639 * stripe, so we must do the range check again.
3636 * Expansion could still move past after this 3640 * Expansion could still move past after this
@@ -3641,8 +3645,10 @@ static int make_request(struct request_queue *q, struct bio * bi)
3641 */ 3645 */
3642 int must_retry = 0; 3646 int must_retry = 0;
3643 spin_lock_irq(&conf->device_lock); 3647 spin_lock_irq(&conf->device_lock);
3644 if (logical_sector < conf->expand_progress && 3648 if ((mddev->delta_disks < 0
3645 disks == conf->previous_raid_disks) 3649 ? logical_sector >= conf->reshape_progress
3650 : logical_sector < conf->reshape_progress)
3651 && disks == conf->previous_raid_disks)
3646 /* mismatch, need to try again */ 3652 /* mismatch, need to try again */
3647 must_retry = 1; 3653 must_retry = 1;
3648 spin_unlock_irq(&conf->device_lock); 3654 spin_unlock_irq(&conf->device_lock);
@@ -3720,13 +3726,20 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3720 int dd_idx; 3726 int dd_idx;
3721 sector_t writepos, safepos, gap; 3727 sector_t writepos, safepos, gap;
3722 3728
3723 if (sector_nr == 0 && 3729 if (sector_nr == 0) {
3724 conf->expand_progress != 0) { 3730 /* If restarting in the middle, skip the initial sectors */
3725 /* restarting in the middle, skip the initial sectors */ 3731 if (mddev->delta_disks < 0 &&
3726 sector_nr = conf->expand_progress; 3732 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
3733 sector_nr = raid5_size(mddev, 0, 0)
3734 - conf->reshape_progress;
3735 } else if (mddev->delta_disks > 0 &&
3736 conf->reshape_progress > 0)
3737 sector_nr = conf->reshape_progress;
3727 sector_div(sector_nr, new_data_disks); 3738 sector_div(sector_nr, new_data_disks);
3728 *skipped = 1; 3739 if (sector_nr) {
3729 return sector_nr; 3740 *skipped = 1;
3741 return sector_nr;
3742 }
3730 } 3743 }
3731 3744
3732 /* we update the metadata when there is more than 3Meg 3745 /* we update the metadata when there is more than 3Meg
@@ -3734,28 +3747,37 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3734 * probably be time based) or when the data about to be 3747 * probably be time based) or when the data about to be
3735 * copied would over-write the source of the data at 3748 * copied would over-write the source of the data at
3736 * the front of the range. 3749 * the front of the range.
3737 * i.e. one new_stripe forward from expand_progress new_maps 3750 * i.e. one new_stripe along from reshape_progress new_maps
3738 * to after where expand_lo old_maps to 3751 * to after where reshape_safe old_maps to
3739 */ 3752 */
3740 writepos = conf->expand_progress + 3753 writepos = conf->reshape_progress;
3741 conf->chunk_size/512*(new_data_disks);
3742 sector_div(writepos, new_data_disks); 3754 sector_div(writepos, new_data_disks);
3743 safepos = conf->expand_lo; 3755 safepos = conf->reshape_safe;
3744 sector_div(safepos, data_disks); 3756 sector_div(safepos, data_disks);
3745 gap = conf->expand_progress - conf->expand_lo; 3757 if (mddev->delta_disks < 0) {
3758 writepos -= conf->chunk_size/512;
3759 safepos += conf->chunk_size/512;
3760 gap = conf->reshape_safe - conf->reshape_progress;
3761 } else {
3762 writepos += conf->chunk_size/512;
3763 safepos -= conf->chunk_size/512;
3764 gap = conf->reshape_progress - conf->reshape_safe;
3765 }
3746 3766
3747 if (writepos >= safepos || 3767 if ((mddev->delta_disks < 0
3768 ? writepos < safepos
3769 : writepos > safepos) ||
3748 gap > (new_data_disks)*3000*2 /*3Meg*/) { 3770 gap > (new_data_disks)*3000*2 /*3Meg*/) {
3749 /* Cannot proceed until we've updated the superblock... */ 3771 /* Cannot proceed until we've updated the superblock... */
3750 wait_event(conf->wait_for_overlap, 3772 wait_event(conf->wait_for_overlap,
3751 atomic_read(&conf->reshape_stripes)==0); 3773 atomic_read(&conf->reshape_stripes)==0);
3752 mddev->reshape_position = conf->expand_progress; 3774 mddev->reshape_position = conf->reshape_progress;
3753 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3775 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3754 md_wakeup_thread(mddev->thread); 3776 md_wakeup_thread(mddev->thread);
3755 wait_event(mddev->sb_wait, mddev->flags == 0 || 3777 wait_event(mddev->sb_wait, mddev->flags == 0 ||
3756 kthread_should_stop()); 3778 kthread_should_stop());
3757 spin_lock_irq(&conf->device_lock); 3779 spin_lock_irq(&conf->device_lock);
3758 conf->expand_lo = mddev->reshape_position; 3780 conf->reshape_safe = mddev->reshape_position;
3759 spin_unlock_irq(&conf->device_lock); 3781 spin_unlock_irq(&conf->device_lock);
3760 wake_up(&conf->wait_for_overlap); 3782 wake_up(&conf->wait_for_overlap);
3761 } 3783 }
@@ -3792,7 +3814,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3792 release_stripe(sh); 3814 release_stripe(sh);
3793 } 3815 }
3794 spin_lock_irq(&conf->device_lock); 3816 spin_lock_irq(&conf->device_lock);
3795 conf->expand_progress = (sector_nr + i) * new_data_disks; 3817 if (mddev->delta_disks < 0)
3818 conf->reshape_progress -= i * new_data_disks;
3819 else
3820 conf->reshape_progress += i * new_data_disks;
3796 spin_unlock_irq(&conf->device_lock); 3821 spin_unlock_irq(&conf->device_lock);
3797 /* Ok, those stripe are ready. We can start scheduling 3822 /* Ok, those stripe are ready. We can start scheduling
3798 * reads on the source stripes. 3823 * reads on the source stripes.
@@ -3823,14 +3848,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3823 /* Cannot proceed until we've updated the superblock... */ 3848 /* Cannot proceed until we've updated the superblock... */
3824 wait_event(conf->wait_for_overlap, 3849 wait_event(conf->wait_for_overlap,
3825 atomic_read(&conf->reshape_stripes) == 0); 3850 atomic_read(&conf->reshape_stripes) == 0);
3826 mddev->reshape_position = conf->expand_progress; 3851 mddev->reshape_position = conf->reshape_progress;
3827 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3852 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3828 md_wakeup_thread(mddev->thread); 3853 md_wakeup_thread(mddev->thread);
3829 wait_event(mddev->sb_wait, 3854 wait_event(mddev->sb_wait,
3830 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 3855 !test_bit(MD_CHANGE_DEVS, &mddev->flags)
3831 || kthread_should_stop()); 3856 || kthread_should_stop());
3832 spin_lock_irq(&conf->device_lock); 3857 spin_lock_irq(&conf->device_lock);
3833 conf->expand_lo = mddev->reshape_position; 3858 conf->reshape_safe = mddev->reshape_position;
3834 spin_unlock_irq(&conf->device_lock); 3859 spin_unlock_irq(&conf->device_lock);
3835 wake_up(&conf->wait_for_overlap); 3860 wake_up(&conf->wait_for_overlap);
3836 } 3861 }
@@ -4283,7 +4308,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4283 conf->max_degraded = 1; 4308 conf->max_degraded = 1;
4284 conf->algorithm = mddev->new_layout; 4309 conf->algorithm = mddev->new_layout;
4285 conf->max_nr_stripes = NR_STRIPES; 4310 conf->max_nr_stripes = NR_STRIPES;
4286 conf->expand_progress = mddev->reshape_position; 4311 conf->reshape_progress = mddev->reshape_position;
4287 4312
4288 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4313 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4289 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4314 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
@@ -4441,9 +4466,9 @@ static int run(mddev_t *mddev)
4441 4466
4442 print_raid5_conf(conf); 4467 print_raid5_conf(conf);
4443 4468
4444 if (conf->expand_progress != MaxSector) { 4469 if (conf->reshape_progress != MaxSector) {
4445 printk("...ok start reshape thread\n"); 4470 printk("...ok start reshape thread\n");
4446 conf->expand_lo = conf->expand_progress; 4471 conf->reshape_safe = conf->reshape_progress;
4447 atomic_set(&conf->reshape_stripes, 0); 4472 atomic_set(&conf->reshape_stripes, 0);
4448 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4473 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4449 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4474 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -4782,8 +4807,11 @@ static int raid5_start_reshape(mddev_t *mddev)
4782 spin_lock_irq(&conf->device_lock); 4807 spin_lock_irq(&conf->device_lock);
4783 conf->previous_raid_disks = conf->raid_disks; 4808 conf->previous_raid_disks = conf->raid_disks;
4784 conf->raid_disks += mddev->delta_disks; 4809 conf->raid_disks += mddev->delta_disks;
4785 conf->expand_progress = 0; 4810 if (mddev->delta_disks < 0)
4786 conf->expand_lo = 0; 4811 conf->reshape_progress = raid5_size(mddev, 0, 0);
4812 else
4813 conf->reshape_progress = 0;
4814 conf->reshape_safe = conf->reshape_progress;
4787 spin_unlock_irq(&conf->device_lock); 4815 spin_unlock_irq(&conf->device_lock);
4788 4816
4789 /* Add some new drives, as many as will fit. 4817 /* Add some new drives, as many as will fit.
@@ -4825,7 +4853,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4825 mddev->recovery = 0; 4853 mddev->recovery = 0;
4826 spin_lock_irq(&conf->device_lock); 4854 spin_lock_irq(&conf->device_lock);
4827 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 4855 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
4828 conf->expand_progress = MaxSector; 4856 conf->reshape_progress = MaxSector;
4829 spin_unlock_irq(&conf->device_lock); 4857 spin_unlock_irq(&conf->device_lock);
4830 return -EAGAIN; 4858 return -EAGAIN;
4831 } 4859 }
@@ -4842,7 +4870,7 @@ static void end_reshape(raid5_conf_t *conf)
4842 4870
4843 spin_lock_irq(&conf->device_lock); 4871 spin_lock_irq(&conf->device_lock);
4844 conf->previous_raid_disks = conf->raid_disks; 4872 conf->previous_raid_disks = conf->raid_disks;
4845 conf->expand_progress = MaxSector; 4873 conf->reshape_progress = MaxSector;
4846 spin_unlock_irq(&conf->device_lock); 4874 spin_unlock_irq(&conf->device_lock);
4847 4875
4848 /* read-ahead size must cover two whole stripes, which is 4876 /* read-ahead size must cover two whole stripes, which is
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c2f37f25ef44..b2edcc434e41 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -337,11 +337,16 @@ struct raid5_private_data {
337 int raid_disks; 337 int raid_disks;
338 int max_nr_stripes; 338 int max_nr_stripes;
339 339
340 /* used during an expand */ 340 /* reshape_progress is the leading edge of a 'reshape'
341 sector_t expand_progress; /* MaxSector when no expand happening */ 341 * It has value MaxSector when no reshape is happening
342 sector_t expand_lo; /* from here up to expand_progress it out-of-bounds 342 * If delta_disks < 0, it is the last sector we started work on,
343 * as we haven't flushed the metadata yet 343 * else is it the next sector to work on.
344 */ 344 */
345 sector_t reshape_progress;
346 /* reshape_safe is the trailing edge of a reshape. We know that
347 * before (or after) this address, all reshape has completed.
348 */
349 sector_t reshape_safe;
345 int previous_raid_disks; 350 int previous_raid_disks;
346 351
347 struct list_head handle_list; /* stripes needing handling */ 352 struct list_head handle_list; /* stripes needing handling */