md: allow a maximum extent to be set for resyncing

This allows userspace to control resync/reshape progress and synchronise it with other activities, such as shared access in a SAN, or backing up critical sections during a tricky reshape. Writing a number of sectors (which must be a multiple of the chunk size if such is meaningful) causes a resync to pause when it gets to that point. Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: NeilBrown <neilb@suse.de> 2008-02-06 04:39:52 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2008-02-06 13:41:18 -0500
commit: c620727779f7cc8ea96efb71f0651a26349e59c1 (patch)
tree: 777abdad9c9ef10cb4df5c0efc736e6c64851ed8
parent: c303da6d713b87b7b3f999f5acce8ecc76ff1adb (diff)
6 files changed, 107 insertions, 10 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 5818628207b5..396cdd982c26 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -416,6 +416,16 @@ also have
     sectors in total that could need to be processed.  The two
     numbers are separated by a '/'  thus effectively showing one
     value, a fraction of the process that is complete.
+     A 'select' on this attribute will return when resync completes,
+     when it reaches the current sync_max (below) and possibly at
+     other times.
+   sync_max
+     This is a number of sectors at which point a resync/recovery
+     process will pause.  When a resync is active, the value can
+     only ever be increased, never decreased.  The value of 'max'
+     effectively disables the limit.
   sync_speed
     This shows the current actual speed, in K/sec, of the current
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 00788c56276f..79eb63fdb4b3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -275,6 +275,7 @@ static mddev_t * mddev_find(dev_t unit)
        spin_lock_init(&new->write_lock);
        init_waitqueue_head(&new->sb_wait);
        new->reshape_position = MaxSector;
+        new->resync_max = MaxSector;
        new->queue = blk_alloc_queue(GFP_KERNEL);
        if (!new->queue) {
@@ -2921,6 +2922,43 @@ sync_completed_show(mddev_t *mddev, char *page)
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
 static ssize_t
+max_sync_show(mddev_t *mddev, char *page)
+{
+        if (mddev->resync_max == MaxSector)
+                return sprintf(page, "max\n");
+        else
+                return sprintf(page, "%llu\n",
+                               (unsigned long long)mddev->resync_max);
+}
+static ssize_t
+max_sync_store(mddev_t *mddev, const char *buf, size_t len)
+{
+        if (strncmp(buf, "max", 3) == 0)
+                mddev->resync_max = MaxSector;
+        else {
+                char *ep;
+                unsigned long long max = simple_strtoull(buf, &ep, 10);
+                if (ep == buf || (*ep != 0 && *ep != '\n'))
+                        return -EINVAL;
+                if (max < mddev->resync_max &&
+                    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+                        return -EBUSY;
+                /* Must be a multiple of chunk_size */
+                if (mddev->chunk_size) {
+                        if (max & (sector_t)((mddev->chunk_size>>9)-1))
+                                return -EINVAL;
+                }
+                mddev->resync_max = max;
+        }
+        wake_up(&mddev->recovery_wait);
+        return len;
+}
+static struct md_sysfs_entry md_max_sync =
+__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
+static ssize_t
 suspend_lo_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
@@ -3030,6 +3068,7 @@ static struct attribute *md_redundancy_attrs[] = {
        &md_sync_max.attr,
        &md_sync_speed.attr,
        &md_sync_completed.attr,
+        &md_max_sync.attr,
        &md_suspend_lo.attr,
        &md_suspend_hi.attr,
        &md_bitmap.attr,
@@ -3579,6 +3618,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
                mddev->size = 0;
                mddev->raid_disks = 0;
                mddev->recovery_cp = 0;
+                mddev->resync_max = MaxSector;
                mddev->reshape_position = MaxSector;
                mddev->external = 0;
@@ -5443,8 +5483,16 @@ void md_do_sync(mddev_t *mddev)
                sector_t sectors;
                skipped = 0;
+                if (j >= mddev->resync_max) {
+                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+                        wait_event(mddev->recovery_wait,
+                                   mddev->resync_max > j
+                                   || kthread_should_stop());
+                }
+                if (kthread_should_stop())
+                        goto interrupted;
                sectors = mddev->pers->sync_request(mddev, j, &skipped,
-                                            currspeed < speed_min(mddev));
+                                                  currspeed < speed_min(mddev));
                if (sectors == 0) {
                        set_bit(MD_RECOVERY_ERR, &mddev->recovery);
                        goto out;
@@ -5486,15 +5534,9 @@ void md_do_sync(mddev_t *mddev)
                }
-                if (kthread_should_stop()) {
+                if (kthread_should_stop())
-                        /*
+                        goto interrupted;
-                         * got a signal, exit.
-                         */
-                        printk(KERN_INFO 
-                                "md: md_do_sync() got signal ... exiting\n");
-                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                        goto out;
-                }
                /*
                 * this loop exits only if either when we are slower than
@@ -5558,9 +5600,22 @@ void md_do_sync(mddev_t *mddev)
 skip:
        mddev->curr_resync = 0;
+        mddev->resync_max = MaxSector;
+        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        wake_up(&resync_wait);
        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
+        return;
+ interrupted:
+        /*
+         * got a signal, exit.
+         */
+        printk(KERN_INFO
+               "md: md_do_sync() got signal ... exiting\n");
+        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+        goto out;
 }
 EXPORT_SYMBOL_GPL(md_do_sync);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e0b8d0dd7a87..ae7c15207df5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1767,6 +1767,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                return rv;
        }
+        if (max_sector > mddev->resync_max)
+                max_sector = mddev->resync_max; /* Don't do IO beyond here */
        nr_sectors = 0;
        sync_blocks = 0;
        do {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ba125277c6c4..d6f12882424d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1657,6 +1657,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                return (max_sector - sector_nr) + sectors_skipped;
        }
+        if (max_sector > mddev->resync_max)
+                max_sector = mddev->resync_max; /* Don't do IO beyond here */
        /* make sure whole request will fit in a chunk - if chunks
         * are meaningful
         */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 388a974d63ef..e946de6f46bc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3698,6 +3698,25 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                release_stripe(sh);
                first_sector += STRIPE_SECTORS;
        }
+        /* If this takes us to the resync_max point where we have to pause,
+         * then we need to write out the superblock.
+         */
+        sector_nr += conf->chunk_size>>9;
+        if (sector_nr >= mddev->resync_max) {
+                /* Cannot proceed until we've updated the superblock... */
+                wait_event(conf->wait_for_overlap,
+                           atomic_read(&conf->reshape_stripes) == 0);
+                mddev->reshape_position = conf->expand_progress;
+                set_bit(MD_CHANGE_DEVS, &mddev->flags);
+                md_wakeup_thread(mddev->thread);
+                wait_event(mddev->sb_wait,
+                           !test_bit(MD_CHANGE_DEVS, &mddev->flags)
+                           || kthread_should_stop());
+                spin_lock_irq(&conf->device_lock);
+                conf->expand_lo = mddev->reshape_position;
+                spin_unlock_irq(&conf->device_lock);
+                wake_up(&conf->wait_for_overlap);
+        }
        return conf->chunk_size>>9;
 }
@@ -3734,6 +3753,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                return reshape_request(mddev, sector_nr, skipped);
+        /* No need to check resync_max as we never do more than one
+         * stripe, and as resync_max will always be on a chunk boundary,
+         * if the check in md_do_sync didn't fire, there is no chance
+         * of overstepping resync_max here
+         */
        /* if there is too many failed drives and we are trying
         * to resync, then assert that we are finished, because there is
         * nothing we can do.
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index b579cc628303..c77dca3221ed 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -219,6 +219,8 @@ struct mddev_s
        atomic_t                        recovery_active; /* blocks scheduled, but not written */
        wait_queue_head_t               recovery_wait;
        sector_t                        recovery_cp;
+        sector_t                        resync_max;     /* resync should pause
+                                                         * when it gets here */
        spinlock_t                      write_lock;
        wait_queue_head_t               sb_wait;        /* for waiting on superblock updates */
author	NeilBrown <neilb@suse.de>	2008-02-06 04:39:52 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2008-02-06 13:41:18 -0500
commit	c620727779f7cc8ea96efb71f0651a26349e59c1 (patch)
tree	777abdad9c9ef10cb4df5c0efc736e6c64851ed8
parent	c303da6d713b87b7b3f999f5acce8ecc76ff1adb (diff)

diff --git a/Documentation/md.txt b/Documentation/md.txt index 5818628207b5..396cdd982c26 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt
@@ -416,6 +416,16 @@ also have
416	sectors in total that could need to be processed. The two	416	sectors in total that could need to be processed. The two
417	numbers are separated by a '/' thus effectively showing one	417	numbers are separated by a '/' thus effectively showing one
418	value, a fraction of the process that is complete.	418	value, a fraction of the process that is complete.
		419	A 'select' on this attribute will return when resync completes,
		420	when it reaches the current sync_max (below) and possibly at
		421	other times.
		422
		423	sync_max
		424	This is a number of sectors at which point a resync/recovery
		425	process will pause. When a resync is active, the value can
		426	only ever be increased, never decreased. The value of 'max'
		427	effectively disables the limit.
		428
419		429
420	sync_speed	430	sync_speed
421	This shows the current actual speed, in K/sec, of the current	431	This shows the current actual speed, in K/sec, of the current


diff --git a/drivers/md/md.c b/drivers/md/md.c index 00788c56276f..79eb63fdb4b3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c
@@ -275,6 +275,7 @@ static mddev_t * mddev_find(dev_t unit)
275	spin_lock_init(&new->write_lock);	275	spin_lock_init(&new->write_lock);
276	init_waitqueue_head(&new->sb_wait);	276	init_waitqueue_head(&new->sb_wait);
277	new->reshape_position = MaxSector;	277	new->reshape_position = MaxSector;
		278	new->resync_max = MaxSector;
278		279
279	new->queue = blk_alloc_queue(GFP_KERNEL);	280	new->queue = blk_alloc_queue(GFP_KERNEL);
280	if (!new->queue) {	281	if (!new->queue) {
@@ -2921,6 +2922,43 @@ sync_completed_show(mddev_t mddev, char page)
2921	static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);	2922	static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
2922		2923
2923	static ssize_t	2924	static ssize_t
		2925	max_sync_show(mddev_t mddev, char page)
		2926	{
		2927	if (mddev->resync_max == MaxSector)
		2928	return sprintf(page, "max\n");
		2929	else
		2930	return sprintf(page, "%llu\n",
		2931	(unsigned long long)mddev->resync_max);
		2932	}
		2933	static ssize_t
		2934	max_sync_store(mddev_t mddev, const char buf, size_t len)
		2935	{
		2936	if (strncmp(buf, "max", 3) == 0)
		2937	mddev->resync_max = MaxSector;
		2938	else {
		2939	char *ep;
		2940	unsigned long long max = simple_strtoull(buf, &ep, 10);
		2941	if (ep == buf \|\| (ep != 0 && ep != '\n'))
		2942	return -EINVAL;
		2943	if (max < mddev->resync_max &&
		2944	test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		2945	return -EBUSY;
		2946
		2947	/* Must be a multiple of chunk_size */
		2948	if (mddev->chunk_size) {
		2949	if (max & (sector_t)((mddev->chunk_size>>9)-1))
		2950	return -EINVAL;
		2951	}
		2952	mddev->resync_max = max;
		2953	}
		2954	wake_up(&mddev->recovery_wait);
		2955	return len;
		2956	}
		2957
		2958	static struct md_sysfs_entry md_max_sync =
		2959	__ATTR(sync_max, S_IRUGO\|S_IWUSR, max_sync_show, max_sync_store);
		2960
		2961	static ssize_t
2924	suspend_lo_show(mddev_t mddev, char page)	2962	suspend_lo_show(mddev_t mddev, char page)
2925	{	2963	{
2926	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);	2964	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
@@ -3030,6 +3068,7 @@ static struct attribute *md_redundancy_attrs[] = {
3030	&md_sync_max.attr,	3068	&md_sync_max.attr,
3031	&md_sync_speed.attr,	3069	&md_sync_speed.attr,
3032	&md_sync_completed.attr,	3070	&md_sync_completed.attr,
		3071	&md_max_sync.attr,
3033	&md_suspend_lo.attr,	3072	&md_suspend_lo.attr,
3034	&md_suspend_hi.attr,	3073	&md_suspend_hi.attr,
3035	&md_bitmap.attr,	3074	&md_bitmap.attr,
@@ -3579,6 +3618,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
3579	mddev->size = 0;	3618	mddev->size = 0;
3580	mddev->raid_disks = 0;	3619	mddev->raid_disks = 0;
3581	mddev->recovery_cp = 0;	3620	mddev->recovery_cp = 0;
		3621	mddev->resync_max = MaxSector;
3582	mddev->reshape_position = MaxSector;	3622	mddev->reshape_position = MaxSector;
3583	mddev->external = 0;	3623	mddev->external = 0;
3584		3624
@@ -5443,8 +5483,16 @@ void md_do_sync(mddev_t *mddev)
5443	sector_t sectors;	5483	sector_t sectors;
5444		5484
5445	skipped = 0;	5485	skipped = 0;
		5486	if (j >= mddev->resync_max) {
		5487	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
		5488	wait_event(mddev->recovery_wait,
		5489	mddev->resync_max > j
		5490	\|\| kthread_should_stop());
		5491	}
		5492	if (kthread_should_stop())
		5493	goto interrupted;
5446	sectors = mddev->pers->sync_request(mddev, j, &skipped,	5494	sectors = mddev->pers->sync_request(mddev, j, &skipped,
5447	currspeed < speed_min(mddev));	5495	currspeed < speed_min(mddev));
5448	if (sectors == 0) {	5496	if (sectors == 0) {
5449	set_bit(MD_RECOVERY_ERR, &mddev->recovery);	5497	set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5450	goto out;	5498	goto out;
@@ -5486,15 +5534,9 @@ void md_do_sync(mddev_t *mddev)
5486	}	5534	}
5487		5535
5488		5536
5489	if (kthread_should_stop()) {	5537	if (kthread_should_stop())
5490	/*	5538	goto interrupted;
5491	* got a signal, exit.	5539
5492	*/
5493	printk(KERN_INFO
5494	"md: md_do_sync() got signal ... exiting\n");
5495	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5496	goto out;
5497	}
5498		5540
5499	/*	5541	/*
5500	* this loop exits only if either when we are slower than	5542	* this loop exits only if either when we are slower than
@@ -5558,9 +5600,22 @@ void md_do_sync(mddev_t *mddev)
5558		5600
5559	skip:	5601	skip:
5560	mddev->curr_resync = 0;	5602	mddev->curr_resync = 0;
		5603	mddev->resync_max = MaxSector;
		5604	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5561	wake_up(&resync_wait);	5605	wake_up(&resync_wait);
5562	set_bit(MD_RECOVERY_DONE, &mddev->recovery);	5606	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5563	md_wakeup_thread(mddev->thread);	5607	md_wakeup_thread(mddev->thread);
		5608	return;
		5609
		5610	interrupted:
		5611	/*
		5612	* got a signal, exit.
		5613	*/
		5614	printk(KERN_INFO
		5615	"md: md_do_sync() got signal ... exiting\n");
		5616	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
		5617	goto out;
		5618
5564	}	5619	}
5565	EXPORT_SYMBOL_GPL(md_do_sync);	5620	EXPORT_SYMBOL_GPL(md_do_sync);
5566		5621


diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e0b8d0dd7a87..ae7c15207df5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -1767,6 +1767,8 @@ static sector_t sync_request(mddev_t mddev, sector_t sector_nr, int skipped, i
1767	return rv;	1767	return rv;
1768	}	1768	}
1769		1769
		1770	if (max_sector > mddev->resync_max)
		1771	max_sector = mddev->resync_max; /* Don't do IO beyond here */
1770	nr_sectors = 0;	1772	nr_sectors = 0;
1771	sync_blocks = 0;	1773	sync_blocks = 0;
1772	do {	1774	do {


diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ba125277c6c4..d6f12882424d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -1657,6 +1657,9 @@ static sector_t sync_request(mddev_t mddev, sector_t sector_nr, int skipped, i
1657	return (max_sector - sector_nr) + sectors_skipped;	1657	return (max_sector - sector_nr) + sectors_skipped;
1658	}	1658	}
1659		1659
		1660	if (max_sector > mddev->resync_max)
		1661	max_sector = mddev->resync_max; /* Don't do IO beyond here */
		1662
1660	/* make sure whole request will fit in a chunk - if chunks	1663	/* make sure whole request will fit in a chunk - if chunks
1661	* are meaningful	1664	* are meaningful
1662	*/	1665	*/


diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 388a974d63ef..e946de6f46bc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c
@@ -3698,6 +3698,25 @@ static sector_t reshape_request(mddev_t mddev, sector_t sector_nr, int skipped
3698	release_stripe(sh);	3698	release_stripe(sh);
3699	first_sector += STRIPE_SECTORS;	3699	first_sector += STRIPE_SECTORS;
3700	}	3700	}
		3701	/* If this takes us to the resync_max point where we have to pause,
		3702	* then we need to write out the superblock.
		3703	*/
		3704	sector_nr += conf->chunk_size>>9;
		3705	if (sector_nr >= mddev->resync_max) {
		3706	/* Cannot proceed until we've updated the superblock... */
		3707	wait_event(conf->wait_for_overlap,
		3708	atomic_read(&conf->reshape_stripes) == 0);
		3709	mddev->reshape_position = conf->expand_progress;
		3710	set_bit(MD_CHANGE_DEVS, &mddev->flags);
		3711	md_wakeup_thread(mddev->thread);
		3712	wait_event(mddev->sb_wait,
		3713	!test_bit(MD_CHANGE_DEVS, &mddev->flags)
		3714	\|\| kthread_should_stop());
		3715	spin_lock_irq(&conf->device_lock);
		3716	conf->expand_lo = mddev->reshape_position;
		3717	spin_unlock_irq(&conf->device_lock);
		3718	wake_up(&conf->wait_for_overlap);
		3719	}
3701	return conf->chunk_size>>9;	3720	return conf->chunk_size>>9;
3702	}	3721	}
3703		3722
@@ -3734,6 +3753,12 @@ static inline sector_t sync_request(mddev_t mddev, sector_t sector_nr, int ski
3734	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))	3753	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3735	return reshape_request(mddev, sector_nr, skipped);	3754	return reshape_request(mddev, sector_nr, skipped);
3736		3755
		3756	/* No need to check resync_max as we never do more than one
		3757	* stripe, and as resync_max will always be on a chunk boundary,
		3758	* if the check in md_do_sync didn't fire, there is no chance
		3759	* of overstepping resync_max here
		3760	*/
		3761
3737	/* if there is too many failed drives and we are trying	3762	/* if there is too many failed drives and we are trying
3738	* to resync, then assert that we are finished, because there is	3763	* to resync, then assert that we are finished, because there is
3739	* nothing we can do.	3764	* nothing we can do.


diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index b579cc628303..c77dca3221ed 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h
@@ -219,6 +219,8 @@ struct mddev_s
219	atomic_t recovery_active; /* blocks scheduled, but not written */	219	atomic_t recovery_active; /* blocks scheduled, but not written */
220	wait_queue_head_t recovery_wait;	220	wait_queue_head_t recovery_wait;
221	sector_t recovery_cp;	221	sector_t recovery_cp;
		222	sector_t resync_max; /* resync should pause
		223	* when it gets here */
222		224
223	spinlock_t write_lock;	225	spinlock_t write_lock;
224	wait_queue_head_t sb_wait; /* for waiting on superblock updates */	226	wait_queue_head_t sb_wait; /* for waiting on superblock updates */