md/raid10: avoid reading from known bad blocks - part 1

This patch just covers the basic read path: 1/ read_balance needs to check for badblocks, and return not only the chosen slot, but also how many good blocks are available there. 2/ read submission must be ready to issue multiple reads to different devices as different bad blocks on different devices could mean that a single large read cannot be served by any one device, but can still be served by the array. This requires keeping count of the number of outstanding requests per bio. This count is stored in 'bi_phys_segments' On read error we currently just fail the request if another target cannot handle the whole request. Next patch refines that a bit. Signed-off-by: NeilBrown <neilb@suse.de>
author: NeilBrown <neilb@suse.de> 2011-07-27 21:39:23 -0400
committer: NeilBrown <neilb@suse.de> 2011-07-27 21:39:23 -0400
commit: 856e08e23762dfb92ffc68fd0a8d228f9e152160 (patch)
tree: fa9977a39da542eebb2129712703c11009a56ff2 /drivers/md/raid10.c
parent: 560f8e5532d63a314271bfb99d3d1d53c938ed14 (diff)
1 files changed, 125 insertions, 16 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f1b749c21717..872bf948f33a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
 {
        conf_t *conf = r10_bio->mddev->private;
-        /*
-         * Wake up any possible resync thread that waits for the device
-         * to go idle.
-         */
-        allow_barrier(conf);
        put_all_bios(conf, r10_bio);
        mempool_free(r10_bio, conf->r10bio_pool);
 }
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
 static void raid_end_bio_io(r10bio_t *r10_bio)
 {
        struct bio *bio = r10_bio->master_bio;
+        int done;
+        conf_t *conf = r10_bio->mddev->private;
-        bio_endio(bio,
+        if (bio->bi_phys_segments) {
-                test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
+                unsigned long flags;
+                spin_lock_irqsave(&conf->device_lock, flags);
+                bio->bi_phys_segments--;
+                done = (bio->bi_phys_segments == 0);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+        } else
+                done = 1;
+        if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        if (done) {
+                bio_endio(bio, 0);
+                /*
+                 * Wake up any possible resync thread that waits for the device
+                 * to go idle.
+                 */
+                allow_barrier(conf);
+        }
        free_r10bio(r10_bio);
 }
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
                                   mdname(conf->mddev),
                                   bdevname(conf->mirrors[dev].rdev->bdev, b),
                                   (unsigned long long)r10_bio->sector);
+                set_bit(R10BIO_ReadError, &r10_bio->state);
                reschedule_retry(r10_bio);
        }
 }
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 * FIXME: possibly should rethink readbalancing and do it differently
 * depending on near_copies / far_copies geometry.
 */
-static int read_balance(conf_t *conf, r10bio_t *r10_bio)
+static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
 {
        const sector_t this_sector = r10_bio->sector;
        int disk, slot;
-        const int sectors = r10_bio->sectors;
+        int sectors = r10_bio->sectors;
+        int best_good_sectors;
        sector_t new_distance, best_dist;
        mdk_rdev_t *rdev;
        int do_balance;
@@ -518,8 +532,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
        raid10_find_phys(conf, r10_bio);
        rcu_read_lock();
 retry:
+        sectors = r10_bio->sectors;
        best_slot = -1;
        best_dist = MaxSector;
+        best_good_sectors = 0;
        do_balance = 1;
        /*
         * Check if we can balance. We can balance on the whole
@@ -532,6 +548,10 @@ retry:
                do_balance = 0;
        for (slot = 0; slot < conf->copies ; slot++) {
+                sector_t first_bad;
+                int bad_sectors;
+                sector_t dev_sector;
                if (r10_bio->devs[slot].bio == IO_BLOCKED)
                        continue;
                disk = r10_bio->devs[slot].devnum;
@@ -541,6 +561,37 @@ retry:
                if (!test_bit(In_sync, &rdev->flags))
                        continue;
+                dev_sector = r10_bio->devs[slot].addr;
+                if (is_badblock(rdev, dev_sector, sectors,
+                                &first_bad, &bad_sectors)) {
+                        if (best_dist < MaxSector)
+                                /* Already have a better slot */
+                                continue;
+                        if (first_bad <= dev_sector) {
+                                /* Cannot read here.  If this is the
+                                 * 'primary' device, then we must not read
+                                 * beyond 'bad_sectors' from another device.
+                                 */
+                                bad_sectors -= (dev_sector - first_bad);
+                                if (!do_balance && sectors > bad_sectors)
+                                        sectors = bad_sectors;
+                                if (best_good_sectors > sectors)
+                                        best_good_sectors = sectors;
+                        } else {
+                                sector_t good_sectors =
+                                        first_bad - dev_sector;
+                                if (good_sectors > best_good_sectors) {
+                                        best_good_sectors = good_sectors;
+                                        best_slot = slot;
+                                }
+                                if (!do_balance)
+                                        /* Must read from here */
+                                        break;
+                        }
+                        continue;
+                } else
+                        best_good_sectors = sectors;
                if (!do_balance)
                        break;
@@ -582,6 +633,7 @@ retry:
        } else
                disk = -1;
        rcu_read_unlock();
+        *max_sectors = best_good_sectors;
        return disk;
 }
@@ -829,12 +881,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        r10_bio->sector = bio->bi_sector;
        r10_bio->state = 0;
+        /* We might need to issue multiple reads to different
+         * devices if there are bad blocks around, so we keep
+         * track of the number of reads in bio->bi_phys_segments.
+         * If this is 0, there is only one r10_bio and no locking
+         * will be needed when the request completes.  If it is
+         * non-zero, then it is the number of not-completed requests.
+         */
+        bio->bi_phys_segments = 0;
+        clear_bit(BIO_SEG_VALID, &bio->bi_flags);
        if (rw == READ) {
                /*
                 * read balancing logic:
                 */
-                int disk = read_balance(conf, r10_bio);
+                int max_sectors;
-                int slot = r10_bio->read_slot;
+                int disk;
+                int slot;
+read_again:
+                disk = read_balance(conf, r10_bio, &max_sectors);
+                slot = r10_bio->read_slot;
                if (disk < 0) {
                        raid_end_bio_io(r10_bio);
                        return 0;
@@ -842,6 +909,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                mirror = conf->mirrors + disk;
                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
+                            max_sectors);
                r10_bio->devs[slot].bio = read_bio;
@@ -852,7 +921,39 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                read_bio->bi_rw = READ | do_sync;
                read_bio->bi_private = r10_bio;
-                generic_make_request(read_bio);
+                if (max_sectors < r10_bio->sectors) {
+                        /* Could not read all from this device, so we will
+                         * need another r10_bio.
+                         */
+                        int sectors_handled;
+                        sectors_handled = (r10_bio->sectors + max_sectors
+                                           - bio->bi_sector);
+                        r10_bio->sectors = max_sectors;
+                        spin_lock_irq(&conf->device_lock);
+                        if (bio->bi_phys_segments == 0)
+                                bio->bi_phys_segments = 2;
+                        else
+                                bio->bi_phys_segments++;
+                        spin_unlock(&conf->device_lock);
+                        /* Cannot call generic_make_request directly
+                         * as that will be queued in __generic_make_request
+                         * and subsequent mempool_alloc might block
+                         * waiting for it.  so hand bio over to raid10d.
+                         */
+                        reschedule_retry(r10_bio);
+                        r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+                        r10_bio->master_bio = bio;
+                        r10_bio->sectors = ((bio->bi_size >> 9)
+                                            - sectors_handled);
+                        r10_bio->state = 0;
+                        r10_bio->mddev = mddev;
+                        r10_bio->sector = bio->bi_sector + sectors_handled;
+                        goto read_again;
+                } else
+                        generic_make_request(read_bio);
                return 0;
        }
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
        mdk_rdev_t *rdev;
        char b[BDEVNAME_SIZE];
        unsigned long do_sync;
+        int max_sectors;
        /* we got a read error. Maybe the drive is bad.  Maybe just
         * the block and we can fix it.
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
        bio = r10_bio->devs[slot].bio;
        r10_bio->devs[slot].bio =
                mddev->ro ? IO_BLOCKED : NULL;
-        mirror = read_balance(conf, r10_bio);
+        mirror = read_balance(conf, r10_bio, &max_sectors);
-        if (mirror == -1) {
+        if (mirror == -1 || max_sectors < r10_bio->sectors) {
                printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
                       " read error for block %llu\n",
                       mdname(mddev),
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev)
                        sync_request_write(mddev, r10_bio);
                else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
                        recovery_request_write(mddev, r10_bio);
-                else
+                else if (test_bit(R10BIO_ReadError, &r10_bio->state))
                        handle_read_error(mddev, r10_bio);
+                else {
+                        /* just a partial read to be scheduled from a
+                         * separate context
+                         */
+                        int slot = r10_bio->read_slot;
+                        generic_make_request(r10_bio->devs[slot].bio);
+                }
                cond_resched();
                if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
author	NeilBrown <neilb@suse.de>	2011-07-27 21:39:23 -0400
committer	NeilBrown <neilb@suse.de>	2011-07-27 21:39:23 -0400
commit	856e08e23762dfb92ffc68fd0a8d228f9e152160 (patch)
tree	fa9977a39da542eebb2129712703c11009a56ff2 /drivers/md/raid10.c
parent	560f8e5532d63a314271bfb99d3d1d53c938ed14 (diff)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f1b749c21717..872bf948f33a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
191	{	191	{
192	conf_t *conf = r10_bio->mddev->private;	192	conf_t *conf = r10_bio->mddev->private;
193		193
194	/*
195	* Wake up any possible resync thread that waits for the device
196	* to go idle.
197	*/
198	allow_barrier(conf);
199
200	put_all_bios(conf, r10_bio);	194	put_all_bios(conf, r10_bio);
201	mempool_free(r10_bio, conf->r10bio_pool);	195	mempool_free(r10_bio, conf->r10bio_pool);
202	}	196	}
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
235	static void raid_end_bio_io(r10bio_t *r10_bio)	229	static void raid_end_bio_io(r10bio_t *r10_bio)
236	{	230	{
237	struct bio *bio = r10_bio->master_bio;	231	struct bio *bio = r10_bio->master_bio;
		232	int done;
		233	conf_t *conf = r10_bio->mddev->private;
238		234
239	bio_endio(bio,	235	if (bio->bi_phys_segments) {
240	test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);	236	unsigned long flags;
		237	spin_lock_irqsave(&conf->device_lock, flags);
		238	bio->bi_phys_segments--;
		239	done = (bio->bi_phys_segments == 0);
		240	spin_unlock_irqrestore(&conf->device_lock, flags);
		241	} else
		242	done = 1;
		243	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
		244	clear_bit(BIO_UPTODATE, &bio->bi_flags);
		245	if (done) {
		246	bio_endio(bio, 0);
		247	/*
		248	* Wake up any possible resync thread that waits for the device
		249	* to go idle.
		250	*/
		251	allow_barrier(conf);
		252	}
241	free_r10bio(r10_bio);	253	free_r10bio(r10_bio);
242	}	254	}
243		255
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
307	mdname(conf->mddev),	319	mdname(conf->mddev),
308	bdevname(conf->mirrors[dev].rdev->bdev, b),	320	bdevname(conf->mirrors[dev].rdev->bdev, b),
309	(unsigned long long)r10_bio->sector);	321	(unsigned long long)r10_bio->sector);
		322	set_bit(R10BIO_ReadError, &r10_bio->state);
310	reschedule_retry(r10_bio);	323	reschedule_retry(r10_bio);
311	}	324	}
312	}	325	}
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
505	* FIXME: possibly should rethink readbalancing and do it differently	518	* FIXME: possibly should rethink readbalancing and do it differently
506	* depending on near_copies / far_copies geometry.	519	* depending on near_copies / far_copies geometry.
507	*/	520	*/
508	static int read_balance(conf_t conf, r10bio_t r10_bio)	521	static int read_balance(conf_t conf, r10bio_t r10_bio, int *max_sectors)
509	{	522	{
510	const sector_t this_sector = r10_bio->sector;	523	const sector_t this_sector = r10_bio->sector;
511	int disk, slot;	524	int disk, slot;
512	const int sectors = r10_bio->sectors;	525	int sectors = r10_bio->sectors;
		526	int best_good_sectors;
513	sector_t new_distance, best_dist;	527	sector_t new_distance, best_dist;
514	mdk_rdev_t *rdev;	528	mdk_rdev_t *rdev;
515	int do_balance;	529	int do_balance;
@@ -518,8 +532,10 @@ static int read_balance(conf_t conf, r10bio_t r10_bio)
518	raid10_find_phys(conf, r10_bio);	532	raid10_find_phys(conf, r10_bio);
519	rcu_read_lock();	533	rcu_read_lock();
520	retry:	534	retry:
		535	sectors = r10_bio->sectors;
521	best_slot = -1;	536	best_slot = -1;
522	best_dist = MaxSector;	537	best_dist = MaxSector;
		538	best_good_sectors = 0;
523	do_balance = 1;	539	do_balance = 1;
524	/*	540	/*
525	* Check if we can balance. We can balance on the whole	541	* Check if we can balance. We can balance on the whole
@@ -532,6 +548,10 @@ retry:
532	do_balance = 0;	548	do_balance = 0;
533		549
534	for (slot = 0; slot < conf->copies ; slot++) {	550	for (slot = 0; slot < conf->copies ; slot++) {
		551	sector_t first_bad;
		552	int bad_sectors;
		553	sector_t dev_sector;
		554
535	if (r10_bio->devs[slot].bio == IO_BLOCKED)	555	if (r10_bio->devs[slot].bio == IO_BLOCKED)
536	continue;	556	continue;
537	disk = r10_bio->devs[slot].devnum;	557	disk = r10_bio->devs[slot].devnum;
@@ -541,6 +561,37 @@ retry:
541	if (!test_bit(In_sync, &rdev->flags))	561	if (!test_bit(In_sync, &rdev->flags))
542	continue;	562	continue;
543		563
		564	dev_sector = r10_bio->devs[slot].addr;
		565	if (is_badblock(rdev, dev_sector, sectors,
		566	&first_bad, &bad_sectors)) {
		567	if (best_dist < MaxSector)
		568	/* Already have a better slot */
		569	continue;
		570	if (first_bad <= dev_sector) {
		571	/* Cannot read here. If this is the
		572	* 'primary' device, then we must not read
		573	* beyond 'bad_sectors' from another device.
		574	*/
		575	bad_sectors -= (dev_sector - first_bad);
		576	if (!do_balance && sectors > bad_sectors)
		577	sectors = bad_sectors;
		578	if (best_good_sectors > sectors)
		579	best_good_sectors = sectors;
		580	} else {
		581	sector_t good_sectors =
		582	first_bad - dev_sector;
		583	if (good_sectors > best_good_sectors) {
		584	best_good_sectors = good_sectors;
		585	best_slot = slot;
		586	}
		587	if (!do_balance)
		588	/* Must read from here */
		589	break;
		590	}
		591	continue;
		592	} else
		593	best_good_sectors = sectors;
		594
544	if (!do_balance)	595	if (!do_balance)
545	break;	596	break;
546		597
@@ -582,6 +633,7 @@ retry:
582	} else	633	} else
583	disk = -1;	634	disk = -1;
584	rcu_read_unlock();	635	rcu_read_unlock();
		636	*max_sectors = best_good_sectors;
585		637
586	return disk;	638	return disk;
587	}	639	}
@@ -829,12 +881,27 @@ static int make_request(mddev_t mddev, struct bio bio)
829	r10_bio->sector = bio->bi_sector;	881	r10_bio->sector = bio->bi_sector;
830	r10_bio->state = 0;	882	r10_bio->state = 0;
831		883
		884	/* We might need to issue multiple reads to different
		885	* devices if there are bad blocks around, so we keep
		886	* track of the number of reads in bio->bi_phys_segments.
		887	* If this is 0, there is only one r10_bio and no locking
		888	* will be needed when the request completes. If it is
		889	* non-zero, then it is the number of not-completed requests.
		890	*/
		891	bio->bi_phys_segments = 0;
		892	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
		893
832	if (rw == READ) {	894	if (rw == READ) {
833	/*	895	/*
834	* read balancing logic:	896	* read balancing logic:
835	*/	897	*/
836	int disk = read_balance(conf, r10_bio);	898	int max_sectors;
837	int slot = r10_bio->read_slot;	899	int disk;
		900	int slot;
		901
		902	read_again:
		903	disk = read_balance(conf, r10_bio, &max_sectors);
		904	slot = r10_bio->read_slot;
838	if (disk < 0) {	905	if (disk < 0) {
839	raid_end_bio_io(r10_bio);	906	raid_end_bio_io(r10_bio);
840	return 0;	907	return 0;
@@ -842,6 +909,8 @@ static int make_request(mddev_t mddev, struct bio bio)
842	mirror = conf->mirrors + disk;	909	mirror = conf->mirrors + disk;
843		910
844	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);	911	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
		912	md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
		913	max_sectors);
845		914
846	r10_bio->devs[slot].bio = read_bio;	915	r10_bio->devs[slot].bio = read_bio;
847		916
@@ -852,7 +921,39 @@ static int make_request(mddev_t mddev, struct bio bio)
852	read_bio->bi_rw = READ \| do_sync;	921	read_bio->bi_rw = READ \| do_sync;
853	read_bio->bi_private = r10_bio;	922	read_bio->bi_private = r10_bio;
854		923
855	generic_make_request(read_bio);	924	if (max_sectors < r10_bio->sectors) {
		925	/* Could not read all from this device, so we will
		926	* need another r10_bio.
		927	*/
		928	int sectors_handled;
		929
		930	sectors_handled = (r10_bio->sectors + max_sectors
		931	- bio->bi_sector);
		932	r10_bio->sectors = max_sectors;
		933	spin_lock_irq(&conf->device_lock);
		934	if (bio->bi_phys_segments == 0)
		935	bio->bi_phys_segments = 2;
		936	else
		937	bio->bi_phys_segments++;
		938	spin_unlock(&conf->device_lock);
		939	/* Cannot call generic_make_request directly
		940	* as that will be queued in __generic_make_request
		941	* and subsequent mempool_alloc might block
		942	* waiting for it. so hand bio over to raid10d.
		943	*/
		944	reschedule_retry(r10_bio);
		945
		946	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
		947
		948	r10_bio->master_bio = bio;
		949	r10_bio->sectors = ((bio->bi_size >> 9)
		950	- sectors_handled);
		951	r10_bio->state = 0;
		952	r10_bio->mddev = mddev;
		953	r10_bio->sector = bio->bi_sector + sectors_handled;
		954	goto read_again;
		955	} else
		956	generic_make_request(read_bio);
856	return 0;	957	return 0;
857	}	958	}
858		959
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t mddev, r10bio_t r10_bio)
1627	mdk_rdev_t *rdev;	1728	mdk_rdev_t *rdev;
1628	char b[BDEVNAME_SIZE];	1729	char b[BDEVNAME_SIZE];
1629	unsigned long do_sync;	1730	unsigned long do_sync;
		1731	int max_sectors;
1630		1732
1631	/* we got a read error. Maybe the drive is bad. Maybe just	1733	/* we got a read error. Maybe the drive is bad. Maybe just
1632	* the block and we can fix it.	1734	* the block and we can fix it.
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t mddev, r10bio_t r10_bio)
1646	bio = r10_bio->devs[slot].bio;	1748	bio = r10_bio->devs[slot].bio;
1647	r10_bio->devs[slot].bio =	1749	r10_bio->devs[slot].bio =
1648	mddev->ro ? IO_BLOCKED : NULL;	1750	mddev->ro ? IO_BLOCKED : NULL;
1649	mirror = read_balance(conf, r10_bio);	1751	mirror = read_balance(conf, r10_bio, &max_sectors);
1650	if (mirror == -1) {	1752	if (mirror == -1 \|\| max_sectors < r10_bio->sectors) {
1651	printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"	1753	printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1652	" read error for block %llu\n",	1754	" read error for block %llu\n",
1653	mdname(mddev),	1755	mdname(mddev),
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev)
1712	sync_request_write(mddev, r10_bio);	1814	sync_request_write(mddev, r10_bio);
1713	else if (test_bit(R10BIO_IsRecover, &r10_bio->state))	1815	else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1714	recovery_request_write(mddev, r10_bio);	1816	recovery_request_write(mddev, r10_bio);
1715	else	1817	else if (test_bit(R10BIO_ReadError, &r10_bio->state))
1716	handle_read_error(mddev, r10_bio);	1818	handle_read_error(mddev, r10_bio);
		1819	else {
		1820	/* just a partial read to be scheduled from a
		1821	* separate context
		1822	*/
		1823	int slot = r10_bio->read_slot;
		1824	generic_make_request(r10_bio->devs[slot].bio);
		1825	}
1717		1826
1718	cond_resched();	1827	cond_resched();
1719	if (mddev->flags & ~(1<<MD_CHANGE_PENDING))	1828	if (mddev->flags & ~(1<<MD_CHANGE_PENDING))