2 files changed, 129 insertions, 16 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f1b749c21717..872bf948f33a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
 {
        conf_t *conf = r10_bio->mddev->private;
-        /*
-         * Wake up any possible resync thread that waits for the device
-         * to go idle.
-         */
-        allow_barrier(conf);
        put_all_bios(conf, r10_bio);
        mempool_free(r10_bio, conf->r10bio_pool);
 }
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
 static void raid_end_bio_io(r10bio_t *r10_bio)
 {
        struct bio *bio = r10_bio->master_bio;
+        int done;
+        conf_t *conf = r10_bio->mddev->private;
-        bio_endio(bio,
+        if (bio->bi_phys_segments) {
-                test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
+                unsigned long flags;
+                spin_lock_irqsave(&conf->device_lock, flags);
+                bio->bi_phys_segments--;
+                done = (bio->bi_phys_segments == 0);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+        } else
+                done = 1;
+        if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        if (done) {
+                bio_endio(bio, 0);
+                /*
+                 * Wake up any possible resync thread that waits for the device
+                 * to go idle.
+                 */
+                allow_barrier(conf);
+        }
        free_r10bio(r10_bio);
 }
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
                                   mdname(conf->mddev),
                                   bdevname(conf->mirrors[dev].rdev->bdev, b),
                                   (unsigned long long)r10_bio->sector);
+                set_bit(R10BIO_ReadError, &r10_bio->state);
                reschedule_retry(r10_bio);
        }
 }
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 * FIXME: possibly should rethink readbalancing and do it differently
 * depending on near_copies / far_copies geometry.
 */
-static int read_balance(conf_t *conf, r10bio_t *r10_bio)
+static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
 {
        const sector_t this_sector = r10_bio->sector;
        int disk, slot;
-        const int sectors = r10_bio->sectors;
+        int sectors = r10_bio->sectors;
+        int best_good_sectors;
        sector_t new_distance, best_dist;
        mdk_rdev_t *rdev;
        int do_balance;
@@ -518,8 +532,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
        raid10_find_phys(conf, r10_bio);
        rcu_read_lock();
 retry:
+        sectors = r10_bio->sectors;
        best_slot = -1;
        best_dist = MaxSector;
+        best_good_sectors = 0;
        do_balance = 1;
        /*
         * Check if we can balance. We can balance on the whole
@@ -532,6 +548,10 @@ retry:
                do_balance = 0;
        for (slot = 0; slot < conf->copies ; slot++) {
+                sector_t first_bad;
+                int bad_sectors;
+                sector_t dev_sector;
                if (r10_bio->devs[slot].bio == IO_BLOCKED)
                        continue;
                disk = r10_bio->devs[slot].devnum;
@@ -541,6 +561,37 @@ retry:
                if (!test_bit(In_sync, &rdev->flags))
                        continue;
+                dev_sector = r10_bio->devs[slot].addr;
+                if (is_badblock(rdev, dev_sector, sectors,
+                                &first_bad, &bad_sectors)) {
+                        if (best_dist < MaxSector)
+                                /* Already have a better slot */
+                                continue;
+                        if (first_bad <= dev_sector) {
+                                /* Cannot read here.  If this is the
+                                 * 'primary' device, then we must not read
+                                 * beyond 'bad_sectors' from another device.
+                                 */
+                                bad_sectors -= (dev_sector - first_bad);
+                                if (!do_balance && sectors > bad_sectors)
+                                        sectors = bad_sectors;
+                                if (best_good_sectors > sectors)
+                                        best_good_sectors = sectors;
+                        } else {
+                                sector_t good_sectors =
+                                        first_bad - dev_sector;
+                                if (good_sectors > best_good_sectors) {
+                                        best_good_sectors = good_sectors;
+                                        best_slot = slot;
+                                }
+                                if (!do_balance)
+                                        /* Must read from here */
+                                        break;
+                        }
+                        continue;
+                } else
+                        best_good_sectors = sectors;
                if (!do_balance)
                        break;
@@ -582,6 +633,7 @@ retry:
        } else
                disk = -1;
        rcu_read_unlock();
+        *max_sectors = best_good_sectors;
        return disk;
 }
@@ -829,12 +881,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        r10_bio->sector = bio->bi_sector;
        r10_bio->state = 0;
+        /* We might need to issue multiple reads to different
+         * devices if there are bad blocks around, so we keep
+         * track of the number of reads in bio->bi_phys_segments.
+         * If this is 0, there is only one r10_bio and no locking
+         * will be needed when the request completes.  If it is
+         * non-zero, then it is the number of not-completed requests.
+         */
+        bio->bi_phys_segments = 0;
+        clear_bit(BIO_SEG_VALID, &bio->bi_flags);
        if (rw == READ) {
                /*
                 * read balancing logic:
                 */
-                int disk = read_balance(conf, r10_bio);
+                int max_sectors;
-                int slot = r10_bio->read_slot;
+                int disk;
+                int slot;
+read_again:
+                disk = read_balance(conf, r10_bio, &max_sectors);
+                slot = r10_bio->read_slot;
                if (disk < 0) {
                        raid_end_bio_io(r10_bio);
                        return 0;
@@ -842,6 +909,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                mirror = conf->mirrors + disk;
                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
+                            max_sectors);
                r10_bio->devs[slot].bio = read_bio;
@@ -852,7 +921,39 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                read_bio->bi_rw = READ | do_sync;
                read_bio->bi_private = r10_bio;
-                generic_make_request(read_bio);
+                if (max_sectors < r10_bio->sectors) {
+                        /* Could not read all from this device, so we will
+                         * need another r10_bio.
+                         */
+                        int sectors_handled;
+                        sectors_handled = (r10_bio->sectors + max_sectors
+                                           - bio->bi_sector);
+                        r10_bio->sectors = max_sectors;
+                        spin_lock_irq(&conf->device_lock);
+                        if (bio->bi_phys_segments == 0)
+                                bio->bi_phys_segments = 2;
+                        else
+                                bio->bi_phys_segments++;
+                        spin_unlock(&conf->device_lock);
+                        /* Cannot call generic_make_request directly
+                         * as that will be queued in __generic_make_request
+                         * and subsequent mempool_alloc might block
+                         * waiting for it.  so hand bio over to raid10d.
+                         */
+                        reschedule_retry(r10_bio);
+                        r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+                        r10_bio->master_bio = bio;
+                        r10_bio->sectors = ((bio->bi_size >> 9)
+                                            - sectors_handled);
+                        r10_bio->state = 0;
+                        r10_bio->mddev = mddev;
+                        r10_bio->sector = bio->bi_sector + sectors_handled;
+                        goto read_again;
+                } else
+                        generic_make_request(read_bio);
                return 0;
        }
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
        mdk_rdev_t *rdev;
        char b[BDEVNAME_SIZE];
        unsigned long do_sync;
+        int max_sectors;
        /* we got a read error. Maybe the drive is bad.  Maybe just
         * the block and we can fix it.
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
        bio = r10_bio->devs[slot].bio;
        r10_bio->devs[slot].bio =
                mddev->ro ? IO_BLOCKED : NULL;
-        mirror = read_balance(conf, r10_bio);
+        mirror = read_balance(conf, r10_bio, &max_sectors);
-        if (mirror == -1) {
+        if (mirror == -1 || max_sectors < r10_bio->sectors) {
                printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
                       " read error for block %llu\n",
                       mdname(mddev),
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev)
                        sync_request_write(mddev, r10_bio);
                else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
                        recovery_request_write(mddev, r10_bio);
-                else
+                else if (test_bit(R10BIO_ReadError, &r10_bio->state))
                        handle_read_error(mddev, r10_bio);
+                else {
+                        /* just a partial read to be scheduled from a
+                         * separate context
+                         */
+                        int slot = r10_bio->read_slot;
+                        generic_make_request(r10_bio->devs[slot].bio);
+                }
                cond_resched();
                if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index a485914c48c1..c646152ba4e4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -124,4 +124,8 @@ struct r10bio_s {
 #define R10BIO_IsSync   1
 #define R10BIO_IsRecover 2
 #define R10BIO_Degraded 3
+/* Set ReadError on bios that experience a read error
+ * so that raid10d knows what to do with them.
+ */
+#define R10BIO_ReadError 4
 #endif

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f1b749c21717..872bf948f33a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
191	{	191	{
192	conf_t *conf = r10_bio->mddev->private;	192	conf_t *conf = r10_bio->mddev->private;
193		193
194	/*
195	* Wake up any possible resync thread that waits for the device
196	* to go idle.
197	*/
198	allow_barrier(conf);
199
200	put_all_bios(conf, r10_bio);	194	put_all_bios(conf, r10_bio);
201	mempool_free(r10_bio, conf->r10bio_pool);	195	mempool_free(r10_bio, conf->r10bio_pool);
202	}	196	}
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
235	static void raid_end_bio_io(r10bio_t *r10_bio)	229	static void raid_end_bio_io(r10bio_t *r10_bio)
236	{	230	{
237	struct bio *bio = r10_bio->master_bio;	231	struct bio *bio = r10_bio->master_bio;
		232	int done;
		233	conf_t *conf = r10_bio->mddev->private;
238		234
239	bio_endio(bio,	235	if (bio->bi_phys_segments) {
240	test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);	236	unsigned long flags;
		237	spin_lock_irqsave(&conf->device_lock, flags);
		238	bio->bi_phys_segments--;
		239	done = (bio->bi_phys_segments == 0);
		240	spin_unlock_irqrestore(&conf->device_lock, flags);
		241	} else
		242	done = 1;
		243	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
		244	clear_bit(BIO_UPTODATE, &bio->bi_flags);
		245	if (done) {
		246	bio_endio(bio, 0);
		247	/*
		248	* Wake up any possible resync thread that waits for the device
		249	* to go idle.
		250	*/
		251	allow_barrier(conf);
		252	}
241	free_r10bio(r10_bio);	253	free_r10bio(r10_bio);
242	}	254	}
243		255
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
307	mdname(conf->mddev),	319	mdname(conf->mddev),
308	bdevname(conf->mirrors[dev].rdev->bdev, b),	320	bdevname(conf->mirrors[dev].rdev->bdev, b),
309	(unsigned long long)r10_bio->sector);	321	(unsigned long long)r10_bio->sector);
		322	set_bit(R10BIO_ReadError, &r10_bio->state);
310	reschedule_retry(r10_bio);	323	reschedule_retry(r10_bio);
311	}	324	}
312	}	325	}
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
505	* FIXME: possibly should rethink readbalancing and do it differently	518	* FIXME: possibly should rethink readbalancing and do it differently
506	* depending on near_copies / far_copies geometry.	519	* depending on near_copies / far_copies geometry.
507	*/	520	*/
508	static int read_balance(conf_t conf, r10bio_t r10_bio)	521	static int read_balance(conf_t conf, r10bio_t r10_bio, int *max_sectors)
509	{	522	{
510	const sector_t this_sector = r10_bio->sector;	523	const sector_t this_sector = r10_bio->sector;
511	int disk, slot;	524	int disk, slot;
512	const int sectors = r10_bio->sectors;	525	int sectors = r10_bio->sectors;
		526	int best_good_sectors;
513	sector_t new_distance, best_dist;	527	sector_t new_distance, best_dist;
514	mdk_rdev_t *rdev;	528	mdk_rdev_t *rdev;
515	int do_balance;	529	int do_balance;
@@ -518,8 +532,10 @@ static int read_balance(conf_t conf, r10bio_t r10_bio)
518	raid10_find_phys(conf, r10_bio);	532	raid10_find_phys(conf, r10_bio);
519	rcu_read_lock();	533	rcu_read_lock();
520	retry:	534	retry:
		535	sectors = r10_bio->sectors;
521	best_slot = -1;	536	best_slot = -1;
522	best_dist = MaxSector;	537	best_dist = MaxSector;
		538	best_good_sectors = 0;
523	do_balance = 1;	539	do_balance = 1;
524	/*	540	/*
525	* Check if we can balance. We can balance on the whole	541	* Check if we can balance. We can balance on the whole
@@ -532,6 +548,10 @@ retry:
532	do_balance = 0;	548	do_balance = 0;
533		549
534	for (slot = 0; slot < conf->copies ; slot++) {	550	for (slot = 0; slot < conf->copies ; slot++) {
		551	sector_t first_bad;
		552	int bad_sectors;
		553	sector_t dev_sector;
		554
535	if (r10_bio->devs[slot].bio == IO_BLOCKED)	555	if (r10_bio->devs[slot].bio == IO_BLOCKED)
536	continue;	556	continue;
537	disk = r10_bio->devs[slot].devnum;	557	disk = r10_bio->devs[slot].devnum;
@@ -541,6 +561,37 @@ retry:
541	if (!test_bit(In_sync, &rdev->flags))	561	if (!test_bit(In_sync, &rdev->flags))
542	continue;	562	continue;
543		563
		564	dev_sector = r10_bio->devs[slot].addr;
		565	if (is_badblock(rdev, dev_sector, sectors,
		566	&first_bad, &bad_sectors)) {
		567	if (best_dist < MaxSector)
		568	/* Already have a better slot */
		569	continue;
		570	if (first_bad <= dev_sector) {
		571	/* Cannot read here. If this is the
		572	* 'primary' device, then we must not read
		573	* beyond 'bad_sectors' from another device.
		574	*/
		575	bad_sectors -= (dev_sector - first_bad);
		576	if (!do_balance && sectors > bad_sectors)
		577	sectors = bad_sectors;
		578	if (best_good_sectors > sectors)
		579	best_good_sectors = sectors;
		580	} else {
		581	sector_t good_sectors =
		582	first_bad - dev_sector;
		583	if (good_sectors > best_good_sectors) {
		584	best_good_sectors = good_sectors;
		585	best_slot = slot;
		586	}
		587	if (!do_balance)
		588	/* Must read from here */
		589	break;
		590	}
		591	continue;
		592	} else
		593	best_good_sectors = sectors;
		594
544	if (!do_balance)	595	if (!do_balance)
545	break;	596	break;
546		597
@@ -582,6 +633,7 @@ retry:
582	} else	633	} else
583	disk = -1;	634	disk = -1;
584	rcu_read_unlock();	635	rcu_read_unlock();
		636	*max_sectors = best_good_sectors;
585		637
586	return disk;	638	return disk;
587	}	639	}
@@ -829,12 +881,27 @@ static int make_request(mddev_t mddev, struct bio bio)
829	r10_bio->sector = bio->bi_sector;	881	r10_bio->sector = bio->bi_sector;
830	r10_bio->state = 0;	882	r10_bio->state = 0;
831		883
		884	/* We might need to issue multiple reads to different
		885	* devices if there are bad blocks around, so we keep
		886	* track of the number of reads in bio->bi_phys_segments.
		887	* If this is 0, there is only one r10_bio and no locking
		888	* will be needed when the request completes. If it is
		889	* non-zero, then it is the number of not-completed requests.
		890	*/
		891	bio->bi_phys_segments = 0;
		892	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
		893
832	if (rw == READ) {	894	if (rw == READ) {
833	/*	895	/*
834	* read balancing logic:	896	* read balancing logic:
835	*/	897	*/
836	int disk = read_balance(conf, r10_bio);	898	int max_sectors;
837	int slot = r10_bio->read_slot;	899	int disk;
		900	int slot;
		901
		902	read_again:
		903	disk = read_balance(conf, r10_bio, &max_sectors);
		904	slot = r10_bio->read_slot;
838	if (disk < 0) {	905	if (disk < 0) {
839	raid_end_bio_io(r10_bio);	906	raid_end_bio_io(r10_bio);
840	return 0;	907	return 0;
@@ -842,6 +909,8 @@ static int make_request(mddev_t mddev, struct bio bio)
842	mirror = conf->mirrors + disk;	909	mirror = conf->mirrors + disk;
843		910
844	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);	911	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
		912	md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
		913	max_sectors);
845		914
846	r10_bio->devs[slot].bio = read_bio;	915	r10_bio->devs[slot].bio = read_bio;
847		916
@@ -852,7 +921,39 @@ static int make_request(mddev_t mddev, struct bio bio)
852	read_bio->bi_rw = READ \| do_sync;	921	read_bio->bi_rw = READ \| do_sync;
853	read_bio->bi_private = r10_bio;	922	read_bio->bi_private = r10_bio;
854		923
855	generic_make_request(read_bio);	924	if (max_sectors < r10_bio->sectors) {
		925	/* Could not read all from this device, so we will
		926	* need another r10_bio.
		927	*/
		928	int sectors_handled;
		929
		930	sectors_handled = (r10_bio->sectors + max_sectors
		931	- bio->bi_sector);
		932	r10_bio->sectors = max_sectors;
		933	spin_lock_irq(&conf->device_lock);
		934	if (bio->bi_phys_segments == 0)
		935	bio->bi_phys_segments = 2;
		936	else
		937	bio->bi_phys_segments++;
		938	spin_unlock(&conf->device_lock);
		939	/* Cannot call generic_make_request directly
		940	* as that will be queued in __generic_make_request
		941	* and subsequent mempool_alloc might block
		942	* waiting for it. so hand bio over to raid10d.
		943	*/
		944	reschedule_retry(r10_bio);
		945
		946	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
		947
		948	r10_bio->master_bio = bio;
		949	r10_bio->sectors = ((bio->bi_size >> 9)
		950	- sectors_handled);
		951	r10_bio->state = 0;
		952	r10_bio->mddev = mddev;
		953	r10_bio->sector = bio->bi_sector + sectors_handled;
		954	goto read_again;
		955	} else
		956	generic_make_request(read_bio);
856	return 0;	957	return 0;
857	}	958	}
858		959
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t mddev, r10bio_t r10_bio)
1627	mdk_rdev_t *rdev;	1728	mdk_rdev_t *rdev;
1628	char b[BDEVNAME_SIZE];	1729	char b[BDEVNAME_SIZE];
1629	unsigned long do_sync;	1730	unsigned long do_sync;
		1731	int max_sectors;
1630		1732
1631	/* we got a read error. Maybe the drive is bad. Maybe just	1733	/* we got a read error. Maybe the drive is bad. Maybe just
1632	* the block and we can fix it.	1734	* the block and we can fix it.
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t mddev, r10bio_t r10_bio)
1646	bio = r10_bio->devs[slot].bio;	1748	bio = r10_bio->devs[slot].bio;
1647	r10_bio->devs[slot].bio =	1749	r10_bio->devs[slot].bio =
1648	mddev->ro ? IO_BLOCKED : NULL;	1750	mddev->ro ? IO_BLOCKED : NULL;
1649	mirror = read_balance(conf, r10_bio);	1751	mirror = read_balance(conf, r10_bio, &max_sectors);
1650	if (mirror == -1) {	1752	if (mirror == -1 \|\| max_sectors < r10_bio->sectors) {
1651	printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"	1753	printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1652	" read error for block %llu\n",	1754	" read error for block %llu\n",
1653	mdname(mddev),	1755	mdname(mddev),
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev)
1712	sync_request_write(mddev, r10_bio);	1814	sync_request_write(mddev, r10_bio);
1713	else if (test_bit(R10BIO_IsRecover, &r10_bio->state))	1815	else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1714	recovery_request_write(mddev, r10_bio);	1816	recovery_request_write(mddev, r10_bio);
1715	else	1817	else if (test_bit(R10BIO_ReadError, &r10_bio->state))
1716	handle_read_error(mddev, r10_bio);	1818	handle_read_error(mddev, r10_bio);
		1819	else {
		1820	/* just a partial read to be scheduled from a
		1821	* separate context
		1822	*/
		1823	int slot = r10_bio->read_slot;
		1824	generic_make_request(r10_bio->devs[slot].bio);
		1825	}
1717		1826
1718	cond_resched();	1827	cond_resched();
1719	if (mddev->flags & ~(1<<MD_CHANGE_PENDING))	1828	if (mddev->flags & ~(1<<MD_CHANGE_PENDING))


diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index a485914c48c1..c646152ba4e4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h
@@ -124,4 +124,8 @@ struct r10bio_s {
124	#define R10BIO_IsSync 1	124	#define R10BIO_IsSync 1
125	#define R10BIO_IsRecover 2	125	#define R10BIO_IsRecover 2
126	#define R10BIO_Degraded 3	126	#define R10BIO_Degraded 3
		127	/* Set ReadError on bios that experience a read error
		128	* so that raid10d knows what to do with them.
		129	*/
		130	#define R10BIO_ReadError 4
127	#endif	131	#endif