aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-07-27 21:39:23 -0400
committerNeilBrown <neilb@suse.de>2011-07-27 21:39:23 -0400
commit856e08e23762dfb92ffc68fd0a8d228f9e152160 (patch)
treefa9977a39da542eebb2129712703c11009a56ff2 /drivers/md
parent560f8e5532d63a314271bfb99d3d1d53c938ed14 (diff)
md/raid10: avoid reading from known bad blocks - part 1
This patch just covers the basic read path: 1/ read_balance needs to check for badblocks, and return not only the chosen slot, but also how many good blocks are available there. 2/ read submission must be ready to issue multiple reads to different devices as different bad blocks on different devices could mean that a single large read cannot be served by any one device, but can still be served by the array. This requires keeping count of the number of outstanding requests per bio. This count is stored in 'bi_phys_segments' On read error we currently just fail the request if another target cannot handle the whole request. Next patch refines that a bit. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid10.c141
-rw-r--r--drivers/md/raid10.h4
2 files changed, 129 insertions, 16 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f1b749c21717..872bf948f33a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
191{ 191{
192 conf_t *conf = r10_bio->mddev->private; 192 conf_t *conf = r10_bio->mddev->private;
193 193
194 /*
195 * Wake up any possible resync thread that waits for the device
196 * to go idle.
197 */
198 allow_barrier(conf);
199
200 put_all_bios(conf, r10_bio); 194 put_all_bios(conf, r10_bio);
201 mempool_free(r10_bio, conf->r10bio_pool); 195 mempool_free(r10_bio, conf->r10bio_pool);
202} 196}
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
235static void raid_end_bio_io(r10bio_t *r10_bio) 229static void raid_end_bio_io(r10bio_t *r10_bio)
236{ 230{
237 struct bio *bio = r10_bio->master_bio; 231 struct bio *bio = r10_bio->master_bio;
232 int done;
233 conf_t *conf = r10_bio->mddev->private;
238 234
239 bio_endio(bio, 235 if (bio->bi_phys_segments) {
240 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); 236 unsigned long flags;
237 spin_lock_irqsave(&conf->device_lock, flags);
238 bio->bi_phys_segments--;
239 done = (bio->bi_phys_segments == 0);
240 spin_unlock_irqrestore(&conf->device_lock, flags);
241 } else
242 done = 1;
243 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
244 clear_bit(BIO_UPTODATE, &bio->bi_flags);
245 if (done) {
246 bio_endio(bio, 0);
247 /*
248 * Wake up any possible resync thread that waits for the device
249 * to go idle.
250 */
251 allow_barrier(conf);
252 }
241 free_r10bio(r10_bio); 253 free_r10bio(r10_bio);
242} 254}
243 255
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
307 mdname(conf->mddev), 319 mdname(conf->mddev),
308 bdevname(conf->mirrors[dev].rdev->bdev, b), 320 bdevname(conf->mirrors[dev].rdev->bdev, b),
309 (unsigned long long)r10_bio->sector); 321 (unsigned long long)r10_bio->sector);
322 set_bit(R10BIO_ReadError, &r10_bio->state);
310 reschedule_retry(r10_bio); 323 reschedule_retry(r10_bio);
311 } 324 }
312} 325}
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
505 * FIXME: possibly should rethink readbalancing and do it differently 518 * FIXME: possibly should rethink readbalancing and do it differently
506 * depending on near_copies / far_copies geometry. 519 * depending on near_copies / far_copies geometry.
507 */ 520 */
508static int read_balance(conf_t *conf, r10bio_t *r10_bio) 521static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
509{ 522{
510 const sector_t this_sector = r10_bio->sector; 523 const sector_t this_sector = r10_bio->sector;
511 int disk, slot; 524 int disk, slot;
512 const int sectors = r10_bio->sectors; 525 int sectors = r10_bio->sectors;
526 int best_good_sectors;
513 sector_t new_distance, best_dist; 527 sector_t new_distance, best_dist;
514 mdk_rdev_t *rdev; 528 mdk_rdev_t *rdev;
515 int do_balance; 529 int do_balance;
@@ -518,8 +532,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
518 raid10_find_phys(conf, r10_bio); 532 raid10_find_phys(conf, r10_bio);
519 rcu_read_lock(); 533 rcu_read_lock();
520retry: 534retry:
535 sectors = r10_bio->sectors;
521 best_slot = -1; 536 best_slot = -1;
522 best_dist = MaxSector; 537 best_dist = MaxSector;
538 best_good_sectors = 0;
523 do_balance = 1; 539 do_balance = 1;
524 /* 540 /*
525 * Check if we can balance. We can balance on the whole 541 * Check if we can balance. We can balance on the whole
@@ -532,6 +548,10 @@ retry:
532 do_balance = 0; 548 do_balance = 0;
533 549
534 for (slot = 0; slot < conf->copies ; slot++) { 550 for (slot = 0; slot < conf->copies ; slot++) {
551 sector_t first_bad;
552 int bad_sectors;
553 sector_t dev_sector;
554
535 if (r10_bio->devs[slot].bio == IO_BLOCKED) 555 if (r10_bio->devs[slot].bio == IO_BLOCKED)
536 continue; 556 continue;
537 disk = r10_bio->devs[slot].devnum; 557 disk = r10_bio->devs[slot].devnum;
@@ -541,6 +561,37 @@ retry:
541 if (!test_bit(In_sync, &rdev->flags)) 561 if (!test_bit(In_sync, &rdev->flags))
542 continue; 562 continue;
543 563
564 dev_sector = r10_bio->devs[slot].addr;
565 if (is_badblock(rdev, dev_sector, sectors,
566 &first_bad, &bad_sectors)) {
567 if (best_dist < MaxSector)
568 /* Already have a better slot */
569 continue;
570 if (first_bad <= dev_sector) {
571 /* Cannot read here. If this is the
572 * 'primary' device, then we must not read
573 * beyond 'bad_sectors' from another device.
574 */
575 bad_sectors -= (dev_sector - first_bad);
576 if (!do_balance && sectors > bad_sectors)
577 sectors = bad_sectors;
578 if (best_good_sectors > sectors)
579 best_good_sectors = sectors;
580 } else {
581 sector_t good_sectors =
582 first_bad - dev_sector;
583 if (good_sectors > best_good_sectors) {
584 best_good_sectors = good_sectors;
585 best_slot = slot;
586 }
587 if (!do_balance)
588 /* Must read from here */
589 break;
590 }
591 continue;
592 } else
593 best_good_sectors = sectors;
594
544 if (!do_balance) 595 if (!do_balance)
545 break; 596 break;
546 597
@@ -582,6 +633,7 @@ retry:
582 } else 633 } else
583 disk = -1; 634 disk = -1;
584 rcu_read_unlock(); 635 rcu_read_unlock();
636 *max_sectors = best_good_sectors;
585 637
586 return disk; 638 return disk;
587} 639}
@@ -829,12 +881,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
829 r10_bio->sector = bio->bi_sector; 881 r10_bio->sector = bio->bi_sector;
830 r10_bio->state = 0; 882 r10_bio->state = 0;
831 883
884 /* We might need to issue multiple reads to different
885 * devices if there are bad blocks around, so we keep
886 * track of the number of reads in bio->bi_phys_segments.
887 * If this is 0, there is only one r10_bio and no locking
888 * will be needed when the request completes. If it is
889 * non-zero, then it is the number of not-completed requests.
890 */
891 bio->bi_phys_segments = 0;
892 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
893
832 if (rw == READ) { 894 if (rw == READ) {
833 /* 895 /*
834 * read balancing logic: 896 * read balancing logic:
835 */ 897 */
836 int disk = read_balance(conf, r10_bio); 898 int max_sectors;
837 int slot = r10_bio->read_slot; 899 int disk;
900 int slot;
901
902read_again:
903 disk = read_balance(conf, r10_bio, &max_sectors);
904 slot = r10_bio->read_slot;
838 if (disk < 0) { 905 if (disk < 0) {
839 raid_end_bio_io(r10_bio); 906 raid_end_bio_io(r10_bio);
840 return 0; 907 return 0;
@@ -842,6 +909,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
842 mirror = conf->mirrors + disk; 909 mirror = conf->mirrors + disk;
843 910
844 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 911 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
912 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
913 max_sectors);
845 914
846 r10_bio->devs[slot].bio = read_bio; 915 r10_bio->devs[slot].bio = read_bio;
847 916
@@ -852,7 +921,39 @@ static int make_request(mddev_t *mddev, struct bio * bio)
852 read_bio->bi_rw = READ | do_sync; 921 read_bio->bi_rw = READ | do_sync;
853 read_bio->bi_private = r10_bio; 922 read_bio->bi_private = r10_bio;
854 923
855 generic_make_request(read_bio); 924 if (max_sectors < r10_bio->sectors) {
925 /* Could not read all from this device, so we will
926 * need another r10_bio.
927 */
928 int sectors_handled;
929
930 sectors_handled = (r10_bio->sectors + max_sectors
931 - bio->bi_sector);
932 r10_bio->sectors = max_sectors;
933 spin_lock_irq(&conf->device_lock);
934 if (bio->bi_phys_segments == 0)
935 bio->bi_phys_segments = 2;
936 else
937 bio->bi_phys_segments++;
938 spin_unlock(&conf->device_lock);
939 /* Cannot call generic_make_request directly
940 * as that will be queued in __generic_make_request
941 * and subsequent mempool_alloc might block
942 * waiting for it. so hand bio over to raid10d.
943 */
944 reschedule_retry(r10_bio);
945
946 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
947
948 r10_bio->master_bio = bio;
949 r10_bio->sectors = ((bio->bi_size >> 9)
950 - sectors_handled);
951 r10_bio->state = 0;
952 r10_bio->mddev = mddev;
953 r10_bio->sector = bio->bi_sector + sectors_handled;
954 goto read_again;
955 } else
956 generic_make_request(read_bio);
856 return 0; 957 return 0;
857 } 958 }
858 959
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
1627 mdk_rdev_t *rdev; 1728 mdk_rdev_t *rdev;
1628 char b[BDEVNAME_SIZE]; 1729 char b[BDEVNAME_SIZE];
1629 unsigned long do_sync; 1730 unsigned long do_sync;
1731 int max_sectors;
1630 1732
1631 /* we got a read error. Maybe the drive is bad. Maybe just 1733 /* we got a read error. Maybe the drive is bad. Maybe just
1632 * the block and we can fix it. 1734 * the block and we can fix it.
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
1646 bio = r10_bio->devs[slot].bio; 1748 bio = r10_bio->devs[slot].bio;
1647 r10_bio->devs[slot].bio = 1749 r10_bio->devs[slot].bio =
1648 mddev->ro ? IO_BLOCKED : NULL; 1750 mddev->ro ? IO_BLOCKED : NULL;
1649 mirror = read_balance(conf, r10_bio); 1751 mirror = read_balance(conf, r10_bio, &max_sectors);
1650 if (mirror == -1) { 1752 if (mirror == -1 || max_sectors < r10_bio->sectors) {
1651 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 1753 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1652 " read error for block %llu\n", 1754 " read error for block %llu\n",
1653 mdname(mddev), 1755 mdname(mddev),
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev)
1712 sync_request_write(mddev, r10_bio); 1814 sync_request_write(mddev, r10_bio);
1713 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 1815 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1714 recovery_request_write(mddev, r10_bio); 1816 recovery_request_write(mddev, r10_bio);
1715 else 1817 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
1716 handle_read_error(mddev, r10_bio); 1818 handle_read_error(mddev, r10_bio);
1819 else {
1820 /* just a partial read to be scheduled from a
1821 * separate context
1822 */
1823 int slot = r10_bio->read_slot;
1824 generic_make_request(r10_bio->devs[slot].bio);
1825 }
1717 1826
1718 cond_resched(); 1827 cond_resched();
1719 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 1828 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index a485914c48c1..c646152ba4e4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -124,4 +124,8 @@ struct r10bio_s {
124#define R10BIO_IsSync 1 124#define R10BIO_IsSync 1
125#define R10BIO_IsRecover 2 125#define R10BIO_IsRecover 2
126#define R10BIO_Degraded 3 126#define R10BIO_Degraded 3
127/* Set ReadError on bios that experience a read error
128 * so that raid10d knows what to do with them.
129 */
130#define R10BIO_ReadError 4
127#endif 131#endif