diff options
author | NeilBrown <neilb@suse.de> | 2011-07-27 21:39:23 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2011-07-27 21:39:23 -0400 |
commit | 856e08e23762dfb92ffc68fd0a8d228f9e152160 (patch) | |
tree | fa9977a39da542eebb2129712703c11009a56ff2 /drivers/md | |
parent | 560f8e5532d63a314271bfb99d3d1d53c938ed14 (diff) |
md/raid10: avoid reading from known bad blocks - part 1
This patch just covers the basic read path:
1/ read_balance needs to check for badblocks, and return not only
the chosen slot, but also how many good blocks are available
there.
2/ read submission must be ready to issue multiple reads to
different devices as different bad blocks on different devices
could mean that a single large read cannot be served by any one
device, but can still be served by the array.
This requires keeping count of the number of outstanding requests
per bio. This count is stored in 'bi_phys_segments'
On read error we currently just fail the request if another target
cannot handle the whole request. Next patch refines that a bit.
Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/raid10.c | 141 | ||||
-rw-r--r-- | drivers/md/raid10.h | 4 |
2 files changed, 129 insertions, 16 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f1b749c21717..872bf948f33a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio) | |||
191 | { | 191 | { |
192 | conf_t *conf = r10_bio->mddev->private; | 192 | conf_t *conf = r10_bio->mddev->private; |
193 | 193 | ||
194 | /* | ||
195 | * Wake up any possible resync thread that waits for the device | ||
196 | * to go idle. | ||
197 | */ | ||
198 | allow_barrier(conf); | ||
199 | |||
200 | put_all_bios(conf, r10_bio); | 194 | put_all_bios(conf, r10_bio); |
201 | mempool_free(r10_bio, conf->r10bio_pool); | 195 | mempool_free(r10_bio, conf->r10bio_pool); |
202 | } | 196 | } |
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio) | |||
235 | static void raid_end_bio_io(r10bio_t *r10_bio) | 229 | static void raid_end_bio_io(r10bio_t *r10_bio) |
236 | { | 230 | { |
237 | struct bio *bio = r10_bio->master_bio; | 231 | struct bio *bio = r10_bio->master_bio; |
232 | int done; | ||
233 | conf_t *conf = r10_bio->mddev->private; | ||
238 | 234 | ||
239 | bio_endio(bio, | 235 | if (bio->bi_phys_segments) { |
240 | test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); | 236 | unsigned long flags; |
237 | spin_lock_irqsave(&conf->device_lock, flags); | ||
238 | bio->bi_phys_segments--; | ||
239 | done = (bio->bi_phys_segments == 0); | ||
240 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
241 | } else | ||
242 | done = 1; | ||
243 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
244 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
245 | if (done) { | ||
246 | bio_endio(bio, 0); | ||
247 | /* | ||
248 | * Wake up any possible resync thread that waits for the device | ||
249 | * to go idle. | ||
250 | */ | ||
251 | allow_barrier(conf); | ||
252 | } | ||
241 | free_r10bio(r10_bio); | 253 | free_r10bio(r10_bio); |
242 | } | 254 | } |
243 | 255 | ||
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
307 | mdname(conf->mddev), | 319 | mdname(conf->mddev), |
308 | bdevname(conf->mirrors[dev].rdev->bdev, b), | 320 | bdevname(conf->mirrors[dev].rdev->bdev, b), |
309 | (unsigned long long)r10_bio->sector); | 321 | (unsigned long long)r10_bio->sector); |
322 | set_bit(R10BIO_ReadError, &r10_bio->state); | ||
310 | reschedule_retry(r10_bio); | 323 | reschedule_retry(r10_bio); |
311 | } | 324 | } |
312 | } | 325 | } |
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
505 | * FIXME: possibly should rethink readbalancing and do it differently | 518 | * FIXME: possibly should rethink readbalancing and do it differently |
506 | * depending on near_copies / far_copies geometry. | 519 | * depending on near_copies / far_copies geometry. |
507 | */ | 520 | */ |
508 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 521 | static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) |
509 | { | 522 | { |
510 | const sector_t this_sector = r10_bio->sector; | 523 | const sector_t this_sector = r10_bio->sector; |
511 | int disk, slot; | 524 | int disk, slot; |
512 | const int sectors = r10_bio->sectors; | 525 | int sectors = r10_bio->sectors; |
526 | int best_good_sectors; | ||
513 | sector_t new_distance, best_dist; | 527 | sector_t new_distance, best_dist; |
514 | mdk_rdev_t *rdev; | 528 | mdk_rdev_t *rdev; |
515 | int do_balance; | 529 | int do_balance; |
@@ -518,8 +532,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
518 | raid10_find_phys(conf, r10_bio); | 532 | raid10_find_phys(conf, r10_bio); |
519 | rcu_read_lock(); | 533 | rcu_read_lock(); |
520 | retry: | 534 | retry: |
535 | sectors = r10_bio->sectors; | ||
521 | best_slot = -1; | 536 | best_slot = -1; |
522 | best_dist = MaxSector; | 537 | best_dist = MaxSector; |
538 | best_good_sectors = 0; | ||
523 | do_balance = 1; | 539 | do_balance = 1; |
524 | /* | 540 | /* |
525 | * Check if we can balance. We can balance on the whole | 541 | * Check if we can balance. We can balance on the whole |
@@ -532,6 +548,10 @@ retry: | |||
532 | do_balance = 0; | 548 | do_balance = 0; |
533 | 549 | ||
534 | for (slot = 0; slot < conf->copies ; slot++) { | 550 | for (slot = 0; slot < conf->copies ; slot++) { |
551 | sector_t first_bad; | ||
552 | int bad_sectors; | ||
553 | sector_t dev_sector; | ||
554 | |||
535 | if (r10_bio->devs[slot].bio == IO_BLOCKED) | 555 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
536 | continue; | 556 | continue; |
537 | disk = r10_bio->devs[slot].devnum; | 557 | disk = r10_bio->devs[slot].devnum; |
@@ -541,6 +561,37 @@ retry: | |||
541 | if (!test_bit(In_sync, &rdev->flags)) | 561 | if (!test_bit(In_sync, &rdev->flags)) |
542 | continue; | 562 | continue; |
543 | 563 | ||
564 | dev_sector = r10_bio->devs[slot].addr; | ||
565 | if (is_badblock(rdev, dev_sector, sectors, | ||
566 | &first_bad, &bad_sectors)) { | ||
567 | if (best_dist < MaxSector) | ||
568 | /* Already have a better slot */ | ||
569 | continue; | ||
570 | if (first_bad <= dev_sector) { | ||
571 | /* Cannot read here. If this is the | ||
572 | * 'primary' device, then we must not read | ||
573 | * beyond 'bad_sectors' from another device. | ||
574 | */ | ||
575 | bad_sectors -= (dev_sector - first_bad); | ||
576 | if (!do_balance && sectors > bad_sectors) | ||
577 | sectors = bad_sectors; | ||
578 | if (best_good_sectors > sectors) | ||
579 | best_good_sectors = sectors; | ||
580 | } else { | ||
581 | sector_t good_sectors = | ||
582 | first_bad - dev_sector; | ||
583 | if (good_sectors > best_good_sectors) { | ||
584 | best_good_sectors = good_sectors; | ||
585 | best_slot = slot; | ||
586 | } | ||
587 | if (!do_balance) | ||
588 | /* Must read from here */ | ||
589 | break; | ||
590 | } | ||
591 | continue; | ||
592 | } else | ||
593 | best_good_sectors = sectors; | ||
594 | |||
544 | if (!do_balance) | 595 | if (!do_balance) |
545 | break; | 596 | break; |
546 | 597 | ||
@@ -582,6 +633,7 @@ retry: | |||
582 | } else | 633 | } else |
583 | disk = -1; | 634 | disk = -1; |
584 | rcu_read_unlock(); | 635 | rcu_read_unlock(); |
636 | *max_sectors = best_good_sectors; | ||
585 | 637 | ||
586 | return disk; | 638 | return disk; |
587 | } | 639 | } |
@@ -829,12 +881,27 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
829 | r10_bio->sector = bio->bi_sector; | 881 | r10_bio->sector = bio->bi_sector; |
830 | r10_bio->state = 0; | 882 | r10_bio->state = 0; |
831 | 883 | ||
884 | /* We might need to issue multiple reads to different | ||
885 | * devices if there are bad blocks around, so we keep | ||
886 | * track of the number of reads in bio->bi_phys_segments. | ||
887 | * If this is 0, there is only one r10_bio and no locking | ||
888 | * will be needed when the request completes. If it is | ||
889 | * non-zero, then it is the number of not-completed requests. | ||
890 | */ | ||
891 | bio->bi_phys_segments = 0; | ||
892 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
893 | |||
832 | if (rw == READ) { | 894 | if (rw == READ) { |
833 | /* | 895 | /* |
834 | * read balancing logic: | 896 | * read balancing logic: |
835 | */ | 897 | */ |
836 | int disk = read_balance(conf, r10_bio); | 898 | int max_sectors; |
837 | int slot = r10_bio->read_slot; | 899 | int disk; |
900 | int slot; | ||
901 | |||
902 | read_again: | ||
903 | disk = read_balance(conf, r10_bio, &max_sectors); | ||
904 | slot = r10_bio->read_slot; | ||
838 | if (disk < 0) { | 905 | if (disk < 0) { |
839 | raid_end_bio_io(r10_bio); | 906 | raid_end_bio_io(r10_bio); |
840 | return 0; | 907 | return 0; |
@@ -842,6 +909,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
842 | mirror = conf->mirrors + disk; | 909 | mirror = conf->mirrors + disk; |
843 | 910 | ||
844 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 911 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
912 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, | ||
913 | max_sectors); | ||
845 | 914 | ||
846 | r10_bio->devs[slot].bio = read_bio; | 915 | r10_bio->devs[slot].bio = read_bio; |
847 | 916 | ||
@@ -852,7 +921,39 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
852 | read_bio->bi_rw = READ | do_sync; | 921 | read_bio->bi_rw = READ | do_sync; |
853 | read_bio->bi_private = r10_bio; | 922 | read_bio->bi_private = r10_bio; |
854 | 923 | ||
855 | generic_make_request(read_bio); | 924 | if (max_sectors < r10_bio->sectors) { |
925 | /* Could not read all from this device, so we will | ||
926 | * need another r10_bio. | ||
927 | */ | ||
928 | int sectors_handled; | ||
929 | |||
930 | sectors_handled = (r10_bio->sectors + max_sectors | ||
931 | - bio->bi_sector); | ||
932 | r10_bio->sectors = max_sectors; | ||
933 | spin_lock_irq(&conf->device_lock); | ||
934 | if (bio->bi_phys_segments == 0) | ||
935 | bio->bi_phys_segments = 2; | ||
936 | else | ||
937 | bio->bi_phys_segments++; | ||
938 | spin_unlock(&conf->device_lock); | ||
939 | /* Cannot call generic_make_request directly | ||
940 | * as that will be queued in __generic_make_request | ||
941 | * and subsequent mempool_alloc might block | ||
942 | * waiting for it. so hand bio over to raid10d. | ||
943 | */ | ||
944 | reschedule_retry(r10_bio); | ||
945 | |||
946 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
947 | |||
948 | r10_bio->master_bio = bio; | ||
949 | r10_bio->sectors = ((bio->bi_size >> 9) | ||
950 | - sectors_handled); | ||
951 | r10_bio->state = 0; | ||
952 | r10_bio->mddev = mddev; | ||
953 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
954 | goto read_again; | ||
955 | } else | ||
956 | generic_make_request(read_bio); | ||
856 | return 0; | 957 | return 0; |
857 | } | 958 | } |
858 | 959 | ||
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) | |||
1627 | mdk_rdev_t *rdev; | 1728 | mdk_rdev_t *rdev; |
1628 | char b[BDEVNAME_SIZE]; | 1729 | char b[BDEVNAME_SIZE]; |
1629 | unsigned long do_sync; | 1730 | unsigned long do_sync; |
1731 | int max_sectors; | ||
1630 | 1732 | ||
1631 | /* we got a read error. Maybe the drive is bad. Maybe just | 1733 | /* we got a read error. Maybe the drive is bad. Maybe just |
1632 | * the block and we can fix it. | 1734 | * the block and we can fix it. |
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) | |||
1646 | bio = r10_bio->devs[slot].bio; | 1748 | bio = r10_bio->devs[slot].bio; |
1647 | r10_bio->devs[slot].bio = | 1749 | r10_bio->devs[slot].bio = |
1648 | mddev->ro ? IO_BLOCKED : NULL; | 1750 | mddev->ro ? IO_BLOCKED : NULL; |
1649 | mirror = read_balance(conf, r10_bio); | 1751 | mirror = read_balance(conf, r10_bio, &max_sectors); |
1650 | if (mirror == -1) { | 1752 | if (mirror == -1 || max_sectors < r10_bio->sectors) { |
1651 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | 1753 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" |
1652 | " read error for block %llu\n", | 1754 | " read error for block %llu\n", |
1653 | mdname(mddev), | 1755 | mdname(mddev), |
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev) | |||
1712 | sync_request_write(mddev, r10_bio); | 1814 | sync_request_write(mddev, r10_bio); |
1713 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 1815 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
1714 | recovery_request_write(mddev, r10_bio); | 1816 | recovery_request_write(mddev, r10_bio); |
1715 | else | 1817 | else if (test_bit(R10BIO_ReadError, &r10_bio->state)) |
1716 | handle_read_error(mddev, r10_bio); | 1818 | handle_read_error(mddev, r10_bio); |
1819 | else { | ||
1820 | /* just a partial read to be scheduled from a | ||
1821 | * separate context | ||
1822 | */ | ||
1823 | int slot = r10_bio->read_slot; | ||
1824 | generic_make_request(r10_bio->devs[slot].bio); | ||
1825 | } | ||
1717 | 1826 | ||
1718 | cond_resched(); | 1827 | cond_resched(); |
1719 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | 1828 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index a485914c48c1..c646152ba4e4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -124,4 +124,8 @@ struct r10bio_s { | |||
124 | #define R10BIO_IsSync 1 | 124 | #define R10BIO_IsSync 1 |
125 | #define R10BIO_IsRecover 2 | 125 | #define R10BIO_IsRecover 2 |
126 | #define R10BIO_Degraded 3 | 126 | #define R10BIO_Degraded 3 |
127 | /* Set ReadError on bios that experience a read error | ||
128 | * so that raid10d knows what to do with them. | ||
129 | */ | ||
130 | #define R10BIO_ReadError 4 | ||
127 | #endif | 131 | #endif |