diff options
-rw-r--r-- | drivers/md/raid10.c | 141 | ||||
-rw-r--r-- | drivers/md/raid10.h | 4 |
2 files changed, 129 insertions, 16 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f1b749c21717..872bf948f33a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio) | |||
191 | { | 191 | { |
192 | conf_t *conf = r10_bio->mddev->private; | 192 | conf_t *conf = r10_bio->mddev->private; |
193 | 193 | ||
194 | /* | ||
195 | * Wake up any possible resync thread that waits for the device | ||
196 | * to go idle. | ||
197 | */ | ||
198 | allow_barrier(conf); | ||
199 | |||
200 | put_all_bios(conf, r10_bio); | 194 | put_all_bios(conf, r10_bio); |
201 | mempool_free(r10_bio, conf->r10bio_pool); | 195 | mempool_free(r10_bio, conf->r10bio_pool); |
202 | } | 196 | } |
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio) | |||
235 | static void raid_end_bio_io(r10bio_t *r10_bio) | 229 | static void raid_end_bio_io(r10bio_t *r10_bio) |
236 | { | 230 | { |
237 | struct bio *bio = r10_bio->master_bio; | 231 | struct bio *bio = r10_bio->master_bio; |
232 | int done; | ||
233 | conf_t *conf = r10_bio->mddev->private; | ||
238 | 234 | ||
239 | bio_endio(bio, | 235 | if (bio->bi_phys_segments) { |
240 | test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); | 236 | unsigned long flags; |
237 | spin_lock_irqsave(&conf->device_lock, flags); | ||
238 | bio->bi_phys_segments--; | ||
239 | done = (bio->bi_phys_segments == 0); | ||
240 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
241 | } else | ||
242 | done = 1; | ||
243 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
244 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
245 | if (done) { | ||
246 | bio_endio(bio, 0); | ||
247 | /* | ||
248 | * Wake up any possible resync thread that waits for the device | ||
249 | * to go idle. | ||
250 | */ | ||
251 | allow_barrier(conf); | ||
252 | } | ||
241 | free_r10bio(r10_bio); | 253 | free_r10bio(r10_bio); |
242 | } | 254 | } |
243 | 255 | ||
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
307 | mdname(conf->mddev), | 319 | mdname(conf->mddev), |
308 | bdevname(conf->mirrors[dev].rdev->bdev, b), | 320 | bdevname(conf->mirrors[dev].rdev->bdev, b), |
309 | (unsigned long long)r10_bio->sector); | 321 | (unsigned long long)r10_bio->sector); |
322 | set_bit(R10BIO_ReadError, &r10_bio->state); | ||
310 | reschedule_retry(r10_bio); | 323 | reschedule_retry(r10_bio); |
311 | } | 324 | } |
312 | } | 325 | } |
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
505 | * FIXME: possibly should rethink readbalancing and do it differently | 518 | * FIXME: possibly should rethink readbalancing and do it differently |
506 | * depending on near_copies / far_copies geometry. | 519 | * depending on near_copies / far_copies geometry. |
507 | */ | 520 | */ |
508 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 521 | static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) |
509 | { | 522 | { |
510 | const sector_t this_sector = r10_bio->sector; | 523 | const sector_t this_sector = r10_bio->sector; |
511 | int disk, slot; | 524 | int disk, slot; |
512 | const int sectors = r10_bio->sectors; | 525 | int sectors = r10_bio->sectors; |
526 | int best_good_sectors; | ||
513 | sector_t new_distance, best_dist; | 527 | sector_t new_distance, best_dist; |
514 | mdk_rdev_t *rdev; | 528 | mdk_rdev_t *rdev; |
515 | int do_balance; | 529 | int do_balance; |
@@ -518,8 +532,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
518 | raid10_find_phys(conf, r10_bio); | 532 | raid10_find_phys(conf, r10_bio); |
519 | rcu_read_lock(); | 533 | rcu_read_lock(); |
520 | retry: | 534 | retry: |
535 | sectors = r10_bio->sectors; | ||
521 | best_slot = -1; | 536 | best_slot = -1; |
522 | best_dist = MaxSector; | 537 | best_dist = MaxSector; |
538 | best_good_sectors = 0; | ||
523 | do_balance = 1; | 539 | do_balance = 1; |
524 | /* | 540 | /* |
525 | * Check if we can balance. We can balance on the whole | 541 | * Check if we can balance. We can balance on the whole |
@@ -532,6 +548,10 @@ retry: | |||
532 | do_balance = 0; | 548 | do_balance = 0; |
533 | 549 | ||
534 | for (slot = 0; slot < conf->copies ; slot++) { | 550 | for (slot = 0; slot < conf->copies ; slot++) { |
551 | sector_t first_bad; | ||
552 | int bad_sectors; | ||
553 | sector_t dev_sector; | ||
554 | |||
535 | if (r10_bio->devs[slot].bio == IO_BLOCKED) | 555 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
536 | continue; | 556 | continue; |
537 | disk = r10_bio->devs[slot].devnum; | 557 | disk = r10_bio->devs[slot].devnum; |
@@ -541,6 +561,37 @@ retry: | |||
541 | if (!test_bit(In_sync, &rdev->flags)) | 561 | if (!test_bit(In_sync, &rdev->flags)) |
542 | continue; | 562 | continue; |
543 | 563 | ||
564 | dev_sector = r10_bio->devs[slot].addr; | ||
565 | if (is_badblock(rdev, dev_sector, sectors, | ||
566 | &first_bad, &bad_sectors)) { | ||
567 | if (best_dist < MaxSector) | ||
568 | /* Already have a better slot */ | ||
569 | continue; | ||
570 | if (first_bad <= dev_sector) { | ||
571 | /* Cannot read here. If this is the | ||
572 | * 'primary' device, then we must not read | ||
573 | * beyond 'bad_sectors' from another device. | ||
574 | */ | ||
575 | bad_sectors -= (dev_sector - first_bad); | ||
576 | if (!do_balance && sectors > bad_sectors) | ||
577 | sectors = bad_sectors; | ||
578 | if (best_good_sectors > sectors) | ||
579 | best_good_sectors = sectors; | ||
580 | } else { | ||
581 | sector_t good_sectors = | ||
582 | first_bad - dev_sector; | ||
583 | if (good_sectors > best_good_sectors) { | ||
584 | best_good_sectors = good_sectors; | ||
585 | best_slot = slot; | ||
586 | } | ||
587 | if (!do_balance) | ||
588 | /* Must read from here */ | ||
589 | break; | ||
590 | } | ||
591 | continue; | ||
592 | } else | ||
593 | best_good_sectors = sectors; | ||
594 | |||
544 | if (!do_balance) | 595 | if (!do_balance) |
545 | break; | 596 | break; |
546 | 597 | ||
@@ -582,6 +633,7 @@ retry: | |||
582 | } else | 633 | } else |
583 | disk = -1; | 634 | disk = -1; |
584 | rcu_read_unlock(); | 635 | rcu_read_unlock(); |
636 | *max_sectors = best_good_sectors; | ||
585 | 637 | ||
586 | return disk; | 638 | return disk; |
587 | } | 639 | } |
@@ -829,12 +881,27 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
829 | r10_bio->sector = bio->bi_sector; | 881 | r10_bio->sector = bio->bi_sector; |
830 | r10_bio->state = 0; | 882 | r10_bio->state = 0; |
831 | 883 | ||
884 | /* We might need to issue multiple reads to different | ||
885 | * devices if there are bad blocks around, so we keep | ||
886 | * track of the number of reads in bio->bi_phys_segments. | ||
887 | * If this is 0, there is only one r10_bio and no locking | ||
888 | * will be needed when the request completes. If it is | ||
889 | * non-zero, then it is the number of not-completed requests. | ||
890 | */ | ||
891 | bio->bi_phys_segments = 0; | ||
892 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
893 | |||
832 | if (rw == READ) { | 894 | if (rw == READ) { |
833 | /* | 895 | /* |
834 | * read balancing logic: | 896 | * read balancing logic: |
835 | */ | 897 | */ |
836 | int disk = read_balance(conf, r10_bio); | 898 | int max_sectors; |
837 | int slot = r10_bio->read_slot; | 899 | int disk; |
900 | int slot; | ||
901 | |||
902 | read_again: | ||
903 | disk = read_balance(conf, r10_bio, &max_sectors); | ||
904 | slot = r10_bio->read_slot; | ||
838 | if (disk < 0) { | 905 | if (disk < 0) { |
839 | raid_end_bio_io(r10_bio); | 906 | raid_end_bio_io(r10_bio); |
840 | return 0; | 907 | return 0; |
@@ -842,6 +909,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
842 | mirror = conf->mirrors + disk; | 909 | mirror = conf->mirrors + disk; |
843 | 910 | ||
844 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 911 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
912 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, | ||
913 | max_sectors); | ||
845 | 914 | ||
846 | r10_bio->devs[slot].bio = read_bio; | 915 | r10_bio->devs[slot].bio = read_bio; |
847 | 916 | ||
@@ -852,7 +921,39 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
852 | read_bio->bi_rw = READ | do_sync; | 921 | read_bio->bi_rw = READ | do_sync; |
853 | read_bio->bi_private = r10_bio; | 922 | read_bio->bi_private = r10_bio; |
854 | 923 | ||
855 | generic_make_request(read_bio); | 924 | if (max_sectors < r10_bio->sectors) { |
925 | /* Could not read all from this device, so we will | ||
926 | * need another r10_bio. | ||
927 | */ | ||
928 | int sectors_handled; | ||
929 | |||
930 | sectors_handled = (r10_bio->sectors + max_sectors | ||
931 | - bio->bi_sector); | ||
932 | r10_bio->sectors = max_sectors; | ||
933 | spin_lock_irq(&conf->device_lock); | ||
934 | if (bio->bi_phys_segments == 0) | ||
935 | bio->bi_phys_segments = 2; | ||
936 | else | ||
937 | bio->bi_phys_segments++; | ||
938 | spin_unlock(&conf->device_lock); | ||
939 | /* Cannot call generic_make_request directly | ||
940 | * as that will be queued in __generic_make_request | ||
941 | * and subsequent mempool_alloc might block | ||
942 | * waiting for it. so hand bio over to raid10d. | ||
943 | */ | ||
944 | reschedule_retry(r10_bio); | ||
945 | |||
946 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
947 | |||
948 | r10_bio->master_bio = bio; | ||
949 | r10_bio->sectors = ((bio->bi_size >> 9) | ||
950 | - sectors_handled); | ||
951 | r10_bio->state = 0; | ||
952 | r10_bio->mddev = mddev; | ||
953 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
954 | goto read_again; | ||
955 | } else | ||
956 | generic_make_request(read_bio); | ||
856 | return 0; | 957 | return 0; |
857 | } | 958 | } |
858 | 959 | ||
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) | |||
1627 | mdk_rdev_t *rdev; | 1728 | mdk_rdev_t *rdev; |
1628 | char b[BDEVNAME_SIZE]; | 1729 | char b[BDEVNAME_SIZE]; |
1629 | unsigned long do_sync; | 1730 | unsigned long do_sync; |
1731 | int max_sectors; | ||
1630 | 1732 | ||
1631 | /* we got a read error. Maybe the drive is bad. Maybe just | 1733 | /* we got a read error. Maybe the drive is bad. Maybe just |
1632 | * the block and we can fix it. | 1734 | * the block and we can fix it. |
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) | |||
1646 | bio = r10_bio->devs[slot].bio; | 1748 | bio = r10_bio->devs[slot].bio; |
1647 | r10_bio->devs[slot].bio = | 1749 | r10_bio->devs[slot].bio = |
1648 | mddev->ro ? IO_BLOCKED : NULL; | 1750 | mddev->ro ? IO_BLOCKED : NULL; |
1649 | mirror = read_balance(conf, r10_bio); | 1751 | mirror = read_balance(conf, r10_bio, &max_sectors); |
1650 | if (mirror == -1) { | 1752 | if (mirror == -1 || max_sectors < r10_bio->sectors) { |
1651 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | 1753 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" |
1652 | " read error for block %llu\n", | 1754 | " read error for block %llu\n", |
1653 | mdname(mddev), | 1755 | mdname(mddev), |
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev) | |||
1712 | sync_request_write(mddev, r10_bio); | 1814 | sync_request_write(mddev, r10_bio); |
1713 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 1815 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
1714 | recovery_request_write(mddev, r10_bio); | 1816 | recovery_request_write(mddev, r10_bio); |
1715 | else | 1817 | else if (test_bit(R10BIO_ReadError, &r10_bio->state)) |
1716 | handle_read_error(mddev, r10_bio); | 1818 | handle_read_error(mddev, r10_bio); |
1819 | else { | ||
1820 | /* just a partial read to be scheduled from a | ||
1821 | * separate context | ||
1822 | */ | ||
1823 | int slot = r10_bio->read_slot; | ||
1824 | generic_make_request(r10_bio->devs[slot].bio); | ||
1825 | } | ||
1717 | 1826 | ||
1718 | cond_resched(); | 1827 | cond_resched(); |
1719 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | 1828 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index a485914c48c1..c646152ba4e4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -124,4 +124,8 @@ struct r10bio_s { | |||
124 | #define R10BIO_IsSync 1 | 124 | #define R10BIO_IsSync 1 |
125 | #define R10BIO_IsRecover 2 | 125 | #define R10BIO_IsRecover 2 |
126 | #define R10BIO_Degraded 3 | 126 | #define R10BIO_Degraded 3 |
127 | /* Set ReadError on bios that experience a read error | ||
128 | * so that raid10d knows what to do with them. | ||
129 | */ | ||
130 | #define R10BIO_ReadError 4 | ||
127 | #endif | 131 | #endif |