aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/md/raid10.c141
-rw-r--r--drivers/md/raid10.h4
2 files changed, 129 insertions, 16 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f1b749c21717..872bf948f33a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
191{ 191{
192 conf_t *conf = r10_bio->mddev->private; 192 conf_t *conf = r10_bio->mddev->private;
193 193
194 /*
195 * Wake up any possible resync thread that waits for the device
196 * to go idle.
197 */
198 allow_barrier(conf);
199
200 put_all_bios(conf, r10_bio); 194 put_all_bios(conf, r10_bio);
201 mempool_free(r10_bio, conf->r10bio_pool); 195 mempool_free(r10_bio, conf->r10bio_pool);
202} 196}
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
235static void raid_end_bio_io(r10bio_t *r10_bio) 229static void raid_end_bio_io(r10bio_t *r10_bio)
236{ 230{
237 struct bio *bio = r10_bio->master_bio; 231 struct bio *bio = r10_bio->master_bio;
232 int done;
233 conf_t *conf = r10_bio->mddev->private;
238 234
239 bio_endio(bio, 235 if (bio->bi_phys_segments) {
240 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); 236 unsigned long flags;
237 spin_lock_irqsave(&conf->device_lock, flags);
238 bio->bi_phys_segments--;
239 done = (bio->bi_phys_segments == 0);
240 spin_unlock_irqrestore(&conf->device_lock, flags);
241 } else
242 done = 1;
243 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
244 clear_bit(BIO_UPTODATE, &bio->bi_flags);
245 if (done) {
246 bio_endio(bio, 0);
247 /*
248 * Wake up any possible resync thread that waits for the device
249 * to go idle.
250 */
251 allow_barrier(conf);
252 }
241 free_r10bio(r10_bio); 253 free_r10bio(r10_bio);
242} 254}
243 255
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
307 mdname(conf->mddev), 319 mdname(conf->mddev),
308 bdevname(conf->mirrors[dev].rdev->bdev, b), 320 bdevname(conf->mirrors[dev].rdev->bdev, b),
309 (unsigned long long)r10_bio->sector); 321 (unsigned long long)r10_bio->sector);
322 set_bit(R10BIO_ReadError, &r10_bio->state);
310 reschedule_retry(r10_bio); 323 reschedule_retry(r10_bio);
311 } 324 }
312} 325}
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
505 * FIXME: possibly should rethink readbalancing and do it differently 518 * FIXME: possibly should rethink readbalancing and do it differently
506 * depending on near_copies / far_copies geometry. 519 * depending on near_copies / far_copies geometry.
507 */ 520 */
508static int read_balance(conf_t *conf, r10bio_t *r10_bio) 521static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
509{ 522{
510 const sector_t this_sector = r10_bio->sector; 523 const sector_t this_sector = r10_bio->sector;
511 int disk, slot; 524 int disk, slot;
512 const int sectors = r10_bio->sectors; 525 int sectors = r10_bio->sectors;
526 int best_good_sectors;
513 sector_t new_distance, best_dist; 527 sector_t new_distance, best_dist;
514 mdk_rdev_t *rdev; 528 mdk_rdev_t *rdev;
515 int do_balance; 529 int do_balance;
@@ -518,8 +532,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
518 raid10_find_phys(conf, r10_bio); 532 raid10_find_phys(conf, r10_bio);
519 rcu_read_lock(); 533 rcu_read_lock();
520retry: 534retry:
535 sectors = r10_bio->sectors;
521 best_slot = -1; 536 best_slot = -1;
522 best_dist = MaxSector; 537 best_dist = MaxSector;
538 best_good_sectors = 0;
523 do_balance = 1; 539 do_balance = 1;
524 /* 540 /*
525 * Check if we can balance. We can balance on the whole 541 * Check if we can balance. We can balance on the whole
@@ -532,6 +548,10 @@ retry:
532 do_balance = 0; 548 do_balance = 0;
533 549
534 for (slot = 0; slot < conf->copies ; slot++) { 550 for (slot = 0; slot < conf->copies ; slot++) {
551 sector_t first_bad;
552 int bad_sectors;
553 sector_t dev_sector;
554
535 if (r10_bio->devs[slot].bio == IO_BLOCKED) 555 if (r10_bio->devs[slot].bio == IO_BLOCKED)
536 continue; 556 continue;
537 disk = r10_bio->devs[slot].devnum; 557 disk = r10_bio->devs[slot].devnum;
@@ -541,6 +561,37 @@ retry:
541 if (!test_bit(In_sync, &rdev->flags)) 561 if (!test_bit(In_sync, &rdev->flags))
542 continue; 562 continue;
543 563
564 dev_sector = r10_bio->devs[slot].addr;
565 if (is_badblock(rdev, dev_sector, sectors,
566 &first_bad, &bad_sectors)) {
567 if (best_dist < MaxSector)
568 /* Already have a better slot */
569 continue;
570 if (first_bad <= dev_sector) {
571 /* Cannot read here. If this is the
572 * 'primary' device, then we must not read
573 * beyond 'bad_sectors' from another device.
574 */
575 bad_sectors -= (dev_sector - first_bad);
576 if (!do_balance && sectors > bad_sectors)
577 sectors = bad_sectors;
578 if (best_good_sectors > sectors)
579 best_good_sectors = sectors;
580 } else {
581 sector_t good_sectors =
582 first_bad - dev_sector;
583 if (good_sectors > best_good_sectors) {
584 best_good_sectors = good_sectors;
585 best_slot = slot;
586 }
587 if (!do_balance)
588 /* Must read from here */
589 break;
590 }
591 continue;
592 } else
593 best_good_sectors = sectors;
594
544 if (!do_balance) 595 if (!do_balance)
545 break; 596 break;
546 597
@@ -582,6 +633,7 @@ retry:
582 } else 633 } else
583 disk = -1; 634 disk = -1;
584 rcu_read_unlock(); 635 rcu_read_unlock();
636 *max_sectors = best_good_sectors;
585 637
586 return disk; 638 return disk;
587} 639}
@@ -829,12 +881,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
829 r10_bio->sector = bio->bi_sector; 881 r10_bio->sector = bio->bi_sector;
830 r10_bio->state = 0; 882 r10_bio->state = 0;
831 883
884 /* We might need to issue multiple reads to different
885 * devices if there are bad blocks around, so we keep
886 * track of the number of reads in bio->bi_phys_segments.
887 * If this is 0, there is only one r10_bio and no locking
888 * will be needed when the request completes. If it is
889 * non-zero, then it is the number of not-completed requests.
890 */
891 bio->bi_phys_segments = 0;
892 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
893
832 if (rw == READ) { 894 if (rw == READ) {
833 /* 895 /*
834 * read balancing logic: 896 * read balancing logic:
835 */ 897 */
836 int disk = read_balance(conf, r10_bio); 898 int max_sectors;
837 int slot = r10_bio->read_slot; 899 int disk;
900 int slot;
901
902read_again:
903 disk = read_balance(conf, r10_bio, &max_sectors);
904 slot = r10_bio->read_slot;
838 if (disk < 0) { 905 if (disk < 0) {
839 raid_end_bio_io(r10_bio); 906 raid_end_bio_io(r10_bio);
840 return 0; 907 return 0;
@@ -842,6 +909,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
842 mirror = conf->mirrors + disk; 909 mirror = conf->mirrors + disk;
843 910
844 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 911 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
912 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
913 max_sectors);
845 914
846 r10_bio->devs[slot].bio = read_bio; 915 r10_bio->devs[slot].bio = read_bio;
847 916
@@ -852,7 +921,39 @@ static int make_request(mddev_t *mddev, struct bio * bio)
852 read_bio->bi_rw = READ | do_sync; 921 read_bio->bi_rw = READ | do_sync;
853 read_bio->bi_private = r10_bio; 922 read_bio->bi_private = r10_bio;
854 923
855 generic_make_request(read_bio); 924 if (max_sectors < r10_bio->sectors) {
925 /* Could not read all from this device, so we will
926 * need another r10_bio.
927 */
928 int sectors_handled;
929
930 sectors_handled = (r10_bio->sectors + max_sectors
931 - bio->bi_sector);
932 r10_bio->sectors = max_sectors;
933 spin_lock_irq(&conf->device_lock);
934 if (bio->bi_phys_segments == 0)
935 bio->bi_phys_segments = 2;
936 else
937 bio->bi_phys_segments++;
938 spin_unlock(&conf->device_lock);
939 /* Cannot call generic_make_request directly
940 * as that will be queued in __generic_make_request
941 * and subsequent mempool_alloc might block
942 * waiting for it. so hand bio over to raid10d.
943 */
944 reschedule_retry(r10_bio);
945
946 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
947
948 r10_bio->master_bio = bio;
949 r10_bio->sectors = ((bio->bi_size >> 9)
950 - sectors_handled);
951 r10_bio->state = 0;
952 r10_bio->mddev = mddev;
953 r10_bio->sector = bio->bi_sector + sectors_handled;
954 goto read_again;
955 } else
956 generic_make_request(read_bio);
856 return 0; 957 return 0;
857 } 958 }
858 959
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
1627 mdk_rdev_t *rdev; 1728 mdk_rdev_t *rdev;
1628 char b[BDEVNAME_SIZE]; 1729 char b[BDEVNAME_SIZE];
1629 unsigned long do_sync; 1730 unsigned long do_sync;
1731 int max_sectors;
1630 1732
1631 /* we got a read error. Maybe the drive is bad. Maybe just 1733 /* we got a read error. Maybe the drive is bad. Maybe just
1632 * the block and we can fix it. 1734 * the block and we can fix it.
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
1646 bio = r10_bio->devs[slot].bio; 1748 bio = r10_bio->devs[slot].bio;
1647 r10_bio->devs[slot].bio = 1749 r10_bio->devs[slot].bio =
1648 mddev->ro ? IO_BLOCKED : NULL; 1750 mddev->ro ? IO_BLOCKED : NULL;
1649 mirror = read_balance(conf, r10_bio); 1751 mirror = read_balance(conf, r10_bio, &max_sectors);
1650 if (mirror == -1) { 1752 if (mirror == -1 || max_sectors < r10_bio->sectors) {
1651 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 1753 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1652 " read error for block %llu\n", 1754 " read error for block %llu\n",
1653 mdname(mddev), 1755 mdname(mddev),
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev)
1712 sync_request_write(mddev, r10_bio); 1814 sync_request_write(mddev, r10_bio);
1713 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 1815 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1714 recovery_request_write(mddev, r10_bio); 1816 recovery_request_write(mddev, r10_bio);
1715 else 1817 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
1716 handle_read_error(mddev, r10_bio); 1818 handle_read_error(mddev, r10_bio);
1819 else {
1820 /* just a partial read to be scheduled from a
1821 * separate context
1822 */
1823 int slot = r10_bio->read_slot;
1824 generic_make_request(r10_bio->devs[slot].bio);
1825 }
1717 1826
1718 cond_resched(); 1827 cond_resched();
1719 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 1828 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index a485914c48c1..c646152ba4e4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -124,4 +124,8 @@ struct r10bio_s {
124#define R10BIO_IsSync 1 124#define R10BIO_IsSync 1
125#define R10BIO_IsRecover 2 125#define R10BIO_IsRecover 2
126#define R10BIO_Degraded 3 126#define R10BIO_Degraded 3
127/* Set ReadError on bios that experience a read error
128 * so that raid10d knows what to do with them.
129 */
130#define R10BIO_ReadError 4
127#endif 131#endif