aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c164
1 files changed, 121 insertions, 43 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index cacd008d6864..197f62681db5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
46 */ 46 */
47#define NR_RAID1_BIOS 256 47#define NR_RAID1_BIOS 256
48 48
49/* when we get a read error on a read-only array, we redirect to another
50 * device without failing the first device, or trying to over-write to
51 * correct the read error. To keep track of bad blocks on a per-bio
52 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
53 */
54#define IO_BLOCKED ((struct bio *)1)
55/* When we successfully write to a known bad-block, we need to remove the
56 * bad-block marking which must be done from process context. So we record
57 * the success by setting devs[n].bio to IO_MADE_GOOD
58 */
59#define IO_MADE_GOOD ((struct bio *)2)
60
61#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
62
49/* When there are this many requests queue to be written by 63/* When there are this many requests queue to be written by
50 * the raid1 thread, we become 'congested' to provide back-pressure 64 * the raid1 thread, we become 'congested' to provide back-pressure
51 * for writeback. 65 * for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
483 const sector_t this_sector = r1_bio->sector; 497 const sector_t this_sector = r1_bio->sector;
484 int sectors; 498 int sectors;
485 int best_good_sectors; 499 int best_good_sectors;
486 int start_disk; 500 int best_disk, best_dist_disk, best_pending_disk;
487 int best_disk; 501 int has_nonrot_disk;
488 int i; 502 int disk;
489 sector_t best_dist; 503 sector_t best_dist;
504 unsigned int min_pending;
490 struct md_rdev *rdev; 505 struct md_rdev *rdev;
491 int choose_first; 506 int choose_first;
507 int choose_next_idle;
492 508
493 rcu_read_lock(); 509 rcu_read_lock();
494 /* 510 /*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
499 retry: 515 retry:
500 sectors = r1_bio->sectors; 516 sectors = r1_bio->sectors;
501 best_disk = -1; 517 best_disk = -1;
518 best_dist_disk = -1;
502 best_dist = MaxSector; 519 best_dist = MaxSector;
520 best_pending_disk = -1;
521 min_pending = UINT_MAX;
503 best_good_sectors = 0; 522 best_good_sectors = 0;
523 has_nonrot_disk = 0;
524 choose_next_idle = 0;
504 525
505 if (conf->mddev->recovery_cp < MaxSector && 526 if (conf->mddev->recovery_cp < MaxSector &&
506 (this_sector + sectors >= conf->next_resync)) { 527 (this_sector + sectors >= conf->next_resync))
507 choose_first = 1; 528 choose_first = 1;
508 start_disk = 0; 529 else
509 } else {
510 choose_first = 0; 530 choose_first = 0;
511 start_disk = conf->last_used;
512 }
513 531
514 for (i = 0 ; i < conf->raid_disks * 2 ; i++) { 532 for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
515 sector_t dist; 533 sector_t dist;
516 sector_t first_bad; 534 sector_t first_bad;
517 int bad_sectors; 535 int bad_sectors;
518 536 unsigned int pending;
519 int disk = start_disk + i; 537 bool nonrot;
520 if (disk >= conf->raid_disks * 2)
521 disk -= conf->raid_disks * 2;
522 538
523 rdev = rcu_dereference(conf->mirrors[disk].rdev); 539 rdev = rcu_dereference(conf->mirrors[disk].rdev);
524 if (r1_bio->bios[disk] == IO_BLOCKED 540 if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
577 } else 593 } else
578 best_good_sectors = sectors; 594 best_good_sectors = sectors;
579 595
596 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
597 has_nonrot_disk |= nonrot;
598 pending = atomic_read(&rdev->nr_pending);
580 dist = abs(this_sector - conf->mirrors[disk].head_position); 599 dist = abs(this_sector - conf->mirrors[disk].head_position);
581 if (choose_first 600 if (choose_first) {
582 /* Don't change to another disk for sequential reads */ 601 best_disk = disk;
583 || conf->next_seq_sect == this_sector 602 break;
584 || dist == 0 603 }
585 /* If device is idle, use it */ 604 /* Don't change to another disk for sequential reads */
586 || atomic_read(&rdev->nr_pending) == 0) { 605 if (conf->mirrors[disk].next_seq_sect == this_sector
606 || dist == 0) {
607 int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
608 struct raid1_info *mirror = &conf->mirrors[disk];
609
610 best_disk = disk;
611 /*
612 * If buffered sequential IO size exceeds optimal
613 * iosize, check if there is idle disk. If yes, choose
614 * the idle disk. read_balance could already choose an
615 * idle disk before noticing it's a sequential IO in
616 * this disk. This doesn't matter because this disk
617 * will idle, next time it will be utilized after the
618 * first disk has IO size exceeds optimal iosize. In
619 * this way, iosize of the first disk will be optimal
620 * iosize at least. iosize of the second disk might be
621 * small, but not a big deal since when the second disk
622 * starts IO, the first disk is likely still busy.
623 */
624 if (nonrot && opt_iosize > 0 &&
625 mirror->seq_start != MaxSector &&
626 mirror->next_seq_sect > opt_iosize &&
627 mirror->next_seq_sect - opt_iosize >=
628 mirror->seq_start) {
629 choose_next_idle = 1;
630 continue;
631 }
632 break;
633 }
634 /* If device is idle, use it */
635 if (pending == 0) {
587 best_disk = disk; 636 best_disk = disk;
588 break; 637 break;
589 } 638 }
639
640 if (choose_next_idle)
641 continue;
642
643 if (min_pending > pending) {
644 min_pending = pending;
645 best_pending_disk = disk;
646 }
647
590 if (dist < best_dist) { 648 if (dist < best_dist) {
591 best_dist = dist; 649 best_dist = dist;
592 best_disk = disk; 650 best_dist_disk = disk;
593 } 651 }
594 } 652 }
595 653
654 /*
655 * If all disks are rotational, choose the closest disk. If any disk is
656 * non-rotational, choose the disk with less pending request even the
657 * disk is rotational, which might/might not be optimal for raids with
658 * mixed ratation/non-rotational disks depending on workload.
659 */
660 if (best_disk == -1) {
661 if (has_nonrot_disk)
662 best_disk = best_pending_disk;
663 else
664 best_disk = best_dist_disk;
665 }
666
596 if (best_disk >= 0) { 667 if (best_disk >= 0) {
597 rdev = rcu_dereference(conf->mirrors[best_disk].rdev); 668 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
598 if (!rdev) 669 if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
606 goto retry; 677 goto retry;
607 } 678 }
608 sectors = best_good_sectors; 679 sectors = best_good_sectors;
609 conf->next_seq_sect = this_sector + sectors; 680
610 conf->last_used = best_disk; 681 if (conf->mirrors[best_disk].next_seq_sect != this_sector)
682 conf->mirrors[best_disk].seq_start = this_sector;
683
684 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
611 } 685 }
612 rcu_read_unlock(); 686 rcu_read_unlock();
613 *max_sectors = sectors; 687 *max_sectors = sectors;
@@ -873,7 +947,7 @@ do_sync_io:
873static void make_request(struct mddev *mddev, struct bio * bio) 947static void make_request(struct mddev *mddev, struct bio * bio)
874{ 948{
875 struct r1conf *conf = mddev->private; 949 struct r1conf *conf = mddev->private;
876 struct mirror_info *mirror; 950 struct raid1_info *mirror;
877 struct r1bio *r1_bio; 951 struct r1bio *r1_bio;
878 struct bio *read_bio; 952 struct bio *read_bio;
879 int i, disks; 953 int i, disks;
@@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1364 struct r1conf *conf = mddev->private; 1438 struct r1conf *conf = mddev->private;
1365 int err = -EEXIST; 1439 int err = -EEXIST;
1366 int mirror = 0; 1440 int mirror = 0;
1367 struct mirror_info *p; 1441 struct raid1_info *p;
1368 int first = 0; 1442 int first = 0;
1369 int last = conf->raid_disks - 1; 1443 int last = conf->raid_disks - 1;
1370 struct request_queue *q = bdev_get_queue(rdev->bdev); 1444 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1433 struct r1conf *conf = mddev->private; 1507 struct r1conf *conf = mddev->private;
1434 int err = 0; 1508 int err = 0;
1435 int number = rdev->raid_disk; 1509 int number = rdev->raid_disk;
1436 struct mirror_info *p = conf->mirrors+ number; 1510 struct raid1_info *p = conf->mirrors + number;
1437 1511
1438 if (rdev != p->rdev) 1512 if (rdev != p->rdev)
1439 p = conf->mirrors + conf->raid_disks + number; 1513 p = conf->mirrors + conf->raid_disks + number;
@@ -2371,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2371 bio->bi_rw = READ; 2445 bio->bi_rw = READ;
2372 bio->bi_end_io = end_sync_read; 2446 bio->bi_end_io = end_sync_read;
2373 read_targets++; 2447 read_targets++;
2448 } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
2449 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2450 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
2451 /*
2452 * The device is suitable for reading (InSync),
2453 * but has bad block(s) here. Let's try to correct them,
2454 * if we are doing resync or repair. Otherwise, leave
2455 * this device alone for this sync request.
2456 */
2457 bio->bi_rw = WRITE;
2458 bio->bi_end_io = end_sync_write;
2459 write_targets++;
2374 } 2460 }
2375 } 2461 }
2376 if (bio->bi_end_io) { 2462 if (bio->bi_end_io) {
@@ -2428,7 +2514,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2428 /* There is nowhere to write, so all non-sync 2514 /* There is nowhere to write, so all non-sync
2429 * drives must be failed - so we are finished 2515 * drives must be failed - so we are finished
2430 */ 2516 */
2431 sector_t rv = max_sector - sector_nr; 2517 sector_t rv;
2518 if (min_bad > 0)
2519 max_sector = sector_nr + min_bad;
2520 rv = max_sector - sector_nr;
2432 *skipped = 1; 2521 *skipped = 1;
2433 put_buf(r1_bio); 2522 put_buf(r1_bio);
2434 return rv; 2523 return rv;
@@ -2521,7 +2610,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2521{ 2610{
2522 struct r1conf *conf; 2611 struct r1conf *conf;
2523 int i; 2612 int i;
2524 struct mirror_info *disk; 2613 struct raid1_info *disk;
2525 struct md_rdev *rdev; 2614 struct md_rdev *rdev;
2526 int err = -ENOMEM; 2615 int err = -ENOMEM;
2527 2616
@@ -2529,7 +2618,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2529 if (!conf) 2618 if (!conf)
2530 goto abort; 2619 goto abort;
2531 2620
2532 conf->mirrors = kzalloc(sizeof(struct mirror_info) 2621 conf->mirrors = kzalloc(sizeof(struct raid1_info)
2533 * mddev->raid_disks * 2, 2622 * mddev->raid_disks * 2,
2534 GFP_KERNEL); 2623 GFP_KERNEL);
2535 if (!conf->mirrors) 2624 if (!conf->mirrors)
@@ -2572,6 +2661,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2572 mddev->merge_check_needed = 1; 2661 mddev->merge_check_needed = 1;
2573 2662
2574 disk->head_position = 0; 2663 disk->head_position = 0;
2664 disk->seq_start = MaxSector;
2575 } 2665 }
2576 conf->raid_disks = mddev->raid_disks; 2666 conf->raid_disks = mddev->raid_disks;
2577 conf->mddev = mddev; 2667 conf->mddev = mddev;
@@ -2585,7 +2675,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2585 conf->recovery_disabled = mddev->recovery_disabled - 1; 2675 conf->recovery_disabled = mddev->recovery_disabled - 1;
2586 2676
2587 err = -EIO; 2677 err = -EIO;
2588 conf->last_used = -1;
2589 for (i = 0; i < conf->raid_disks * 2; i++) { 2678 for (i = 0; i < conf->raid_disks * 2; i++) {
2590 2679
2591 disk = conf->mirrors + i; 2680 disk = conf->mirrors + i;
@@ -2611,19 +2700,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2611 if (disk->rdev && 2700 if (disk->rdev &&
2612 (disk->rdev->saved_raid_disk < 0)) 2701 (disk->rdev->saved_raid_disk < 0))
2613 conf->fullsync = 1; 2702 conf->fullsync = 1;
2614 } else if (conf->last_used < 0) 2703 }
2615 /*
2616 * The first working device is used as a
2617 * starting point to read balancing.
2618 */
2619 conf->last_used = i;
2620 } 2704 }
2621 2705
2622 if (conf->last_used < 0) {
2623 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2624 mdname(mddev));
2625 goto abort;
2626 }
2627 err = -ENOMEM; 2706 err = -ENOMEM;
2628 conf->thread = md_register_thread(raid1d, mddev, "raid1"); 2707 conf->thread = md_register_thread(raid1d, mddev, "raid1");
2629 if (!conf->thread) { 2708 if (!conf->thread) {
@@ -2798,7 +2877,7 @@ static int raid1_reshape(struct mddev *mddev)
2798 */ 2877 */
2799 mempool_t *newpool, *oldpool; 2878 mempool_t *newpool, *oldpool;
2800 struct pool_info *newpoolinfo; 2879 struct pool_info *newpoolinfo;
2801 struct mirror_info *newmirrors; 2880 struct raid1_info *newmirrors;
2802 struct r1conf *conf = mddev->private; 2881 struct r1conf *conf = mddev->private;
2803 int cnt, raid_disks; 2882 int cnt, raid_disks;
2804 unsigned long flags; 2883 unsigned long flags;
@@ -2841,7 +2920,7 @@ static int raid1_reshape(struct mddev *mddev)
2841 kfree(newpoolinfo); 2920 kfree(newpoolinfo);
2842 return -ENOMEM; 2921 return -ENOMEM;
2843 } 2922 }
2844 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, 2923 newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
2845 GFP_KERNEL); 2924 GFP_KERNEL);
2846 if (!newmirrors) { 2925 if (!newmirrors) {
2847 kfree(newpoolinfo); 2926 kfree(newpoolinfo);
@@ -2880,7 +2959,6 @@ static int raid1_reshape(struct mddev *mddev)
2880 conf->raid_disks = mddev->raid_disks = raid_disks; 2959 conf->raid_disks = mddev->raid_disks = raid_disks;
2881 mddev->delta_disks = 0; 2960 mddev->delta_disks = 0;
2882 2961
2883 conf->last_used = 0; /* just make sure it is in-range */
2884 lower_barrier(conf); 2962 lower_barrier(conf);
2885 2963
2886 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2964 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);