diff options
Diffstat (limited to 'drivers/md/raid1.c')
| -rw-r--r-- | drivers/md/raid1.c | 164 |
1 files changed, 121 insertions, 43 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cacd008d6864..197f62681db5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -46,6 +46,20 @@ | |||
| 46 | */ | 46 | */ |
| 47 | #define NR_RAID1_BIOS 256 | 47 | #define NR_RAID1_BIOS 256 |
| 48 | 48 | ||
| 49 | /* when we get a read error on a read-only array, we redirect to another | ||
| 50 | * device without failing the first device, or trying to over-write to | ||
| 51 | * correct the read error. To keep track of bad blocks on a per-bio | ||
| 52 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
| 53 | */ | ||
| 54 | #define IO_BLOCKED ((struct bio *)1) | ||
| 55 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 56 | * bad-block marking which must be done from process context. So we record | ||
| 57 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
| 58 | */ | ||
| 59 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 60 | |||
| 61 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 62 | |||
| 49 | /* When there are this many requests queue to be written by | 63 | /* When there are this many requests queue to be written by |
| 50 | * the raid1 thread, we become 'congested' to provide back-pressure | 64 | * the raid1 thread, we become 'congested' to provide back-pressure |
| 51 | * for writeback. | 65 | * for writeback. |
| @@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 483 | const sector_t this_sector = r1_bio->sector; | 497 | const sector_t this_sector = r1_bio->sector; |
| 484 | int sectors; | 498 | int sectors; |
| 485 | int best_good_sectors; | 499 | int best_good_sectors; |
| 486 | int start_disk; | 500 | int best_disk, best_dist_disk, best_pending_disk; |
| 487 | int best_disk; | 501 | int has_nonrot_disk; |
| 488 | int i; | 502 | int disk; |
| 489 | sector_t best_dist; | 503 | sector_t best_dist; |
| 504 | unsigned int min_pending; | ||
| 490 | struct md_rdev *rdev; | 505 | struct md_rdev *rdev; |
| 491 | int choose_first; | 506 | int choose_first; |
| 507 | int choose_next_idle; | ||
| 492 | 508 | ||
| 493 | rcu_read_lock(); | 509 | rcu_read_lock(); |
| 494 | /* | 510 | /* |
| @@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 499 | retry: | 515 | retry: |
| 500 | sectors = r1_bio->sectors; | 516 | sectors = r1_bio->sectors; |
| 501 | best_disk = -1; | 517 | best_disk = -1; |
| 518 | best_dist_disk = -1; | ||
| 502 | best_dist = MaxSector; | 519 | best_dist = MaxSector; |
| 520 | best_pending_disk = -1; | ||
| 521 | min_pending = UINT_MAX; | ||
| 503 | best_good_sectors = 0; | 522 | best_good_sectors = 0; |
| 523 | has_nonrot_disk = 0; | ||
| 524 | choose_next_idle = 0; | ||
| 504 | 525 | ||
| 505 | if (conf->mddev->recovery_cp < MaxSector && | 526 | if (conf->mddev->recovery_cp < MaxSector && |
| 506 | (this_sector + sectors >= conf->next_resync)) { | 527 | (this_sector + sectors >= conf->next_resync)) |
| 507 | choose_first = 1; | 528 | choose_first = 1; |
| 508 | start_disk = 0; | 529 | else |
| 509 | } else { | ||
| 510 | choose_first = 0; | 530 | choose_first = 0; |
| 511 | start_disk = conf->last_used; | ||
| 512 | } | ||
| 513 | 531 | ||
| 514 | for (i = 0 ; i < conf->raid_disks * 2 ; i++) { | 532 | for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { |
| 515 | sector_t dist; | 533 | sector_t dist; |
| 516 | sector_t first_bad; | 534 | sector_t first_bad; |
| 517 | int bad_sectors; | 535 | int bad_sectors; |
| 518 | 536 | unsigned int pending; | |
| 519 | int disk = start_disk + i; | 537 | bool nonrot; |
| 520 | if (disk >= conf->raid_disks * 2) | ||
| 521 | disk -= conf->raid_disks * 2; | ||
| 522 | 538 | ||
| 523 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 539 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
| 524 | if (r1_bio->bios[disk] == IO_BLOCKED | 540 | if (r1_bio->bios[disk] == IO_BLOCKED |
| @@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 577 | } else | 593 | } else |
| 578 | best_good_sectors = sectors; | 594 | best_good_sectors = sectors; |
| 579 | 595 | ||
| 596 | nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); | ||
| 597 | has_nonrot_disk |= nonrot; | ||
| 598 | pending = atomic_read(&rdev->nr_pending); | ||
| 580 | dist = abs(this_sector - conf->mirrors[disk].head_position); | 599 | dist = abs(this_sector - conf->mirrors[disk].head_position); |
| 581 | if (choose_first | 600 | if (choose_first) { |
| 582 | /* Don't change to another disk for sequential reads */ | 601 | best_disk = disk; |
| 583 | || conf->next_seq_sect == this_sector | 602 | break; |
| 584 | || dist == 0 | 603 | } |
| 585 | /* If device is idle, use it */ | 604 | /* Don't change to another disk for sequential reads */ |
| 586 | || atomic_read(&rdev->nr_pending) == 0) { | 605 | if (conf->mirrors[disk].next_seq_sect == this_sector |
| 606 | || dist == 0) { | ||
| 607 | int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; | ||
| 608 | struct raid1_info *mirror = &conf->mirrors[disk]; | ||
| 609 | |||
| 610 | best_disk = disk; | ||
| 611 | /* | ||
| 612 | * If buffered sequential IO size exceeds optimal | ||
| 613 | * iosize, check if there is idle disk. If yes, choose | ||
| 614 | * the idle disk. read_balance could already choose an | ||
| 615 | * idle disk before noticing it's a sequential IO in | ||
| 616 | * this disk. This doesn't matter because this disk | ||
| 617 | * will idle, next time it will be utilized after the | ||
| 618 | * first disk has IO size exceeds optimal iosize. In | ||
| 619 | * this way, iosize of the first disk will be optimal | ||
| 620 | * iosize at least. iosize of the second disk might be | ||
| 621 | * small, but not a big deal since when the second disk | ||
| 622 | * starts IO, the first disk is likely still busy. | ||
| 623 | */ | ||
| 624 | if (nonrot && opt_iosize > 0 && | ||
| 625 | mirror->seq_start != MaxSector && | ||
| 626 | mirror->next_seq_sect > opt_iosize && | ||
| 627 | mirror->next_seq_sect - opt_iosize >= | ||
| 628 | mirror->seq_start) { | ||
| 629 | choose_next_idle = 1; | ||
| 630 | continue; | ||
| 631 | } | ||
| 632 | break; | ||
| 633 | } | ||
| 634 | /* If device is idle, use it */ | ||
| 635 | if (pending == 0) { | ||
| 587 | best_disk = disk; | 636 | best_disk = disk; |
| 588 | break; | 637 | break; |
| 589 | } | 638 | } |
| 639 | |||
| 640 | if (choose_next_idle) | ||
| 641 | continue; | ||
| 642 | |||
| 643 | if (min_pending > pending) { | ||
| 644 | min_pending = pending; | ||
| 645 | best_pending_disk = disk; | ||
| 646 | } | ||
| 647 | |||
| 590 | if (dist < best_dist) { | 648 | if (dist < best_dist) { |
| 591 | best_dist = dist; | 649 | best_dist = dist; |
| 592 | best_disk = disk; | 650 | best_dist_disk = disk; |
| 593 | } | 651 | } |
| 594 | } | 652 | } |
| 595 | 653 | ||
| 654 | /* | ||
| 655 | * If all disks are rotational, choose the closest disk. If any disk is | ||
| 656 | * non-rotational, choose the disk with less pending request even the | ||
| 657 | * disk is rotational, which might/might not be optimal for raids with | ||
| 658 | * mixed ratation/non-rotational disks depending on workload. | ||
| 659 | */ | ||
| 660 | if (best_disk == -1) { | ||
| 661 | if (has_nonrot_disk) | ||
| 662 | best_disk = best_pending_disk; | ||
| 663 | else | ||
| 664 | best_disk = best_dist_disk; | ||
| 665 | } | ||
| 666 | |||
| 596 | if (best_disk >= 0) { | 667 | if (best_disk >= 0) { |
| 597 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); | 668 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); |
| 598 | if (!rdev) | 669 | if (!rdev) |
| @@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 606 | goto retry; | 677 | goto retry; |
| 607 | } | 678 | } |
| 608 | sectors = best_good_sectors; | 679 | sectors = best_good_sectors; |
| 609 | conf->next_seq_sect = this_sector + sectors; | 680 | |
| 610 | conf->last_used = best_disk; | 681 | if (conf->mirrors[best_disk].next_seq_sect != this_sector) |
| 682 | conf->mirrors[best_disk].seq_start = this_sector; | ||
| 683 | |||
| 684 | conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; | ||
| 611 | } | 685 | } |
| 612 | rcu_read_unlock(); | 686 | rcu_read_unlock(); |
| 613 | *max_sectors = sectors; | 687 | *max_sectors = sectors; |
| @@ -873,7 +947,7 @@ do_sync_io: | |||
| 873 | static void make_request(struct mddev *mddev, struct bio * bio) | 947 | static void make_request(struct mddev *mddev, struct bio * bio) |
| 874 | { | 948 | { |
| 875 | struct r1conf *conf = mddev->private; | 949 | struct r1conf *conf = mddev->private; |
| 876 | struct mirror_info *mirror; | 950 | struct raid1_info *mirror; |
| 877 | struct r1bio *r1_bio; | 951 | struct r1bio *r1_bio; |
| 878 | struct bio *read_bio; | 952 | struct bio *read_bio; |
| 879 | int i, disks; | 953 | int i, disks; |
| @@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1364 | struct r1conf *conf = mddev->private; | 1438 | struct r1conf *conf = mddev->private; |
| 1365 | int err = -EEXIST; | 1439 | int err = -EEXIST; |
| 1366 | int mirror = 0; | 1440 | int mirror = 0; |
| 1367 | struct mirror_info *p; | 1441 | struct raid1_info *p; |
| 1368 | int first = 0; | 1442 | int first = 0; |
| 1369 | int last = conf->raid_disks - 1; | 1443 | int last = conf->raid_disks - 1; |
| 1370 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 1444 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
| @@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1433 | struct r1conf *conf = mddev->private; | 1507 | struct r1conf *conf = mddev->private; |
| 1434 | int err = 0; | 1508 | int err = 0; |
| 1435 | int number = rdev->raid_disk; | 1509 | int number = rdev->raid_disk; |
| 1436 | struct mirror_info *p = conf->mirrors+ number; | 1510 | struct raid1_info *p = conf->mirrors + number; |
| 1437 | 1511 | ||
| 1438 | if (rdev != p->rdev) | 1512 | if (rdev != p->rdev) |
| 1439 | p = conf->mirrors + conf->raid_disks + number; | 1513 | p = conf->mirrors + conf->raid_disks + number; |
| @@ -2371,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
| 2371 | bio->bi_rw = READ; | 2445 | bio->bi_rw = READ; |
| 2372 | bio->bi_end_io = end_sync_read; | 2446 | bio->bi_end_io = end_sync_read; |
| 2373 | read_targets++; | 2447 | read_targets++; |
| 2448 | } else if (!test_bit(WriteErrorSeen, &rdev->flags) && | ||
| 2449 | test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && | ||
| 2450 | !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { | ||
| 2451 | /* | ||
| 2452 | * The device is suitable for reading (InSync), | ||
| 2453 | * but has bad block(s) here. Let's try to correct them, | ||
| 2454 | * if we are doing resync or repair. Otherwise, leave | ||
| 2455 | * this device alone for this sync request. | ||
| 2456 | */ | ||
| 2457 | bio->bi_rw = WRITE; | ||
| 2458 | bio->bi_end_io = end_sync_write; | ||
| 2459 | write_targets++; | ||
| 2374 | } | 2460 | } |
| 2375 | } | 2461 | } |
| 2376 | if (bio->bi_end_io) { | 2462 | if (bio->bi_end_io) { |
| @@ -2428,7 +2514,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
| 2428 | /* There is nowhere to write, so all non-sync | 2514 | /* There is nowhere to write, so all non-sync |
| 2429 | * drives must be failed - so we are finished | 2515 | * drives must be failed - so we are finished |
| 2430 | */ | 2516 | */ |
| 2431 | sector_t rv = max_sector - sector_nr; | 2517 | sector_t rv; |
| 2518 | if (min_bad > 0) | ||
| 2519 | max_sector = sector_nr + min_bad; | ||
| 2520 | rv = max_sector - sector_nr; | ||
| 2432 | *skipped = 1; | 2521 | *skipped = 1; |
| 2433 | put_buf(r1_bio); | 2522 | put_buf(r1_bio); |
| 2434 | return rv; | 2523 | return rv; |
| @@ -2521,7 +2610,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2521 | { | 2610 | { |
| 2522 | struct r1conf *conf; | 2611 | struct r1conf *conf; |
| 2523 | int i; | 2612 | int i; |
| 2524 | struct mirror_info *disk; | 2613 | struct raid1_info *disk; |
| 2525 | struct md_rdev *rdev; | 2614 | struct md_rdev *rdev; |
| 2526 | int err = -ENOMEM; | 2615 | int err = -ENOMEM; |
| 2527 | 2616 | ||
| @@ -2529,7 +2618,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2529 | if (!conf) | 2618 | if (!conf) |
| 2530 | goto abort; | 2619 | goto abort; |
| 2531 | 2620 | ||
| 2532 | conf->mirrors = kzalloc(sizeof(struct mirror_info) | 2621 | conf->mirrors = kzalloc(sizeof(struct raid1_info) |
| 2533 | * mddev->raid_disks * 2, | 2622 | * mddev->raid_disks * 2, |
| 2534 | GFP_KERNEL); | 2623 | GFP_KERNEL); |
| 2535 | if (!conf->mirrors) | 2624 | if (!conf->mirrors) |
| @@ -2572,6 +2661,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2572 | mddev->merge_check_needed = 1; | 2661 | mddev->merge_check_needed = 1; |
| 2573 | 2662 | ||
| 2574 | disk->head_position = 0; | 2663 | disk->head_position = 0; |
| 2664 | disk->seq_start = MaxSector; | ||
| 2575 | } | 2665 | } |
| 2576 | conf->raid_disks = mddev->raid_disks; | 2666 | conf->raid_disks = mddev->raid_disks; |
| 2577 | conf->mddev = mddev; | 2667 | conf->mddev = mddev; |
| @@ -2585,7 +2675,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2585 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 2675 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
| 2586 | 2676 | ||
| 2587 | err = -EIO; | 2677 | err = -EIO; |
| 2588 | conf->last_used = -1; | ||
| 2589 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2678 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| 2590 | 2679 | ||
| 2591 | disk = conf->mirrors + i; | 2680 | disk = conf->mirrors + i; |
| @@ -2611,19 +2700,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2611 | if (disk->rdev && | 2700 | if (disk->rdev && |
| 2612 | (disk->rdev->saved_raid_disk < 0)) | 2701 | (disk->rdev->saved_raid_disk < 0)) |
| 2613 | conf->fullsync = 1; | 2702 | conf->fullsync = 1; |
| 2614 | } else if (conf->last_used < 0) | 2703 | } |
| 2615 | /* | ||
| 2616 | * The first working device is used as a | ||
| 2617 | * starting point to read balancing. | ||
| 2618 | */ | ||
| 2619 | conf->last_used = i; | ||
| 2620 | } | 2704 | } |
| 2621 | 2705 | ||
| 2622 | if (conf->last_used < 0) { | ||
| 2623 | printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", | ||
| 2624 | mdname(mddev)); | ||
| 2625 | goto abort; | ||
| 2626 | } | ||
| 2627 | err = -ENOMEM; | 2706 | err = -ENOMEM; |
| 2628 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); | 2707 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); |
| 2629 | if (!conf->thread) { | 2708 | if (!conf->thread) { |
| @@ -2798,7 +2877,7 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 2798 | */ | 2877 | */ |
| 2799 | mempool_t *newpool, *oldpool; | 2878 | mempool_t *newpool, *oldpool; |
| 2800 | struct pool_info *newpoolinfo; | 2879 | struct pool_info *newpoolinfo; |
| 2801 | struct mirror_info *newmirrors; | 2880 | struct raid1_info *newmirrors; |
| 2802 | struct r1conf *conf = mddev->private; | 2881 | struct r1conf *conf = mddev->private; |
| 2803 | int cnt, raid_disks; | 2882 | int cnt, raid_disks; |
| 2804 | unsigned long flags; | 2883 | unsigned long flags; |
| @@ -2841,7 +2920,7 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 2841 | kfree(newpoolinfo); | 2920 | kfree(newpoolinfo); |
| 2842 | return -ENOMEM; | 2921 | return -ENOMEM; |
| 2843 | } | 2922 | } |
| 2844 | newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, | 2923 | newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, |
| 2845 | GFP_KERNEL); | 2924 | GFP_KERNEL); |
| 2846 | if (!newmirrors) { | 2925 | if (!newmirrors) { |
| 2847 | kfree(newpoolinfo); | 2926 | kfree(newpoolinfo); |
| @@ -2880,7 +2959,6 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 2880 | conf->raid_disks = mddev->raid_disks = raid_disks; | 2959 | conf->raid_disks = mddev->raid_disks = raid_disks; |
| 2881 | mddev->delta_disks = 0; | 2960 | mddev->delta_disks = 0; |
| 2882 | 2961 | ||
| 2883 | conf->last_used = 0; /* just make sure it is in-range */ | ||
| 2884 | lower_barrier(conf); | 2962 | lower_barrier(conf); |
| 2885 | 2963 | ||
| 2886 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2964 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
