diff options
| -rw-r--r-- | drivers/md/md.c | 8 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 164 | ||||
| -rw-r--r-- | drivers/md/raid1.h | 30 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 92 | ||||
| -rw-r--r-- | drivers/md/raid10.h | 23 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 205 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 2 |
7 files changed, 310 insertions, 214 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index d5ab4493c8be..f6c46109b071 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -3942,17 +3942,13 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3942 | break; | 3942 | break; |
| 3943 | case clear: | 3943 | case clear: |
| 3944 | /* stopping an active array */ | 3944 | /* stopping an active array */ |
| 3945 | if (atomic_read(&mddev->openers) > 0) | ||
| 3946 | return -EBUSY; | ||
| 3947 | err = do_md_stop(mddev, 0, NULL); | 3945 | err = do_md_stop(mddev, 0, NULL); |
| 3948 | break; | 3946 | break; |
| 3949 | case inactive: | 3947 | case inactive: |
| 3950 | /* stopping an active array */ | 3948 | /* stopping an active array */ |
| 3951 | if (mddev->pers) { | 3949 | if (mddev->pers) |
| 3952 | if (atomic_read(&mddev->openers) > 0) | ||
| 3953 | return -EBUSY; | ||
| 3954 | err = do_md_stop(mddev, 2, NULL); | 3950 | err = do_md_stop(mddev, 2, NULL); |
| 3955 | } else | 3951 | else |
| 3956 | err = 0; /* already inactive */ | 3952 | err = 0; /* already inactive */ |
| 3957 | break; | 3953 | break; |
| 3958 | case suspended: | 3954 | case suspended: |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cacd008d6864..197f62681db5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -46,6 +46,20 @@ | |||
| 46 | */ | 46 | */ |
| 47 | #define NR_RAID1_BIOS 256 | 47 | #define NR_RAID1_BIOS 256 |
| 48 | 48 | ||
| 49 | /* when we get a read error on a read-only array, we redirect to another | ||
| 50 | * device without failing the first device, or trying to over-write to | ||
| 51 | * correct the read error. To keep track of bad blocks on a per-bio | ||
| 52 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
| 53 | */ | ||
| 54 | #define IO_BLOCKED ((struct bio *)1) | ||
| 55 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 56 | * bad-block marking which must be done from process context. So we record | ||
| 57 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
| 58 | */ | ||
| 59 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 60 | |||
| 61 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 62 | |||
| 49 | /* When there are this many requests queue to be written by | 63 | /* When there are this many requests queue to be written by |
| 50 | * the raid1 thread, we become 'congested' to provide back-pressure | 64 | * the raid1 thread, we become 'congested' to provide back-pressure |
| 51 | * for writeback. | 65 | * for writeback. |
| @@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 483 | const sector_t this_sector = r1_bio->sector; | 497 | const sector_t this_sector = r1_bio->sector; |
| 484 | int sectors; | 498 | int sectors; |
| 485 | int best_good_sectors; | 499 | int best_good_sectors; |
| 486 | int start_disk; | 500 | int best_disk, best_dist_disk, best_pending_disk; |
| 487 | int best_disk; | 501 | int has_nonrot_disk; |
| 488 | int i; | 502 | int disk; |
| 489 | sector_t best_dist; | 503 | sector_t best_dist; |
| 504 | unsigned int min_pending; | ||
| 490 | struct md_rdev *rdev; | 505 | struct md_rdev *rdev; |
| 491 | int choose_first; | 506 | int choose_first; |
| 507 | int choose_next_idle; | ||
| 492 | 508 | ||
| 493 | rcu_read_lock(); | 509 | rcu_read_lock(); |
| 494 | /* | 510 | /* |
| @@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 499 | retry: | 515 | retry: |
| 500 | sectors = r1_bio->sectors; | 516 | sectors = r1_bio->sectors; |
| 501 | best_disk = -1; | 517 | best_disk = -1; |
| 518 | best_dist_disk = -1; | ||
| 502 | best_dist = MaxSector; | 519 | best_dist = MaxSector; |
| 520 | best_pending_disk = -1; | ||
| 521 | min_pending = UINT_MAX; | ||
| 503 | best_good_sectors = 0; | 522 | best_good_sectors = 0; |
| 523 | has_nonrot_disk = 0; | ||
| 524 | choose_next_idle = 0; | ||
| 504 | 525 | ||
| 505 | if (conf->mddev->recovery_cp < MaxSector && | 526 | if (conf->mddev->recovery_cp < MaxSector && |
| 506 | (this_sector + sectors >= conf->next_resync)) { | 527 | (this_sector + sectors >= conf->next_resync)) |
| 507 | choose_first = 1; | 528 | choose_first = 1; |
| 508 | start_disk = 0; | 529 | else |
| 509 | } else { | ||
| 510 | choose_first = 0; | 530 | choose_first = 0; |
| 511 | start_disk = conf->last_used; | ||
| 512 | } | ||
| 513 | 531 | ||
| 514 | for (i = 0 ; i < conf->raid_disks * 2 ; i++) { | 532 | for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { |
| 515 | sector_t dist; | 533 | sector_t dist; |
| 516 | sector_t first_bad; | 534 | sector_t first_bad; |
| 517 | int bad_sectors; | 535 | int bad_sectors; |
| 518 | 536 | unsigned int pending; | |
| 519 | int disk = start_disk + i; | 537 | bool nonrot; |
| 520 | if (disk >= conf->raid_disks * 2) | ||
| 521 | disk -= conf->raid_disks * 2; | ||
| 522 | 538 | ||
| 523 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 539 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
| 524 | if (r1_bio->bios[disk] == IO_BLOCKED | 540 | if (r1_bio->bios[disk] == IO_BLOCKED |
| @@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 577 | } else | 593 | } else |
| 578 | best_good_sectors = sectors; | 594 | best_good_sectors = sectors; |
| 579 | 595 | ||
| 596 | nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); | ||
| 597 | has_nonrot_disk |= nonrot; | ||
| 598 | pending = atomic_read(&rdev->nr_pending); | ||
| 580 | dist = abs(this_sector - conf->mirrors[disk].head_position); | 599 | dist = abs(this_sector - conf->mirrors[disk].head_position); |
| 581 | if (choose_first | 600 | if (choose_first) { |
| 582 | /* Don't change to another disk for sequential reads */ | 601 | best_disk = disk; |
| 583 | || conf->next_seq_sect == this_sector | 602 | break; |
| 584 | || dist == 0 | 603 | } |
| 585 | /* If device is idle, use it */ | 604 | /* Don't change to another disk for sequential reads */ |
| 586 | || atomic_read(&rdev->nr_pending) == 0) { | 605 | if (conf->mirrors[disk].next_seq_sect == this_sector |
| 606 | || dist == 0) { | ||
| 607 | int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; | ||
| 608 | struct raid1_info *mirror = &conf->mirrors[disk]; | ||
| 609 | |||
| 610 | best_disk = disk; | ||
| 611 | /* | ||
| 612 | * If buffered sequential IO size exceeds optimal | ||
| 613 | * iosize, check if there is idle disk. If yes, choose | ||
| 614 | * the idle disk. read_balance could already choose an | ||
| 615 | * idle disk before noticing it's a sequential IO in | ||
| 616 | * this disk. This doesn't matter because this disk | ||
| 617 | * will idle, next time it will be utilized after the | ||
| 618 | * first disk has IO size exceeds optimal iosize. In | ||
| 619 | * this way, iosize of the first disk will be optimal | ||
| 620 | * iosize at least. iosize of the second disk might be | ||
| 621 | * small, but not a big deal since when the second disk | ||
| 622 | * starts IO, the first disk is likely still busy. | ||
| 623 | */ | ||
| 624 | if (nonrot && opt_iosize > 0 && | ||
| 625 | mirror->seq_start != MaxSector && | ||
| 626 | mirror->next_seq_sect > opt_iosize && | ||
| 627 | mirror->next_seq_sect - opt_iosize >= | ||
| 628 | mirror->seq_start) { | ||
| 629 | choose_next_idle = 1; | ||
| 630 | continue; | ||
| 631 | } | ||
| 632 | break; | ||
| 633 | } | ||
| 634 | /* If device is idle, use it */ | ||
| 635 | if (pending == 0) { | ||
| 587 | best_disk = disk; | 636 | best_disk = disk; |
| 588 | break; | 637 | break; |
| 589 | } | 638 | } |
| 639 | |||
| 640 | if (choose_next_idle) | ||
| 641 | continue; | ||
| 642 | |||
| 643 | if (min_pending > pending) { | ||
| 644 | min_pending = pending; | ||
| 645 | best_pending_disk = disk; | ||
| 646 | } | ||
| 647 | |||
| 590 | if (dist < best_dist) { | 648 | if (dist < best_dist) { |
| 591 | best_dist = dist; | 649 | best_dist = dist; |
| 592 | best_disk = disk; | 650 | best_dist_disk = disk; |
| 593 | } | 651 | } |
| 594 | } | 652 | } |
| 595 | 653 | ||
| 654 | /* | ||
| 655 | * If all disks are rotational, choose the closest disk. If any disk is | ||
| 656 | * non-rotational, choose the disk with less pending request even the | ||
| 657 | * disk is rotational, which might/might not be optimal for raids with | ||
| 658 | * mixed ratation/non-rotational disks depending on workload. | ||
| 659 | */ | ||
| 660 | if (best_disk == -1) { | ||
| 661 | if (has_nonrot_disk) | ||
| 662 | best_disk = best_pending_disk; | ||
| 663 | else | ||
| 664 | best_disk = best_dist_disk; | ||
| 665 | } | ||
| 666 | |||
| 596 | if (best_disk >= 0) { | 667 | if (best_disk >= 0) { |
| 597 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); | 668 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); |
| 598 | if (!rdev) | 669 | if (!rdev) |
| @@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 606 | goto retry; | 677 | goto retry; |
| 607 | } | 678 | } |
| 608 | sectors = best_good_sectors; | 679 | sectors = best_good_sectors; |
| 609 | conf->next_seq_sect = this_sector + sectors; | 680 | |
| 610 | conf->last_used = best_disk; | 681 | if (conf->mirrors[best_disk].next_seq_sect != this_sector) |
| 682 | conf->mirrors[best_disk].seq_start = this_sector; | ||
| 683 | |||
| 684 | conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; | ||
| 611 | } | 685 | } |
| 612 | rcu_read_unlock(); | 686 | rcu_read_unlock(); |
| 613 | *max_sectors = sectors; | 687 | *max_sectors = sectors; |
| @@ -873,7 +947,7 @@ do_sync_io: | |||
| 873 | static void make_request(struct mddev *mddev, struct bio * bio) | 947 | static void make_request(struct mddev *mddev, struct bio * bio) |
| 874 | { | 948 | { |
| 875 | struct r1conf *conf = mddev->private; | 949 | struct r1conf *conf = mddev->private; |
| 876 | struct mirror_info *mirror; | 950 | struct raid1_info *mirror; |
| 877 | struct r1bio *r1_bio; | 951 | struct r1bio *r1_bio; |
| 878 | struct bio *read_bio; | 952 | struct bio *read_bio; |
| 879 | int i, disks; | 953 | int i, disks; |
| @@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1364 | struct r1conf *conf = mddev->private; | 1438 | struct r1conf *conf = mddev->private; |
| 1365 | int err = -EEXIST; | 1439 | int err = -EEXIST; |
| 1366 | int mirror = 0; | 1440 | int mirror = 0; |
| 1367 | struct mirror_info *p; | 1441 | struct raid1_info *p; |
| 1368 | int first = 0; | 1442 | int first = 0; |
| 1369 | int last = conf->raid_disks - 1; | 1443 | int last = conf->raid_disks - 1; |
| 1370 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 1444 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
| @@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1433 | struct r1conf *conf = mddev->private; | 1507 | struct r1conf *conf = mddev->private; |
| 1434 | int err = 0; | 1508 | int err = 0; |
| 1435 | int number = rdev->raid_disk; | 1509 | int number = rdev->raid_disk; |
| 1436 | struct mirror_info *p = conf->mirrors+ number; | 1510 | struct raid1_info *p = conf->mirrors + number; |
| 1437 | 1511 | ||
| 1438 | if (rdev != p->rdev) | 1512 | if (rdev != p->rdev) |
| 1439 | p = conf->mirrors + conf->raid_disks + number; | 1513 | p = conf->mirrors + conf->raid_disks + number; |
| @@ -2371,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
| 2371 | bio->bi_rw = READ; | 2445 | bio->bi_rw = READ; |
| 2372 | bio->bi_end_io = end_sync_read; | 2446 | bio->bi_end_io = end_sync_read; |
| 2373 | read_targets++; | 2447 | read_targets++; |
| 2448 | } else if (!test_bit(WriteErrorSeen, &rdev->flags) && | ||
| 2449 | test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && | ||
| 2450 | !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { | ||
| 2451 | /* | ||
| 2452 | * The device is suitable for reading (InSync), | ||
| 2453 | * but has bad block(s) here. Let's try to correct them, | ||
| 2454 | * if we are doing resync or repair. Otherwise, leave | ||
| 2455 | * this device alone for this sync request. | ||
| 2456 | */ | ||
| 2457 | bio->bi_rw = WRITE; | ||
| 2458 | bio->bi_end_io = end_sync_write; | ||
| 2459 | write_targets++; | ||
| 2374 | } | 2460 | } |
| 2375 | } | 2461 | } |
| 2376 | if (bio->bi_end_io) { | 2462 | if (bio->bi_end_io) { |
| @@ -2428,7 +2514,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
| 2428 | /* There is nowhere to write, so all non-sync | 2514 | /* There is nowhere to write, so all non-sync |
| 2429 | * drives must be failed - so we are finished | 2515 | * drives must be failed - so we are finished |
| 2430 | */ | 2516 | */ |
| 2431 | sector_t rv = max_sector - sector_nr; | 2517 | sector_t rv; |
| 2518 | if (min_bad > 0) | ||
| 2519 | max_sector = sector_nr + min_bad; | ||
| 2520 | rv = max_sector - sector_nr; | ||
| 2432 | *skipped = 1; | 2521 | *skipped = 1; |
| 2433 | put_buf(r1_bio); | 2522 | put_buf(r1_bio); |
| 2434 | return rv; | 2523 | return rv; |
| @@ -2521,7 +2610,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2521 | { | 2610 | { |
| 2522 | struct r1conf *conf; | 2611 | struct r1conf *conf; |
| 2523 | int i; | 2612 | int i; |
| 2524 | struct mirror_info *disk; | 2613 | struct raid1_info *disk; |
| 2525 | struct md_rdev *rdev; | 2614 | struct md_rdev *rdev; |
| 2526 | int err = -ENOMEM; | 2615 | int err = -ENOMEM; |
| 2527 | 2616 | ||
| @@ -2529,7 +2618,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2529 | if (!conf) | 2618 | if (!conf) |
| 2530 | goto abort; | 2619 | goto abort; |
| 2531 | 2620 | ||
| 2532 | conf->mirrors = kzalloc(sizeof(struct mirror_info) | 2621 | conf->mirrors = kzalloc(sizeof(struct raid1_info) |
| 2533 | * mddev->raid_disks * 2, | 2622 | * mddev->raid_disks * 2, |
| 2534 | GFP_KERNEL); | 2623 | GFP_KERNEL); |
| 2535 | if (!conf->mirrors) | 2624 | if (!conf->mirrors) |
| @@ -2572,6 +2661,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2572 | mddev->merge_check_needed = 1; | 2661 | mddev->merge_check_needed = 1; |
| 2573 | 2662 | ||
| 2574 | disk->head_position = 0; | 2663 | disk->head_position = 0; |
| 2664 | disk->seq_start = MaxSector; | ||
| 2575 | } | 2665 | } |
| 2576 | conf->raid_disks = mddev->raid_disks; | 2666 | conf->raid_disks = mddev->raid_disks; |
| 2577 | conf->mddev = mddev; | 2667 | conf->mddev = mddev; |
| @@ -2585,7 +2675,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2585 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 2675 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
| 2586 | 2676 | ||
| 2587 | err = -EIO; | 2677 | err = -EIO; |
| 2588 | conf->last_used = -1; | ||
| 2589 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2678 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| 2590 | 2679 | ||
| 2591 | disk = conf->mirrors + i; | 2680 | disk = conf->mirrors + i; |
| @@ -2611,19 +2700,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2611 | if (disk->rdev && | 2700 | if (disk->rdev && |
| 2612 | (disk->rdev->saved_raid_disk < 0)) | 2701 | (disk->rdev->saved_raid_disk < 0)) |
| 2613 | conf->fullsync = 1; | 2702 | conf->fullsync = 1; |
| 2614 | } else if (conf->last_used < 0) | 2703 | } |
| 2615 | /* | ||
| 2616 | * The first working device is used as a | ||
| 2617 | * starting point to read balancing. | ||
| 2618 | */ | ||
| 2619 | conf->last_used = i; | ||
| 2620 | } | 2704 | } |
| 2621 | 2705 | ||
| 2622 | if (conf->last_used < 0) { | ||
| 2623 | printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", | ||
| 2624 | mdname(mddev)); | ||
| 2625 | goto abort; | ||
| 2626 | } | ||
| 2627 | err = -ENOMEM; | 2706 | err = -ENOMEM; |
| 2628 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); | 2707 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); |
| 2629 | if (!conf->thread) { | 2708 | if (!conf->thread) { |
| @@ -2798,7 +2877,7 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 2798 | */ | 2877 | */ |
| 2799 | mempool_t *newpool, *oldpool; | 2878 | mempool_t *newpool, *oldpool; |
| 2800 | struct pool_info *newpoolinfo; | 2879 | struct pool_info *newpoolinfo; |
| 2801 | struct mirror_info *newmirrors; | 2880 | struct raid1_info *newmirrors; |
| 2802 | struct r1conf *conf = mddev->private; | 2881 | struct r1conf *conf = mddev->private; |
| 2803 | int cnt, raid_disks; | 2882 | int cnt, raid_disks; |
| 2804 | unsigned long flags; | 2883 | unsigned long flags; |
| @@ -2841,7 +2920,7 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 2841 | kfree(newpoolinfo); | 2920 | kfree(newpoolinfo); |
| 2842 | return -ENOMEM; | 2921 | return -ENOMEM; |
| 2843 | } | 2922 | } |
| 2844 | newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, | 2923 | newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, |
| 2845 | GFP_KERNEL); | 2924 | GFP_KERNEL); |
| 2846 | if (!newmirrors) { | 2925 | if (!newmirrors) { |
| 2847 | kfree(newpoolinfo); | 2926 | kfree(newpoolinfo); |
| @@ -2880,7 +2959,6 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 2880 | conf->raid_disks = mddev->raid_disks = raid_disks; | 2959 | conf->raid_disks = mddev->raid_disks = raid_disks; |
| 2881 | mddev->delta_disks = 0; | 2960 | mddev->delta_disks = 0; |
| 2882 | 2961 | ||
| 2883 | conf->last_used = 0; /* just make sure it is in-range */ | ||
| 2884 | lower_barrier(conf); | 2962 | lower_barrier(conf); |
| 2885 | 2963 | ||
| 2886 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2964 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 80ded139314c..0ff3715fb7eb 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
| @@ -1,9 +1,15 @@ | |||
| 1 | #ifndef _RAID1_H | 1 | #ifndef _RAID1_H |
| 2 | #define _RAID1_H | 2 | #define _RAID1_H |
| 3 | 3 | ||
| 4 | struct mirror_info { | 4 | struct raid1_info { |
| 5 | struct md_rdev *rdev; | 5 | struct md_rdev *rdev; |
| 6 | sector_t head_position; | 6 | sector_t head_position; |
| 7 | |||
| 8 | /* When choose the best device for a read (read_balance()) | ||
| 9 | * we try to keep sequential reads one the same device | ||
| 10 | */ | ||
| 11 | sector_t next_seq_sect; | ||
| 12 | sector_t seq_start; | ||
| 7 | }; | 13 | }; |
| 8 | 14 | ||
| 9 | /* | 15 | /* |
| @@ -24,17 +30,11 @@ struct pool_info { | |||
| 24 | 30 | ||
| 25 | struct r1conf { | 31 | struct r1conf { |
| 26 | struct mddev *mddev; | 32 | struct mddev *mddev; |
| 27 | struct mirror_info *mirrors; /* twice 'raid_disks' to | 33 | struct raid1_info *mirrors; /* twice 'raid_disks' to |
| 28 | * allow for replacements. | 34 | * allow for replacements. |
| 29 | */ | 35 | */ |
| 30 | int raid_disks; | 36 | int raid_disks; |
| 31 | 37 | ||
| 32 | /* When choose the best device for a read (read_balance()) | ||
| 33 | * we try to keep sequential reads one the same device | ||
| 34 | * using 'last_used' and 'next_seq_sect' | ||
| 35 | */ | ||
| 36 | int last_used; | ||
| 37 | sector_t next_seq_sect; | ||
| 38 | /* During resync, read_balancing is only allowed on the part | 38 | /* During resync, read_balancing is only allowed on the part |
| 39 | * of the array that has been resynced. 'next_resync' tells us | 39 | * of the array that has been resynced. 'next_resync' tells us |
| 40 | * where that is. | 40 | * where that is. |
| @@ -135,20 +135,6 @@ struct r1bio { | |||
| 135 | /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ | 135 | /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ |
| 136 | }; | 136 | }; |
| 137 | 137 | ||
| 138 | /* when we get a read error on a read-only array, we redirect to another | ||
| 139 | * device without failing the first device, or trying to over-write to | ||
| 140 | * correct the read error. To keep track of bad blocks on a per-bio | ||
| 141 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
| 142 | */ | ||
| 143 | #define IO_BLOCKED ((struct bio *)1) | ||
| 144 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 145 | * bad-block marking which must be done from process context. So we record | ||
| 146 | * the success by setting bios[n] to IO_MADE_GOOD | ||
| 147 | */ | ||
| 148 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 149 | |||
| 150 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 151 | |||
| 152 | /* bits for r1bio.state */ | 138 | /* bits for r1bio.state */ |
| 153 | #define R1BIO_Uptodate 0 | 139 | #define R1BIO_Uptodate 0 |
| 154 | #define R1BIO_IsSync 1 | 140 | #define R1BIO_IsSync 1 |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8da6282254c3..e2549deab7c3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -60,7 +60,21 @@ | |||
| 60 | */ | 60 | */ |
| 61 | #define NR_RAID10_BIOS 256 | 61 | #define NR_RAID10_BIOS 256 |
| 62 | 62 | ||
| 63 | /* When there are this many requests queue to be written by | 63 | /* when we get a read error on a read-only array, we redirect to another |
| 64 | * device without failing the first device, or trying to over-write to | ||
| 65 | * correct the read error. To keep track of bad blocks on a per-bio | ||
| 66 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
| 67 | */ | ||
| 68 | #define IO_BLOCKED ((struct bio *)1) | ||
| 69 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 70 | * bad-block marking which must be done from process context. So we record | ||
| 71 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
| 72 | */ | ||
| 73 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 74 | |||
| 75 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 76 | |||
| 77 | /* When there are this many requests queued to be written by | ||
| 64 | * the raid10 thread, we become 'congested' to provide back-pressure | 78 | * the raid10 thread, we become 'congested' to provide back-pressure |
| 65 | * for writeback. | 79 | * for writeback. |
| 66 | */ | 80 | */ |
| @@ -717,7 +731,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
| 717 | int sectors = r10_bio->sectors; | 731 | int sectors = r10_bio->sectors; |
| 718 | int best_good_sectors; | 732 | int best_good_sectors; |
| 719 | sector_t new_distance, best_dist; | 733 | sector_t new_distance, best_dist; |
| 720 | struct md_rdev *rdev, *best_rdev; | 734 | struct md_rdev *best_rdev, *rdev = NULL; |
| 721 | int do_balance; | 735 | int do_balance; |
| 722 | int best_slot; | 736 | int best_slot; |
| 723 | struct geom *geo = &conf->geo; | 737 | struct geom *geo = &conf->geo; |
| @@ -839,9 +853,8 @@ retry: | |||
| 839 | return rdev; | 853 | return rdev; |
| 840 | } | 854 | } |
| 841 | 855 | ||
| 842 | static int raid10_congested(void *data, int bits) | 856 | int md_raid10_congested(struct mddev *mddev, int bits) |
| 843 | { | 857 | { |
| 844 | struct mddev *mddev = data; | ||
| 845 | struct r10conf *conf = mddev->private; | 858 | struct r10conf *conf = mddev->private; |
| 846 | int i, ret = 0; | 859 | int i, ret = 0; |
| 847 | 860 | ||
| @@ -849,8 +862,6 @@ static int raid10_congested(void *data, int bits) | |||
| 849 | conf->pending_count >= max_queued_requests) | 862 | conf->pending_count >= max_queued_requests) |
| 850 | return 1; | 863 | return 1; |
| 851 | 864 | ||
| 852 | if (mddev_congested(mddev, bits)) | ||
| 853 | return 1; | ||
| 854 | rcu_read_lock(); | 865 | rcu_read_lock(); |
| 855 | for (i = 0; | 866 | for (i = 0; |
| 856 | (i < conf->geo.raid_disks || i < conf->prev.raid_disks) | 867 | (i < conf->geo.raid_disks || i < conf->prev.raid_disks) |
| @@ -866,6 +877,15 @@ static int raid10_congested(void *data, int bits) | |||
| 866 | rcu_read_unlock(); | 877 | rcu_read_unlock(); |
| 867 | return ret; | 878 | return ret; |
| 868 | } | 879 | } |
| 880 | EXPORT_SYMBOL_GPL(md_raid10_congested); | ||
| 881 | |||
| 882 | static int raid10_congested(void *data, int bits) | ||
| 883 | { | ||
| 884 | struct mddev *mddev = data; | ||
| 885 | |||
| 886 | return mddev_congested(mddev, bits) || | ||
| 887 | md_raid10_congested(mddev, bits); | ||
| 888 | } | ||
| 869 | 889 | ||
| 870 | static void flush_pending_writes(struct r10conf *conf) | 890 | static void flush_pending_writes(struct r10conf *conf) |
| 871 | { | 891 | { |
| @@ -1546,7 +1566,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1546 | static void print_conf(struct r10conf *conf) | 1566 | static void print_conf(struct r10conf *conf) |
| 1547 | { | 1567 | { |
| 1548 | int i; | 1568 | int i; |
| 1549 | struct mirror_info *tmp; | 1569 | struct raid10_info *tmp; |
| 1550 | 1570 | ||
| 1551 | printk(KERN_DEBUG "RAID10 conf printout:\n"); | 1571 | printk(KERN_DEBUG "RAID10 conf printout:\n"); |
| 1552 | if (!conf) { | 1572 | if (!conf) { |
| @@ -1580,7 +1600,7 @@ static int raid10_spare_active(struct mddev *mddev) | |||
| 1580 | { | 1600 | { |
| 1581 | int i; | 1601 | int i; |
| 1582 | struct r10conf *conf = mddev->private; | 1602 | struct r10conf *conf = mddev->private; |
| 1583 | struct mirror_info *tmp; | 1603 | struct raid10_info *tmp; |
| 1584 | int count = 0; | 1604 | int count = 0; |
| 1585 | unsigned long flags; | 1605 | unsigned long flags; |
| 1586 | 1606 | ||
| @@ -1655,7 +1675,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1655 | else | 1675 | else |
| 1656 | mirror = first; | 1676 | mirror = first; |
| 1657 | for ( ; mirror <= last ; mirror++) { | 1677 | for ( ; mirror <= last ; mirror++) { |
| 1658 | struct mirror_info *p = &conf->mirrors[mirror]; | 1678 | struct raid10_info *p = &conf->mirrors[mirror]; |
| 1659 | if (p->recovery_disabled == mddev->recovery_disabled) | 1679 | if (p->recovery_disabled == mddev->recovery_disabled) |
| 1660 | continue; | 1680 | continue; |
| 1661 | if (p->rdev) { | 1681 | if (p->rdev) { |
| @@ -1709,7 +1729,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1709 | int err = 0; | 1729 | int err = 0; |
| 1710 | int number = rdev->raid_disk; | 1730 | int number = rdev->raid_disk; |
| 1711 | struct md_rdev **rdevp; | 1731 | struct md_rdev **rdevp; |
| 1712 | struct mirror_info *p = conf->mirrors + number; | 1732 | struct raid10_info *p = conf->mirrors + number; |
| 1713 | 1733 | ||
| 1714 | print_conf(conf); | 1734 | print_conf(conf); |
| 1715 | if (rdev == p->rdev) | 1735 | if (rdev == p->rdev) |
| @@ -2876,7 +2896,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2876 | sector_t sect; | 2896 | sector_t sect; |
| 2877 | int must_sync; | 2897 | int must_sync; |
| 2878 | int any_working; | 2898 | int any_working; |
| 2879 | struct mirror_info *mirror = &conf->mirrors[i]; | 2899 | struct raid10_info *mirror = &conf->mirrors[i]; |
| 2880 | 2900 | ||
| 2881 | if ((mirror->rdev == NULL || | 2901 | if ((mirror->rdev == NULL || |
| 2882 | test_bit(In_sync, &mirror->rdev->flags)) | 2902 | test_bit(In_sync, &mirror->rdev->flags)) |
| @@ -3388,7 +3408,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
| 3388 | goto out; | 3408 | goto out; |
| 3389 | 3409 | ||
| 3390 | /* FIXME calc properly */ | 3410 | /* FIXME calc properly */ |
| 3391 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + | 3411 | conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + |
| 3392 | max(0,mddev->delta_disks)), | 3412 | max(0,mddev->delta_disks)), |
| 3393 | GFP_KERNEL); | 3413 | GFP_KERNEL); |
| 3394 | if (!conf->mirrors) | 3414 | if (!conf->mirrors) |
| @@ -3452,7 +3472,7 @@ static int run(struct mddev *mddev) | |||
| 3452 | { | 3472 | { |
| 3453 | struct r10conf *conf; | 3473 | struct r10conf *conf; |
| 3454 | int i, disk_idx, chunk_size; | 3474 | int i, disk_idx, chunk_size; |
| 3455 | struct mirror_info *disk; | 3475 | struct raid10_info *disk; |
| 3456 | struct md_rdev *rdev; | 3476 | struct md_rdev *rdev; |
| 3457 | sector_t size; | 3477 | sector_t size; |
| 3458 | sector_t min_offset_diff = 0; | 3478 | sector_t min_offset_diff = 0; |
| @@ -3472,12 +3492,14 @@ static int run(struct mddev *mddev) | |||
| 3472 | conf->thread = NULL; | 3492 | conf->thread = NULL; |
| 3473 | 3493 | ||
| 3474 | chunk_size = mddev->chunk_sectors << 9; | 3494 | chunk_size = mddev->chunk_sectors << 9; |
| 3475 | blk_queue_io_min(mddev->queue, chunk_size); | 3495 | if (mddev->queue) { |
| 3476 | if (conf->geo.raid_disks % conf->geo.near_copies) | 3496 | blk_queue_io_min(mddev->queue, chunk_size); |
| 3477 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); | 3497 | if (conf->geo.raid_disks % conf->geo.near_copies) |
| 3478 | else | 3498 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
| 3479 | blk_queue_io_opt(mddev->queue, chunk_size * | 3499 | else |
| 3480 | (conf->geo.raid_disks / conf->geo.near_copies)); | 3500 | blk_queue_io_opt(mddev->queue, chunk_size * |
| 3501 | (conf->geo.raid_disks / conf->geo.near_copies)); | ||
| 3502 | } | ||
| 3481 | 3503 | ||
| 3482 | rdev_for_each(rdev, mddev) { | 3504 | rdev_for_each(rdev, mddev) { |
| 3483 | long long diff; | 3505 | long long diff; |
| @@ -3511,8 +3533,9 @@ static int run(struct mddev *mddev) | |||
| 3511 | if (first || diff < min_offset_diff) | 3533 | if (first || diff < min_offset_diff) |
| 3512 | min_offset_diff = diff; | 3534 | min_offset_diff = diff; |
| 3513 | 3535 | ||
| 3514 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3536 | if (mddev->gendisk) |
| 3515 | rdev->data_offset << 9); | 3537 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
| 3538 | rdev->data_offset << 9); | ||
| 3516 | 3539 | ||
| 3517 | disk->head_position = 0; | 3540 | disk->head_position = 0; |
| 3518 | } | 3541 | } |
| @@ -3575,22 +3598,22 @@ static int run(struct mddev *mddev) | |||
| 3575 | md_set_array_sectors(mddev, size); | 3598 | md_set_array_sectors(mddev, size); |
| 3576 | mddev->resync_max_sectors = size; | 3599 | mddev->resync_max_sectors = size; |
| 3577 | 3600 | ||
| 3578 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | 3601 | if (mddev->queue) { |
| 3579 | mddev->queue->backing_dev_info.congested_data = mddev; | ||
| 3580 | |||
| 3581 | /* Calculate max read-ahead size. | ||
| 3582 | * We need to readahead at least twice a whole stripe.... | ||
| 3583 | * maybe... | ||
| 3584 | */ | ||
| 3585 | { | ||
| 3586 | int stripe = conf->geo.raid_disks * | 3602 | int stripe = conf->geo.raid_disks * |
| 3587 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); | 3603 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
| 3604 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | ||
| 3605 | mddev->queue->backing_dev_info.congested_data = mddev; | ||
| 3606 | |||
| 3607 | /* Calculate max read-ahead size. | ||
| 3608 | * We need to readahead at least twice a whole stripe.... | ||
| 3609 | * maybe... | ||
| 3610 | */ | ||
| 3588 | stripe /= conf->geo.near_copies; | 3611 | stripe /= conf->geo.near_copies; |
| 3589 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | 3612 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
| 3590 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | 3613 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
| 3614 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | ||
| 3591 | } | 3615 | } |
| 3592 | 3616 | ||
| 3593 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | ||
| 3594 | 3617 | ||
| 3595 | if (md_integrity_register(mddev)) | 3618 | if (md_integrity_register(mddev)) |
| 3596 | goto out_free_conf; | 3619 | goto out_free_conf; |
| @@ -3641,7 +3664,10 @@ static int stop(struct mddev *mddev) | |||
| 3641 | lower_barrier(conf); | 3664 | lower_barrier(conf); |
| 3642 | 3665 | ||
| 3643 | md_unregister_thread(&mddev->thread); | 3666 | md_unregister_thread(&mddev->thread); |
| 3644 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 3667 | if (mddev->queue) |
| 3668 | /* the unplug fn references 'conf'*/ | ||
| 3669 | blk_sync_queue(mddev->queue); | ||
| 3670 | |||
| 3645 | if (conf->r10bio_pool) | 3671 | if (conf->r10bio_pool) |
| 3646 | mempool_destroy(conf->r10bio_pool); | 3672 | mempool_destroy(conf->r10bio_pool); |
| 3647 | kfree(conf->mirrors); | 3673 | kfree(conf->mirrors); |
| @@ -3805,7 +3831,7 @@ static int raid10_check_reshape(struct mddev *mddev) | |||
| 3805 | if (mddev->delta_disks > 0) { | 3831 | if (mddev->delta_disks > 0) { |
| 3806 | /* allocate new 'mirrors' list */ | 3832 | /* allocate new 'mirrors' list */ |
| 3807 | conf->mirrors_new = kzalloc( | 3833 | conf->mirrors_new = kzalloc( |
| 3808 | sizeof(struct mirror_info) | 3834 | sizeof(struct raid10_info) |
| 3809 | *(mddev->raid_disks + | 3835 | *(mddev->raid_disks + |
| 3810 | mddev->delta_disks), | 3836 | mddev->delta_disks), |
| 3811 | GFP_KERNEL); | 3837 | GFP_KERNEL); |
| @@ -3930,7 +3956,7 @@ static int raid10_start_reshape(struct mddev *mddev) | |||
| 3930 | spin_lock_irq(&conf->device_lock); | 3956 | spin_lock_irq(&conf->device_lock); |
| 3931 | if (conf->mirrors_new) { | 3957 | if (conf->mirrors_new) { |
| 3932 | memcpy(conf->mirrors_new, conf->mirrors, | 3958 | memcpy(conf->mirrors_new, conf->mirrors, |
| 3933 | sizeof(struct mirror_info)*conf->prev.raid_disks); | 3959 | sizeof(struct raid10_info)*conf->prev.raid_disks); |
| 3934 | smp_mb(); | 3960 | smp_mb(); |
| 3935 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ | 3961 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ |
| 3936 | conf->mirrors_old = conf->mirrors; | 3962 | conf->mirrors_old = conf->mirrors; |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 135b1b0a1554..007c2c68dd83 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | #ifndef _RAID10_H | 1 | #ifndef _RAID10_H |
| 2 | #define _RAID10_H | 2 | #define _RAID10_H |
| 3 | 3 | ||
| 4 | struct mirror_info { | 4 | struct raid10_info { |
| 5 | struct md_rdev *rdev, *replacement; | 5 | struct md_rdev *rdev, *replacement; |
| 6 | sector_t head_position; | 6 | sector_t head_position; |
| 7 | int recovery_disabled; /* matches | 7 | int recovery_disabled; /* matches |
| @@ -13,8 +13,8 @@ struct mirror_info { | |||
| 13 | 13 | ||
| 14 | struct r10conf { | 14 | struct r10conf { |
| 15 | struct mddev *mddev; | 15 | struct mddev *mddev; |
| 16 | struct mirror_info *mirrors; | 16 | struct raid10_info *mirrors; |
| 17 | struct mirror_info *mirrors_new, *mirrors_old; | 17 | struct raid10_info *mirrors_new, *mirrors_old; |
| 18 | spinlock_t device_lock; | 18 | spinlock_t device_lock; |
| 19 | 19 | ||
| 20 | /* geometry */ | 20 | /* geometry */ |
| @@ -123,20 +123,6 @@ struct r10bio { | |||
| 123 | } devs[0]; | 123 | } devs[0]; |
| 124 | }; | 124 | }; |
| 125 | 125 | ||
| 126 | /* when we get a read error on a read-only array, we redirect to another | ||
| 127 | * device without failing the first device, or trying to over-write to | ||
| 128 | * correct the read error. To keep track of bad blocks on a per-bio | ||
| 129 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
| 130 | */ | ||
| 131 | #define IO_BLOCKED ((struct bio*)1) | ||
| 132 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 133 | * bad-block marking which must be done from process context. So we record | ||
| 134 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
| 135 | */ | ||
| 136 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 137 | |||
| 138 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 139 | |||
| 140 | /* bits for r10bio.state */ | 126 | /* bits for r10bio.state */ |
| 141 | enum r10bio_state { | 127 | enum r10bio_state { |
| 142 | R10BIO_Uptodate, | 128 | R10BIO_Uptodate, |
| @@ -159,4 +145,7 @@ enum r10bio_state { | |||
| 159 | */ | 145 | */ |
| 160 | R10BIO_Previous, | 146 | R10BIO_Previous, |
| 161 | }; | 147 | }; |
| 148 | |||
| 149 | extern int md_raid10_congested(struct mddev *mddev, int bits); | ||
| 150 | |||
| 162 | #endif | 151 | #endif |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 04348d76bb30..259f519814ca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) | |||
| 99 | * We maintain a biased count of active stripes in the bottom 16 bits of | 99 | * We maintain a biased count of active stripes in the bottom 16 bits of |
| 100 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | 100 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits |
| 101 | */ | 101 | */ |
| 102 | static inline int raid5_bi_phys_segments(struct bio *bio) | 102 | static inline int raid5_bi_processed_stripes(struct bio *bio) |
| 103 | { | 103 | { |
| 104 | return bio->bi_phys_segments & 0xffff; | 104 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 105 | return (atomic_read(segments) >> 16) & 0xffff; | ||
| 105 | } | 106 | } |
| 106 | 107 | ||
| 107 | static inline int raid5_bi_hw_segments(struct bio *bio) | 108 | static inline int raid5_dec_bi_active_stripes(struct bio *bio) |
| 108 | { | 109 | { |
| 109 | return (bio->bi_phys_segments >> 16) & 0xffff; | 110 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 111 | return atomic_sub_return(1, segments) & 0xffff; | ||
| 110 | } | 112 | } |
| 111 | 113 | ||
| 112 | static inline int raid5_dec_bi_phys_segments(struct bio *bio) | 114 | static inline void raid5_inc_bi_active_stripes(struct bio *bio) |
| 113 | { | 115 | { |
| 114 | --bio->bi_phys_segments; | 116 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 115 | return raid5_bi_phys_segments(bio); | 117 | atomic_inc(segments); |
| 116 | } | 118 | } |
| 117 | 119 | ||
| 118 | static inline int raid5_dec_bi_hw_segments(struct bio *bio) | 120 | static inline void raid5_set_bi_processed_stripes(struct bio *bio, |
| 121 | unsigned int cnt) | ||
| 119 | { | 122 | { |
| 120 | unsigned short val = raid5_bi_hw_segments(bio); | 123 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 124 | int old, new; | ||
| 121 | 125 | ||
| 122 | --val; | 126 | do { |
| 123 | bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); | 127 | old = atomic_read(segments); |
| 124 | return val; | 128 | new = (old & 0xffff) | (cnt << 16); |
| 129 | } while (atomic_cmpxchg(segments, old, new) != old); | ||
| 125 | } | 130 | } |
| 126 | 131 | ||
| 127 | static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) | 132 | static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) |
| 128 | { | 133 | { |
| 129 | bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); | 134 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 135 | atomic_set(segments, cnt); | ||
| 130 | } | 136 | } |
| 131 | 137 | ||
| 132 | /* Find first data disk in a raid6 stripe */ | 138 | /* Find first data disk in a raid6 stripe */ |
| @@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh) | |||
| 190 | test_bit(STRIPE_COMPUTE_RUN, &sh->state); | 196 | test_bit(STRIPE_COMPUTE_RUN, &sh->state); |
| 191 | } | 197 | } |
| 192 | 198 | ||
| 193 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | 199 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) |
| 194 | { | 200 | { |
| 195 | if (atomic_dec_and_test(&sh->count)) { | 201 | BUG_ON(!list_empty(&sh->lru)); |
| 196 | BUG_ON(!list_empty(&sh->lru)); | 202 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
| 197 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 203 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
| 198 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 204 | if (test_bit(STRIPE_DELAYED, &sh->state) && |
| 199 | if (test_bit(STRIPE_DELAYED, &sh->state) && | 205 | !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 200 | !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 206 | list_add_tail(&sh->lru, &conf->delayed_list); |
| 201 | list_add_tail(&sh->lru, &conf->delayed_list); | 207 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && |
| 202 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && | 208 | sh->bm_seq - conf->seq_write > 0) |
| 203 | sh->bm_seq - conf->seq_write > 0) | 209 | list_add_tail(&sh->lru, &conf->bitmap_list); |
| 204 | list_add_tail(&sh->lru, &conf->bitmap_list); | 210 | else { |
| 205 | else { | 211 | clear_bit(STRIPE_DELAYED, &sh->state); |
| 206 | clear_bit(STRIPE_DELAYED, &sh->state); | 212 | clear_bit(STRIPE_BIT_DELAY, &sh->state); |
| 207 | clear_bit(STRIPE_BIT_DELAY, &sh->state); | 213 | list_add_tail(&sh->lru, &conf->handle_list); |
| 208 | list_add_tail(&sh->lru, &conf->handle_list); | 214 | } |
| 209 | } | 215 | md_wakeup_thread(conf->mddev->thread); |
| 210 | md_wakeup_thread(conf->mddev->thread); | 216 | } else { |
| 211 | } else { | 217 | BUG_ON(stripe_operations_active(sh)); |
| 212 | BUG_ON(stripe_operations_active(sh)); | 218 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 213 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 219 | if (atomic_dec_return(&conf->preread_active_stripes) |
| 214 | if (atomic_dec_return(&conf->preread_active_stripes) | 220 | < IO_THRESHOLD) |
| 215 | < IO_THRESHOLD) | 221 | md_wakeup_thread(conf->mddev->thread); |
| 216 | md_wakeup_thread(conf->mddev->thread); | 222 | atomic_dec(&conf->active_stripes); |
| 217 | atomic_dec(&conf->active_stripes); | 223 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { |
| 218 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { | 224 | list_add_tail(&sh->lru, &conf->inactive_list); |
| 219 | list_add_tail(&sh->lru, &conf->inactive_list); | 225 | wake_up(&conf->wait_for_stripe); |
| 220 | wake_up(&conf->wait_for_stripe); | 226 | if (conf->retry_read_aligned) |
| 221 | if (conf->retry_read_aligned) | 227 | md_wakeup_thread(conf->mddev->thread); |
| 222 | md_wakeup_thread(conf->mddev->thread); | ||
| 223 | } | ||
| 224 | } | 228 | } |
| 225 | } | 229 | } |
| 226 | } | 230 | } |
| 227 | 231 | ||
| 232 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | ||
| 233 | { | ||
| 234 | if (atomic_dec_and_test(&sh->count)) | ||
| 235 | do_release_stripe(conf, sh); | ||
| 236 | } | ||
| 237 | |||
| 228 | static void release_stripe(struct stripe_head *sh) | 238 | static void release_stripe(struct stripe_head *sh) |
| 229 | { | 239 | { |
| 230 | struct r5conf *conf = sh->raid_conf; | 240 | struct r5conf *conf = sh->raid_conf; |
| 231 | unsigned long flags; | 241 | unsigned long flags; |
| 232 | 242 | ||
| 233 | spin_lock_irqsave(&conf->device_lock, flags); | 243 | local_irq_save(flags); |
| 234 | __release_stripe(conf, sh); | 244 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { |
| 235 | spin_unlock_irqrestore(&conf->device_lock, flags); | 245 | do_release_stripe(conf, sh); |
| 246 | spin_unlock(&conf->device_lock); | ||
| 247 | } | ||
| 248 | local_irq_restore(flags); | ||
| 236 | } | 249 | } |
| 237 | 250 | ||
| 238 | static inline void remove_hash(struct stripe_head *sh) | 251 | static inline void remove_hash(struct stripe_head *sh) |
| @@ -640,6 +653,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 640 | else | 653 | else |
| 641 | bi->bi_sector = (sh->sector | 654 | bi->bi_sector = (sh->sector |
| 642 | + rdev->data_offset); | 655 | + rdev->data_offset); |
| 656 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | ||
| 657 | bi->bi_rw |= REQ_FLUSH; | ||
| 658 | |||
| 643 | bi->bi_flags = 1 << BIO_UPTODATE; | 659 | bi->bi_flags = 1 << BIO_UPTODATE; |
| 644 | bi->bi_idx = 0; | 660 | bi->bi_idx = 0; |
| 645 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 661 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
| @@ -749,14 +765,12 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
| 749 | { | 765 | { |
| 750 | struct stripe_head *sh = stripe_head_ref; | 766 | struct stripe_head *sh = stripe_head_ref; |
| 751 | struct bio *return_bi = NULL; | 767 | struct bio *return_bi = NULL; |
| 752 | struct r5conf *conf = sh->raid_conf; | ||
| 753 | int i; | 768 | int i; |
| 754 | 769 | ||
| 755 | pr_debug("%s: stripe %llu\n", __func__, | 770 | pr_debug("%s: stripe %llu\n", __func__, |
| 756 | (unsigned long long)sh->sector); | 771 | (unsigned long long)sh->sector); |
| 757 | 772 | ||
| 758 | /* clear completed biofills */ | 773 | /* clear completed biofills */ |
| 759 | spin_lock_irq(&conf->device_lock); | ||
| 760 | for (i = sh->disks; i--; ) { | 774 | for (i = sh->disks; i--; ) { |
| 761 | struct r5dev *dev = &sh->dev[i]; | 775 | struct r5dev *dev = &sh->dev[i]; |
| 762 | 776 | ||
| @@ -774,7 +788,7 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
| 774 | while (rbi && rbi->bi_sector < | 788 | while (rbi && rbi->bi_sector < |
| 775 | dev->sector + STRIPE_SECTORS) { | 789 | dev->sector + STRIPE_SECTORS) { |
| 776 | rbi2 = r5_next_bio(rbi, dev->sector); | 790 | rbi2 = r5_next_bio(rbi, dev->sector); |
| 777 | if (!raid5_dec_bi_phys_segments(rbi)) { | 791 | if (!raid5_dec_bi_active_stripes(rbi)) { |
| 778 | rbi->bi_next = return_bi; | 792 | rbi->bi_next = return_bi; |
| 779 | return_bi = rbi; | 793 | return_bi = rbi; |
| 780 | } | 794 | } |
| @@ -782,7 +796,6 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
| 782 | } | 796 | } |
| 783 | } | 797 | } |
| 784 | } | 798 | } |
| 785 | spin_unlock_irq(&conf->device_lock); | ||
| 786 | clear_bit(STRIPE_BIOFILL_RUN, &sh->state); | 799 | clear_bit(STRIPE_BIOFILL_RUN, &sh->state); |
| 787 | 800 | ||
| 788 | return_io(return_bi); | 801 | return_io(return_bi); |
| @@ -794,7 +807,6 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
| 794 | static void ops_run_biofill(struct stripe_head *sh) | 807 | static void ops_run_biofill(struct stripe_head *sh) |
| 795 | { | 808 | { |
| 796 | struct dma_async_tx_descriptor *tx = NULL; | 809 | struct dma_async_tx_descriptor *tx = NULL; |
| 797 | struct r5conf *conf = sh->raid_conf; | ||
| 798 | struct async_submit_ctl submit; | 810 | struct async_submit_ctl submit; |
| 799 | int i; | 811 | int i; |
| 800 | 812 | ||
| @@ -805,10 +817,10 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
| 805 | struct r5dev *dev = &sh->dev[i]; | 817 | struct r5dev *dev = &sh->dev[i]; |
| 806 | if (test_bit(R5_Wantfill, &dev->flags)) { | 818 | if (test_bit(R5_Wantfill, &dev->flags)) { |
| 807 | struct bio *rbi; | 819 | struct bio *rbi; |
| 808 | spin_lock_irq(&conf->device_lock); | 820 | spin_lock_irq(&sh->stripe_lock); |
| 809 | dev->read = rbi = dev->toread; | 821 | dev->read = rbi = dev->toread; |
| 810 | dev->toread = NULL; | 822 | dev->toread = NULL; |
| 811 | spin_unlock_irq(&conf->device_lock); | 823 | spin_unlock_irq(&sh->stripe_lock); |
| 812 | while (rbi && rbi->bi_sector < | 824 | while (rbi && rbi->bi_sector < |
| 813 | dev->sector + STRIPE_SECTORS) { | 825 | dev->sector + STRIPE_SECTORS) { |
| 814 | tx = async_copy_data(0, rbi, dev->page, | 826 | tx = async_copy_data(0, rbi, dev->page, |
| @@ -1144,12 +1156,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 1144 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1156 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
| 1145 | struct bio *wbi; | 1157 | struct bio *wbi; |
| 1146 | 1158 | ||
| 1147 | spin_lock_irq(&sh->raid_conf->device_lock); | 1159 | spin_lock_irq(&sh->stripe_lock); |
| 1148 | chosen = dev->towrite; | 1160 | chosen = dev->towrite; |
| 1149 | dev->towrite = NULL; | 1161 | dev->towrite = NULL; |
| 1150 | BUG_ON(dev->written); | 1162 | BUG_ON(dev->written); |
| 1151 | wbi = dev->written = chosen; | 1163 | wbi = dev->written = chosen; |
| 1152 | spin_unlock_irq(&sh->raid_conf->device_lock); | 1164 | spin_unlock_irq(&sh->stripe_lock); |
| 1153 | 1165 | ||
| 1154 | while (wbi && wbi->bi_sector < | 1166 | while (wbi && wbi->bi_sector < |
| 1155 | dev->sector + STRIPE_SECTORS) { | 1167 | dev->sector + STRIPE_SECTORS) { |
| @@ -1454,6 +1466,8 @@ static int grow_one_stripe(struct r5conf *conf) | |||
| 1454 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1466 | init_waitqueue_head(&sh->ops.wait_for_ops); |
| 1455 | #endif | 1467 | #endif |
| 1456 | 1468 | ||
| 1469 | spin_lock_init(&sh->stripe_lock); | ||
| 1470 | |||
| 1457 | if (grow_buffers(sh)) { | 1471 | if (grow_buffers(sh)) { |
| 1458 | shrink_buffers(sh); | 1472 | shrink_buffers(sh); |
| 1459 | kmem_cache_free(conf->slab_cache, sh); | 1473 | kmem_cache_free(conf->slab_cache, sh); |
| @@ -1739,7 +1753,9 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1739 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | 1753 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); |
| 1740 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1754 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
| 1741 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1755 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
| 1742 | } | 1756 | } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) |
| 1757 | clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); | ||
| 1758 | |||
| 1743 | if (atomic_read(&rdev->read_errors)) | 1759 | if (atomic_read(&rdev->read_errors)) |
| 1744 | atomic_set(&rdev->read_errors, 0); | 1760 | atomic_set(&rdev->read_errors, 0); |
| 1745 | } else { | 1761 | } else { |
| @@ -1784,7 +1800,11 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1784 | else | 1800 | else |
| 1785 | retry = 1; | 1801 | retry = 1; |
| 1786 | if (retry) | 1802 | if (retry) |
| 1787 | set_bit(R5_ReadError, &sh->dev[i].flags); | 1803 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { |
| 1804 | set_bit(R5_ReadError, &sh->dev[i].flags); | ||
| 1805 | clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); | ||
| 1806 | } else | ||
| 1807 | set_bit(R5_ReadNoMerge, &sh->dev[i].flags); | ||
| 1788 | else { | 1808 | else { |
| 1789 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1809 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
| 1790 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1810 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
| @@ -2340,11 +2360,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2340 | (unsigned long long)bi->bi_sector, | 2360 | (unsigned long long)bi->bi_sector, |
| 2341 | (unsigned long long)sh->sector); | 2361 | (unsigned long long)sh->sector); |
| 2342 | 2362 | ||
| 2343 | 2363 | /* | |
| 2344 | spin_lock_irq(&conf->device_lock); | 2364 | * If several bio share a stripe. The bio bi_phys_segments acts as a |
| 2365 | * reference count to avoid race. The reference count should already be | ||
| 2366 | * increased before this function is called (for example, in | ||
| 2367 | * make_request()), so other bio sharing this stripe will not free the | ||
| 2368 | * stripe. If a stripe is owned by one stripe, the stripe lock will | ||
| 2369 | * protect it. | ||
| 2370 | */ | ||
| 2371 | spin_lock_irq(&sh->stripe_lock); | ||
| 2345 | if (forwrite) { | 2372 | if (forwrite) { |
| 2346 | bip = &sh->dev[dd_idx].towrite; | 2373 | bip = &sh->dev[dd_idx].towrite; |
| 2347 | if (*bip == NULL && sh->dev[dd_idx].written == NULL) | 2374 | if (*bip == NULL) |
| 2348 | firstwrite = 1; | 2375 | firstwrite = 1; |
| 2349 | } else | 2376 | } else |
| 2350 | bip = &sh->dev[dd_idx].toread; | 2377 | bip = &sh->dev[dd_idx].toread; |
| @@ -2360,7 +2387,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2360 | if (*bip) | 2387 | if (*bip) |
| 2361 | bi->bi_next = *bip; | 2388 | bi->bi_next = *bip; |
| 2362 | *bip = bi; | 2389 | *bip = bi; |
| 2363 | bi->bi_phys_segments++; | 2390 | raid5_inc_bi_active_stripes(bi); |
| 2364 | 2391 | ||
| 2365 | if (forwrite) { | 2392 | if (forwrite) { |
| 2366 | /* check if page is covered */ | 2393 | /* check if page is covered */ |
| @@ -2375,7 +2402,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2375 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2402 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
| 2376 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2403 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); |
| 2377 | } | 2404 | } |
| 2378 | spin_unlock_irq(&conf->device_lock); | 2405 | spin_unlock_irq(&sh->stripe_lock); |
| 2379 | 2406 | ||
| 2380 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | 2407 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", |
| 2381 | (unsigned long long)(*bip)->bi_sector, | 2408 | (unsigned long long)(*bip)->bi_sector, |
| @@ -2391,7 +2418,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2391 | 2418 | ||
| 2392 | overlap: | 2419 | overlap: |
| 2393 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | 2420 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); |
| 2394 | spin_unlock_irq(&conf->device_lock); | 2421 | spin_unlock_irq(&sh->stripe_lock); |
| 2395 | return 0; | 2422 | return 0; |
| 2396 | } | 2423 | } |
| 2397 | 2424 | ||
| @@ -2441,10 +2468,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2441 | rdev_dec_pending(rdev, conf->mddev); | 2468 | rdev_dec_pending(rdev, conf->mddev); |
| 2442 | } | 2469 | } |
| 2443 | } | 2470 | } |
| 2444 | spin_lock_irq(&conf->device_lock); | 2471 | spin_lock_irq(&sh->stripe_lock); |
| 2445 | /* fail all writes first */ | 2472 | /* fail all writes first */ |
| 2446 | bi = sh->dev[i].towrite; | 2473 | bi = sh->dev[i].towrite; |
| 2447 | sh->dev[i].towrite = NULL; | 2474 | sh->dev[i].towrite = NULL; |
| 2475 | spin_unlock_irq(&sh->stripe_lock); | ||
| 2448 | if (bi) { | 2476 | if (bi) { |
| 2449 | s->to_write--; | 2477 | s->to_write--; |
| 2450 | bitmap_end = 1; | 2478 | bitmap_end = 1; |
| @@ -2457,13 +2485,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2457 | sh->dev[i].sector + STRIPE_SECTORS) { | 2485 | sh->dev[i].sector + STRIPE_SECTORS) { |
| 2458 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | 2486 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); |
| 2459 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2487 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
| 2460 | if (!raid5_dec_bi_phys_segments(bi)) { | 2488 | if (!raid5_dec_bi_active_stripes(bi)) { |
| 2461 | md_write_end(conf->mddev); | 2489 | md_write_end(conf->mddev); |
| 2462 | bi->bi_next = *return_bi; | 2490 | bi->bi_next = *return_bi; |
| 2463 | *return_bi = bi; | 2491 | *return_bi = bi; |
| 2464 | } | 2492 | } |
| 2465 | bi = nextbi; | 2493 | bi = nextbi; |
| 2466 | } | 2494 | } |
| 2495 | if (bitmap_end) | ||
| 2496 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
| 2497 | STRIPE_SECTORS, 0, 0); | ||
| 2498 | bitmap_end = 0; | ||
| 2467 | /* and fail all 'written' */ | 2499 | /* and fail all 'written' */ |
| 2468 | bi = sh->dev[i].written; | 2500 | bi = sh->dev[i].written; |
| 2469 | sh->dev[i].written = NULL; | 2501 | sh->dev[i].written = NULL; |
| @@ -2472,7 +2504,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2472 | sh->dev[i].sector + STRIPE_SECTORS) { | 2504 | sh->dev[i].sector + STRIPE_SECTORS) { |
| 2473 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | 2505 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); |
| 2474 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2506 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
| 2475 | if (!raid5_dec_bi_phys_segments(bi)) { | 2507 | if (!raid5_dec_bi_active_stripes(bi)) { |
| 2476 | md_write_end(conf->mddev); | 2508 | md_write_end(conf->mddev); |
| 2477 | bi->bi_next = *return_bi; | 2509 | bi->bi_next = *return_bi; |
| 2478 | *return_bi = bi; | 2510 | *return_bi = bi; |
| @@ -2496,14 +2528,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2496 | struct bio *nextbi = | 2528 | struct bio *nextbi = |
| 2497 | r5_next_bio(bi, sh->dev[i].sector); | 2529 | r5_next_bio(bi, sh->dev[i].sector); |
| 2498 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2530 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
| 2499 | if (!raid5_dec_bi_phys_segments(bi)) { | 2531 | if (!raid5_dec_bi_active_stripes(bi)) { |
| 2500 | bi->bi_next = *return_bi; | 2532 | bi->bi_next = *return_bi; |
| 2501 | *return_bi = bi; | 2533 | *return_bi = bi; |
| 2502 | } | 2534 | } |
| 2503 | bi = nextbi; | 2535 | bi = nextbi; |
| 2504 | } | 2536 | } |
| 2505 | } | 2537 | } |
| 2506 | spin_unlock_irq(&conf->device_lock); | ||
| 2507 | if (bitmap_end) | 2538 | if (bitmap_end) |
| 2508 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | 2539 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
| 2509 | STRIPE_SECTORS, 0, 0); | 2540 | STRIPE_SECTORS, 0, 0); |
| @@ -2707,30 +2738,23 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
| 2707 | test_bit(R5_UPTODATE, &dev->flags)) { | 2738 | test_bit(R5_UPTODATE, &dev->flags)) { |
| 2708 | /* We can return any write requests */ | 2739 | /* We can return any write requests */ |
| 2709 | struct bio *wbi, *wbi2; | 2740 | struct bio *wbi, *wbi2; |
| 2710 | int bitmap_end = 0; | ||
| 2711 | pr_debug("Return write for disc %d\n", i); | 2741 | pr_debug("Return write for disc %d\n", i); |
| 2712 | spin_lock_irq(&conf->device_lock); | ||
| 2713 | wbi = dev->written; | 2742 | wbi = dev->written; |
| 2714 | dev->written = NULL; | 2743 | dev->written = NULL; |
| 2715 | while (wbi && wbi->bi_sector < | 2744 | while (wbi && wbi->bi_sector < |
| 2716 | dev->sector + STRIPE_SECTORS) { | 2745 | dev->sector + STRIPE_SECTORS) { |
| 2717 | wbi2 = r5_next_bio(wbi, dev->sector); | 2746 | wbi2 = r5_next_bio(wbi, dev->sector); |
| 2718 | if (!raid5_dec_bi_phys_segments(wbi)) { | 2747 | if (!raid5_dec_bi_active_stripes(wbi)) { |
| 2719 | md_write_end(conf->mddev); | 2748 | md_write_end(conf->mddev); |
| 2720 | wbi->bi_next = *return_bi; | 2749 | wbi->bi_next = *return_bi; |
| 2721 | *return_bi = wbi; | 2750 | *return_bi = wbi; |
| 2722 | } | 2751 | } |
| 2723 | wbi = wbi2; | 2752 | wbi = wbi2; |
| 2724 | } | 2753 | } |
| 2725 | if (dev->towrite == NULL) | 2754 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
| 2726 | bitmap_end = 1; | 2755 | STRIPE_SECTORS, |
| 2727 | spin_unlock_irq(&conf->device_lock); | ||
| 2728 | if (bitmap_end) | ||
| 2729 | bitmap_endwrite(conf->mddev->bitmap, | ||
| 2730 | sh->sector, | ||
| 2731 | STRIPE_SECTORS, | ||
| 2732 | !test_bit(STRIPE_DEGRADED, &sh->state), | 2756 | !test_bit(STRIPE_DEGRADED, &sh->state), |
| 2733 | 0); | 2757 | 0); |
| 2734 | } | 2758 | } |
| 2735 | } | 2759 | } |
| 2736 | 2760 | ||
| @@ -3182,7 +3206,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 3182 | 3206 | ||
| 3183 | /* Now to look around and see what can be done */ | 3207 | /* Now to look around and see what can be done */ |
| 3184 | rcu_read_lock(); | 3208 | rcu_read_lock(); |
| 3185 | spin_lock_irq(&conf->device_lock); | ||
| 3186 | for (i=disks; i--; ) { | 3209 | for (i=disks; i--; ) { |
| 3187 | struct md_rdev *rdev; | 3210 | struct md_rdev *rdev; |
| 3188 | sector_t first_bad; | 3211 | sector_t first_bad; |
| @@ -3328,7 +3351,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 3328 | do_recovery = 1; | 3351 | do_recovery = 1; |
| 3329 | } | 3352 | } |
| 3330 | } | 3353 | } |
| 3331 | spin_unlock_irq(&conf->device_lock); | ||
| 3332 | if (test_bit(STRIPE_SYNCING, &sh->state)) { | 3354 | if (test_bit(STRIPE_SYNCING, &sh->state)) { |
| 3333 | /* If there is a failed device being replaced, | 3355 | /* If there is a failed device being replaced, |
| 3334 | * we must be recovering. | 3356 | * we must be recovering. |
| @@ -3791,7 +3813,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) | |||
| 3791 | * this sets the active strip count to 1 and the processed | 3813 | * this sets the active strip count to 1 and the processed |
| 3792 | * strip count to zero (upper 8 bits) | 3814 | * strip count to zero (upper 8 bits) |
| 3793 | */ | 3815 | */ |
| 3794 | bi->bi_phys_segments = 1; /* biased count of active stripes */ | 3816 | raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ |
| 3795 | } | 3817 | } |
| 3796 | 3818 | ||
| 3797 | return bi; | 3819 | return bi; |
| @@ -4113,7 +4135,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4113 | finish_wait(&conf->wait_for_overlap, &w); | 4135 | finish_wait(&conf->wait_for_overlap, &w); |
| 4114 | set_bit(STRIPE_HANDLE, &sh->state); | 4136 | set_bit(STRIPE_HANDLE, &sh->state); |
| 4115 | clear_bit(STRIPE_DELAYED, &sh->state); | 4137 | clear_bit(STRIPE_DELAYED, &sh->state); |
| 4116 | if ((bi->bi_rw & REQ_SYNC) && | 4138 | if ((bi->bi_rw & REQ_NOIDLE) && |
| 4117 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 4139 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 4118 | atomic_inc(&conf->preread_active_stripes); | 4140 | atomic_inc(&conf->preread_active_stripes); |
| 4119 | mddev_check_plugged(mddev); | 4141 | mddev_check_plugged(mddev); |
| @@ -4126,9 +4148,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4126 | } | 4148 | } |
| 4127 | } | 4149 | } |
| 4128 | 4150 | ||
| 4129 | spin_lock_irq(&conf->device_lock); | 4151 | remaining = raid5_dec_bi_active_stripes(bi); |
| 4130 | remaining = raid5_dec_bi_phys_segments(bi); | ||
| 4131 | spin_unlock_irq(&conf->device_lock); | ||
| 4132 | if (remaining == 0) { | 4152 | if (remaining == 0) { |
| 4133 | 4153 | ||
| 4134 | if ( rw == WRITE ) | 4154 | if ( rw == WRITE ) |
| @@ -4484,7 +4504,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
| 4484 | sector += STRIPE_SECTORS, | 4504 | sector += STRIPE_SECTORS, |
| 4485 | scnt++) { | 4505 | scnt++) { |
| 4486 | 4506 | ||
| 4487 | if (scnt < raid5_bi_hw_segments(raid_bio)) | 4507 | if (scnt < raid5_bi_processed_stripes(raid_bio)) |
| 4488 | /* already done this stripe */ | 4508 | /* already done this stripe */ |
| 4489 | continue; | 4509 | continue; |
| 4490 | 4510 | ||
| @@ -4492,25 +4512,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
| 4492 | 4512 | ||
| 4493 | if (!sh) { | 4513 | if (!sh) { |
| 4494 | /* failed to get a stripe - must wait */ | 4514 | /* failed to get a stripe - must wait */ |
| 4495 | raid5_set_bi_hw_segments(raid_bio, scnt); | 4515 | raid5_set_bi_processed_stripes(raid_bio, scnt); |
| 4496 | conf->retry_read_aligned = raid_bio; | 4516 | conf->retry_read_aligned = raid_bio; |
| 4497 | return handled; | 4517 | return handled; |
| 4498 | } | 4518 | } |
| 4499 | 4519 | ||
| 4500 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { | 4520 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { |
| 4501 | release_stripe(sh); | 4521 | release_stripe(sh); |
| 4502 | raid5_set_bi_hw_segments(raid_bio, scnt); | 4522 | raid5_set_bi_processed_stripes(raid_bio, scnt); |
| 4503 | conf->retry_read_aligned = raid_bio; | 4523 | conf->retry_read_aligned = raid_bio; |
| 4504 | return handled; | 4524 | return handled; |
| 4505 | } | 4525 | } |
| 4506 | 4526 | ||
| 4527 | set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); | ||
| 4507 | handle_stripe(sh); | 4528 | handle_stripe(sh); |
| 4508 | release_stripe(sh); | 4529 | release_stripe(sh); |
| 4509 | handled++; | 4530 | handled++; |
| 4510 | } | 4531 | } |
| 4511 | spin_lock_irq(&conf->device_lock); | 4532 | remaining = raid5_dec_bi_active_stripes(raid_bio); |
| 4512 | remaining = raid5_dec_bi_phys_segments(raid_bio); | ||
| 4513 | spin_unlock_irq(&conf->device_lock); | ||
| 4514 | if (remaining == 0) | 4533 | if (remaining == 0) |
| 4515 | bio_endio(raid_bio, 0); | 4534 | bio_endio(raid_bio, 0); |
| 4516 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 4535 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2164021f3b5f..61dbb615c30b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -210,6 +210,7 @@ struct stripe_head { | |||
| 210 | int disks; /* disks in stripe */ | 210 | int disks; /* disks in stripe */ |
| 211 | enum check_states check_state; | 211 | enum check_states check_state; |
| 212 | enum reconstruct_states reconstruct_state; | 212 | enum reconstruct_states reconstruct_state; |
| 213 | spinlock_t stripe_lock; | ||
| 213 | /** | 214 | /** |
| 214 | * struct stripe_operations | 215 | * struct stripe_operations |
| 215 | * @target - STRIPE_OP_COMPUTE_BLK target | 216 | * @target - STRIPE_OP_COMPUTE_BLK target |
| @@ -273,6 +274,7 @@ enum r5dev_flags { | |||
| 273 | R5_Wantwrite, | 274 | R5_Wantwrite, |
| 274 | R5_Overlap, /* There is a pending overlapping request | 275 | R5_Overlap, /* There is a pending overlapping request |
| 275 | * on this block */ | 276 | * on this block */ |
| 277 | R5_ReadNoMerge, /* prevent bio from merging in block-layer */ | ||
| 276 | R5_ReadError, /* seen a read error here recently */ | 278 | R5_ReadError, /* seen a read error here recently */ |
| 277 | R5_ReWrite, /* have tried to over-write the readerror */ | 279 | R5_ReWrite, /* have tried to over-write the readerror */ |
| 278 | 280 | ||
