diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 282 |
1 files changed, 116 insertions, 166 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 69b0a169e43d..b72edf35ec54 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -27,12 +27,12 @@ | |||
27 | * | 27 | * |
28 | * We group bitmap updates into batches. Each batch has a number. | 28 | * We group bitmap updates into batches. Each batch has a number. |
29 | * We may write out several batches at once, but that isn't very important. | 29 | * We may write out several batches at once, but that isn't very important. |
30 | * conf->bm_write is the number of the last batch successfully written. | 30 | * conf->seq_write is the number of the last batch successfully written. |
31 | * conf->bm_flush is the number of the last batch that was closed to | 31 | * conf->seq_flush is the number of the last batch that was closed to |
32 | * new additions. | 32 | * new additions. |
33 | * When we discover that we will need to write to any block in a stripe | 33 | * When we discover that we will need to write to any block in a stripe |
34 | * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq | 34 | * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq |
35 | * the number of the batch it will be in. This is bm_flush+1. | 35 | * the number of the batch it will be in. This is seq_flush+1. |
36 | * When we are ready to do a write, if that batch hasn't been written yet, | 36 | * When we are ready to do a write, if that batch hasn't been written yet, |
37 | * we plug the array and queue the stripe for later. | 37 | * we plug the array and queue the stripe for later. |
38 | * When an unplug happens, we increment bm_flush, thus closing the current | 38 | * When an unplug happens, we increment bm_flush, thus closing the current |
@@ -129,7 +129,7 @@ static inline int raid5_dec_bi_hw_segments(struct bio *bio) | |||
129 | 129 | ||
130 | static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) | 130 | static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) |
131 | { | 131 | { |
132 | bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); | 132 | bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); |
133 | } | 133 | } |
134 | 134 | ||
135 | /* Find first data disk in a raid6 stripe */ | 135 | /* Find first data disk in a raid6 stripe */ |
@@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
199 | BUG_ON(!list_empty(&sh->lru)); | 199 | BUG_ON(!list_empty(&sh->lru)); |
200 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 200 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
201 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 201 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
202 | if (test_bit(STRIPE_DELAYED, &sh->state)) { | 202 | if (test_bit(STRIPE_DELAYED, &sh->state)) |
203 | list_add_tail(&sh->lru, &conf->delayed_list); | 203 | list_add_tail(&sh->lru, &conf->delayed_list); |
204 | plugger_set_plug(&conf->plug); | 204 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && |
205 | } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && | 205 | sh->bm_seq - conf->seq_write > 0) |
206 | sh->bm_seq - conf->seq_write > 0) { | ||
207 | list_add_tail(&sh->lru, &conf->bitmap_list); | 206 | list_add_tail(&sh->lru, &conf->bitmap_list); |
208 | plugger_set_plug(&conf->plug); | 207 | else { |
209 | } else { | ||
210 | clear_bit(STRIPE_BIT_DELAY, &sh->state); | 208 | clear_bit(STRIPE_BIT_DELAY, &sh->state); |
211 | list_add_tail(&sh->lru, &conf->handle_list); | 209 | list_add_tail(&sh->lru, &conf->handle_list); |
212 | } | 210 | } |
@@ -433,8 +431,6 @@ static int has_failed(raid5_conf_t *conf) | |||
433 | return 0; | 431 | return 0; |
434 | } | 432 | } |
435 | 433 | ||
436 | static void unplug_slaves(mddev_t *mddev); | ||
437 | |||
438 | static struct stripe_head * | 434 | static struct stripe_head * |
439 | get_active_stripe(raid5_conf_t *conf, sector_t sector, | 435 | get_active_stripe(raid5_conf_t *conf, sector_t sector, |
440 | int previous, int noblock, int noquiesce) | 436 | int previous, int noblock, int noquiesce) |
@@ -463,8 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, | |||
463 | < (conf->max_nr_stripes *3/4) | 459 | < (conf->max_nr_stripes *3/4) |
464 | || !conf->inactive_blocked), | 460 | || !conf->inactive_blocked), |
465 | conf->device_lock, | 461 | conf->device_lock, |
466 | md_raid5_unplug_device(conf) | 462 | ); |
467 | ); | ||
468 | conf->inactive_blocked = 0; | 463 | conf->inactive_blocked = 0; |
469 | } else | 464 | } else |
470 | init_stripe(sh, sector, previous); | 465 | init_stripe(sh, sector, previous); |
@@ -506,9 +501,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
506 | int rw; | 501 | int rw; |
507 | struct bio *bi; | 502 | struct bio *bi; |
508 | mdk_rdev_t *rdev; | 503 | mdk_rdev_t *rdev; |
509 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) | 504 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { |
510 | rw = WRITE; | 505 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) |
511 | else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | 506 | rw = WRITE_FUA; |
507 | else | ||
508 | rw = WRITE; | ||
509 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | ||
512 | rw = READ; | 510 | rw = READ; |
513 | else | 511 | else |
514 | continue; | 512 | continue; |
@@ -516,7 +514,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
516 | bi = &sh->dev[i].req; | 514 | bi = &sh->dev[i].req; |
517 | 515 | ||
518 | bi->bi_rw = rw; | 516 | bi->bi_rw = rw; |
519 | if (rw == WRITE) | 517 | if (rw & WRITE) |
520 | bi->bi_end_io = raid5_end_write_request; | 518 | bi->bi_end_io = raid5_end_write_request; |
521 | else | 519 | else |
522 | bi->bi_end_io = raid5_end_read_request; | 520 | bi->bi_end_io = raid5_end_read_request; |
@@ -550,13 +548,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
550 | bi->bi_io_vec[0].bv_offset = 0; | 548 | bi->bi_io_vec[0].bv_offset = 0; |
551 | bi->bi_size = STRIPE_SIZE; | 549 | bi->bi_size = STRIPE_SIZE; |
552 | bi->bi_next = NULL; | 550 | bi->bi_next = NULL; |
553 | if (rw == WRITE && | 551 | if ((rw & WRITE) && |
554 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | 552 | test_bit(R5_ReWrite, &sh->dev[i].flags)) |
555 | atomic_add(STRIPE_SECTORS, | 553 | atomic_add(STRIPE_SECTORS, |
556 | &rdev->corrected_errors); | 554 | &rdev->corrected_errors); |
557 | generic_make_request(bi); | 555 | generic_make_request(bi); |
558 | } else { | 556 | } else { |
559 | if (rw == WRITE) | 557 | if (rw & WRITE) |
560 | set_bit(STRIPE_DEGRADED, &sh->state); | 558 | set_bit(STRIPE_DEGRADED, &sh->state); |
561 | pr_debug("skip op %ld on disc %d for sector %llu\n", | 559 | pr_debug("skip op %ld on disc %d for sector %llu\n", |
562 | bi->bi_rw, i, (unsigned long long)sh->sector); | 560 | bi->bi_rw, i, (unsigned long long)sh->sector); |
@@ -587,7 +585,7 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
587 | init_async_submit(&submit, flags, tx, NULL, NULL, NULL); | 585 | init_async_submit(&submit, flags, tx, NULL, NULL, NULL); |
588 | 586 | ||
589 | bio_for_each_segment(bvl, bio, i) { | 587 | bio_for_each_segment(bvl, bio, i) { |
590 | int len = bio_iovec_idx(bio, i)->bv_len; | 588 | int len = bvl->bv_len; |
591 | int clen; | 589 | int clen; |
592 | int b_offset = 0; | 590 | int b_offset = 0; |
593 | 591 | ||
@@ -603,8 +601,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
603 | clen = len; | 601 | clen = len; |
604 | 602 | ||
605 | if (clen > 0) { | 603 | if (clen > 0) { |
606 | b_offset += bio_iovec_idx(bio, i)->bv_offset; | 604 | b_offset += bvl->bv_offset; |
607 | bio_page = bio_iovec_idx(bio, i)->bv_page; | 605 | bio_page = bvl->bv_page; |
608 | if (frombio) | 606 | if (frombio) |
609 | tx = async_memcpy(page, bio_page, page_offset, | 607 | tx = async_memcpy(page, bio_page, page_offset, |
610 | b_offset, clen, &submit); | 608 | b_offset, clen, &submit); |
@@ -1031,6 +1029,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1031 | 1029 | ||
1032 | while (wbi && wbi->bi_sector < | 1030 | while (wbi && wbi->bi_sector < |
1033 | dev->sector + STRIPE_SECTORS) { | 1031 | dev->sector + STRIPE_SECTORS) { |
1032 | if (wbi->bi_rw & REQ_FUA) | ||
1033 | set_bit(R5_WantFUA, &dev->flags); | ||
1034 | tx = async_copy_data(1, wbi, dev->page, | 1034 | tx = async_copy_data(1, wbi, dev->page, |
1035 | dev->sector, tx); | 1035 | dev->sector, tx); |
1036 | wbi = r5_next_bio(wbi, dev->sector); | 1036 | wbi = r5_next_bio(wbi, dev->sector); |
@@ -1048,15 +1048,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
1048 | int pd_idx = sh->pd_idx; | 1048 | int pd_idx = sh->pd_idx; |
1049 | int qd_idx = sh->qd_idx; | 1049 | int qd_idx = sh->qd_idx; |
1050 | int i; | 1050 | int i; |
1051 | bool fua = false; | ||
1051 | 1052 | ||
1052 | pr_debug("%s: stripe %llu\n", __func__, | 1053 | pr_debug("%s: stripe %llu\n", __func__, |
1053 | (unsigned long long)sh->sector); | 1054 | (unsigned long long)sh->sector); |
1054 | 1055 | ||
1056 | for (i = disks; i--; ) | ||
1057 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); | ||
1058 | |||
1055 | for (i = disks; i--; ) { | 1059 | for (i = disks; i--; ) { |
1056 | struct r5dev *dev = &sh->dev[i]; | 1060 | struct r5dev *dev = &sh->dev[i]; |
1057 | 1061 | ||
1058 | if (dev->written || i == pd_idx || i == qd_idx) | 1062 | if (dev->written || i == pd_idx || i == qd_idx) { |
1059 | set_bit(R5_UPTODATE, &dev->flags); | 1063 | set_bit(R5_UPTODATE, &dev->flags); |
1064 | if (fua) | ||
1065 | set_bit(R5_WantFUA, &dev->flags); | ||
1066 | } | ||
1060 | } | 1067 | } |
1061 | 1068 | ||
1062 | if (sh->reconstruct_state == reconstruct_state_drain_run) | 1069 | if (sh->reconstruct_state == reconstruct_state_drain_run) |
@@ -1461,8 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1461 | wait_event_lock_irq(conf->wait_for_stripe, | 1468 | wait_event_lock_irq(conf->wait_for_stripe, |
1462 | !list_empty(&conf->inactive_list), | 1469 | !list_empty(&conf->inactive_list), |
1463 | conf->device_lock, | 1470 | conf->device_lock, |
1464 | unplug_slaves(conf->mddev) | 1471 | ); |
1465 | ); | ||
1466 | osh = get_free_stripe(conf); | 1472 | osh = get_free_stripe(conf); |
1467 | spin_unlock_irq(&conf->device_lock); | 1473 | spin_unlock_irq(&conf->device_lock); |
1468 | atomic_set(&nsh->count, 1); | 1474 | atomic_set(&nsh->count, 1); |
@@ -1694,28 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1694 | raid5_conf_t *conf = mddev->private; | 1700 | raid5_conf_t *conf = mddev->private; |
1695 | pr_debug("raid456: error called\n"); | 1701 | pr_debug("raid456: error called\n"); |
1696 | 1702 | ||
1697 | if (!test_bit(Faulty, &rdev->flags)) { | 1703 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
1698 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1704 | unsigned long flags; |
1699 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1705 | spin_lock_irqsave(&conf->device_lock, flags); |
1700 | unsigned long flags; | 1706 | mddev->degraded++; |
1701 | spin_lock_irqsave(&conf->device_lock, flags); | 1707 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1702 | mddev->degraded++; | 1708 | /* |
1703 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1709 | * if recovery was running, make sure it aborts. |
1704 | /* | 1710 | */ |
1705 | * if recovery was running, make sure it aborts. | 1711 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
1706 | */ | ||
1707 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
1708 | } | ||
1709 | set_bit(Faulty, &rdev->flags); | ||
1710 | printk(KERN_ALERT | ||
1711 | "md/raid:%s: Disk failure on %s, disabling device.\n" | ||
1712 | KERN_ALERT | ||
1713 | "md/raid:%s: Operation continuing on %d devices.\n", | ||
1714 | mdname(mddev), | ||
1715 | bdevname(rdev->bdev, b), | ||
1716 | mdname(mddev), | ||
1717 | conf->raid_disks - mddev->degraded); | ||
1718 | } | 1712 | } |
1713 | set_bit(Faulty, &rdev->flags); | ||
1714 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
1715 | printk(KERN_ALERT | ||
1716 | "md/raid:%s: Disk failure on %s, disabling device.\n" | ||
1717 | "md/raid:%s: Operation continuing on %d devices.\n", | ||
1718 | mdname(mddev), | ||
1719 | bdevname(rdev->bdev, b), | ||
1720 | mdname(mddev), | ||
1721 | conf->raid_disks - mddev->degraded); | ||
1719 | } | 1722 | } |
1720 | 1723 | ||
1721 | /* | 1724 | /* |
@@ -3281,7 +3284,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3281 | 3284 | ||
3282 | if (dec_preread_active) { | 3285 | if (dec_preread_active) { |
3283 | /* We delay this until after ops_run_io so that if make_request | 3286 | /* We delay this until after ops_run_io so that if make_request |
3284 | * is waiting on a barrier, it won't continue until the writes | 3287 | * is waiting on a flush, it won't continue until the writes |
3285 | * have actually been submitted. | 3288 | * have actually been submitted. |
3286 | */ | 3289 | */ |
3287 | atomic_dec(&conf->preread_active_stripes); | 3290 | atomic_dec(&conf->preread_active_stripes); |
@@ -3583,7 +3586,7 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3583 | 3586 | ||
3584 | if (dec_preread_active) { | 3587 | if (dec_preread_active) { |
3585 | /* We delay this until after ops_run_io so that if make_request | 3588 | /* We delay this until after ops_run_io so that if make_request |
3586 | * is waiting on a barrier, it won't continue until the writes | 3589 | * is waiting on a flush, it won't continue until the writes |
3587 | * have actually been submitted. | 3590 | * have actually been submitted. |
3588 | */ | 3591 | */ |
3589 | atomic_dec(&conf->preread_active_stripes); | 3592 | atomic_dec(&conf->preread_active_stripes); |
@@ -3616,8 +3619,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf) | |||
3616 | atomic_inc(&conf->preread_active_stripes); | 3619 | atomic_inc(&conf->preread_active_stripes); |
3617 | list_add_tail(&sh->lru, &conf->hold_list); | 3620 | list_add_tail(&sh->lru, &conf->hold_list); |
3618 | } | 3621 | } |
3619 | } else | 3622 | } |
3620 | plugger_set_plug(&conf->plug); | ||
3621 | } | 3623 | } |
3622 | 3624 | ||
3623 | static void activate_bit_delay(raid5_conf_t *conf) | 3625 | static void activate_bit_delay(raid5_conf_t *conf) |
@@ -3634,60 +3636,6 @@ static void activate_bit_delay(raid5_conf_t *conf) | |||
3634 | } | 3636 | } |
3635 | } | 3637 | } |
3636 | 3638 | ||
3637 | static void unplug_slaves(mddev_t *mddev) | ||
3638 | { | ||
3639 | raid5_conf_t *conf = mddev->private; | ||
3640 | int i; | ||
3641 | int devs = max(conf->raid_disks, conf->previous_raid_disks); | ||
3642 | |||
3643 | rcu_read_lock(); | ||
3644 | for (i = 0; i < devs; i++) { | ||
3645 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
3646 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { | ||
3647 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); | ||
3648 | |||
3649 | atomic_inc(&rdev->nr_pending); | ||
3650 | rcu_read_unlock(); | ||
3651 | |||
3652 | blk_unplug(r_queue); | ||
3653 | |||
3654 | rdev_dec_pending(rdev, mddev); | ||
3655 | rcu_read_lock(); | ||
3656 | } | ||
3657 | } | ||
3658 | rcu_read_unlock(); | ||
3659 | } | ||
3660 | |||
3661 | void md_raid5_unplug_device(raid5_conf_t *conf) | ||
3662 | { | ||
3663 | unsigned long flags; | ||
3664 | |||
3665 | spin_lock_irqsave(&conf->device_lock, flags); | ||
3666 | |||
3667 | if (plugger_remove_plug(&conf->plug)) { | ||
3668 | conf->seq_flush++; | ||
3669 | raid5_activate_delayed(conf); | ||
3670 | } | ||
3671 | md_wakeup_thread(conf->mddev->thread); | ||
3672 | |||
3673 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
3674 | |||
3675 | unplug_slaves(conf->mddev); | ||
3676 | } | ||
3677 | EXPORT_SYMBOL_GPL(md_raid5_unplug_device); | ||
3678 | |||
3679 | static void raid5_unplug(struct plug_handle *plug) | ||
3680 | { | ||
3681 | raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); | ||
3682 | md_raid5_unplug_device(conf); | ||
3683 | } | ||
3684 | |||
3685 | static void raid5_unplug_queue(struct request_queue *q) | ||
3686 | { | ||
3687 | mddev_t *mddev = q->queuedata; | ||
3688 | md_raid5_unplug_device(mddev->private); | ||
3689 | } | ||
3690 | |||
3691 | int md_raid5_congested(mddev_t *mddev, int bits) | 3639 | int md_raid5_congested(mddev_t *mddev, int bits) |
3692 | { | 3640 | { |
3693 | raid5_conf_t *conf = mddev->private; | 3641 | raid5_conf_t *conf = mddev->private; |
@@ -3864,9 +3812,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
3864 | return 0; | 3812 | return 0; |
3865 | } | 3813 | } |
3866 | /* | 3814 | /* |
3867 | * use bio_clone to make a copy of the bio | 3815 | * use bio_clone_mddev to make a copy of the bio |
3868 | */ | 3816 | */ |
3869 | align_bi = bio_clone(raid_bio, GFP_NOIO); | 3817 | align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); |
3870 | if (!align_bi) | 3818 | if (!align_bi) |
3871 | return 0; | 3819 | return 0; |
3872 | /* | 3820 | /* |
@@ -3977,15 +3925,10 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
3977 | struct stripe_head *sh; | 3925 | struct stripe_head *sh; |
3978 | const int rw = bio_data_dir(bi); | 3926 | const int rw = bio_data_dir(bi); |
3979 | int remaining; | 3927 | int remaining; |
3928 | int plugged; | ||
3980 | 3929 | ||
3981 | if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { | 3930 | if (unlikely(bi->bi_rw & REQ_FLUSH)) { |
3982 | /* Drain all pending writes. We only really need | 3931 | md_flush_request(mddev, bi); |
3983 | * to ensure they have been submitted, but this is | ||
3984 | * easier. | ||
3985 | */ | ||
3986 | mddev->pers->quiesce(mddev, 1); | ||
3987 | mddev->pers->quiesce(mddev, 0); | ||
3988 | md_barrier_request(mddev, bi); | ||
3989 | return 0; | 3932 | return 0; |
3990 | } | 3933 | } |
3991 | 3934 | ||
@@ -4001,6 +3944,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4001 | bi->bi_next = NULL; | 3944 | bi->bi_next = NULL; |
4002 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ | 3945 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ |
4003 | 3946 | ||
3947 | plugged = mddev_check_plugged(mddev); | ||
4004 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | 3948 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { |
4005 | DEFINE_WAIT(w); | 3949 | DEFINE_WAIT(w); |
4006 | int disks, data_disks; | 3950 | int disks, data_disks; |
@@ -4014,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4014 | /* spinlock is needed as reshape_progress may be | 3958 | /* spinlock is needed as reshape_progress may be |
4015 | * 64bit on a 32bit platform, and so it might be | 3959 | * 64bit on a 32bit platform, and so it might be |
4016 | * possible to see a half-updated value | 3960 | * possible to see a half-updated value |
4017 | * Ofcourse reshape_progress could change after | 3961 | * Of course reshape_progress could change after |
4018 | * the lock is dropped, so once we get a reference | 3962 | * the lock is dropped, so once we get a reference |
4019 | * to the stripe that we think it is, we will have | 3963 | * to the stripe that we think it is, we will have |
4020 | * to check again. | 3964 | * to check again. |
@@ -4095,7 +4039,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4095 | * add failed due to overlap. Flush everything | 4039 | * add failed due to overlap. Flush everything |
4096 | * and wait a while | 4040 | * and wait a while |
4097 | */ | 4041 | */ |
4098 | md_raid5_unplug_device(conf); | 4042 | md_wakeup_thread(mddev->thread); |
4099 | release_stripe(sh); | 4043 | release_stripe(sh); |
4100 | schedule(); | 4044 | schedule(); |
4101 | goto retry; | 4045 | goto retry; |
@@ -4103,7 +4047,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4103 | finish_wait(&conf->wait_for_overlap, &w); | 4047 | finish_wait(&conf->wait_for_overlap, &w); |
4104 | set_bit(STRIPE_HANDLE, &sh->state); | 4048 | set_bit(STRIPE_HANDLE, &sh->state); |
4105 | clear_bit(STRIPE_DELAYED, &sh->state); | 4049 | clear_bit(STRIPE_DELAYED, &sh->state); |
4106 | if (mddev->barrier && | 4050 | if ((bi->bi_rw & REQ_SYNC) && |
4107 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 4051 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
4108 | atomic_inc(&conf->preread_active_stripes); | 4052 | atomic_inc(&conf->preread_active_stripes); |
4109 | release_stripe(sh); | 4053 | release_stripe(sh); |
@@ -4115,6 +4059,9 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4115 | } | 4059 | } |
4116 | 4060 | ||
4117 | } | 4061 | } |
4062 | if (!plugged) | ||
4063 | md_wakeup_thread(mddev->thread); | ||
4064 | |||
4118 | spin_lock_irq(&conf->device_lock); | 4065 | spin_lock_irq(&conf->device_lock); |
4119 | remaining = raid5_dec_bi_phys_segments(bi); | 4066 | remaining = raid5_dec_bi_phys_segments(bi); |
4120 | spin_unlock_irq(&conf->device_lock); | 4067 | spin_unlock_irq(&conf->device_lock); |
@@ -4126,13 +4073,6 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4126 | bio_endio(bi, 0); | 4073 | bio_endio(bi, 0); |
4127 | } | 4074 | } |
4128 | 4075 | ||
4129 | if (mddev->barrier) { | ||
4130 | /* We need to wait for the stripes to all be handled. | ||
4131 | * So: wait for preread_active_stripes to drop to 0. | ||
4132 | */ | ||
4133 | wait_event(mddev->thread->wqueue, | ||
4134 | atomic_read(&conf->preread_active_stripes) == 0); | ||
4135 | } | ||
4136 | return 0; | 4076 | return 0; |
4137 | } | 4077 | } |
4138 | 4078 | ||
@@ -4238,7 +4178,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
4238 | wait_event(conf->wait_for_overlap, | 4178 | wait_event(conf->wait_for_overlap, |
4239 | atomic_read(&conf->reshape_stripes)==0); | 4179 | atomic_read(&conf->reshape_stripes)==0); |
4240 | mddev->reshape_position = conf->reshape_progress; | 4180 | mddev->reshape_position = conf->reshape_progress; |
4241 | mddev->curr_resync_completed = mddev->curr_resync; | 4181 | mddev->curr_resync_completed = sector_nr; |
4242 | conf->reshape_checkpoint = jiffies; | 4182 | conf->reshape_checkpoint = jiffies; |
4243 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4183 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
4244 | md_wakeup_thread(mddev->thread); | 4184 | md_wakeup_thread(mddev->thread); |
@@ -4339,7 +4279,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
4339 | wait_event(conf->wait_for_overlap, | 4279 | wait_event(conf->wait_for_overlap, |
4340 | atomic_read(&conf->reshape_stripes) == 0); | 4280 | atomic_read(&conf->reshape_stripes) == 0); |
4341 | mddev->reshape_position = conf->reshape_progress; | 4281 | mddev->reshape_position = conf->reshape_progress; |
4342 | mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; | 4282 | mddev->curr_resync_completed = sector_nr; |
4343 | conf->reshape_checkpoint = jiffies; | 4283 | conf->reshape_checkpoint = jiffies; |
4344 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4284 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
4345 | md_wakeup_thread(mddev->thread); | 4285 | md_wakeup_thread(mddev->thread); |
@@ -4361,13 +4301,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4361 | raid5_conf_t *conf = mddev->private; | 4301 | raid5_conf_t *conf = mddev->private; |
4362 | struct stripe_head *sh; | 4302 | struct stripe_head *sh; |
4363 | sector_t max_sector = mddev->dev_sectors; | 4303 | sector_t max_sector = mddev->dev_sectors; |
4364 | int sync_blocks; | 4304 | sector_t sync_blocks; |
4365 | int still_degraded = 0; | 4305 | int still_degraded = 0; |
4366 | int i; | 4306 | int i; |
4367 | 4307 | ||
4368 | if (sector_nr >= max_sector) { | 4308 | if (sector_nr >= max_sector) { |
4369 | /* just being told to finish up .. nothing much to do */ | 4309 | /* just being told to finish up .. nothing much to do */ |
4370 | unplug_slaves(mddev); | ||
4371 | 4310 | ||
4372 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | 4311 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { |
4373 | end_reshape(conf); | 4312 | end_reshape(conf); |
@@ -4524,24 +4463,30 @@ static void raid5d(mddev_t *mddev) | |||
4524 | struct stripe_head *sh; | 4463 | struct stripe_head *sh; |
4525 | raid5_conf_t *conf = mddev->private; | 4464 | raid5_conf_t *conf = mddev->private; |
4526 | int handled; | 4465 | int handled; |
4466 | struct blk_plug plug; | ||
4527 | 4467 | ||
4528 | pr_debug("+++ raid5d active\n"); | 4468 | pr_debug("+++ raid5d active\n"); |
4529 | 4469 | ||
4530 | md_check_recovery(mddev); | 4470 | md_check_recovery(mddev); |
4531 | 4471 | ||
4472 | blk_start_plug(&plug); | ||
4532 | handled = 0; | 4473 | handled = 0; |
4533 | spin_lock_irq(&conf->device_lock); | 4474 | spin_lock_irq(&conf->device_lock); |
4534 | while (1) { | 4475 | while (1) { |
4535 | struct bio *bio; | 4476 | struct bio *bio; |
4536 | 4477 | ||
4537 | if (conf->seq_flush != conf->seq_write) { | 4478 | if (atomic_read(&mddev->plug_cnt) == 0 && |
4538 | int seq = conf->seq_flush; | 4479 | !list_empty(&conf->bitmap_list)) { |
4480 | /* Now is a good time to flush some bitmap updates */ | ||
4481 | conf->seq_flush++; | ||
4539 | spin_unlock_irq(&conf->device_lock); | 4482 | spin_unlock_irq(&conf->device_lock); |
4540 | bitmap_unplug(mddev->bitmap); | 4483 | bitmap_unplug(mddev->bitmap); |
4541 | spin_lock_irq(&conf->device_lock); | 4484 | spin_lock_irq(&conf->device_lock); |
4542 | conf->seq_write = seq; | 4485 | conf->seq_write = conf->seq_flush; |
4543 | activate_bit_delay(conf); | 4486 | activate_bit_delay(conf); |
4544 | } | 4487 | } |
4488 | if (atomic_read(&mddev->plug_cnt) == 0) | ||
4489 | raid5_activate_delayed(conf); | ||
4545 | 4490 | ||
4546 | while ((bio = remove_bio_from_retry(conf))) { | 4491 | while ((bio = remove_bio_from_retry(conf))) { |
4547 | int ok; | 4492 | int ok; |
@@ -4571,7 +4516,7 @@ static void raid5d(mddev_t *mddev) | |||
4571 | spin_unlock_irq(&conf->device_lock); | 4516 | spin_unlock_irq(&conf->device_lock); |
4572 | 4517 | ||
4573 | async_tx_issue_pending_all(); | 4518 | async_tx_issue_pending_all(); |
4574 | unplug_slaves(mddev); | 4519 | blk_finish_plug(&plug); |
4575 | 4520 | ||
4576 | pr_debug("--- raid5d inactive\n"); | 4521 | pr_debug("--- raid5d inactive\n"); |
4577 | } | 4522 | } |
@@ -4913,7 +4858,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4913 | printk(KERN_INFO "md/raid:%s: device %s operational as raid" | 4858 | printk(KERN_INFO "md/raid:%s: device %s operational as raid" |
4914 | " disk %d\n", | 4859 | " disk %d\n", |
4915 | mdname(mddev), bdevname(rdev->bdev, b), raid_disk); | 4860 | mdname(mddev), bdevname(rdev->bdev, b), raid_disk); |
4916 | } else | 4861 | } else if (rdev->saved_raid_disk != raid_disk) |
4917 | /* Cannot rely on bitmap to complete recovery */ | 4862 | /* Cannot rely on bitmap to complete recovery */ |
4918 | conf->fullsync = 1; | 4863 | conf->fullsync = 1; |
4919 | } | 4864 | } |
@@ -5188,8 +5133,6 @@ static int run(mddev_t *mddev) | |||
5188 | mdname(mddev)); | 5133 | mdname(mddev)); |
5189 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); | 5134 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); |
5190 | 5135 | ||
5191 | plugger_init(&conf->plug, raid5_unplug); | ||
5192 | mddev->plug = &conf->plug; | ||
5193 | if (mddev->queue) { | 5136 | if (mddev->queue) { |
5194 | int chunk_size; | 5137 | int chunk_size; |
5195 | /* read-ahead size must cover two whole stripes, which | 5138 | /* read-ahead size must cover two whole stripes, which |
@@ -5206,8 +5149,6 @@ static int run(mddev_t *mddev) | |||
5206 | 5149 | ||
5207 | mddev->queue->backing_dev_info.congested_data = mddev; | 5150 | mddev->queue->backing_dev_info.congested_data = mddev; |
5208 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; | 5151 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; |
5209 | mddev->queue->queue_lock = &conf->device_lock; | ||
5210 | mddev->queue->unplug_fn = raid5_unplug_queue; | ||
5211 | 5152 | ||
5212 | chunk_size = mddev->chunk_sectors << 9; | 5153 | chunk_size = mddev->chunk_sectors << 9; |
5213 | blk_queue_io_min(mddev->queue, chunk_size); | 5154 | blk_queue_io_min(mddev->queue, chunk_size); |
@@ -5240,7 +5181,6 @@ static int stop(mddev_t *mddev) | |||
5240 | mddev->thread = NULL; | 5181 | mddev->thread = NULL; |
5241 | if (mddev->queue) | 5182 | if (mddev->queue) |
5242 | mddev->queue->backing_dev_info.congested_fn = NULL; | 5183 | mddev->queue->backing_dev_info.congested_fn = NULL; |
5243 | plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/ | ||
5244 | free_conf(conf); | 5184 | free_conf(conf); |
5245 | mddev->private = NULL; | 5185 | mddev->private = NULL; |
5246 | mddev->to_remove = &raid5_attrs_group; | 5186 | mddev->to_remove = &raid5_attrs_group; |
@@ -5340,7 +5280,7 @@ static int raid5_spare_active(mddev_t *mddev) | |||
5340 | && !test_bit(Faulty, &tmp->rdev->flags) | 5280 | && !test_bit(Faulty, &tmp->rdev->flags) |
5341 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 5281 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
5342 | count++; | 5282 | count++; |
5343 | sysfs_notify_dirent(tmp->rdev->sysfs_state); | 5283 | sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); |
5344 | } | 5284 | } |
5345 | } | 5285 | } |
5346 | spin_lock_irqsave(&conf->device_lock, flags); | 5286 | spin_lock_irqsave(&conf->device_lock, flags); |
@@ -5449,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
5449 | return -EINVAL; | 5389 | return -EINVAL; |
5450 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5390 | set_capacity(mddev->gendisk, mddev->array_sectors); |
5451 | revalidate_disk(mddev->gendisk); | 5391 | revalidate_disk(mddev->gendisk); |
5452 | if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { | 5392 | if (sectors > mddev->dev_sectors && |
5393 | mddev->recovery_cp > mddev->dev_sectors) { | ||
5453 | mddev->recovery_cp = mddev->dev_sectors; | 5394 | mddev->recovery_cp = mddev->dev_sectors; |
5454 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5395 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5455 | } | 5396 | } |
@@ -5519,7 +5460,6 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5519 | raid5_conf_t *conf = mddev->private; | 5460 | raid5_conf_t *conf = mddev->private; |
5520 | mdk_rdev_t *rdev; | 5461 | mdk_rdev_t *rdev; |
5521 | int spares = 0; | 5462 | int spares = 0; |
5522 | int added_devices = 0; | ||
5523 | unsigned long flags; | 5463 | unsigned long flags; |
5524 | 5464 | ||
5525 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 5465 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
@@ -5529,8 +5469,8 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5529 | return -ENOSPC; | 5469 | return -ENOSPC; |
5530 | 5470 | ||
5531 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5471 | list_for_each_entry(rdev, &mddev->disks, same_set) |
5532 | if (rdev->raid_disk < 0 && | 5472 | if (!test_bit(In_sync, &rdev->flags) |
5533 | !test_bit(Faulty, &rdev->flags)) | 5473 | && !test_bit(Faulty, &rdev->flags)) |
5534 | spares++; | 5474 | spares++; |
5535 | 5475 | ||
5536 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) | 5476 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) |
@@ -5573,29 +5513,35 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5573 | * to correctly record the "partially reconstructed" state of | 5513 | * to correctly record the "partially reconstructed" state of |
5574 | * such devices during the reshape and confusion could result. | 5514 | * such devices during the reshape and confusion could result. |
5575 | */ | 5515 | */ |
5576 | if (mddev->delta_disks >= 0) | 5516 | if (mddev->delta_disks >= 0) { |
5577 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5517 | int added_devices = 0; |
5578 | if (rdev->raid_disk < 0 && | 5518 | list_for_each_entry(rdev, &mddev->disks, same_set) |
5579 | !test_bit(Faulty, &rdev->flags)) { | 5519 | if (rdev->raid_disk < 0 && |
5580 | if (raid5_add_disk(mddev, rdev) == 0) { | 5520 | !test_bit(Faulty, &rdev->flags)) { |
5581 | char nm[20]; | 5521 | if (raid5_add_disk(mddev, rdev) == 0) { |
5582 | if (rdev->raid_disk >= conf->previous_raid_disks) { | 5522 | char nm[20]; |
5583 | set_bit(In_sync, &rdev->flags); | 5523 | if (rdev->raid_disk |
5584 | added_devices++; | 5524 | >= conf->previous_raid_disks) { |
5585 | } else | 5525 | set_bit(In_sync, &rdev->flags); |
5586 | rdev->recovery_offset = 0; | 5526 | added_devices++; |
5587 | sprintf(nm, "rd%d", rdev->raid_disk); | 5527 | } else |
5588 | if (sysfs_create_link(&mddev->kobj, | 5528 | rdev->recovery_offset = 0; |
5589 | &rdev->kobj, nm)) | 5529 | sprintf(nm, "rd%d", rdev->raid_disk); |
5590 | /* Failure here is OK */; | 5530 | if (sysfs_create_link(&mddev->kobj, |
5591 | } else | 5531 | &rdev->kobj, nm)) |
5592 | break; | 5532 | /* Failure here is OK */; |
5593 | } | 5533 | } |
5534 | } else if (rdev->raid_disk >= conf->previous_raid_disks | ||
5535 | && !test_bit(Faulty, &rdev->flags)) { | ||
5536 | /* This is a spare that was manually added */ | ||
5537 | set_bit(In_sync, &rdev->flags); | ||
5538 | added_devices++; | ||
5539 | } | ||
5594 | 5540 | ||
5595 | /* When a reshape changes the number of devices, ->degraded | 5541 | /* When a reshape changes the number of devices, |
5596 | * is measured against the larger of the pre and post number of | 5542 | * ->degraded is measured against the larger of the |
5597 | * devices.*/ | 5543 | * pre and post number of devices. |
5598 | if (mddev->delta_disks > 0) { | 5544 | */ |
5599 | spin_lock_irqsave(&conf->device_lock, flags); | 5545 | spin_lock_irqsave(&conf->device_lock, flags); |
5600 | mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) | 5546 | mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) |
5601 | - added_devices; | 5547 | - added_devices; |
@@ -5731,6 +5677,7 @@ static void raid5_quiesce(mddev_t *mddev, int state) | |||
5731 | static void *raid45_takeover_raid0(mddev_t *mddev, int level) | 5677 | static void *raid45_takeover_raid0(mddev_t *mddev, int level) |
5732 | { | 5678 | { |
5733 | struct raid0_private_data *raid0_priv = mddev->private; | 5679 | struct raid0_private_data *raid0_priv = mddev->private; |
5680 | sector_t sectors; | ||
5734 | 5681 | ||
5735 | /* for raid0 takeover only one zone is supported */ | 5682 | /* for raid0 takeover only one zone is supported */ |
5736 | if (raid0_priv->nr_strip_zones > 1) { | 5683 | if (raid0_priv->nr_strip_zones > 1) { |
@@ -5739,6 +5686,9 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level) | |||
5739 | return ERR_PTR(-EINVAL); | 5686 | return ERR_PTR(-EINVAL); |
5740 | } | 5687 | } |
5741 | 5688 | ||
5689 | sectors = raid0_priv->strip_zone[0].zone_end; | ||
5690 | sector_div(sectors, raid0_priv->strip_zone[0].nb_dev); | ||
5691 | mddev->dev_sectors = sectors; | ||
5742 | mddev->new_level = level; | 5692 | mddev->new_level = level; |
5743 | mddev->new_layout = ALGORITHM_PARITY_N; | 5693 | mddev->new_layout = ALGORITHM_PARITY_N; |
5744 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 5694 | mddev->new_chunk_sectors = mddev->chunk_sectors; |