aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c282
1 files changed, 116 insertions, 166 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69b0a169e43d..b72edf35ec54 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -27,12 +27,12 @@
27 * 27 *
28 * We group bitmap updates into batches. Each batch has a number. 28 * We group bitmap updates into batches. Each batch has a number.
29 * We may write out several batches at once, but that isn't very important. 29 * We may write out several batches at once, but that isn't very important.
30 * conf->bm_write is the number of the last batch successfully written. 30 * conf->seq_write is the number of the last batch successfully written.
31 * conf->bm_flush is the number of the last batch that was closed to 31 * conf->seq_flush is the number of the last batch that was closed to
32 * new additions. 32 * new additions.
33 * When we discover that we will need to write to any block in a stripe 33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is bm_flush+1. 35 * the number of the batch it will be in. This is seq_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet, 36 * When we are ready to do a write, if that batch hasn't been written yet,
37 * we plug the array and queue the stripe for later. 37 * we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current 38 * When an unplug happens, we increment bm_flush, thus closing the current
@@ -129,7 +129,7 @@ static inline int raid5_dec_bi_hw_segments(struct bio *bio)
129 129
130static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 130static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
131{ 131{
132 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); 132 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
133} 133}
134 134
135/* Find first data disk in a raid6 stripe */ 135/* Find first data disk in a raid6 stripe */
@@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
199 BUG_ON(!list_empty(&sh->lru)); 199 BUG_ON(!list_empty(&sh->lru));
200 BUG_ON(atomic_read(&conf->active_stripes)==0); 200 BUG_ON(atomic_read(&conf->active_stripes)==0);
201 if (test_bit(STRIPE_HANDLE, &sh->state)) { 201 if (test_bit(STRIPE_HANDLE, &sh->state)) {
202 if (test_bit(STRIPE_DELAYED, &sh->state)) { 202 if (test_bit(STRIPE_DELAYED, &sh->state))
203 list_add_tail(&sh->lru, &conf->delayed_list); 203 list_add_tail(&sh->lru, &conf->delayed_list);
204 plugger_set_plug(&conf->plug); 204 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
205 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 205 sh->bm_seq - conf->seq_write > 0)
206 sh->bm_seq - conf->seq_write > 0) {
207 list_add_tail(&sh->lru, &conf->bitmap_list); 206 list_add_tail(&sh->lru, &conf->bitmap_list);
208 plugger_set_plug(&conf->plug); 207 else {
209 } else {
210 clear_bit(STRIPE_BIT_DELAY, &sh->state); 208 clear_bit(STRIPE_BIT_DELAY, &sh->state);
211 list_add_tail(&sh->lru, &conf->handle_list); 209 list_add_tail(&sh->lru, &conf->handle_list);
212 } 210 }
@@ -433,8 +431,6 @@ static int has_failed(raid5_conf_t *conf)
433 return 0; 431 return 0;
434} 432}
435 433
436static void unplug_slaves(mddev_t *mddev);
437
438static struct stripe_head * 434static struct stripe_head *
439get_active_stripe(raid5_conf_t *conf, sector_t sector, 435get_active_stripe(raid5_conf_t *conf, sector_t sector,
440 int previous, int noblock, int noquiesce) 436 int previous, int noblock, int noquiesce)
@@ -463,8 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
463 < (conf->max_nr_stripes *3/4) 459 < (conf->max_nr_stripes *3/4)
464 || !conf->inactive_blocked), 460 || !conf->inactive_blocked),
465 conf->device_lock, 461 conf->device_lock,
466 md_raid5_unplug_device(conf) 462 );
467 );
468 conf->inactive_blocked = 0; 463 conf->inactive_blocked = 0;
469 } else 464 } else
470 init_stripe(sh, sector, previous); 465 init_stripe(sh, sector, previous);
@@ -506,9 +501,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
506 int rw; 501 int rw;
507 struct bio *bi; 502 struct bio *bi;
508 mdk_rdev_t *rdev; 503 mdk_rdev_t *rdev;
509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 504 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
510 rw = WRITE; 505 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
511 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 506 rw = WRITE_FUA;
507 else
508 rw = WRITE;
509 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
512 rw = READ; 510 rw = READ;
513 else 511 else
514 continue; 512 continue;
@@ -516,7 +514,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
516 bi = &sh->dev[i].req; 514 bi = &sh->dev[i].req;
517 515
518 bi->bi_rw = rw; 516 bi->bi_rw = rw;
519 if (rw == WRITE) 517 if (rw & WRITE)
520 bi->bi_end_io = raid5_end_write_request; 518 bi->bi_end_io = raid5_end_write_request;
521 else 519 else
522 bi->bi_end_io = raid5_end_read_request; 520 bi->bi_end_io = raid5_end_read_request;
@@ -550,13 +548,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
550 bi->bi_io_vec[0].bv_offset = 0; 548 bi->bi_io_vec[0].bv_offset = 0;
551 bi->bi_size = STRIPE_SIZE; 549 bi->bi_size = STRIPE_SIZE;
552 bi->bi_next = NULL; 550 bi->bi_next = NULL;
553 if (rw == WRITE && 551 if ((rw & WRITE) &&
554 test_bit(R5_ReWrite, &sh->dev[i].flags)) 552 test_bit(R5_ReWrite, &sh->dev[i].flags))
555 atomic_add(STRIPE_SECTORS, 553 atomic_add(STRIPE_SECTORS,
556 &rdev->corrected_errors); 554 &rdev->corrected_errors);
557 generic_make_request(bi); 555 generic_make_request(bi);
558 } else { 556 } else {
559 if (rw == WRITE) 557 if (rw & WRITE)
560 set_bit(STRIPE_DEGRADED, &sh->state); 558 set_bit(STRIPE_DEGRADED, &sh->state);
561 pr_debug("skip op %ld on disc %d for sector %llu\n", 559 pr_debug("skip op %ld on disc %d for sector %llu\n",
562 bi->bi_rw, i, (unsigned long long)sh->sector); 560 bi->bi_rw, i, (unsigned long long)sh->sector);
@@ -587,7 +585,7 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
587 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 585 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
588 586
589 bio_for_each_segment(bvl, bio, i) { 587 bio_for_each_segment(bvl, bio, i) {
590 int len = bio_iovec_idx(bio, i)->bv_len; 588 int len = bvl->bv_len;
591 int clen; 589 int clen;
592 int b_offset = 0; 590 int b_offset = 0;
593 591
@@ -603,8 +601,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
603 clen = len; 601 clen = len;
604 602
605 if (clen > 0) { 603 if (clen > 0) {
606 b_offset += bio_iovec_idx(bio, i)->bv_offset; 604 b_offset += bvl->bv_offset;
607 bio_page = bio_iovec_idx(bio, i)->bv_page; 605 bio_page = bvl->bv_page;
608 if (frombio) 606 if (frombio)
609 tx = async_memcpy(page, bio_page, page_offset, 607 tx = async_memcpy(page, bio_page, page_offset,
610 b_offset, clen, &submit); 608 b_offset, clen, &submit);
@@ -1031,6 +1029,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1031 1029
1032 while (wbi && wbi->bi_sector < 1030 while (wbi && wbi->bi_sector <
1033 dev->sector + STRIPE_SECTORS) { 1031 dev->sector + STRIPE_SECTORS) {
1032 if (wbi->bi_rw & REQ_FUA)
1033 set_bit(R5_WantFUA, &dev->flags);
1034 tx = async_copy_data(1, wbi, dev->page, 1034 tx = async_copy_data(1, wbi, dev->page,
1035 dev->sector, tx); 1035 dev->sector, tx);
1036 wbi = r5_next_bio(wbi, dev->sector); 1036 wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1048,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1048 int pd_idx = sh->pd_idx; 1048 int pd_idx = sh->pd_idx;
1049 int qd_idx = sh->qd_idx; 1049 int qd_idx = sh->qd_idx;
1050 int i; 1050 int i;
1051 bool fua = false;
1051 1052
1052 pr_debug("%s: stripe %llu\n", __func__, 1053 pr_debug("%s: stripe %llu\n", __func__,
1053 (unsigned long long)sh->sector); 1054 (unsigned long long)sh->sector);
1054 1055
1056 for (i = disks; i--; )
1057 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1058
1055 for (i = disks; i--; ) { 1059 for (i = disks; i--; ) {
1056 struct r5dev *dev = &sh->dev[i]; 1060 struct r5dev *dev = &sh->dev[i];
1057 1061
1058 if (dev->written || i == pd_idx || i == qd_idx) 1062 if (dev->written || i == pd_idx || i == qd_idx) {
1059 set_bit(R5_UPTODATE, &dev->flags); 1063 set_bit(R5_UPTODATE, &dev->flags);
1064 if (fua)
1065 set_bit(R5_WantFUA, &dev->flags);
1066 }
1060 } 1067 }
1061 1068
1062 if (sh->reconstruct_state == reconstruct_state_drain_run) 1069 if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -1461,8 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1461 wait_event_lock_irq(conf->wait_for_stripe, 1468 wait_event_lock_irq(conf->wait_for_stripe,
1462 !list_empty(&conf->inactive_list), 1469 !list_empty(&conf->inactive_list),
1463 conf->device_lock, 1470 conf->device_lock,
1464 unplug_slaves(conf->mddev) 1471 );
1465 );
1466 osh = get_free_stripe(conf); 1472 osh = get_free_stripe(conf);
1467 spin_unlock_irq(&conf->device_lock); 1473 spin_unlock_irq(&conf->device_lock);
1468 atomic_set(&nsh->count, 1); 1474 atomic_set(&nsh->count, 1);
@@ -1694,28 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1694 raid5_conf_t *conf = mddev->private; 1700 raid5_conf_t *conf = mddev->private;
1695 pr_debug("raid456: error called\n"); 1701 pr_debug("raid456: error called\n");
1696 1702
1697 if (!test_bit(Faulty, &rdev->flags)) { 1703 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1698 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1704 unsigned long flags;
1699 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1705 spin_lock_irqsave(&conf->device_lock, flags);
1700 unsigned long flags; 1706 mddev->degraded++;
1701 spin_lock_irqsave(&conf->device_lock, flags); 1707 spin_unlock_irqrestore(&conf->device_lock, flags);
1702 mddev->degraded++; 1708 /*
1703 spin_unlock_irqrestore(&conf->device_lock, flags); 1709 * if recovery was running, make sure it aborts.
1704 /* 1710 */
1705 * if recovery was running, make sure it aborts. 1711 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1706 */
1707 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1708 }
1709 set_bit(Faulty, &rdev->flags);
1710 printk(KERN_ALERT
1711 "md/raid:%s: Disk failure on %s, disabling device.\n"
1712 KERN_ALERT
1713 "md/raid:%s: Operation continuing on %d devices.\n",
1714 mdname(mddev),
1715 bdevname(rdev->bdev, b),
1716 mdname(mddev),
1717 conf->raid_disks - mddev->degraded);
1718 } 1712 }
1713 set_bit(Faulty, &rdev->flags);
1714 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1715 printk(KERN_ALERT
1716 "md/raid:%s: Disk failure on %s, disabling device.\n"
1717 "md/raid:%s: Operation continuing on %d devices.\n",
1718 mdname(mddev),
1719 bdevname(rdev->bdev, b),
1720 mdname(mddev),
1721 conf->raid_disks - mddev->degraded);
1719} 1722}
1720 1723
1721/* 1724/*
@@ -3281,7 +3284,7 @@ static void handle_stripe5(struct stripe_head *sh)
3281 3284
3282 if (dec_preread_active) { 3285 if (dec_preread_active) {
3283 /* We delay this until after ops_run_io so that if make_request 3286 /* We delay this until after ops_run_io so that if make_request
3284 * is waiting on a barrier, it won't continue until the writes 3287 * is waiting on a flush, it won't continue until the writes
3285 * have actually been submitted. 3288 * have actually been submitted.
3286 */ 3289 */
3287 atomic_dec(&conf->preread_active_stripes); 3290 atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3586,7 @@ static void handle_stripe6(struct stripe_head *sh)
3583 3586
3584 if (dec_preread_active) { 3587 if (dec_preread_active) {
3585 /* We delay this until after ops_run_io so that if make_request 3588 /* We delay this until after ops_run_io so that if make_request
3586 * is waiting on a barrier, it won't continue until the writes 3589 * is waiting on a flush, it won't continue until the writes
3587 * have actually been submitted. 3590 * have actually been submitted.
3588 */ 3591 */
3589 atomic_dec(&conf->preread_active_stripes); 3592 atomic_dec(&conf->preread_active_stripes);
@@ -3616,8 +3619,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
3616 atomic_inc(&conf->preread_active_stripes); 3619 atomic_inc(&conf->preread_active_stripes);
3617 list_add_tail(&sh->lru, &conf->hold_list); 3620 list_add_tail(&sh->lru, &conf->hold_list);
3618 } 3621 }
3619 } else 3622 }
3620 plugger_set_plug(&conf->plug);
3621} 3623}
3622 3624
3623static void activate_bit_delay(raid5_conf_t *conf) 3625static void activate_bit_delay(raid5_conf_t *conf)
@@ -3634,60 +3636,6 @@ static void activate_bit_delay(raid5_conf_t *conf)
3634 } 3636 }
3635} 3637}
3636 3638
3637static void unplug_slaves(mddev_t *mddev)
3638{
3639 raid5_conf_t *conf = mddev->private;
3640 int i;
3641 int devs = max(conf->raid_disks, conf->previous_raid_disks);
3642
3643 rcu_read_lock();
3644 for (i = 0; i < devs; i++) {
3645 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3646 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3647 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
3648
3649 atomic_inc(&rdev->nr_pending);
3650 rcu_read_unlock();
3651
3652 blk_unplug(r_queue);
3653
3654 rdev_dec_pending(rdev, mddev);
3655 rcu_read_lock();
3656 }
3657 }
3658 rcu_read_unlock();
3659}
3660
3661void md_raid5_unplug_device(raid5_conf_t *conf)
3662{
3663 unsigned long flags;
3664
3665 spin_lock_irqsave(&conf->device_lock, flags);
3666
3667 if (plugger_remove_plug(&conf->plug)) {
3668 conf->seq_flush++;
3669 raid5_activate_delayed(conf);
3670 }
3671 md_wakeup_thread(conf->mddev->thread);
3672
3673 spin_unlock_irqrestore(&conf->device_lock, flags);
3674
3675 unplug_slaves(conf->mddev);
3676}
3677EXPORT_SYMBOL_GPL(md_raid5_unplug_device);
3678
3679static void raid5_unplug(struct plug_handle *plug)
3680{
3681 raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
3682 md_raid5_unplug_device(conf);
3683}
3684
3685static void raid5_unplug_queue(struct request_queue *q)
3686{
3687 mddev_t *mddev = q->queuedata;
3688 md_raid5_unplug_device(mddev->private);
3689}
3690
3691int md_raid5_congested(mddev_t *mddev, int bits) 3639int md_raid5_congested(mddev_t *mddev, int bits)
3692{ 3640{
3693 raid5_conf_t *conf = mddev->private; 3641 raid5_conf_t *conf = mddev->private;
@@ -3864,9 +3812,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3864 return 0; 3812 return 0;
3865 } 3813 }
3866 /* 3814 /*
3867 * use bio_clone to make a copy of the bio 3815 * use bio_clone_mddev to make a copy of the bio
3868 */ 3816 */
3869 align_bi = bio_clone(raid_bio, GFP_NOIO); 3817 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
3870 if (!align_bi) 3818 if (!align_bi)
3871 return 0; 3819 return 0;
3872 /* 3820 /*
@@ -3977,15 +3925,10 @@ static int make_request(mddev_t *mddev, struct bio * bi)
3977 struct stripe_head *sh; 3925 struct stripe_head *sh;
3978 const int rw = bio_data_dir(bi); 3926 const int rw = bio_data_dir(bi);
3979 int remaining; 3927 int remaining;
3928 int plugged;
3980 3929
3981 if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { 3930 if (unlikely(bi->bi_rw & REQ_FLUSH)) {
3982 /* Drain all pending writes. We only really need 3931 md_flush_request(mddev, bi);
3983 * to ensure they have been submitted, but this is
3984 * easier.
3985 */
3986 mddev->pers->quiesce(mddev, 1);
3987 mddev->pers->quiesce(mddev, 0);
3988 md_barrier_request(mddev, bi);
3989 return 0; 3932 return 0;
3990 } 3933 }
3991 3934
@@ -4001,6 +3944,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4001 bi->bi_next = NULL; 3944 bi->bi_next = NULL;
4002 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3945 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4003 3946
3947 plugged = mddev_check_plugged(mddev);
4004 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3948 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
4005 DEFINE_WAIT(w); 3949 DEFINE_WAIT(w);
4006 int disks, data_disks; 3950 int disks, data_disks;
@@ -4014,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4014 /* spinlock is needed as reshape_progress may be 3958 /* spinlock is needed as reshape_progress may be
4015 * 64bit on a 32bit platform, and so it might be 3959 * 64bit on a 32bit platform, and so it might be
4016 * possible to see a half-updated value 3960 * possible to see a half-updated value
4017 * Ofcourse reshape_progress could change after 3961 * Of course reshape_progress could change after
4018 * the lock is dropped, so once we get a reference 3962 * the lock is dropped, so once we get a reference
4019 * to the stripe that we think it is, we will have 3963 * to the stripe that we think it is, we will have
4020 * to check again. 3964 * to check again.
@@ -4095,7 +4039,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4095 * add failed due to overlap. Flush everything 4039 * add failed due to overlap. Flush everything
4096 * and wait a while 4040 * and wait a while
4097 */ 4041 */
4098 md_raid5_unplug_device(conf); 4042 md_wakeup_thread(mddev->thread);
4099 release_stripe(sh); 4043 release_stripe(sh);
4100 schedule(); 4044 schedule();
4101 goto retry; 4045 goto retry;
@@ -4103,7 +4047,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4103 finish_wait(&conf->wait_for_overlap, &w); 4047 finish_wait(&conf->wait_for_overlap, &w);
4104 set_bit(STRIPE_HANDLE, &sh->state); 4048 set_bit(STRIPE_HANDLE, &sh->state);
4105 clear_bit(STRIPE_DELAYED, &sh->state); 4049 clear_bit(STRIPE_DELAYED, &sh->state);
4106 if (mddev->barrier && 4050 if ((bi->bi_rw & REQ_SYNC) &&
4107 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4051 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4108 atomic_inc(&conf->preread_active_stripes); 4052 atomic_inc(&conf->preread_active_stripes);
4109 release_stripe(sh); 4053 release_stripe(sh);
@@ -4115,6 +4059,9 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4115 } 4059 }
4116 4060
4117 } 4061 }
4062 if (!plugged)
4063 md_wakeup_thread(mddev->thread);
4064
4118 spin_lock_irq(&conf->device_lock); 4065 spin_lock_irq(&conf->device_lock);
4119 remaining = raid5_dec_bi_phys_segments(bi); 4066 remaining = raid5_dec_bi_phys_segments(bi);
4120 spin_unlock_irq(&conf->device_lock); 4067 spin_unlock_irq(&conf->device_lock);
@@ -4126,13 +4073,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4126 bio_endio(bi, 0); 4073 bio_endio(bi, 0);
4127 } 4074 }
4128 4075
4129 if (mddev->barrier) {
4130 /* We need to wait for the stripes to all be handled.
4131 * So: wait for preread_active_stripes to drop to 0.
4132 */
4133 wait_event(mddev->thread->wqueue,
4134 atomic_read(&conf->preread_active_stripes) == 0);
4135 }
4136 return 0; 4076 return 0;
4137} 4077}
4138 4078
@@ -4238,7 +4178,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4238 wait_event(conf->wait_for_overlap, 4178 wait_event(conf->wait_for_overlap,
4239 atomic_read(&conf->reshape_stripes)==0); 4179 atomic_read(&conf->reshape_stripes)==0);
4240 mddev->reshape_position = conf->reshape_progress; 4180 mddev->reshape_position = conf->reshape_progress;
4241 mddev->curr_resync_completed = mddev->curr_resync; 4181 mddev->curr_resync_completed = sector_nr;
4242 conf->reshape_checkpoint = jiffies; 4182 conf->reshape_checkpoint = jiffies;
4243 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4183 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4244 md_wakeup_thread(mddev->thread); 4184 md_wakeup_thread(mddev->thread);
@@ -4339,7 +4279,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4339 wait_event(conf->wait_for_overlap, 4279 wait_event(conf->wait_for_overlap,
4340 atomic_read(&conf->reshape_stripes) == 0); 4280 atomic_read(&conf->reshape_stripes) == 0);
4341 mddev->reshape_position = conf->reshape_progress; 4281 mddev->reshape_position = conf->reshape_progress;
4342 mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; 4282 mddev->curr_resync_completed = sector_nr;
4343 conf->reshape_checkpoint = jiffies; 4283 conf->reshape_checkpoint = jiffies;
4344 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4284 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4345 md_wakeup_thread(mddev->thread); 4285 md_wakeup_thread(mddev->thread);
@@ -4361,13 +4301,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4361 raid5_conf_t *conf = mddev->private; 4301 raid5_conf_t *conf = mddev->private;
4362 struct stripe_head *sh; 4302 struct stripe_head *sh;
4363 sector_t max_sector = mddev->dev_sectors; 4303 sector_t max_sector = mddev->dev_sectors;
4364 int sync_blocks; 4304 sector_t sync_blocks;
4365 int still_degraded = 0; 4305 int still_degraded = 0;
4366 int i; 4306 int i;
4367 4307
4368 if (sector_nr >= max_sector) { 4308 if (sector_nr >= max_sector) {
4369 /* just being told to finish up .. nothing much to do */ 4309 /* just being told to finish up .. nothing much to do */
4370 unplug_slaves(mddev);
4371 4310
4372 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4311 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
4373 end_reshape(conf); 4312 end_reshape(conf);
@@ -4524,24 +4463,30 @@ static void raid5d(mddev_t *mddev)
4524 struct stripe_head *sh; 4463 struct stripe_head *sh;
4525 raid5_conf_t *conf = mddev->private; 4464 raid5_conf_t *conf = mddev->private;
4526 int handled; 4465 int handled;
4466 struct blk_plug plug;
4527 4467
4528 pr_debug("+++ raid5d active\n"); 4468 pr_debug("+++ raid5d active\n");
4529 4469
4530 md_check_recovery(mddev); 4470 md_check_recovery(mddev);
4531 4471
4472 blk_start_plug(&plug);
4532 handled = 0; 4473 handled = 0;
4533 spin_lock_irq(&conf->device_lock); 4474 spin_lock_irq(&conf->device_lock);
4534 while (1) { 4475 while (1) {
4535 struct bio *bio; 4476 struct bio *bio;
4536 4477
4537 if (conf->seq_flush != conf->seq_write) { 4478 if (atomic_read(&mddev->plug_cnt) == 0 &&
4538 int seq = conf->seq_flush; 4479 !list_empty(&conf->bitmap_list)) {
4480 /* Now is a good time to flush some bitmap updates */
4481 conf->seq_flush++;
4539 spin_unlock_irq(&conf->device_lock); 4482 spin_unlock_irq(&conf->device_lock);
4540 bitmap_unplug(mddev->bitmap); 4483 bitmap_unplug(mddev->bitmap);
4541 spin_lock_irq(&conf->device_lock); 4484 spin_lock_irq(&conf->device_lock);
4542 conf->seq_write = seq; 4485 conf->seq_write = conf->seq_flush;
4543 activate_bit_delay(conf); 4486 activate_bit_delay(conf);
4544 } 4487 }
4488 if (atomic_read(&mddev->plug_cnt) == 0)
4489 raid5_activate_delayed(conf);
4545 4490
4546 while ((bio = remove_bio_from_retry(conf))) { 4491 while ((bio = remove_bio_from_retry(conf))) {
4547 int ok; 4492 int ok;
@@ -4571,7 +4516,7 @@ static void raid5d(mddev_t *mddev)
4571 spin_unlock_irq(&conf->device_lock); 4516 spin_unlock_irq(&conf->device_lock);
4572 4517
4573 async_tx_issue_pending_all(); 4518 async_tx_issue_pending_all();
4574 unplug_slaves(mddev); 4519 blk_finish_plug(&plug);
4575 4520
4576 pr_debug("--- raid5d inactive\n"); 4521 pr_debug("--- raid5d inactive\n");
4577} 4522}
@@ -4913,7 +4858,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4913 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4858 printk(KERN_INFO "md/raid:%s: device %s operational as raid"
4914 " disk %d\n", 4859 " disk %d\n",
4915 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4860 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
4916 } else 4861 } else if (rdev->saved_raid_disk != raid_disk)
4917 /* Cannot rely on bitmap to complete recovery */ 4862 /* Cannot rely on bitmap to complete recovery */
4918 conf->fullsync = 1; 4863 conf->fullsync = 1;
4919 } 4864 }
@@ -5188,8 +5133,6 @@ static int run(mddev_t *mddev)
5188 mdname(mddev)); 5133 mdname(mddev));
5189 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5134 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5190 5135
5191 plugger_init(&conf->plug, raid5_unplug);
5192 mddev->plug = &conf->plug;
5193 if (mddev->queue) { 5136 if (mddev->queue) {
5194 int chunk_size; 5137 int chunk_size;
5195 /* read-ahead size must cover two whole stripes, which 5138 /* read-ahead size must cover two whole stripes, which
@@ -5206,8 +5149,6 @@ static int run(mddev_t *mddev)
5206 5149
5207 mddev->queue->backing_dev_info.congested_data = mddev; 5150 mddev->queue->backing_dev_info.congested_data = mddev;
5208 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5151 mddev->queue->backing_dev_info.congested_fn = raid5_congested;
5209 mddev->queue->queue_lock = &conf->device_lock;
5210 mddev->queue->unplug_fn = raid5_unplug_queue;
5211 5152
5212 chunk_size = mddev->chunk_sectors << 9; 5153 chunk_size = mddev->chunk_sectors << 9;
5213 blk_queue_io_min(mddev->queue, chunk_size); 5154 blk_queue_io_min(mddev->queue, chunk_size);
@@ -5240,7 +5181,6 @@ static int stop(mddev_t *mddev)
5240 mddev->thread = NULL; 5181 mddev->thread = NULL;
5241 if (mddev->queue) 5182 if (mddev->queue)
5242 mddev->queue->backing_dev_info.congested_fn = NULL; 5183 mddev->queue->backing_dev_info.congested_fn = NULL;
5243 plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/
5244 free_conf(conf); 5184 free_conf(conf);
5245 mddev->private = NULL; 5185 mddev->private = NULL;
5246 mddev->to_remove = &raid5_attrs_group; 5186 mddev->to_remove = &raid5_attrs_group;
@@ -5340,7 +5280,7 @@ static int raid5_spare_active(mddev_t *mddev)
5340 && !test_bit(Faulty, &tmp->rdev->flags) 5280 && !test_bit(Faulty, &tmp->rdev->flags)
5341 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5281 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5342 count++; 5282 count++;
5343 sysfs_notify_dirent(tmp->rdev->sysfs_state); 5283 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
5344 } 5284 }
5345 } 5285 }
5346 spin_lock_irqsave(&conf->device_lock, flags); 5286 spin_lock_irqsave(&conf->device_lock, flags);
@@ -5449,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
5449 return -EINVAL; 5389 return -EINVAL;
5450 set_capacity(mddev->gendisk, mddev->array_sectors); 5390 set_capacity(mddev->gendisk, mddev->array_sectors);
5451 revalidate_disk(mddev->gendisk); 5391 revalidate_disk(mddev->gendisk);
5452 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5392 if (sectors > mddev->dev_sectors &&
5393 mddev->recovery_cp > mddev->dev_sectors) {
5453 mddev->recovery_cp = mddev->dev_sectors; 5394 mddev->recovery_cp = mddev->dev_sectors;
5454 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5395 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5455 } 5396 }
@@ -5519,7 +5460,6 @@ static int raid5_start_reshape(mddev_t *mddev)
5519 raid5_conf_t *conf = mddev->private; 5460 raid5_conf_t *conf = mddev->private;
5520 mdk_rdev_t *rdev; 5461 mdk_rdev_t *rdev;
5521 int spares = 0; 5462 int spares = 0;
5522 int added_devices = 0;
5523 unsigned long flags; 5463 unsigned long flags;
5524 5464
5525 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5465 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -5529,8 +5469,8 @@ static int raid5_start_reshape(mddev_t *mddev)
5529 return -ENOSPC; 5469 return -ENOSPC;
5530 5470
5531 list_for_each_entry(rdev, &mddev->disks, same_set) 5471 list_for_each_entry(rdev, &mddev->disks, same_set)
5532 if (rdev->raid_disk < 0 && 5472 if (!test_bit(In_sync, &rdev->flags)
5533 !test_bit(Faulty, &rdev->flags)) 5473 && !test_bit(Faulty, &rdev->flags))
5534 spares++; 5474 spares++;
5535 5475
5536 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5476 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5573,29 +5513,35 @@ static int raid5_start_reshape(mddev_t *mddev)
5573 * to correctly record the "partially reconstructed" state of 5513 * to correctly record the "partially reconstructed" state of
5574 * such devices during the reshape and confusion could result. 5514 * such devices during the reshape and confusion could result.
5575 */ 5515 */
5576 if (mddev->delta_disks >= 0) 5516 if (mddev->delta_disks >= 0) {
5577 list_for_each_entry(rdev, &mddev->disks, same_set) 5517 int added_devices = 0;
5578 if (rdev->raid_disk < 0 && 5518 list_for_each_entry(rdev, &mddev->disks, same_set)
5579 !test_bit(Faulty, &rdev->flags)) { 5519 if (rdev->raid_disk < 0 &&
5580 if (raid5_add_disk(mddev, rdev) == 0) { 5520 !test_bit(Faulty, &rdev->flags)) {
5581 char nm[20]; 5521 if (raid5_add_disk(mddev, rdev) == 0) {
5582 if (rdev->raid_disk >= conf->previous_raid_disks) { 5522 char nm[20];
5583 set_bit(In_sync, &rdev->flags); 5523 if (rdev->raid_disk
5584 added_devices++; 5524 >= conf->previous_raid_disks) {
5585 } else 5525 set_bit(In_sync, &rdev->flags);
5586 rdev->recovery_offset = 0; 5526 added_devices++;
5587 sprintf(nm, "rd%d", rdev->raid_disk); 5527 } else
5588 if (sysfs_create_link(&mddev->kobj, 5528 rdev->recovery_offset = 0;
5589 &rdev->kobj, nm)) 5529 sprintf(nm, "rd%d", rdev->raid_disk);
5590 /* Failure here is OK */; 5530 if (sysfs_create_link(&mddev->kobj,
5591 } else 5531 &rdev->kobj, nm))
5592 break; 5532 /* Failure here is OK */;
5593 } 5533 }
5534 } else if (rdev->raid_disk >= conf->previous_raid_disks
5535 && !test_bit(Faulty, &rdev->flags)) {
5536 /* This is a spare that was manually added */
5537 set_bit(In_sync, &rdev->flags);
5538 added_devices++;
5539 }
5594 5540
5595 /* When a reshape changes the number of devices, ->degraded 5541 /* When a reshape changes the number of devices,
5596 * is measured against the larger of the pre and post number of 5542 * ->degraded is measured against the larger of the
5597 * devices.*/ 5543 * pre and post number of devices.
5598 if (mddev->delta_disks > 0) { 5544 */
5599 spin_lock_irqsave(&conf->device_lock, flags); 5545 spin_lock_irqsave(&conf->device_lock, flags);
5600 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) 5546 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
5601 - added_devices; 5547 - added_devices;
@@ -5731,6 +5677,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
5731static void *raid45_takeover_raid0(mddev_t *mddev, int level) 5677static void *raid45_takeover_raid0(mddev_t *mddev, int level)
5732{ 5678{
5733 struct raid0_private_data *raid0_priv = mddev->private; 5679 struct raid0_private_data *raid0_priv = mddev->private;
5680 sector_t sectors;
5734 5681
5735 /* for raid0 takeover only one zone is supported */ 5682 /* for raid0 takeover only one zone is supported */
5736 if (raid0_priv->nr_strip_zones > 1) { 5683 if (raid0_priv->nr_strip_zones > 1) {
@@ -5739,6 +5686,9 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level)
5739 return ERR_PTR(-EINVAL); 5686 return ERR_PTR(-EINVAL);
5740 } 5687 }
5741 5688
5689 sectors = raid0_priv->strip_zone[0].zone_end;
5690 sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
5691 mddev->dev_sectors = sectors;
5742 mddev->new_level = level; 5692 mddev->new_level = level;
5743 mddev->new_layout = ALGORITHM_PARITY_N; 5693 mddev->new_layout = ALGORITHM_PARITY_N;
5744 mddev->new_chunk_sectors = mddev->chunk_sectors; 5694 mddev->new_chunk_sectors = mddev->chunk_sectors;