aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2012-05-21 23:53:47 -0400
committerNeilBrown <neilb@suse.de>2012-05-21 23:53:47 -0400
commit3ea7daa5d7fde47cd41f4d56c2deb949114da9d6 (patch)
tree8b88c2f7451219cd32f32753100ffc62cbda9c60 /drivers/md
parentdeb200d08590622d987718135a1e6323f83154aa (diff)
md/raid10: add reshape support
A 'near' or 'offset' lay RAID10 array can be reshaped to a different 'near' or 'offset' layout, a different chunk size, and a different number of devices. However the number of copies cannot change. Unlike RAID5/6, we do not support having user-space backup data that is being relocated during a 'critical section'. Rather, the data_offset of each device must change so that when writing any block to a new location, it will not over-write any data that is still 'live'. This means that RAID10 reshape is not supportable on v0.90 metadata. The different between the old data_offset and the new_offset must be at least the larger of the chunksize multiplied by offset copies of each of the old and new layout. (for 'near' mode, offset_copies == 1). A larger difference of around 64M seems useful for in-place reshapes as more data can be moved between metadata updates. Very large differences (e.g. 512M) seem to slow the process down due to lots of long seeks (on oldish consumer graded devices at least). Metadata needs to be updated whenever the place we are about to write to is considered - by the current metadata - to still contain data in the old layout. [unbalanced locking fix from Dan Carpenter <dan.carpenter@oracle.com>] Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid10.c890
-rw-r--r--drivers/md/raid10.h5
2 files changed, 872 insertions, 23 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f102e88fc785..ec271ae4318f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -24,6 +24,7 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/seq_file.h> 25#include <linux/seq_file.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
27#include "md.h" 28#include "md.h"
28#include "raid10.h" 29#include "raid10.h"
29#include "raid0.h" 30#include "raid0.h"
@@ -68,6 +69,11 @@ static int max_queued_requests = 1024;
68static void allow_barrier(struct r10conf *conf); 69static void allow_barrier(struct r10conf *conf);
69static void lower_barrier(struct r10conf *conf); 70static void lower_barrier(struct r10conf *conf);
70static int enough(struct r10conf *conf, int ignore); 71static int enough(struct r10conf *conf, int ignore);
72static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
73 int *skipped);
74static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
75static void end_reshape_write(struct bio *bio, int error);
76static void end_reshape(struct r10conf *conf);
71 77
72static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 78static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
73{ 79{
@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
112 if (!r10_bio) 118 if (!r10_bio)
113 return NULL; 119 return NULL;
114 120
115 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) 121 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
122 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
116 nalloc = conf->copies; /* resync */ 123 nalloc = conf->copies; /* resync */
117 else 124 else
118 nalloc = 2; /* recovery */ 125 nalloc = 2; /* recovery */
@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
140 struct bio *rbio = r10_bio->devs[j].repl_bio; 147 struct bio *rbio = r10_bio->devs[j].repl_bio;
141 bio = r10_bio->devs[j].bio; 148 bio = r10_bio->devs[j].bio;
142 for (i = 0; i < RESYNC_PAGES; i++) { 149 for (i = 0; i < RESYNC_PAGES; i++) {
143 if (j == 1 && !test_bit(MD_RECOVERY_SYNC, 150 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
144 &conf->mddev->recovery)) { 151 &conf->mddev->recovery)) {
145 /* we can share bv_page's during recovery */ 152 /* we can share bv_page's during recovery
153 * and reshape */
146 struct bio *rbio = r10_bio->devs[0].bio; 154 struct bio *rbio = r10_bio->devs[0].bio;
147 page = rbio->bi_io_vec[i].bv_page; 155 page = rbio->bi_io_vec[i].bv_page;
148 get_page(page); 156 get_page(page);
@@ -614,10 +622,11 @@ static int raid10_mergeable_bvec(struct request_queue *q,
614 struct r10conf *conf = mddev->private; 622 struct r10conf *conf = mddev->private;
615 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 623 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
616 int max; 624 int max;
617 unsigned int chunk_sectors = mddev->chunk_sectors; 625 unsigned int chunk_sectors;
618 unsigned int bio_sectors = bvm->bi_size >> 9; 626 unsigned int bio_sectors = bvm->bi_size >> 9;
619 struct geom *geo = &conf->geo; 627 struct geom *geo = &conf->geo;
620 628
629 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
621 if (conf->reshape_progress != MaxSector && 630 if (conf->reshape_progress != MaxSector &&
622 ((sector >= conf->reshape_progress) != 631 ((sector >= conf->reshape_progress) !=
623 conf->mddev->reshape_backwards)) 632 conf->mddev->reshape_backwards))
@@ -1032,6 +1041,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1032 int plugged; 1041 int plugged;
1033 int sectors_handled; 1042 int sectors_handled;
1034 int max_sectors; 1043 int max_sectors;
1044 int sectors;
1035 1045
1036 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 1046 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1037 md_flush_request(mddev, bio); 1047 md_flush_request(mddev, bio);
@@ -1096,10 +1106,41 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1096 */ 1106 */
1097 wait_barrier(conf); 1107 wait_barrier(conf);
1098 1108
1109 sectors = bio->bi_size >> 9;
1110 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1111 bio->bi_sector < conf->reshape_progress &&
1112 bio->bi_sector + sectors > conf->reshape_progress) {
1113 /* IO spans the reshape position. Need to wait for
1114 * reshape to pass
1115 */
1116 allow_barrier(conf);
1117 wait_event(conf->wait_barrier,
1118 conf->reshape_progress <= bio->bi_sector ||
1119 conf->reshape_progress >= bio->bi_sector + sectors);
1120 wait_barrier(conf);
1121 }
1122 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1123 bio_data_dir(bio) == WRITE &&
1124 (mddev->reshape_backwards
1125 ? (bio->bi_sector < conf->reshape_safe &&
1126 bio->bi_sector + sectors > conf->reshape_progress)
1127 : (bio->bi_sector + sectors > conf->reshape_safe &&
1128 bio->bi_sector < conf->reshape_progress))) {
1129 /* Need to update reshape_position in metadata */
1130 mddev->reshape_position = conf->reshape_progress;
1131 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1132 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1133 md_wakeup_thread(mddev->thread);
1134 wait_event(mddev->sb_wait,
1135 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1136
1137 conf->reshape_safe = mddev->reshape_position;
1138 }
1139
1099 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1140 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1100 1141
1101 r10_bio->master_bio = bio; 1142 r10_bio->master_bio = bio;
1102 r10_bio->sectors = bio->bi_size >> 9; 1143 r10_bio->sectors = sectors;
1103 1144
1104 r10_bio->mddev = mddev; 1145 r10_bio->mddev = mddev;
1105 r10_bio->sector = bio->bi_sector; 1146 r10_bio->sector = bio->bi_sector;
@@ -1730,7 +1771,11 @@ static void end_sync_read(struct bio *bio, int error)
1730 struct r10conf *conf = r10_bio->mddev->private; 1771 struct r10conf *conf = r10_bio->mddev->private;
1731 int d; 1772 int d;
1732 1773
1733 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 1774 if (bio == r10_bio->master_bio) {
1775 /* this is a reshape read */
1776 d = r10_bio->read_slot; /* really the read dev */
1777 } else
1778 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1734 1779
1735 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1780 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1736 set_bit(R10BIO_Uptodate, &r10_bio->state); 1781 set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -2631,6 +2676,8 @@ static void raid10d(struct mddev *mddev)
2631 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2676 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2632 test_bit(R10BIO_WriteError, &r10_bio->state)) 2677 test_bit(R10BIO_WriteError, &r10_bio->state))
2633 handle_write_completed(conf, r10_bio); 2678 handle_write_completed(conf, r10_bio);
2679 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2680 reshape_request_write(mddev, r10_bio);
2634 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2681 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2635 sync_request_write(mddev, r10_bio); 2682 sync_request_write(mddev, r10_bio);
2636 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 2683 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
@@ -2723,7 +2770,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2723 2770
2724 skipped: 2771 skipped:
2725 max_sector = mddev->dev_sectors; 2772 max_sector = mddev->dev_sectors;
2726 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2773 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2774 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2727 max_sector = mddev->resync_max_sectors; 2775 max_sector = mddev->resync_max_sectors;
2728 if (sector_nr >= max_sector) { 2776 if (sector_nr >= max_sector) {
2729 /* If we aborted, we need to abort the 2777 /* If we aborted, we need to abort the
@@ -2735,6 +2783,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2735 * we need to convert that to several 2783 * we need to convert that to several
2736 * virtual addresses. 2784 * virtual addresses.
2737 */ 2785 */
2786 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2787 end_reshape(conf);
2788 return 0;
2789 }
2790
2738 if (mddev->curr_resync < max_sector) { /* aborted */ 2791 if (mddev->curr_resync < max_sector) { /* aborted */
2739 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2792 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2740 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2793 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
@@ -2766,6 +2819,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2766 *skipped = 1; 2819 *skipped = 1;
2767 return sectors_skipped; 2820 return sectors_skipped;
2768 } 2821 }
2822
2823 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2824 return reshape_request(mddev, sector_nr, skipped);
2825
2769 if (chunks_skipped >= conf->geo.raid_disks) { 2826 if (chunks_skipped >= conf->geo.raid_disks) {
2770 /* if there has been nothing to do on any drive, 2827 /* if there has been nothing to do on any drive,
2771 * then there is nothing to do at all.. 2828 * then there is nothing to do at all..
@@ -3211,7 +3268,8 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3211 struct r10conf *conf = mddev->private; 3268 struct r10conf *conf = mddev->private;
3212 3269
3213 if (!raid_disks) 3270 if (!raid_disks)
3214 raid_disks = conf->geo.raid_disks; 3271 raid_disks = min(conf->geo.raid_disks,
3272 conf->prev.raid_disks);
3215 if (!sectors) 3273 if (!sectors)
3216 sectors = conf->dev_sectors; 3274 sectors = conf->dev_sectors;
3217 3275
@@ -3321,7 +3379,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3321 if (!conf) 3379 if (!conf)
3322 goto out; 3380 goto out;
3323 3381
3324 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 3382 /* FIXME calc properly */
3383 conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks +
3384 max(0,mddev->delta_disks)),
3325 GFP_KERNEL); 3385 GFP_KERNEL);
3326 if (!conf->mirrors) 3386 if (!conf->mirrors)
3327 goto out; 3387 goto out;
@@ -3338,9 +3398,21 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3338 goto out; 3398 goto out;
3339 3399
3340 calc_sectors(conf, mddev->dev_sectors); 3400 calc_sectors(conf, mddev->dev_sectors);
3341 conf->prev = conf->geo; 3401 if (mddev->reshape_position == MaxSector) {
3342 conf->reshape_progress = MaxSector; 3402 conf->prev = conf->geo;
3343 3403 conf->reshape_progress = MaxSector;
3404 } else {
3405 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3406 err = -EINVAL;
3407 goto out;
3408 }
3409 conf->reshape_progress = mddev->reshape_position;
3410 if (conf->prev.far_offset)
3411 conf->prev.stride = 1 << conf->prev.chunk_shift;
3412 else
3413 /* far_copies must be 1 */
3414 conf->prev.stride = conf->dev_sectors;
3415 }
3344 spin_lock_init(&conf->device_lock); 3416 spin_lock_init(&conf->device_lock);
3345 INIT_LIST_HEAD(&conf->retry_list); 3417 INIT_LIST_HEAD(&conf->retry_list);
3346 3418
@@ -3355,8 +3427,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3355 return conf; 3427 return conf;
3356 3428
3357 out: 3429 out:
3358 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", 3430 if (err == -ENOMEM)
3359 mdname(mddev)); 3431 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3432 mdname(mddev));
3360 if (conf) { 3433 if (conf) {
3361 if (conf->r10bio_pool) 3434 if (conf->r10bio_pool)
3362 mempool_destroy(conf->r10bio_pool); 3435 mempool_destroy(conf->r10bio_pool);
@@ -3374,12 +3447,8 @@ static int run(struct mddev *mddev)
3374 struct mirror_info *disk; 3447 struct mirror_info *disk;
3375 struct md_rdev *rdev; 3448 struct md_rdev *rdev;
3376 sector_t size; 3449 sector_t size;
3377 3450 sector_t min_offset_diff = 0;
3378 /* 3451 int first = 1;
3379 * copy the already verified devices into our private RAID10
3380 * bookkeeping area. [whatever we allocate in run(),
3381 * should be freed in stop()]
3382 */
3383 3452
3384 if (mddev->private == NULL) { 3453 if (mddev->private == NULL) {
3385 conf = setup_conf(mddev); 3454 conf = setup_conf(mddev);
@@ -3403,6 +3472,7 @@ static int run(struct mddev *mddev)
3403 (conf->geo.raid_disks / conf->geo.near_copies)); 3472 (conf->geo.raid_disks / conf->geo.near_copies));
3404 3473
3405 rdev_for_each(rdev, mddev) { 3474 rdev_for_each(rdev, mddev) {
3475 long long diff;
3406 3476
3407 disk_idx = rdev->raid_disk; 3477 disk_idx = rdev->raid_disk;
3408 if (disk_idx < 0) 3478 if (disk_idx < 0)
@@ -3421,12 +3491,20 @@ static int run(struct mddev *mddev)
3421 goto out_free_conf; 3491 goto out_free_conf;
3422 disk->rdev = rdev; 3492 disk->rdev = rdev;
3423 } 3493 }
3494 diff = (rdev->new_data_offset - rdev->data_offset);
3495 if (!mddev->reshape_backwards)
3496 diff = -diff;
3497 if (diff < 0)
3498 diff = 0;
3499 if (first || diff < min_offset_diff)
3500 min_offset_diff = diff;
3424 3501
3425 disk_stack_limits(mddev->gendisk, rdev->bdev, 3502 disk_stack_limits(mddev->gendisk, rdev->bdev,
3426 rdev->data_offset << 9); 3503 rdev->data_offset << 9);
3427 3504
3428 disk->head_position = 0; 3505 disk->head_position = 0;
3429 } 3506 }
3507
3430 /* need to check that every block has at least one working mirror */ 3508 /* need to check that every block has at least one working mirror */
3431 if (!enough(conf, -1)) { 3509 if (!enough(conf, -1)) {
3432 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 3510 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
@@ -3434,6 +3512,16 @@ static int run(struct mddev *mddev)
3434 goto out_free_conf; 3512 goto out_free_conf;
3435 } 3513 }
3436 3514
3515 if (conf->reshape_progress != MaxSector) {
3516 /* must ensure that shape change is supported */
3517 if (conf->geo.far_copies != 1 &&
3518 conf->geo.far_offset == 0)
3519 goto out_free_conf;
3520 if (conf->prev.far_copies != 1 &&
3521 conf->geo.far_offset == 0)
3522 goto out_free_conf;
3523 }
3524
3437 mddev->degraded = 0; 3525 mddev->degraded = 0;
3438 for (i = 0; 3526 for (i = 0;
3439 i < conf->geo.raid_disks 3527 i < conf->geo.raid_disks
@@ -3486,8 +3574,8 @@ static int run(struct mddev *mddev)
3486 int stripe = conf->geo.raid_disks * 3574 int stripe = conf->geo.raid_disks *
3487 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3575 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3488 stripe /= conf->geo.near_copies; 3576 stripe /= conf->geo.near_copies;
3489 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 3577 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3490 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 3578 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3491 } 3579 }
3492 3580
3493 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 3581 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
@@ -3495,6 +3583,30 @@ static int run(struct mddev *mddev)
3495 if (md_integrity_register(mddev)) 3583 if (md_integrity_register(mddev))
3496 goto out_free_conf; 3584 goto out_free_conf;
3497 3585
3586 if (conf->reshape_progress != MaxSector) {
3587 unsigned long before_length, after_length;
3588
3589 before_length = ((1 << conf->prev.chunk_shift) *
3590 conf->prev.far_copies);
3591 after_length = ((1 << conf->geo.chunk_shift) *
3592 conf->geo.far_copies);
3593
3594 if (max(before_length, after_length) > min_offset_diff) {
3595 /* This cannot work */
3596 printk("md/raid10: offset difference not enough to continue reshape\n");
3597 goto out_free_conf;
3598 }
3599 conf->offset_diff = min_offset_diff;
3600
3601 conf->reshape_safe = conf->reshape_progress;
3602 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3603 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3604 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3605 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3606 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3607 "reshape");
3608 }
3609
3498 return 0; 3610 return 0;
3499 3611
3500out_free_conf: 3612out_free_conf:
@@ -3634,6 +3746,735 @@ static void *raid10_takeover(struct mddev *mddev)
3634 return ERR_PTR(-EINVAL); 3746 return ERR_PTR(-EINVAL);
3635} 3747}
3636 3748
3749static int raid10_check_reshape(struct mddev *mddev)
3750{
3751 /* Called when there is a request to change
3752 * - layout (to ->new_layout)
3753 * - chunk size (to ->new_chunk_sectors)
3754 * - raid_disks (by delta_disks)
3755 * or when trying to restart a reshape that was ongoing.
3756 *
3757 * We need to validate the request and possibly allocate
3758 * space if that might be an issue later.
3759 *
3760 * Currently we reject any reshape of a 'far' mode array,
3761 * allow chunk size to change if new is generally acceptable,
3762 * allow raid_disks to increase, and allow
3763 * a switch between 'near' mode and 'offset' mode.
3764 */
3765 struct r10conf *conf = mddev->private;
3766 struct geom geo;
3767
3768 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3769 return -EINVAL;
3770
3771 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3772 /* mustn't change number of copies */
3773 return -EINVAL;
3774 if (geo.far_copies > 1 && !geo.far_offset)
3775 /* Cannot switch to 'far' mode */
3776 return -EINVAL;
3777
3778 if (mddev->array_sectors & geo.chunk_mask)
3779 /* not factor of array size */
3780 return -EINVAL;
3781
3782 if (mddev->bitmap)
3783 return -EBUSY;
3784 if (!enough(conf, -1))
3785 return -EINVAL;
3786
3787 kfree(conf->mirrors_new);
3788 conf->mirrors_new = NULL;
3789 if (mddev->delta_disks > 0) {
3790 /* allocate new 'mirrors' list */
3791 conf->mirrors_new = kzalloc(
3792 sizeof(struct mirror_info)
3793 *(mddev->raid_disks +
3794 mddev->delta_disks),
3795 GFP_KERNEL);
3796 if (!conf->mirrors_new)
3797 return -ENOMEM;
3798 }
3799 return 0;
3800}
3801
3802/*
3803 * Need to check if array has failed when deciding whether to:
3804 * - start an array
3805 * - remove non-faulty devices
3806 * - add a spare
3807 * - allow a reshape
3808 * This determination is simple when no reshape is happening.
3809 * However if there is a reshape, we need to carefully check
3810 * both the before and after sections.
3811 * This is because some failed devices may only affect one
3812 * of the two sections, and some non-in_sync devices may
3813 * be insync in the section most affected by failed devices.
3814 */
3815static int calc_degraded(struct r10conf *conf)
3816{
3817 int degraded, degraded2;
3818 int i;
3819
3820 rcu_read_lock();
3821 degraded = 0;
3822 /* 'prev' section first */
3823 for (i = 0; i < conf->prev.raid_disks; i++) {
3824 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3825 if (!rdev || test_bit(Faulty, &rdev->flags))
3826 degraded++;
3827 else if (!test_bit(In_sync, &rdev->flags))
3828 /* When we can reduce the number of devices in
3829 * an array, this might not contribute to
3830 * 'degraded'. It does now.
3831 */
3832 degraded++;
3833 }
3834 rcu_read_unlock();
3835 if (conf->geo.raid_disks == conf->prev.raid_disks)
3836 return degraded;
3837 rcu_read_lock();
3838 degraded2 = 0;
3839 for (i = 0; i < conf->geo.raid_disks; i++) {
3840 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3841 if (!rdev || test_bit(Faulty, &rdev->flags))
3842 degraded2++;
3843 else if (!test_bit(In_sync, &rdev->flags)) {
3844 /* If reshape is increasing the number of devices,
3845 * this section has already been recovered, so
3846 * it doesn't contribute to degraded.
3847 * else it does.
3848 */
3849 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3850 degraded2++;
3851 }
3852 }
3853 rcu_read_unlock();
3854 if (degraded2 > degraded)
3855 return degraded2;
3856 return degraded;
3857}
3858
3859static int raid10_start_reshape(struct mddev *mddev)
3860{
3861 /* A 'reshape' has been requested. This commits
3862 * the various 'new' fields and sets MD_RECOVER_RESHAPE
3863 * This also checks if there are enough spares and adds them
3864 * to the array.
3865 * We currently require enough spares to make the final
3866 * array non-degraded. We also require that the difference
3867 * between old and new data_offset - on each device - is
3868 * enough that we never risk over-writing.
3869 */
3870
3871 unsigned long before_length, after_length;
3872 sector_t min_offset_diff = 0;
3873 int first = 1;
3874 struct geom new;
3875 struct r10conf *conf = mddev->private;
3876 struct md_rdev *rdev;
3877 int spares = 0;
3878
3879 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3880 return -EBUSY;
3881
3882 if (setup_geo(&new, mddev, geo_start) != conf->copies)
3883 return -EINVAL;
3884
3885 before_length = ((1 << conf->prev.chunk_shift) *
3886 conf->prev.far_copies);
3887 after_length = ((1 << conf->geo.chunk_shift) *
3888 conf->geo.far_copies);
3889
3890 rdev_for_each(rdev, mddev) {
3891 if (!test_bit(In_sync, &rdev->flags)
3892 && !test_bit(Faulty, &rdev->flags))
3893 spares++;
3894 if (rdev->raid_disk >= 0) {
3895 long long diff = (rdev->new_data_offset
3896 - rdev->data_offset);
3897 if (!mddev->reshape_backwards)
3898 diff = -diff;
3899 if (diff < 0)
3900 diff = 0;
3901 if (first || diff < min_offset_diff)
3902 min_offset_diff = diff;
3903 }
3904 }
3905
3906 if (max(before_length, after_length) > min_offset_diff)
3907 return -EINVAL;
3908
3909 if (spares < mddev->delta_disks)
3910 return -EINVAL;
3911
3912 conf->offset_diff = min_offset_diff;
3913 spin_lock_irq(&conf->device_lock);
3914 if (conf->mirrors_new) {
3915 memcpy(conf->mirrors_new, conf->mirrors,
3916 sizeof(struct mirror_info)*conf->prev.raid_disks);
3917 smp_mb();
3918 kfree(conf->mirrors_old); /* FIXME and elsewhere */
3919 conf->mirrors_old = conf->mirrors;
3920 conf->mirrors = conf->mirrors_new;
3921 conf->mirrors_new = NULL;
3922 }
3923 setup_geo(&conf->geo, mddev, geo_start);
3924 smp_mb();
3925 if (mddev->reshape_backwards) {
3926 sector_t size = raid10_size(mddev, 0, 0);
3927 if (size < mddev->array_sectors) {
3928 spin_unlock_irq(&conf->device_lock);
3929 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
3930 mdname(mddev));
3931 return -EINVAL;
3932 }
3933 mddev->resync_max_sectors = size;
3934 conf->reshape_progress = size;
3935 } else
3936 conf->reshape_progress = 0;
3937 spin_unlock_irq(&conf->device_lock);
3938
3939 if (mddev->delta_disks > 0) {
3940 rdev_for_each(rdev, mddev)
3941 if (rdev->raid_disk < 0 &&
3942 !test_bit(Faulty, &rdev->flags)) {
3943 if (raid10_add_disk(mddev, rdev) == 0) {
3944 if (rdev->raid_disk >=
3945 conf->prev.raid_disks)
3946 set_bit(In_sync, &rdev->flags);
3947 else
3948 rdev->recovery_offset = 0;
3949
3950 if (sysfs_link_rdev(mddev, rdev))
3951 /* Failure here is OK */;
3952 }
3953 } else if (rdev->raid_disk >= conf->prev.raid_disks
3954 && !test_bit(Faulty, &rdev->flags)) {
3955 /* This is a spare that was manually added */
3956 set_bit(In_sync, &rdev->flags);
3957 }
3958 }
3959 /* When a reshape changes the number of devices,
3960 * ->degraded is measured against the larger of the
3961 * pre and post numbers.
3962 */
3963 spin_lock_irq(&conf->device_lock);
3964 mddev->degraded = calc_degraded(conf);
3965 spin_unlock_irq(&conf->device_lock);
3966 mddev->raid_disks = conf->geo.raid_disks;
3967 mddev->reshape_position = conf->reshape_progress;
3968 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3969
3970 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3971 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3972 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3973 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3974
3975 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3976 "reshape");
3977 if (!mddev->sync_thread) {
3978 mddev->recovery = 0;
3979 spin_lock_irq(&conf->device_lock);
3980 conf->geo = conf->prev;
3981 mddev->raid_disks = conf->geo.raid_disks;
3982 rdev_for_each(rdev, mddev)
3983 rdev->new_data_offset = rdev->data_offset;
3984 smp_wmb();
3985 conf->reshape_progress = MaxSector;
3986 mddev->reshape_position = MaxSector;
3987 spin_unlock_irq(&conf->device_lock);
3988 return -EAGAIN;
3989 }
3990 conf->reshape_checkpoint = jiffies;
3991 md_wakeup_thread(mddev->sync_thread);
3992 md_new_event(mddev);
3993 return 0;
3994}
3995
3996/* Calculate the last device-address that could contain
3997 * any block from the chunk that includes the array-address 's'
3998 * and report the next address.
3999 * i.e. the address returned will be chunk-aligned and after
4000 * any data that is in the chunk containing 's'.
4001 */
4002static sector_t last_dev_address(sector_t s, struct geom *geo)
4003{
4004 s = (s | geo->chunk_mask) + 1;
4005 s >>= geo->chunk_shift;
4006 s *= geo->near_copies;
4007 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4008 s *= geo->far_copies;
4009 s <<= geo->chunk_shift;
4010 return s;
4011}
4012
4013/* Calculate the first device-address that could contain
4014 * any block from the chunk that includes the array-address 's'.
4015 * This too will be the start of a chunk
4016 */
4017static sector_t first_dev_address(sector_t s, struct geom *geo)
4018{
4019 s >>= geo->chunk_shift;
4020 s *= geo->near_copies;
4021 sector_div(s, geo->raid_disks);
4022 s *= geo->far_copies;
4023 s <<= geo->chunk_shift;
4024 return s;
4025}
4026
4027static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4028 int *skipped)
4029{
4030 /* We simply copy at most one chunk (smallest of old and new)
4031 * at a time, possibly less if that exceeds RESYNC_PAGES,
4032 * or we hit a bad block or something.
4033 * This might mean we pause for normal IO in the middle of
4034 * a chunk, but that is not a problem was mddev->reshape_position
4035 * can record any location.
4036 *
4037 * If we will want to write to a location that isn't
4038 * yet recorded as 'safe' (i.e. in metadata on disk) then
4039 * we need to flush all reshape requests and update the metadata.
4040 *
4041 * When reshaping forwards (e.g. to more devices), we interpret
4042 * 'safe' as the earliest block which might not have been copied
4043 * down yet. We divide this by previous stripe size and multiply
4044 * by previous stripe length to get lowest device offset that we
4045 * cannot write to yet.
4046 * We interpret 'sector_nr' as an address that we want to write to.
4047 * From this we use last_device_address() to find where we might
4048 * write to, and first_device_address on the 'safe' position.
4049 * If this 'next' write position is after the 'safe' position,
4050 * we must update the metadata to increase the 'safe' position.
4051 *
4052 * When reshaping backwards, we round in the opposite direction
4053 * and perform the reverse test: next write position must not be
4054 * less than current safe position.
4055 *
4056 * In all this the minimum difference in data offsets
4057 * (conf->offset_diff - always positive) allows a bit of slack,
4058 * so next can be after 'safe', but not by more than offset_disk
4059 *
4060 * We need to prepare all the bios here before we start any IO
4061 * to ensure the size we choose is acceptable to all devices.
4062 * The means one for each copy for write-out and an extra one for
4063 * read-in.
4064 * We store the read-in bio in ->master_bio and the others in
4065 * ->devs[x].bio and ->devs[x].repl_bio.
4066 */
4067 struct r10conf *conf = mddev->private;
4068 struct r10bio *r10_bio;
4069 sector_t next, safe, last;
4070 int max_sectors;
4071 int nr_sectors;
4072 int s;
4073 struct md_rdev *rdev;
4074 int need_flush = 0;
4075 struct bio *blist;
4076 struct bio *bio, *read_bio;
4077 int sectors_done = 0;
4078
4079 if (sector_nr == 0) {
4080 /* If restarting in the middle, skip the initial sectors */
4081 if (mddev->reshape_backwards &&
4082 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4083 sector_nr = (raid10_size(mddev, 0, 0)
4084 - conf->reshape_progress);
4085 } else if (!mddev->reshape_backwards &&
4086 conf->reshape_progress > 0)
4087 sector_nr = conf->reshape_progress;
4088 if (sector_nr) {
4089 mddev->curr_resync_completed = sector_nr;
4090 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4091 *skipped = 1;
4092 return sector_nr;
4093 }
4094 }
4095
4096 /* We don't use sector_nr to track where we are up to
4097 * as that doesn't work well for ->reshape_backwards.
4098 * So just use ->reshape_progress.
4099 */
4100 if (mddev->reshape_backwards) {
4101 /* 'next' is the earliest device address that we might
4102 * write to for this chunk in the new layout
4103 */
4104 next = first_dev_address(conf->reshape_progress - 1,
4105 &conf->geo);
4106
4107 /* 'safe' is the last device address that we might read from
4108 * in the old layout after a restart
4109 */
4110 safe = last_dev_address(conf->reshape_safe - 1,
4111 &conf->prev);
4112
4113 if (next + conf->offset_diff < safe)
4114 need_flush = 1;
4115
4116 last = conf->reshape_progress - 1;
4117 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4118 & conf->prev.chunk_mask);
4119 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4120 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4121 } else {
4122 /* 'next' is after the last device address that we
4123 * might write to for this chunk in the new layout
4124 */
4125 next = last_dev_address(conf->reshape_progress, &conf->geo);
4126
4127 /* 'safe' is the earliest device address that we might
4128 * read from in the old layout after a restart
4129 */
4130 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4131
4132 /* Need to update metadata if 'next' might be beyond 'safe'
4133 * as that would possibly corrupt data
4134 */
4135 if (next > safe + conf->offset_diff)
4136 need_flush = 1;
4137
4138 sector_nr = conf->reshape_progress;
4139 last = sector_nr | (conf->geo.chunk_mask
4140 & conf->prev.chunk_mask);
4141
4142 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4143 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4144 }
4145
4146 if (need_flush ||
4147 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4148 /* Need to update reshape_position in metadata */
4149 wait_barrier(conf);
4150 mddev->reshape_position = conf->reshape_progress;
4151 if (mddev->reshape_backwards)
4152 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4153 - conf->reshape_progress;
4154 else
4155 mddev->curr_resync_completed = conf->reshape_progress;
4156 conf->reshape_checkpoint = jiffies;
4157 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4158 md_wakeup_thread(mddev->thread);
4159 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4160 kthread_should_stop());
4161 conf->reshape_safe = mddev->reshape_position;
4162 allow_barrier(conf);
4163 }
4164
4165read_more:
4166 /* Now schedule reads for blocks from sector_nr to last */
4167 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4168 raise_barrier(conf, sectors_done != 0);
4169 atomic_set(&r10_bio->remaining, 0);
4170 r10_bio->mddev = mddev;
4171 r10_bio->sector = sector_nr;
4172 set_bit(R10BIO_IsReshape, &r10_bio->state);
4173 r10_bio->sectors = last - sector_nr + 1;
4174 rdev = read_balance(conf, r10_bio, &max_sectors);
4175 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4176
4177 if (!rdev) {
4178 /* Cannot read from here, so need to record bad blocks
4179 * on all the target devices.
4180 */
4181 // FIXME
4182 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4183 return sectors_done;
4184 }
4185
4186 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4187
4188 read_bio->bi_bdev = rdev->bdev;
4189 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4190 + rdev->data_offset);
4191 read_bio->bi_private = r10_bio;
4192 read_bio->bi_end_io = end_sync_read;
4193 read_bio->bi_rw = READ;
4194 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4195 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4196 read_bio->bi_vcnt = 0;
4197 read_bio->bi_idx = 0;
4198 read_bio->bi_size = 0;
4199 r10_bio->master_bio = read_bio;
4200 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4201
4202 /* Now find the locations in the new layout */
4203 __raid10_find_phys(&conf->geo, r10_bio);
4204
4205 blist = read_bio;
4206 read_bio->bi_next = NULL;
4207
4208 for (s = 0; s < conf->copies*2; s++) {
4209 struct bio *b;
4210 int d = r10_bio->devs[s/2].devnum;
4211 struct md_rdev *rdev2;
4212 if (s&1) {
4213 rdev2 = conf->mirrors[d].replacement;
4214 b = r10_bio->devs[s/2].repl_bio;
4215 } else {
4216 rdev2 = conf->mirrors[d].rdev;
4217 b = r10_bio->devs[s/2].bio;
4218 }
4219 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4220 continue;
4221 b->bi_bdev = rdev2->bdev;
4222 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4223 b->bi_private = r10_bio;
4224 b->bi_end_io = end_reshape_write;
4225 b->bi_rw = WRITE;
4226 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4227 b->bi_flags |= 1 << BIO_UPTODATE;
4228 b->bi_next = blist;
4229 b->bi_vcnt = 0;
4230 b->bi_idx = 0;
4231 b->bi_size = 0;
4232 blist = b;
4233 }
4234
4235 /* Now add as many pages as possible to all of these bios. */
4236
4237 nr_sectors = 0;
4238 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4239 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4240 int len = (max_sectors - s) << 9;
4241 if (len > PAGE_SIZE)
4242 len = PAGE_SIZE;
4243 for (bio = blist; bio ; bio = bio->bi_next) {
4244 struct bio *bio2;
4245 if (bio_add_page(bio, page, len, 0))
4246 continue;
4247
4248 /* Didn't fit, must stop */
4249 for (bio2 = blist;
4250 bio2 && bio2 != bio;
4251 bio2 = bio2->bi_next) {
4252 /* Remove last page from this bio */
4253 bio2->bi_vcnt--;
4254 bio2->bi_size -= len;
4255 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4256 }
4257 goto bio_full;
4258 }
4259 sector_nr += len >> 9;
4260 nr_sectors += len >> 9;
4261 }
4262bio_full:
4263 r10_bio->sectors = nr_sectors;
4264
4265 /* Now submit the read */
4266 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4267 atomic_inc(&r10_bio->remaining);
4268 read_bio->bi_next = NULL;
4269 generic_make_request(read_bio);
4270 sector_nr += nr_sectors;
4271 sectors_done += nr_sectors;
4272 if (sector_nr <= last)
4273 goto read_more;
4274
4275 /* Now that we have done the whole section we can
4276 * update reshape_progress
4277 */
4278 if (mddev->reshape_backwards)
4279 conf->reshape_progress -= sectors_done;
4280 else
4281 conf->reshape_progress += sectors_done;
4282
4283 return sectors_done;
4284}
4285
4286static void end_reshape_request(struct r10bio *r10_bio);
4287static int handle_reshape_read_error(struct mddev *mddev,
4288 struct r10bio *r10_bio);
4289static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4290{
4291 /* Reshape read completed. Hopefully we have a block
4292 * to write out.
4293 * If we got a read error then we do sync 1-page reads from
4294 * elsewhere until we find the data - or give up.
4295 */
4296 struct r10conf *conf = mddev->private;
4297 int s;
4298
4299 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4300 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4301 /* Reshape has been aborted */
4302 md_done_sync(mddev, r10_bio->sectors, 0);
4303 return;
4304 }
4305
4306 /* We definitely have the data in the pages, schedule the
4307 * writes.
4308 */
4309 atomic_set(&r10_bio->remaining, 1);
4310 for (s = 0; s < conf->copies*2; s++) {
4311 struct bio *b;
4312 int d = r10_bio->devs[s/2].devnum;
4313 struct md_rdev *rdev;
4314 if (s&1) {
4315 rdev = conf->mirrors[d].replacement;
4316 b = r10_bio->devs[s/2].repl_bio;
4317 } else {
4318 rdev = conf->mirrors[d].rdev;
4319 b = r10_bio->devs[s/2].bio;
4320 }
4321 if (!rdev || test_bit(Faulty, &rdev->flags))
4322 continue;
4323 atomic_inc(&rdev->nr_pending);
4324 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4325 atomic_inc(&r10_bio->remaining);
4326 b->bi_next = NULL;
4327 generic_make_request(b);
4328 }
4329 end_reshape_request(r10_bio);
4330}
4331
4332static void end_reshape(struct r10conf *conf)
4333{
4334 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4335 return;
4336
4337 spin_lock_irq(&conf->device_lock);
4338 conf->prev = conf->geo;
4339 md_finish_reshape(conf->mddev);
4340 smp_wmb();
4341 conf->reshape_progress = MaxSector;
4342 spin_unlock_irq(&conf->device_lock);
4343
4344 /* read-ahead size must cover two whole stripes, which is
4345 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4346 */
4347 if (conf->mddev->queue) {
4348 int stripe = conf->geo.raid_disks *
4349 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4350 stripe /= conf->geo.near_copies;
4351 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4352 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4353 }
4354 conf->fullsync = 0;
4355}
4356
4357
4358static int handle_reshape_read_error(struct mddev *mddev,
4359 struct r10bio *r10_bio)
4360{
4361 /* Use sync reads to get the blocks from somewhere else */
4362 int sectors = r10_bio->sectors;
4363 struct r10bio r10b;
4364 struct r10conf *conf = mddev->private;
4365 int slot = 0;
4366 int idx = 0;
4367 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4368
4369 r10b.sector = r10_bio->sector;
4370 __raid10_find_phys(&conf->prev, &r10b);
4371
4372 while (sectors) {
4373 int s = sectors;
4374 int success = 0;
4375 int first_slot = slot;
4376
4377 if (s > (PAGE_SIZE >> 9))
4378 s = PAGE_SIZE >> 9;
4379
4380 while (!success) {
4381 int d = r10b.devs[slot].devnum;
4382 struct md_rdev *rdev = conf->mirrors[d].rdev;
4383 sector_t addr;
4384 if (rdev == NULL ||
4385 test_bit(Faulty, &rdev->flags) ||
4386 !test_bit(In_sync, &rdev->flags))
4387 goto failed;
4388
4389 addr = r10b.devs[slot].addr + idx * PAGE_SIZE;
4390 success = sync_page_io(rdev,
4391 addr,
4392 s << 9,
4393 bvec[idx].bv_page,
4394 READ, false);
4395 if (success)
4396 break;
4397 failed:
4398 slot++;
4399 if (slot >= conf->copies)
4400 slot = 0;
4401 if (slot == first_slot)
4402 break;
4403 }
4404 if (!success) {
4405 /* couldn't read this block, must give up */
4406 set_bit(MD_RECOVERY_INTR,
4407 &mddev->recovery);
4408 return -EIO;
4409 }
4410 sectors -= s;
4411 idx++;
4412 }
4413 return 0;
4414}
4415
4416static void end_reshape_write(struct bio *bio, int error)
4417{
4418 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4419 struct r10bio *r10_bio = bio->bi_private;
4420 struct mddev *mddev = r10_bio->mddev;
4421 struct r10conf *conf = mddev->private;
4422 int d;
4423 int slot;
4424 int repl;
4425 struct md_rdev *rdev = NULL;
4426
4427 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4428 if (repl)
4429 rdev = conf->mirrors[d].replacement;
4430 if (!rdev) {
4431 smp_mb();
4432 rdev = conf->mirrors[d].rdev;
4433 }
4434
4435 if (!uptodate) {
4436 /* FIXME should record badblock */
4437 md_error(mddev, rdev);
4438 }
4439
4440 rdev_dec_pending(rdev, mddev);
4441 end_reshape_request(r10_bio);
4442}
4443
4444static void end_reshape_request(struct r10bio *r10_bio)
4445{
4446 if (!atomic_dec_and_test(&r10_bio->remaining))
4447 return;
4448 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4449 bio_put(r10_bio->master_bio);
4450 put_buf(r10_bio);
4451}
4452
4453static void raid10_finish_reshape(struct mddev *mddev)
4454{
4455 struct r10conf *conf = mddev->private;
4456
4457 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4458 return;
4459
4460 if (mddev->delta_disks > 0) {
4461 sector_t size = raid10_size(mddev, 0, 0);
4462 md_set_array_sectors(mddev, size);
4463 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4464 mddev->recovery_cp = mddev->resync_max_sectors;
4465 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4466 }
4467 mddev->resync_max_sectors = size;
4468 set_capacity(mddev->gendisk, mddev->array_sectors);
4469 revalidate_disk(mddev->gendisk);
4470 }
4471 mddev->layout = mddev->new_layout;
4472 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4473 mddev->reshape_position = MaxSector;
4474 mddev->delta_disks = 0;
4475 mddev->reshape_backwards = 0;
4476}
4477
3637static struct md_personality raid10_personality = 4478static struct md_personality raid10_personality =
3638{ 4479{
3639 .name = "raid10", 4480 .name = "raid10",
@@ -3652,6 +4493,9 @@ static struct md_personality raid10_personality =
3652 .size = raid10_size, 4493 .size = raid10_size,
3653 .resize = raid10_resize, 4494 .resize = raid10_resize,
3654 .takeover = raid10_takeover, 4495 .takeover = raid10_takeover,
4496 .check_reshape = raid10_check_reshape,
4497 .start_reshape = raid10_start_reshape,
4498 .finish_reshape = raid10_finish_reshape,
3655}; 4499};
3656 4500
3657static int __init raid_init(void) 4501static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 37509d7134aa..135b1b0a1554 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -14,6 +14,7 @@ struct mirror_info {
14struct r10conf { 14struct r10conf {
15 struct mddev *mddev; 15 struct mddev *mddev;
16 struct mirror_info *mirrors; 16 struct mirror_info *mirrors;
17 struct mirror_info *mirrors_new, *mirrors_old;
17 spinlock_t device_lock; 18 spinlock_t device_lock;
18 19
19 /* geometry */ 20 /* geometry */
@@ -42,6 +43,9 @@ struct r10conf {
42 sector_t dev_sectors; /* temp copy of 43 sector_t dev_sectors; /* temp copy of
43 * mddev->dev_sectors */ 44 * mddev->dev_sectors */
44 sector_t reshape_progress; 45 sector_t reshape_progress;
46 sector_t reshape_safe;
47 unsigned long reshape_checkpoint;
48 sector_t offset_diff;
45 49
46 struct list_head retry_list; 50 struct list_head retry_list;
47 /* queue pending writes and submit them on unplug */ 51 /* queue pending writes and submit them on unplug */
@@ -138,6 +142,7 @@ enum r10bio_state {
138 R10BIO_Uptodate, 142 R10BIO_Uptodate,
139 R10BIO_IsSync, 143 R10BIO_IsSync,
140 R10BIO_IsRecover, 144 R10BIO_IsRecover,
145 R10BIO_IsReshape,
141 R10BIO_Degraded, 146 R10BIO_Degraded,
142/* Set ReadError on bios that experience a read error 147/* Set ReadError on bios that experience a read error
143 * so that raid10d knows what to do with them. 148 * so that raid10d knows what to do with them.