diff options
-rw-r--r-- | drivers/md/raid10.c | 890 | ||||
-rw-r--r-- | drivers/md/raid10.h | 5 |
2 files changed, 872 insertions, 23 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f102e88fc785..ec271ae4318f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/seq_file.h> | 25 | #include <linux/seq_file.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | ||
27 | #include "md.h" | 28 | #include "md.h" |
28 | #include "raid10.h" | 29 | #include "raid10.h" |
29 | #include "raid0.h" | 30 | #include "raid0.h" |
@@ -68,6 +69,11 @@ static int max_queued_requests = 1024; | |||
68 | static void allow_barrier(struct r10conf *conf); | 69 | static void allow_barrier(struct r10conf *conf); |
69 | static void lower_barrier(struct r10conf *conf); | 70 | static void lower_barrier(struct r10conf *conf); |
70 | static int enough(struct r10conf *conf, int ignore); | 71 | static int enough(struct r10conf *conf, int ignore); |
72 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | ||
73 | int *skipped); | ||
74 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); | ||
75 | static void end_reshape_write(struct bio *bio, int error); | ||
76 | static void end_reshape(struct r10conf *conf); | ||
71 | 77 | ||
72 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | 78 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) |
73 | { | 79 | { |
@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
112 | if (!r10_bio) | 118 | if (!r10_bio) |
113 | return NULL; | 119 | return NULL; |
114 | 120 | ||
115 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | 121 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || |
122 | test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) | ||
116 | nalloc = conf->copies; /* resync */ | 123 | nalloc = conf->copies; /* resync */ |
117 | else | 124 | else |
118 | nalloc = 2; /* recovery */ | 125 | nalloc = 2; /* recovery */ |
@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
140 | struct bio *rbio = r10_bio->devs[j].repl_bio; | 147 | struct bio *rbio = r10_bio->devs[j].repl_bio; |
141 | bio = r10_bio->devs[j].bio; | 148 | bio = r10_bio->devs[j].bio; |
142 | for (i = 0; i < RESYNC_PAGES; i++) { | 149 | for (i = 0; i < RESYNC_PAGES; i++) { |
143 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, | 150 | if (j > 0 && !test_bit(MD_RECOVERY_SYNC, |
144 | &conf->mddev->recovery)) { | 151 | &conf->mddev->recovery)) { |
145 | /* we can share bv_page's during recovery */ | 152 | /* we can share bv_page's during recovery |
153 | * and reshape */ | ||
146 | struct bio *rbio = r10_bio->devs[0].bio; | 154 | struct bio *rbio = r10_bio->devs[0].bio; |
147 | page = rbio->bi_io_vec[i].bv_page; | 155 | page = rbio->bi_io_vec[i].bv_page; |
148 | get_page(page); | 156 | get_page(page); |
@@ -614,10 +622,11 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
614 | struct r10conf *conf = mddev->private; | 622 | struct r10conf *conf = mddev->private; |
615 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 623 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
616 | int max; | 624 | int max; |
617 | unsigned int chunk_sectors = mddev->chunk_sectors; | 625 | unsigned int chunk_sectors; |
618 | unsigned int bio_sectors = bvm->bi_size >> 9; | 626 | unsigned int bio_sectors = bvm->bi_size >> 9; |
619 | struct geom *geo = &conf->geo; | 627 | struct geom *geo = &conf->geo; |
620 | 628 | ||
629 | chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; | ||
621 | if (conf->reshape_progress != MaxSector && | 630 | if (conf->reshape_progress != MaxSector && |
622 | ((sector >= conf->reshape_progress) != | 631 | ((sector >= conf->reshape_progress) != |
623 | conf->mddev->reshape_backwards)) | 632 | conf->mddev->reshape_backwards)) |
@@ -1032,6 +1041,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1032 | int plugged; | 1041 | int plugged; |
1033 | int sectors_handled; | 1042 | int sectors_handled; |
1034 | int max_sectors; | 1043 | int max_sectors; |
1044 | int sectors; | ||
1035 | 1045 | ||
1036 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 1046 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
1037 | md_flush_request(mddev, bio); | 1047 | md_flush_request(mddev, bio); |
@@ -1096,10 +1106,41 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1096 | */ | 1106 | */ |
1097 | wait_barrier(conf); | 1107 | wait_barrier(conf); |
1098 | 1108 | ||
1109 | sectors = bio->bi_size >> 9; | ||
1110 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
1111 | bio->bi_sector < conf->reshape_progress && | ||
1112 | bio->bi_sector + sectors > conf->reshape_progress) { | ||
1113 | /* IO spans the reshape position. Need to wait for | ||
1114 | * reshape to pass | ||
1115 | */ | ||
1116 | allow_barrier(conf); | ||
1117 | wait_event(conf->wait_barrier, | ||
1118 | conf->reshape_progress <= bio->bi_sector || | ||
1119 | conf->reshape_progress >= bio->bi_sector + sectors); | ||
1120 | wait_barrier(conf); | ||
1121 | } | ||
1122 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
1123 | bio_data_dir(bio) == WRITE && | ||
1124 | (mddev->reshape_backwards | ||
1125 | ? (bio->bi_sector < conf->reshape_safe && | ||
1126 | bio->bi_sector + sectors > conf->reshape_progress) | ||
1127 | : (bio->bi_sector + sectors > conf->reshape_safe && | ||
1128 | bio->bi_sector < conf->reshape_progress))) { | ||
1129 | /* Need to update reshape_position in metadata */ | ||
1130 | mddev->reshape_position = conf->reshape_progress; | ||
1131 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
1132 | set_bit(MD_CHANGE_PENDING, &mddev->flags); | ||
1133 | md_wakeup_thread(mddev->thread); | ||
1134 | wait_event(mddev->sb_wait, | ||
1135 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
1136 | |||
1137 | conf->reshape_safe = mddev->reshape_position; | ||
1138 | } | ||
1139 | |||
1099 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | 1140 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); |
1100 | 1141 | ||
1101 | r10_bio->master_bio = bio; | 1142 | r10_bio->master_bio = bio; |
1102 | r10_bio->sectors = bio->bi_size >> 9; | 1143 | r10_bio->sectors = sectors; |
1103 | 1144 | ||
1104 | r10_bio->mddev = mddev; | 1145 | r10_bio->mddev = mddev; |
1105 | r10_bio->sector = bio->bi_sector; | 1146 | r10_bio->sector = bio->bi_sector; |
@@ -1730,7 +1771,11 @@ static void end_sync_read(struct bio *bio, int error) | |||
1730 | struct r10conf *conf = r10_bio->mddev->private; | 1771 | struct r10conf *conf = r10_bio->mddev->private; |
1731 | int d; | 1772 | int d; |
1732 | 1773 | ||
1733 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); | 1774 | if (bio == r10_bio->master_bio) { |
1775 | /* this is a reshape read */ | ||
1776 | d = r10_bio->read_slot; /* really the read dev */ | ||
1777 | } else | ||
1778 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); | ||
1734 | 1779 | ||
1735 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1780 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1736 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1781 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
@@ -2631,6 +2676,8 @@ static void raid10d(struct mddev *mddev) | |||
2631 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || | 2676 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
2632 | test_bit(R10BIO_WriteError, &r10_bio->state)) | 2677 | test_bit(R10BIO_WriteError, &r10_bio->state)) |
2633 | handle_write_completed(conf, r10_bio); | 2678 | handle_write_completed(conf, r10_bio); |
2679 | else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) | ||
2680 | reshape_request_write(mddev, r10_bio); | ||
2634 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) | 2681 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) |
2635 | sync_request_write(mddev, r10_bio); | 2682 | sync_request_write(mddev, r10_bio); |
2636 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 2683 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
@@ -2723,7 +2770,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2723 | 2770 | ||
2724 | skipped: | 2771 | skipped: |
2725 | max_sector = mddev->dev_sectors; | 2772 | max_sector = mddev->dev_sectors; |
2726 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 2773 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
2774 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
2727 | max_sector = mddev->resync_max_sectors; | 2775 | max_sector = mddev->resync_max_sectors; |
2728 | if (sector_nr >= max_sector) { | 2776 | if (sector_nr >= max_sector) { |
2729 | /* If we aborted, we need to abort the | 2777 | /* If we aborted, we need to abort the |
@@ -2735,6 +2783,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2735 | * we need to convert that to several | 2783 | * we need to convert that to several |
2736 | * virtual addresses. | 2784 | * virtual addresses. |
2737 | */ | 2785 | */ |
2786 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | ||
2787 | end_reshape(conf); | ||
2788 | return 0; | ||
2789 | } | ||
2790 | |||
2738 | if (mddev->curr_resync < max_sector) { /* aborted */ | 2791 | if (mddev->curr_resync < max_sector) { /* aborted */ |
2739 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 2792 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
2740 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | 2793 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, |
@@ -2766,6 +2819,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2766 | *skipped = 1; | 2819 | *skipped = 1; |
2767 | return sectors_skipped; | 2820 | return sectors_skipped; |
2768 | } | 2821 | } |
2822 | |||
2823 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
2824 | return reshape_request(mddev, sector_nr, skipped); | ||
2825 | |||
2769 | if (chunks_skipped >= conf->geo.raid_disks) { | 2826 | if (chunks_skipped >= conf->geo.raid_disks) { |
2770 | /* if there has been nothing to do on any drive, | 2827 | /* if there has been nothing to do on any drive, |
2771 | * then there is nothing to do at all.. | 2828 | * then there is nothing to do at all.. |
@@ -3211,7 +3268,8 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) | |||
3211 | struct r10conf *conf = mddev->private; | 3268 | struct r10conf *conf = mddev->private; |
3212 | 3269 | ||
3213 | if (!raid_disks) | 3270 | if (!raid_disks) |
3214 | raid_disks = conf->geo.raid_disks; | 3271 | raid_disks = min(conf->geo.raid_disks, |
3272 | conf->prev.raid_disks); | ||
3215 | if (!sectors) | 3273 | if (!sectors) |
3216 | sectors = conf->dev_sectors; | 3274 | sectors = conf->dev_sectors; |
3217 | 3275 | ||
@@ -3321,7 +3379,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3321 | if (!conf) | 3379 | if (!conf) |
3322 | goto out; | 3380 | goto out; |
3323 | 3381 | ||
3324 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, | 3382 | /* FIXME calc properly */ |
3383 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + | ||
3384 | max(0,mddev->delta_disks)), | ||
3325 | GFP_KERNEL); | 3385 | GFP_KERNEL); |
3326 | if (!conf->mirrors) | 3386 | if (!conf->mirrors) |
3327 | goto out; | 3387 | goto out; |
@@ -3338,9 +3398,21 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3338 | goto out; | 3398 | goto out; |
3339 | 3399 | ||
3340 | calc_sectors(conf, mddev->dev_sectors); | 3400 | calc_sectors(conf, mddev->dev_sectors); |
3341 | conf->prev = conf->geo; | 3401 | if (mddev->reshape_position == MaxSector) { |
3342 | conf->reshape_progress = MaxSector; | 3402 | conf->prev = conf->geo; |
3343 | 3403 | conf->reshape_progress = MaxSector; | |
3404 | } else { | ||
3405 | if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { | ||
3406 | err = -EINVAL; | ||
3407 | goto out; | ||
3408 | } | ||
3409 | conf->reshape_progress = mddev->reshape_position; | ||
3410 | if (conf->prev.far_offset) | ||
3411 | conf->prev.stride = 1 << conf->prev.chunk_shift; | ||
3412 | else | ||
3413 | /* far_copies must be 1 */ | ||
3414 | conf->prev.stride = conf->dev_sectors; | ||
3415 | } | ||
3344 | spin_lock_init(&conf->device_lock); | 3416 | spin_lock_init(&conf->device_lock); |
3345 | INIT_LIST_HEAD(&conf->retry_list); | 3417 | INIT_LIST_HEAD(&conf->retry_list); |
3346 | 3418 | ||
@@ -3355,8 +3427,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3355 | return conf; | 3427 | return conf; |
3356 | 3428 | ||
3357 | out: | 3429 | out: |
3358 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", | 3430 | if (err == -ENOMEM) |
3359 | mdname(mddev)); | 3431 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", |
3432 | mdname(mddev)); | ||
3360 | if (conf) { | 3433 | if (conf) { |
3361 | if (conf->r10bio_pool) | 3434 | if (conf->r10bio_pool) |
3362 | mempool_destroy(conf->r10bio_pool); | 3435 | mempool_destroy(conf->r10bio_pool); |
@@ -3374,12 +3447,8 @@ static int run(struct mddev *mddev) | |||
3374 | struct mirror_info *disk; | 3447 | struct mirror_info *disk; |
3375 | struct md_rdev *rdev; | 3448 | struct md_rdev *rdev; |
3376 | sector_t size; | 3449 | sector_t size; |
3377 | 3450 | sector_t min_offset_diff = 0; | |
3378 | /* | 3451 | int first = 1; |
3379 | * copy the already verified devices into our private RAID10 | ||
3380 | * bookkeeping area. [whatever we allocate in run(), | ||
3381 | * should be freed in stop()] | ||
3382 | */ | ||
3383 | 3452 | ||
3384 | if (mddev->private == NULL) { | 3453 | if (mddev->private == NULL) { |
3385 | conf = setup_conf(mddev); | 3454 | conf = setup_conf(mddev); |
@@ -3403,6 +3472,7 @@ static int run(struct mddev *mddev) | |||
3403 | (conf->geo.raid_disks / conf->geo.near_copies)); | 3472 | (conf->geo.raid_disks / conf->geo.near_copies)); |
3404 | 3473 | ||
3405 | rdev_for_each(rdev, mddev) { | 3474 | rdev_for_each(rdev, mddev) { |
3475 | long long diff; | ||
3406 | 3476 | ||
3407 | disk_idx = rdev->raid_disk; | 3477 | disk_idx = rdev->raid_disk; |
3408 | if (disk_idx < 0) | 3478 | if (disk_idx < 0) |
@@ -3421,12 +3491,20 @@ static int run(struct mddev *mddev) | |||
3421 | goto out_free_conf; | 3491 | goto out_free_conf; |
3422 | disk->rdev = rdev; | 3492 | disk->rdev = rdev; |
3423 | } | 3493 | } |
3494 | diff = (rdev->new_data_offset - rdev->data_offset); | ||
3495 | if (!mddev->reshape_backwards) | ||
3496 | diff = -diff; | ||
3497 | if (diff < 0) | ||
3498 | diff = 0; | ||
3499 | if (first || diff < min_offset_diff) | ||
3500 | min_offset_diff = diff; | ||
3424 | 3501 | ||
3425 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3502 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
3426 | rdev->data_offset << 9); | 3503 | rdev->data_offset << 9); |
3427 | 3504 | ||
3428 | disk->head_position = 0; | 3505 | disk->head_position = 0; |
3429 | } | 3506 | } |
3507 | |||
3430 | /* need to check that every block has at least one working mirror */ | 3508 | /* need to check that every block has at least one working mirror */ |
3431 | if (!enough(conf, -1)) { | 3509 | if (!enough(conf, -1)) { |
3432 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 3510 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
@@ -3434,6 +3512,16 @@ static int run(struct mddev *mddev) | |||
3434 | goto out_free_conf; | 3512 | goto out_free_conf; |
3435 | } | 3513 | } |
3436 | 3514 | ||
3515 | if (conf->reshape_progress != MaxSector) { | ||
3516 | /* must ensure that shape change is supported */ | ||
3517 | if (conf->geo.far_copies != 1 && | ||
3518 | conf->geo.far_offset == 0) | ||
3519 | goto out_free_conf; | ||
3520 | if (conf->prev.far_copies != 1 && | ||
3521 | conf->geo.far_offset == 0) | ||
3522 | goto out_free_conf; | ||
3523 | } | ||
3524 | |||
3437 | mddev->degraded = 0; | 3525 | mddev->degraded = 0; |
3438 | for (i = 0; | 3526 | for (i = 0; |
3439 | i < conf->geo.raid_disks | 3527 | i < conf->geo.raid_disks |
@@ -3486,8 +3574,8 @@ static int run(struct mddev *mddev) | |||
3486 | int stripe = conf->geo.raid_disks * | 3574 | int stripe = conf->geo.raid_disks * |
3487 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); | 3575 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
3488 | stripe /= conf->geo.near_copies; | 3576 | stripe /= conf->geo.near_copies; |
3489 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | 3577 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
3490 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 3578 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
3491 | } | 3579 | } |
3492 | 3580 | ||
3493 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | 3581 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); |
@@ -3495,6 +3583,30 @@ static int run(struct mddev *mddev) | |||
3495 | if (md_integrity_register(mddev)) | 3583 | if (md_integrity_register(mddev)) |
3496 | goto out_free_conf; | 3584 | goto out_free_conf; |
3497 | 3585 | ||
3586 | if (conf->reshape_progress != MaxSector) { | ||
3587 | unsigned long before_length, after_length; | ||
3588 | |||
3589 | before_length = ((1 << conf->prev.chunk_shift) * | ||
3590 | conf->prev.far_copies); | ||
3591 | after_length = ((1 << conf->geo.chunk_shift) * | ||
3592 | conf->geo.far_copies); | ||
3593 | |||
3594 | if (max(before_length, after_length) > min_offset_diff) { | ||
3595 | /* This cannot work */ | ||
3596 | printk("md/raid10: offset difference not enough to continue reshape\n"); | ||
3597 | goto out_free_conf; | ||
3598 | } | ||
3599 | conf->offset_diff = min_offset_diff; | ||
3600 | |||
3601 | conf->reshape_safe = conf->reshape_progress; | ||
3602 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
3603 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
3604 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
3605 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
3606 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
3607 | "reshape"); | ||
3608 | } | ||
3609 | |||
3498 | return 0; | 3610 | return 0; |
3499 | 3611 | ||
3500 | out_free_conf: | 3612 | out_free_conf: |
@@ -3634,6 +3746,735 @@ static void *raid10_takeover(struct mddev *mddev) | |||
3634 | return ERR_PTR(-EINVAL); | 3746 | return ERR_PTR(-EINVAL); |
3635 | } | 3747 | } |
3636 | 3748 | ||
3749 | static int raid10_check_reshape(struct mddev *mddev) | ||
3750 | { | ||
3751 | /* Called when there is a request to change | ||
3752 | * - layout (to ->new_layout) | ||
3753 | * - chunk size (to ->new_chunk_sectors) | ||
3754 | * - raid_disks (by delta_disks) | ||
3755 | * or when trying to restart a reshape that was ongoing. | ||
3756 | * | ||
3757 | * We need to validate the request and possibly allocate | ||
3758 | * space if that might be an issue later. | ||
3759 | * | ||
3760 | * Currently we reject any reshape of a 'far' mode array, | ||
3761 | * allow chunk size to change if new is generally acceptable, | ||
3762 | * allow raid_disks to increase, and allow | ||
3763 | * a switch between 'near' mode and 'offset' mode. | ||
3764 | */ | ||
3765 | struct r10conf *conf = mddev->private; | ||
3766 | struct geom geo; | ||
3767 | |||
3768 | if (conf->geo.far_copies != 1 && !conf->geo.far_offset) | ||
3769 | return -EINVAL; | ||
3770 | |||
3771 | if (setup_geo(&geo, mddev, geo_start) != conf->copies) | ||
3772 | /* mustn't change number of copies */ | ||
3773 | return -EINVAL; | ||
3774 | if (geo.far_copies > 1 && !geo.far_offset) | ||
3775 | /* Cannot switch to 'far' mode */ | ||
3776 | return -EINVAL; | ||
3777 | |||
3778 | if (mddev->array_sectors & geo.chunk_mask) | ||
3779 | /* not factor of array size */ | ||
3780 | return -EINVAL; | ||
3781 | |||
3782 | if (mddev->bitmap) | ||
3783 | return -EBUSY; | ||
3784 | if (!enough(conf, -1)) | ||
3785 | return -EINVAL; | ||
3786 | |||
3787 | kfree(conf->mirrors_new); | ||
3788 | conf->mirrors_new = NULL; | ||
3789 | if (mddev->delta_disks > 0) { | ||
3790 | /* allocate new 'mirrors' list */ | ||
3791 | conf->mirrors_new = kzalloc( | ||
3792 | sizeof(struct mirror_info) | ||
3793 | *(mddev->raid_disks + | ||
3794 | mddev->delta_disks), | ||
3795 | GFP_KERNEL); | ||
3796 | if (!conf->mirrors_new) | ||
3797 | return -ENOMEM; | ||
3798 | } | ||
3799 | return 0; | ||
3800 | } | ||
3801 | |||
3802 | /* | ||
3803 | * Need to check if array has failed when deciding whether to: | ||
3804 | * - start an array | ||
3805 | * - remove non-faulty devices | ||
3806 | * - add a spare | ||
3807 | * - allow a reshape | ||
3808 | * This determination is simple when no reshape is happening. | ||
3809 | * However if there is a reshape, we need to carefully check | ||
3810 | * both the before and after sections. | ||
3811 | * This is because some failed devices may only affect one | ||
3812 | * of the two sections, and some non-in_sync devices may | ||
3813 | * be insync in the section most affected by failed devices. | ||
3814 | */ | ||
3815 | static int calc_degraded(struct r10conf *conf) | ||
3816 | { | ||
3817 | int degraded, degraded2; | ||
3818 | int i; | ||
3819 | |||
3820 | rcu_read_lock(); | ||
3821 | degraded = 0; | ||
3822 | /* 'prev' section first */ | ||
3823 | for (i = 0; i < conf->prev.raid_disks; i++) { | ||
3824 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
3825 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
3826 | degraded++; | ||
3827 | else if (!test_bit(In_sync, &rdev->flags)) | ||
3828 | /* When we can reduce the number of devices in | ||
3829 | * an array, this might not contribute to | ||
3830 | * 'degraded'. It does now. | ||
3831 | */ | ||
3832 | degraded++; | ||
3833 | } | ||
3834 | rcu_read_unlock(); | ||
3835 | if (conf->geo.raid_disks == conf->prev.raid_disks) | ||
3836 | return degraded; | ||
3837 | rcu_read_lock(); | ||
3838 | degraded2 = 0; | ||
3839 | for (i = 0; i < conf->geo.raid_disks; i++) { | ||
3840 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
3841 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
3842 | degraded2++; | ||
3843 | else if (!test_bit(In_sync, &rdev->flags)) { | ||
3844 | /* If reshape is increasing the number of devices, | ||
3845 | * this section has already been recovered, so | ||
3846 | * it doesn't contribute to degraded. | ||
3847 | * else it does. | ||
3848 | */ | ||
3849 | if (conf->geo.raid_disks <= conf->prev.raid_disks) | ||
3850 | degraded2++; | ||
3851 | } | ||
3852 | } | ||
3853 | rcu_read_unlock(); | ||
3854 | if (degraded2 > degraded) | ||
3855 | return degraded2; | ||
3856 | return degraded; | ||
3857 | } | ||
3858 | |||
3859 | static int raid10_start_reshape(struct mddev *mddev) | ||
3860 | { | ||
3861 | /* A 'reshape' has been requested. This commits | ||
3862 | * the various 'new' fields and sets MD_RECOVER_RESHAPE | ||
3863 | * This also checks if there are enough spares and adds them | ||
3864 | * to the array. | ||
3865 | * We currently require enough spares to make the final | ||
3866 | * array non-degraded. We also require that the difference | ||
3867 | * between old and new data_offset - on each device - is | ||
3868 | * enough that we never risk over-writing. | ||
3869 | */ | ||
3870 | |||
3871 | unsigned long before_length, after_length; | ||
3872 | sector_t min_offset_diff = 0; | ||
3873 | int first = 1; | ||
3874 | struct geom new; | ||
3875 | struct r10conf *conf = mddev->private; | ||
3876 | struct md_rdev *rdev; | ||
3877 | int spares = 0; | ||
3878 | |||
3879 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | ||
3880 | return -EBUSY; | ||
3881 | |||
3882 | if (setup_geo(&new, mddev, geo_start) != conf->copies) | ||
3883 | return -EINVAL; | ||
3884 | |||
3885 | before_length = ((1 << conf->prev.chunk_shift) * | ||
3886 | conf->prev.far_copies); | ||
3887 | after_length = ((1 << conf->geo.chunk_shift) * | ||
3888 | conf->geo.far_copies); | ||
3889 | |||
3890 | rdev_for_each(rdev, mddev) { | ||
3891 | if (!test_bit(In_sync, &rdev->flags) | ||
3892 | && !test_bit(Faulty, &rdev->flags)) | ||
3893 | spares++; | ||
3894 | if (rdev->raid_disk >= 0) { | ||
3895 | long long diff = (rdev->new_data_offset | ||
3896 | - rdev->data_offset); | ||
3897 | if (!mddev->reshape_backwards) | ||
3898 | diff = -diff; | ||
3899 | if (diff < 0) | ||
3900 | diff = 0; | ||
3901 | if (first || diff < min_offset_diff) | ||
3902 | min_offset_diff = diff; | ||
3903 | } | ||
3904 | } | ||
3905 | |||
3906 | if (max(before_length, after_length) > min_offset_diff) | ||
3907 | return -EINVAL; | ||
3908 | |||
3909 | if (spares < mddev->delta_disks) | ||
3910 | return -EINVAL; | ||
3911 | |||
3912 | conf->offset_diff = min_offset_diff; | ||
3913 | spin_lock_irq(&conf->device_lock); | ||
3914 | if (conf->mirrors_new) { | ||
3915 | memcpy(conf->mirrors_new, conf->mirrors, | ||
3916 | sizeof(struct mirror_info)*conf->prev.raid_disks); | ||
3917 | smp_mb(); | ||
3918 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ | ||
3919 | conf->mirrors_old = conf->mirrors; | ||
3920 | conf->mirrors = conf->mirrors_new; | ||
3921 | conf->mirrors_new = NULL; | ||
3922 | } | ||
3923 | setup_geo(&conf->geo, mddev, geo_start); | ||
3924 | smp_mb(); | ||
3925 | if (mddev->reshape_backwards) { | ||
3926 | sector_t size = raid10_size(mddev, 0, 0); | ||
3927 | if (size < mddev->array_sectors) { | ||
3928 | spin_unlock_irq(&conf->device_lock); | ||
3929 | printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", | ||
3930 | mdname(mddev)); | ||
3931 | return -EINVAL; | ||
3932 | } | ||
3933 | mddev->resync_max_sectors = size; | ||
3934 | conf->reshape_progress = size; | ||
3935 | } else | ||
3936 | conf->reshape_progress = 0; | ||
3937 | spin_unlock_irq(&conf->device_lock); | ||
3938 | |||
3939 | if (mddev->delta_disks > 0) { | ||
3940 | rdev_for_each(rdev, mddev) | ||
3941 | if (rdev->raid_disk < 0 && | ||
3942 | !test_bit(Faulty, &rdev->flags)) { | ||
3943 | if (raid10_add_disk(mddev, rdev) == 0) { | ||
3944 | if (rdev->raid_disk >= | ||
3945 | conf->prev.raid_disks) | ||
3946 | set_bit(In_sync, &rdev->flags); | ||
3947 | else | ||
3948 | rdev->recovery_offset = 0; | ||
3949 | |||
3950 | if (sysfs_link_rdev(mddev, rdev)) | ||
3951 | /* Failure here is OK */; | ||
3952 | } | ||
3953 | } else if (rdev->raid_disk >= conf->prev.raid_disks | ||
3954 | && !test_bit(Faulty, &rdev->flags)) { | ||
3955 | /* This is a spare that was manually added */ | ||
3956 | set_bit(In_sync, &rdev->flags); | ||
3957 | } | ||
3958 | } | ||
3959 | /* When a reshape changes the number of devices, | ||
3960 | * ->degraded is measured against the larger of the | ||
3961 | * pre and post numbers. | ||
3962 | */ | ||
3963 | spin_lock_irq(&conf->device_lock); | ||
3964 | mddev->degraded = calc_degraded(conf); | ||
3965 | spin_unlock_irq(&conf->device_lock); | ||
3966 | mddev->raid_disks = conf->geo.raid_disks; | ||
3967 | mddev->reshape_position = conf->reshape_progress; | ||
3968 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
3969 | |||
3970 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
3971 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
3972 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
3973 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
3974 | |||
3975 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
3976 | "reshape"); | ||
3977 | if (!mddev->sync_thread) { | ||
3978 | mddev->recovery = 0; | ||
3979 | spin_lock_irq(&conf->device_lock); | ||
3980 | conf->geo = conf->prev; | ||
3981 | mddev->raid_disks = conf->geo.raid_disks; | ||
3982 | rdev_for_each(rdev, mddev) | ||
3983 | rdev->new_data_offset = rdev->data_offset; | ||
3984 | smp_wmb(); | ||
3985 | conf->reshape_progress = MaxSector; | ||
3986 | mddev->reshape_position = MaxSector; | ||
3987 | spin_unlock_irq(&conf->device_lock); | ||
3988 | return -EAGAIN; | ||
3989 | } | ||
3990 | conf->reshape_checkpoint = jiffies; | ||
3991 | md_wakeup_thread(mddev->sync_thread); | ||
3992 | md_new_event(mddev); | ||
3993 | return 0; | ||
3994 | } | ||
3995 | |||
3996 | /* Calculate the last device-address that could contain | ||
3997 | * any block from the chunk that includes the array-address 's' | ||
3998 | * and report the next address. | ||
3999 | * i.e. the address returned will be chunk-aligned and after | ||
4000 | * any data that is in the chunk containing 's'. | ||
4001 | */ | ||
4002 | static sector_t last_dev_address(sector_t s, struct geom *geo) | ||
4003 | { | ||
4004 | s = (s | geo->chunk_mask) + 1; | ||
4005 | s >>= geo->chunk_shift; | ||
4006 | s *= geo->near_copies; | ||
4007 | s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); | ||
4008 | s *= geo->far_copies; | ||
4009 | s <<= geo->chunk_shift; | ||
4010 | return s; | ||
4011 | } | ||
4012 | |||
4013 | /* Calculate the first device-address that could contain | ||
4014 | * any block from the chunk that includes the array-address 's'. | ||
4015 | * This too will be the start of a chunk | ||
4016 | */ | ||
4017 | static sector_t first_dev_address(sector_t s, struct geom *geo) | ||
4018 | { | ||
4019 | s >>= geo->chunk_shift; | ||
4020 | s *= geo->near_copies; | ||
4021 | sector_div(s, geo->raid_disks); | ||
4022 | s *= geo->far_copies; | ||
4023 | s <<= geo->chunk_shift; | ||
4024 | return s; | ||
4025 | } | ||
4026 | |||
4027 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | ||
4028 | int *skipped) | ||
4029 | { | ||
4030 | /* We simply copy at most one chunk (smallest of old and new) | ||
4031 | * at a time, possibly less if that exceeds RESYNC_PAGES, | ||
4032 | * or we hit a bad block or something. | ||
4033 | * This might mean we pause for normal IO in the middle of | ||
4034 | * a chunk, but that is not a problem was mddev->reshape_position | ||
4035 | * can record any location. | ||
4036 | * | ||
4037 | * If we will want to write to a location that isn't | ||
4038 | * yet recorded as 'safe' (i.e. in metadata on disk) then | ||
4039 | * we need to flush all reshape requests and update the metadata. | ||
4040 | * | ||
4041 | * When reshaping forwards (e.g. to more devices), we interpret | ||
4042 | * 'safe' as the earliest block which might not have been copied | ||
4043 | * down yet. We divide this by previous stripe size and multiply | ||
4044 | * by previous stripe length to get lowest device offset that we | ||
4045 | * cannot write to yet. | ||
4046 | * We interpret 'sector_nr' as an address that we want to write to. | ||
4047 | * From this we use last_device_address() to find where we might | ||
4048 | * write to, and first_device_address on the 'safe' position. | ||
4049 | * If this 'next' write position is after the 'safe' position, | ||
4050 | * we must update the metadata to increase the 'safe' position. | ||
4051 | * | ||
4052 | * When reshaping backwards, we round in the opposite direction | ||
4053 | * and perform the reverse test: next write position must not be | ||
4054 | * less than current safe position. | ||
4055 | * | ||
4056 | * In all this the minimum difference in data offsets | ||
4057 | * (conf->offset_diff - always positive) allows a bit of slack, | ||
4058 | * so next can be after 'safe', but not by more than offset_disk | ||
4059 | * | ||
4060 | * We need to prepare all the bios here before we start any IO | ||
4061 | * to ensure the size we choose is acceptable to all devices. | ||
4062 | * The means one for each copy for write-out and an extra one for | ||
4063 | * read-in. | ||
4064 | * We store the read-in bio in ->master_bio and the others in | ||
4065 | * ->devs[x].bio and ->devs[x].repl_bio. | ||
4066 | */ | ||
4067 | struct r10conf *conf = mddev->private; | ||
4068 | struct r10bio *r10_bio; | ||
4069 | sector_t next, safe, last; | ||
4070 | int max_sectors; | ||
4071 | int nr_sectors; | ||
4072 | int s; | ||
4073 | struct md_rdev *rdev; | ||
4074 | int need_flush = 0; | ||
4075 | struct bio *blist; | ||
4076 | struct bio *bio, *read_bio; | ||
4077 | int sectors_done = 0; | ||
4078 | |||
4079 | if (sector_nr == 0) { | ||
4080 | /* If restarting in the middle, skip the initial sectors */ | ||
4081 | if (mddev->reshape_backwards && | ||
4082 | conf->reshape_progress < raid10_size(mddev, 0, 0)) { | ||
4083 | sector_nr = (raid10_size(mddev, 0, 0) | ||
4084 | - conf->reshape_progress); | ||
4085 | } else if (!mddev->reshape_backwards && | ||
4086 | conf->reshape_progress > 0) | ||
4087 | sector_nr = conf->reshape_progress; | ||
4088 | if (sector_nr) { | ||
4089 | mddev->curr_resync_completed = sector_nr; | ||
4090 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||
4091 | *skipped = 1; | ||
4092 | return sector_nr; | ||
4093 | } | ||
4094 | } | ||
4095 | |||
4096 | /* We don't use sector_nr to track where we are up to | ||
4097 | * as that doesn't work well for ->reshape_backwards. | ||
4098 | * So just use ->reshape_progress. | ||
4099 | */ | ||
4100 | if (mddev->reshape_backwards) { | ||
4101 | /* 'next' is the earliest device address that we might | ||
4102 | * write to for this chunk in the new layout | ||
4103 | */ | ||
4104 | next = first_dev_address(conf->reshape_progress - 1, | ||
4105 | &conf->geo); | ||
4106 | |||
4107 | /* 'safe' is the last device address that we might read from | ||
4108 | * in the old layout after a restart | ||
4109 | */ | ||
4110 | safe = last_dev_address(conf->reshape_safe - 1, | ||
4111 | &conf->prev); | ||
4112 | |||
4113 | if (next + conf->offset_diff < safe) | ||
4114 | need_flush = 1; | ||
4115 | |||
4116 | last = conf->reshape_progress - 1; | ||
4117 | sector_nr = last & ~(sector_t)(conf->geo.chunk_mask | ||
4118 | & conf->prev.chunk_mask); | ||
4119 | if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) | ||
4120 | sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; | ||
4121 | } else { | ||
4122 | /* 'next' is after the last device address that we | ||
4123 | * might write to for this chunk in the new layout | ||
4124 | */ | ||
4125 | next = last_dev_address(conf->reshape_progress, &conf->geo); | ||
4126 | |||
4127 | /* 'safe' is the earliest device address that we might | ||
4128 | * read from in the old layout after a restart | ||
4129 | */ | ||
4130 | safe = first_dev_address(conf->reshape_safe, &conf->prev); | ||
4131 | |||
4132 | /* Need to update metadata if 'next' might be beyond 'safe' | ||
4133 | * as that would possibly corrupt data | ||
4134 | */ | ||
4135 | if (next > safe + conf->offset_diff) | ||
4136 | need_flush = 1; | ||
4137 | |||
4138 | sector_nr = conf->reshape_progress; | ||
4139 | last = sector_nr | (conf->geo.chunk_mask | ||
4140 | & conf->prev.chunk_mask); | ||
4141 | |||
4142 | if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) | ||
4143 | last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; | ||
4144 | } | ||
4145 | |||
4146 | if (need_flush || | ||
4147 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | ||
4148 | /* Need to update reshape_position in metadata */ | ||
4149 | wait_barrier(conf); | ||
4150 | mddev->reshape_position = conf->reshape_progress; | ||
4151 | if (mddev->reshape_backwards) | ||
4152 | mddev->curr_resync_completed = raid10_size(mddev, 0, 0) | ||
4153 | - conf->reshape_progress; | ||
4154 | else | ||
4155 | mddev->curr_resync_completed = conf->reshape_progress; | ||
4156 | conf->reshape_checkpoint = jiffies; | ||
4157 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
4158 | md_wakeup_thread(mddev->thread); | ||
4159 | wait_event(mddev->sb_wait, mddev->flags == 0 || | ||
4160 | kthread_should_stop()); | ||
4161 | conf->reshape_safe = mddev->reshape_position; | ||
4162 | allow_barrier(conf); | ||
4163 | } | ||
4164 | |||
4165 | read_more: | ||
4166 | /* Now schedule reads for blocks from sector_nr to last */ | ||
4167 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); | ||
4168 | raise_barrier(conf, sectors_done != 0); | ||
4169 | atomic_set(&r10_bio->remaining, 0); | ||
4170 | r10_bio->mddev = mddev; | ||
4171 | r10_bio->sector = sector_nr; | ||
4172 | set_bit(R10BIO_IsReshape, &r10_bio->state); | ||
4173 | r10_bio->sectors = last - sector_nr + 1; | ||
4174 | rdev = read_balance(conf, r10_bio, &max_sectors); | ||
4175 | BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); | ||
4176 | |||
4177 | if (!rdev) { | ||
4178 | /* Cannot read from here, so need to record bad blocks | ||
4179 | * on all the target devices. | ||
4180 | */ | ||
4181 | // FIXME | ||
4182 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
4183 | return sectors_done; | ||
4184 | } | ||
4185 | |||
4186 | read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); | ||
4187 | |||
4188 | read_bio->bi_bdev = rdev->bdev; | ||
4189 | read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr | ||
4190 | + rdev->data_offset); | ||
4191 | read_bio->bi_private = r10_bio; | ||
4192 | read_bio->bi_end_io = end_sync_read; | ||
4193 | read_bio->bi_rw = READ; | ||
4194 | read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
4195 | read_bio->bi_flags |= 1 << BIO_UPTODATE; | ||
4196 | read_bio->bi_vcnt = 0; | ||
4197 | read_bio->bi_idx = 0; | ||
4198 | read_bio->bi_size = 0; | ||
4199 | r10_bio->master_bio = read_bio; | ||
4200 | r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; | ||
4201 | |||
4202 | /* Now find the locations in the new layout */ | ||
4203 | __raid10_find_phys(&conf->geo, r10_bio); | ||
4204 | |||
4205 | blist = read_bio; | ||
4206 | read_bio->bi_next = NULL; | ||
4207 | |||
4208 | for (s = 0; s < conf->copies*2; s++) { | ||
4209 | struct bio *b; | ||
4210 | int d = r10_bio->devs[s/2].devnum; | ||
4211 | struct md_rdev *rdev2; | ||
4212 | if (s&1) { | ||
4213 | rdev2 = conf->mirrors[d].replacement; | ||
4214 | b = r10_bio->devs[s/2].repl_bio; | ||
4215 | } else { | ||
4216 | rdev2 = conf->mirrors[d].rdev; | ||
4217 | b = r10_bio->devs[s/2].bio; | ||
4218 | } | ||
4219 | if (!rdev2 || test_bit(Faulty, &rdev2->flags)) | ||
4220 | continue; | ||
4221 | b->bi_bdev = rdev2->bdev; | ||
4222 | b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; | ||
4223 | b->bi_private = r10_bio; | ||
4224 | b->bi_end_io = end_reshape_write; | ||
4225 | b->bi_rw = WRITE; | ||
4226 | b->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
4227 | b->bi_flags |= 1 << BIO_UPTODATE; | ||
4228 | b->bi_next = blist; | ||
4229 | b->bi_vcnt = 0; | ||
4230 | b->bi_idx = 0; | ||
4231 | b->bi_size = 0; | ||
4232 | blist = b; | ||
4233 | } | ||
4234 | |||
4235 | /* Now add as many pages as possible to all of these bios. */ | ||
4236 | |||
4237 | nr_sectors = 0; | ||
4238 | for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { | ||
4239 | struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; | ||
4240 | int len = (max_sectors - s) << 9; | ||
4241 | if (len > PAGE_SIZE) | ||
4242 | len = PAGE_SIZE; | ||
4243 | for (bio = blist; bio ; bio = bio->bi_next) { | ||
4244 | struct bio *bio2; | ||
4245 | if (bio_add_page(bio, page, len, 0)) | ||
4246 | continue; | ||
4247 | |||
4248 | /* Didn't fit, must stop */ | ||
4249 | for (bio2 = blist; | ||
4250 | bio2 && bio2 != bio; | ||
4251 | bio2 = bio2->bi_next) { | ||
4252 | /* Remove last page from this bio */ | ||
4253 | bio2->bi_vcnt--; | ||
4254 | bio2->bi_size -= len; | ||
4255 | bio2->bi_flags &= ~(1<<BIO_SEG_VALID); | ||
4256 | } | ||
4257 | goto bio_full; | ||
4258 | } | ||
4259 | sector_nr += len >> 9; | ||
4260 | nr_sectors += len >> 9; | ||
4261 | } | ||
4262 | bio_full: | ||
4263 | r10_bio->sectors = nr_sectors; | ||
4264 | |||
4265 | /* Now submit the read */ | ||
4266 | md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); | ||
4267 | atomic_inc(&r10_bio->remaining); | ||
4268 | read_bio->bi_next = NULL; | ||
4269 | generic_make_request(read_bio); | ||
4270 | sector_nr += nr_sectors; | ||
4271 | sectors_done += nr_sectors; | ||
4272 | if (sector_nr <= last) | ||
4273 | goto read_more; | ||
4274 | |||
4275 | /* Now that we have done the whole section we can | ||
4276 | * update reshape_progress | ||
4277 | */ | ||
4278 | if (mddev->reshape_backwards) | ||
4279 | conf->reshape_progress -= sectors_done; | ||
4280 | else | ||
4281 | conf->reshape_progress += sectors_done; | ||
4282 | |||
4283 | return sectors_done; | ||
4284 | } | ||
4285 | |||
4286 | static void end_reshape_request(struct r10bio *r10_bio); | ||
4287 | static int handle_reshape_read_error(struct mddev *mddev, | ||
4288 | struct r10bio *r10_bio); | ||
4289 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) | ||
4290 | { | ||
4291 | /* Reshape read completed. Hopefully we have a block | ||
4292 | * to write out. | ||
4293 | * If we got a read error then we do sync 1-page reads from | ||
4294 | * elsewhere until we find the data - or give up. | ||
4295 | */ | ||
4296 | struct r10conf *conf = mddev->private; | ||
4297 | int s; | ||
4298 | |||
4299 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
4300 | if (handle_reshape_read_error(mddev, r10_bio) < 0) { | ||
4301 | /* Reshape has been aborted */ | ||
4302 | md_done_sync(mddev, r10_bio->sectors, 0); | ||
4303 | return; | ||
4304 | } | ||
4305 | |||
4306 | /* We definitely have the data in the pages, schedule the | ||
4307 | * writes. | ||
4308 | */ | ||
4309 | atomic_set(&r10_bio->remaining, 1); | ||
4310 | for (s = 0; s < conf->copies*2; s++) { | ||
4311 | struct bio *b; | ||
4312 | int d = r10_bio->devs[s/2].devnum; | ||
4313 | struct md_rdev *rdev; | ||
4314 | if (s&1) { | ||
4315 | rdev = conf->mirrors[d].replacement; | ||
4316 | b = r10_bio->devs[s/2].repl_bio; | ||
4317 | } else { | ||
4318 | rdev = conf->mirrors[d].rdev; | ||
4319 | b = r10_bio->devs[s/2].bio; | ||
4320 | } | ||
4321 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
4322 | continue; | ||
4323 | atomic_inc(&rdev->nr_pending); | ||
4324 | md_sync_acct(b->bi_bdev, r10_bio->sectors); | ||
4325 | atomic_inc(&r10_bio->remaining); | ||
4326 | b->bi_next = NULL; | ||
4327 | generic_make_request(b); | ||
4328 | } | ||
4329 | end_reshape_request(r10_bio); | ||
4330 | } | ||
4331 | |||
4332 | static void end_reshape(struct r10conf *conf) | ||
4333 | { | ||
4334 | if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) | ||
4335 | return; | ||
4336 | |||
4337 | spin_lock_irq(&conf->device_lock); | ||
4338 | conf->prev = conf->geo; | ||
4339 | md_finish_reshape(conf->mddev); | ||
4340 | smp_wmb(); | ||
4341 | conf->reshape_progress = MaxSector; | ||
4342 | spin_unlock_irq(&conf->device_lock); | ||
4343 | |||
4344 | /* read-ahead size must cover two whole stripes, which is | ||
4345 | * 2 * (datadisks) * chunksize where 'n' is the number of raid devices | ||
4346 | */ | ||
4347 | if (conf->mddev->queue) { | ||
4348 | int stripe = conf->geo.raid_disks * | ||
4349 | ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); | ||
4350 | stripe /= conf->geo.near_copies; | ||
4351 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | ||
4352 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | ||
4353 | } | ||
4354 | conf->fullsync = 0; | ||
4355 | } | ||
4356 | |||
4357 | |||
4358 | static int handle_reshape_read_error(struct mddev *mddev, | ||
4359 | struct r10bio *r10_bio) | ||
4360 | { | ||
4361 | /* Use sync reads to get the blocks from somewhere else */ | ||
4362 | int sectors = r10_bio->sectors; | ||
4363 | struct r10bio r10b; | ||
4364 | struct r10conf *conf = mddev->private; | ||
4365 | int slot = 0; | ||
4366 | int idx = 0; | ||
4367 | struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; | ||
4368 | |||
4369 | r10b.sector = r10_bio->sector; | ||
4370 | __raid10_find_phys(&conf->prev, &r10b); | ||
4371 | |||
4372 | while (sectors) { | ||
4373 | int s = sectors; | ||
4374 | int success = 0; | ||
4375 | int first_slot = slot; | ||
4376 | |||
4377 | if (s > (PAGE_SIZE >> 9)) | ||
4378 | s = PAGE_SIZE >> 9; | ||
4379 | |||
4380 | while (!success) { | ||
4381 | int d = r10b.devs[slot].devnum; | ||
4382 | struct md_rdev *rdev = conf->mirrors[d].rdev; | ||
4383 | sector_t addr; | ||
4384 | if (rdev == NULL || | ||
4385 | test_bit(Faulty, &rdev->flags) || | ||
4386 | !test_bit(In_sync, &rdev->flags)) | ||
4387 | goto failed; | ||
4388 | |||
4389 | addr = r10b.devs[slot].addr + idx * PAGE_SIZE; | ||
4390 | success = sync_page_io(rdev, | ||
4391 | addr, | ||
4392 | s << 9, | ||
4393 | bvec[idx].bv_page, | ||
4394 | READ, false); | ||
4395 | if (success) | ||
4396 | break; | ||
4397 | failed: | ||
4398 | slot++; | ||
4399 | if (slot >= conf->copies) | ||
4400 | slot = 0; | ||
4401 | if (slot == first_slot) | ||
4402 | break; | ||
4403 | } | ||
4404 | if (!success) { | ||
4405 | /* couldn't read this block, must give up */ | ||
4406 | set_bit(MD_RECOVERY_INTR, | ||
4407 | &mddev->recovery); | ||
4408 | return -EIO; | ||
4409 | } | ||
4410 | sectors -= s; | ||
4411 | idx++; | ||
4412 | } | ||
4413 | return 0; | ||
4414 | } | ||
4415 | |||
4416 | static void end_reshape_write(struct bio *bio, int error) | ||
4417 | { | ||
4418 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
4419 | struct r10bio *r10_bio = bio->bi_private; | ||
4420 | struct mddev *mddev = r10_bio->mddev; | ||
4421 | struct r10conf *conf = mddev->private; | ||
4422 | int d; | ||
4423 | int slot; | ||
4424 | int repl; | ||
4425 | struct md_rdev *rdev = NULL; | ||
4426 | |||
4427 | d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); | ||
4428 | if (repl) | ||
4429 | rdev = conf->mirrors[d].replacement; | ||
4430 | if (!rdev) { | ||
4431 | smp_mb(); | ||
4432 | rdev = conf->mirrors[d].rdev; | ||
4433 | } | ||
4434 | |||
4435 | if (!uptodate) { | ||
4436 | /* FIXME should record badblock */ | ||
4437 | md_error(mddev, rdev); | ||
4438 | } | ||
4439 | |||
4440 | rdev_dec_pending(rdev, mddev); | ||
4441 | end_reshape_request(r10_bio); | ||
4442 | } | ||
4443 | |||
4444 | static void end_reshape_request(struct r10bio *r10_bio) | ||
4445 | { | ||
4446 | if (!atomic_dec_and_test(&r10_bio->remaining)) | ||
4447 | return; | ||
4448 | md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); | ||
4449 | bio_put(r10_bio->master_bio); | ||
4450 | put_buf(r10_bio); | ||
4451 | } | ||
4452 | |||
4453 | static void raid10_finish_reshape(struct mddev *mddev) | ||
4454 | { | ||
4455 | struct r10conf *conf = mddev->private; | ||
4456 | |||
4457 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
4458 | return; | ||
4459 | |||
4460 | if (mddev->delta_disks > 0) { | ||
4461 | sector_t size = raid10_size(mddev, 0, 0); | ||
4462 | md_set_array_sectors(mddev, size); | ||
4463 | if (mddev->recovery_cp > mddev->resync_max_sectors) { | ||
4464 | mddev->recovery_cp = mddev->resync_max_sectors; | ||
4465 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
4466 | } | ||
4467 | mddev->resync_max_sectors = size; | ||
4468 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
4469 | revalidate_disk(mddev->gendisk); | ||
4470 | } | ||
4471 | mddev->layout = mddev->new_layout; | ||
4472 | mddev->chunk_sectors = 1 << conf->geo.chunk_shift; | ||
4473 | mddev->reshape_position = MaxSector; | ||
4474 | mddev->delta_disks = 0; | ||
4475 | mddev->reshape_backwards = 0; | ||
4476 | } | ||
4477 | |||
3637 | static struct md_personality raid10_personality = | 4478 | static struct md_personality raid10_personality = |
3638 | { | 4479 | { |
3639 | .name = "raid10", | 4480 | .name = "raid10", |
@@ -3652,6 +4493,9 @@ static struct md_personality raid10_personality = | |||
3652 | .size = raid10_size, | 4493 | .size = raid10_size, |
3653 | .resize = raid10_resize, | 4494 | .resize = raid10_resize, |
3654 | .takeover = raid10_takeover, | 4495 | .takeover = raid10_takeover, |
4496 | .check_reshape = raid10_check_reshape, | ||
4497 | .start_reshape = raid10_start_reshape, | ||
4498 | .finish_reshape = raid10_finish_reshape, | ||
3655 | }; | 4499 | }; |
3656 | 4500 | ||
3657 | static int __init raid_init(void) | 4501 | static int __init raid_init(void) |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 37509d7134aa..135b1b0a1554 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -14,6 +14,7 @@ struct mirror_info { | |||
14 | struct r10conf { | 14 | struct r10conf { |
15 | struct mddev *mddev; | 15 | struct mddev *mddev; |
16 | struct mirror_info *mirrors; | 16 | struct mirror_info *mirrors; |
17 | struct mirror_info *mirrors_new, *mirrors_old; | ||
17 | spinlock_t device_lock; | 18 | spinlock_t device_lock; |
18 | 19 | ||
19 | /* geometry */ | 20 | /* geometry */ |
@@ -42,6 +43,9 @@ struct r10conf { | |||
42 | sector_t dev_sectors; /* temp copy of | 43 | sector_t dev_sectors; /* temp copy of |
43 | * mddev->dev_sectors */ | 44 | * mddev->dev_sectors */ |
44 | sector_t reshape_progress; | 45 | sector_t reshape_progress; |
46 | sector_t reshape_safe; | ||
47 | unsigned long reshape_checkpoint; | ||
48 | sector_t offset_diff; | ||
45 | 49 | ||
46 | struct list_head retry_list; | 50 | struct list_head retry_list; |
47 | /* queue pending writes and submit them on unplug */ | 51 | /* queue pending writes and submit them on unplug */ |
@@ -138,6 +142,7 @@ enum r10bio_state { | |||
138 | R10BIO_Uptodate, | 142 | R10BIO_Uptodate, |
139 | R10BIO_IsSync, | 143 | R10BIO_IsSync, |
140 | R10BIO_IsRecover, | 144 | R10BIO_IsRecover, |
145 | R10BIO_IsReshape, | ||
141 | R10BIO_Degraded, | 146 | R10BIO_Degraded, |
142 | /* Set ReadError on bios that experience a read error | 147 | /* Set ReadError on bios that experience a read error |
143 | * so that raid10d knows what to do with them. | 148 | * so that raid10d knows what to do with them. |