aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c1281
1 files changed, 1128 insertions, 153 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3f91c2e1dfe7..987db37cb875 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -24,6 +24,7 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/seq_file.h> 25#include <linux/seq_file.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
27#include "md.h" 28#include "md.h"
28#include "raid10.h" 29#include "raid10.h"
29#include "raid0.h" 30#include "raid0.h"
@@ -68,6 +69,11 @@ static int max_queued_requests = 1024;
68static void allow_barrier(struct r10conf *conf); 69static void allow_barrier(struct r10conf *conf);
69static void lower_barrier(struct r10conf *conf); 70static void lower_barrier(struct r10conf *conf);
70static int enough(struct r10conf *conf, int ignore); 71static int enough(struct r10conf *conf, int ignore);
72static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
73 int *skipped);
74static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
75static void end_reshape_write(struct bio *bio, int error);
76static void end_reshape(struct r10conf *conf);
71 77
72static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 78static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
73{ 79{
@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
112 if (!r10_bio) 118 if (!r10_bio)
113 return NULL; 119 return NULL;
114 120
115 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) 121 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
122 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
116 nalloc = conf->copies; /* resync */ 123 nalloc = conf->copies; /* resync */
117 else 124 else
118 nalloc = 2; /* recovery */ 125 nalloc = 2; /* recovery */
@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
140 struct bio *rbio = r10_bio->devs[j].repl_bio; 147 struct bio *rbio = r10_bio->devs[j].repl_bio;
141 bio = r10_bio->devs[j].bio; 148 bio = r10_bio->devs[j].bio;
142 for (i = 0; i < RESYNC_PAGES; i++) { 149 for (i = 0; i < RESYNC_PAGES; i++) {
143 if (j == 1 && !test_bit(MD_RECOVERY_SYNC, 150 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
144 &conf->mddev->recovery)) { 151 &conf->mddev->recovery)) {
145 /* we can share bv_page's during recovery */ 152 /* we can share bv_page's during recovery
153 * and reshape */
146 struct bio *rbio = r10_bio->devs[0].bio; 154 struct bio *rbio = r10_bio->devs[0].bio;
147 page = rbio->bi_io_vec[i].bv_page; 155 page = rbio->bi_io_vec[i].bv_page;
148 get_page(page); 156 get_page(page);
@@ -165,10 +173,11 @@ out_free_pages:
165 while (j--) 173 while (j--)
166 for (i = 0; i < RESYNC_PAGES ; i++) 174 for (i = 0; i < RESYNC_PAGES ; i++)
167 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 175 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
168 j = -1; 176 j = 0;
169out_free_bio: 177out_free_bio:
170 while (++j < nalloc) { 178 for ( ; j < nalloc; j++) {
171 bio_put(r10_bio->devs[j].bio); 179 if (r10_bio->devs[j].bio)
180 bio_put(r10_bio->devs[j].bio);
172 if (r10_bio->devs[j].repl_bio) 181 if (r10_bio->devs[j].repl_bio)
173 bio_put(r10_bio->devs[j].repl_bio); 182 bio_put(r10_bio->devs[j].repl_bio);
174 } 183 }
@@ -504,79 +513,96 @@ static void raid10_end_write_request(struct bio *bio, int error)
504 * sector offset to a virtual address 513 * sector offset to a virtual address
505 */ 514 */
506 515
507static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 516static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
508{ 517{
509 int n,f; 518 int n,f;
510 sector_t sector; 519 sector_t sector;
511 sector_t chunk; 520 sector_t chunk;
512 sector_t stripe; 521 sector_t stripe;
513 int dev; 522 int dev;
514
515 int slot = 0; 523 int slot = 0;
516 524
517 /* now calculate first sector/dev */ 525 /* now calculate first sector/dev */
518 chunk = r10bio->sector >> conf->chunk_shift; 526 chunk = r10bio->sector >> geo->chunk_shift;
519 sector = r10bio->sector & conf->chunk_mask; 527 sector = r10bio->sector & geo->chunk_mask;
520 528
521 chunk *= conf->near_copies; 529 chunk *= geo->near_copies;
522 stripe = chunk; 530 stripe = chunk;
523 dev = sector_div(stripe, conf->raid_disks); 531 dev = sector_div(stripe, geo->raid_disks);
524 if (conf->far_offset) 532 if (geo->far_offset)
525 stripe *= conf->far_copies; 533 stripe *= geo->far_copies;
526 534
527 sector += stripe << conf->chunk_shift; 535 sector += stripe << geo->chunk_shift;
528 536
529 /* and calculate all the others */ 537 /* and calculate all the others */
530 for (n=0; n < conf->near_copies; n++) { 538 for (n = 0; n < geo->near_copies; n++) {
531 int d = dev; 539 int d = dev;
532 sector_t s = sector; 540 sector_t s = sector;
533 r10bio->devs[slot].addr = sector; 541 r10bio->devs[slot].addr = sector;
534 r10bio->devs[slot].devnum = d; 542 r10bio->devs[slot].devnum = d;
535 slot++; 543 slot++;
536 544
537 for (f = 1; f < conf->far_copies; f++) { 545 for (f = 1; f < geo->far_copies; f++) {
538 d += conf->near_copies; 546 d += geo->near_copies;
539 if (d >= conf->raid_disks) 547 if (d >= geo->raid_disks)
540 d -= conf->raid_disks; 548 d -= geo->raid_disks;
541 s += conf->stride; 549 s += geo->stride;
542 r10bio->devs[slot].devnum = d; 550 r10bio->devs[slot].devnum = d;
543 r10bio->devs[slot].addr = s; 551 r10bio->devs[slot].addr = s;
544 slot++; 552 slot++;
545 } 553 }
546 dev++; 554 dev++;
547 if (dev >= conf->raid_disks) { 555 if (dev >= geo->raid_disks) {
548 dev = 0; 556 dev = 0;
549 sector += (conf->chunk_mask + 1); 557 sector += (geo->chunk_mask + 1);
550 } 558 }
551 } 559 }
552 BUG_ON(slot != conf->copies); 560}
561
562static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
563{
564 struct geom *geo = &conf->geo;
565
566 if (conf->reshape_progress != MaxSector &&
567 ((r10bio->sector >= conf->reshape_progress) !=
568 conf->mddev->reshape_backwards)) {
569 set_bit(R10BIO_Previous, &r10bio->state);
570 geo = &conf->prev;
571 } else
572 clear_bit(R10BIO_Previous, &r10bio->state);
573
574 __raid10_find_phys(geo, r10bio);
553} 575}
554 576
555static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) 577static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
556{ 578{
557 sector_t offset, chunk, vchunk; 579 sector_t offset, chunk, vchunk;
580 /* Never use conf->prev as this is only called during resync
581 * or recovery, so reshape isn't happening
582 */
583 struct geom *geo = &conf->geo;
558 584
559 offset = sector & conf->chunk_mask; 585 offset = sector & geo->chunk_mask;
560 if (conf->far_offset) { 586 if (geo->far_offset) {
561 int fc; 587 int fc;
562 chunk = sector >> conf->chunk_shift; 588 chunk = sector >> geo->chunk_shift;
563 fc = sector_div(chunk, conf->far_copies); 589 fc = sector_div(chunk, geo->far_copies);
564 dev -= fc * conf->near_copies; 590 dev -= fc * geo->near_copies;
565 if (dev < 0) 591 if (dev < 0)
566 dev += conf->raid_disks; 592 dev += geo->raid_disks;
567 } else { 593 } else {
568 while (sector >= conf->stride) { 594 while (sector >= geo->stride) {
569 sector -= conf->stride; 595 sector -= geo->stride;
570 if (dev < conf->near_copies) 596 if (dev < geo->near_copies)
571 dev += conf->raid_disks - conf->near_copies; 597 dev += geo->raid_disks - geo->near_copies;
572 else 598 else
573 dev -= conf->near_copies; 599 dev -= geo->near_copies;
574 } 600 }
575 chunk = sector >> conf->chunk_shift; 601 chunk = sector >> geo->chunk_shift;
576 } 602 }
577 vchunk = chunk * conf->raid_disks + dev; 603 vchunk = chunk * geo->raid_disks + dev;
578 sector_div(vchunk, conf->near_copies); 604 sector_div(vchunk, geo->near_copies);
579 return (vchunk << conf->chunk_shift) + offset; 605 return (vchunk << geo->chunk_shift) + offset;
580} 606}
581 607
582/** 608/**
@@ -597,10 +623,17 @@ static int raid10_mergeable_bvec(struct request_queue *q,
597 struct r10conf *conf = mddev->private; 623 struct r10conf *conf = mddev->private;
598 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 624 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
599 int max; 625 int max;
600 unsigned int chunk_sectors = mddev->chunk_sectors; 626 unsigned int chunk_sectors;
601 unsigned int bio_sectors = bvm->bi_size >> 9; 627 unsigned int bio_sectors = bvm->bi_size >> 9;
628 struct geom *geo = &conf->geo;
629
630 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
631 if (conf->reshape_progress != MaxSector &&
632 ((sector >= conf->reshape_progress) !=
633 conf->mddev->reshape_backwards))
634 geo = &conf->prev;
602 635
603 if (conf->near_copies < conf->raid_disks) { 636 if (geo->near_copies < geo->raid_disks) {
604 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) 637 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
605 + bio_sectors)) << 9; 638 + bio_sectors)) << 9;
606 if (max < 0) 639 if (max < 0)
@@ -614,6 +647,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
614 if (mddev->merge_check_needed) { 647 if (mddev->merge_check_needed) {
615 struct r10bio r10_bio; 648 struct r10bio r10_bio;
616 int s; 649 int s;
650 if (conf->reshape_progress != MaxSector) {
651 /* Cannot give any guidance during reshape */
652 if (max <= biovec->bv_len && bio_sectors == 0)
653 return biovec->bv_len;
654 return 0;
655 }
617 r10_bio.sector = sector; 656 r10_bio.sector = sector;
618 raid10_find_phys(conf, &r10_bio); 657 raid10_find_phys(conf, &r10_bio);
619 rcu_read_lock(); 658 rcu_read_lock();
@@ -681,6 +720,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
681 struct md_rdev *rdev, *best_rdev; 720 struct md_rdev *rdev, *best_rdev;
682 int do_balance; 721 int do_balance;
683 int best_slot; 722 int best_slot;
723 struct geom *geo = &conf->geo;
684 724
685 raid10_find_phys(conf, r10_bio); 725 raid10_find_phys(conf, r10_bio);
686 rcu_read_lock(); 726 rcu_read_lock();
@@ -761,11 +801,11 @@ retry:
761 * sequential read speed for 'far copies' arrays. So only 801 * sequential read speed for 'far copies' arrays. So only
762 * keep it for 'near' arrays, and review those later. 802 * keep it for 'near' arrays, and review those later.
763 */ 803 */
764 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) 804 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
765 break; 805 break;
766 806
767 /* for far > 1 always use the lowest address */ 807 /* for far > 1 always use the lowest address */
768 if (conf->far_copies > 1) 808 if (geo->far_copies > 1)
769 new_distance = r10_bio->devs[slot].addr; 809 new_distance = r10_bio->devs[slot].addr;
770 else 810 else
771 new_distance = abs(r10_bio->devs[slot].addr - 811 new_distance = abs(r10_bio->devs[slot].addr -
@@ -812,7 +852,10 @@ static int raid10_congested(void *data, int bits)
812 if (mddev_congested(mddev, bits)) 852 if (mddev_congested(mddev, bits))
813 return 1; 853 return 1;
814 rcu_read_lock(); 854 rcu_read_lock();
815 for (i = 0; i < conf->raid_disks && ret == 0; i++) { 855 for (i = 0;
856 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
857 && ret == 0;
858 i++) {
816 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 859 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
817 if (rdev && !test_bit(Faulty, &rdev->flags)) { 860 if (rdev && !test_bit(Faulty, &rdev->flags)) {
818 struct request_queue *q = bdev_get_queue(rdev->bdev); 861 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -973,13 +1016,24 @@ static void unfreeze_array(struct r10conf *conf)
973 spin_unlock_irq(&conf->resync_lock); 1016 spin_unlock_irq(&conf->resync_lock);
974} 1017}
975 1018
1019static sector_t choose_data_offset(struct r10bio *r10_bio,
1020 struct md_rdev *rdev)
1021{
1022 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1023 test_bit(R10BIO_Previous, &r10_bio->state))
1024 return rdev->data_offset;
1025 else
1026 return rdev->new_data_offset;
1027}
1028
976static void make_request(struct mddev *mddev, struct bio * bio) 1029static void make_request(struct mddev *mddev, struct bio * bio)
977{ 1030{
978 struct r10conf *conf = mddev->private; 1031 struct r10conf *conf = mddev->private;
979 struct r10bio *r10_bio; 1032 struct r10bio *r10_bio;
980 struct bio *read_bio; 1033 struct bio *read_bio;
981 int i; 1034 int i;
982 int chunk_sects = conf->chunk_mask + 1; 1035 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1036 int chunk_sects = chunk_mask + 1;
983 const int rw = bio_data_dir(bio); 1037 const int rw = bio_data_dir(bio);
984 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 1038 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
985 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1039 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
@@ -988,6 +1042,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
988 int plugged; 1042 int plugged;
989 int sectors_handled; 1043 int sectors_handled;
990 int max_sectors; 1044 int max_sectors;
1045 int sectors;
991 1046
992 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 1047 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
993 md_flush_request(mddev, bio); 1048 md_flush_request(mddev, bio);
@@ -997,9 +1052,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
997 /* If this request crosses a chunk boundary, we need to 1052 /* If this request crosses a chunk boundary, we need to
998 * split it. This will only happen for 1 PAGE (or less) requests. 1053 * split it. This will only happen for 1 PAGE (or less) requests.
999 */ 1054 */
1000 if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) 1055 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
1001 > chunk_sects && 1056 > chunk_sects
1002 conf->near_copies < conf->raid_disks)) { 1057 && (conf->geo.near_copies < conf->geo.raid_disks
1058 || conf->prev.near_copies < conf->prev.raid_disks))) {
1003 struct bio_pair *bp; 1059 struct bio_pair *bp;
1004 /* Sanity check -- queue functions should prevent this happening */ 1060 /* Sanity check -- queue functions should prevent this happening */
1005 if (bio->bi_vcnt != 1 || 1061 if (bio->bi_vcnt != 1 ||
@@ -1051,10 +1107,41 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1051 */ 1107 */
1052 wait_barrier(conf); 1108 wait_barrier(conf);
1053 1109
1110 sectors = bio->bi_size >> 9;
1111 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1112 bio->bi_sector < conf->reshape_progress &&
1113 bio->bi_sector + sectors > conf->reshape_progress) {
1114 /* IO spans the reshape position. Need to wait for
1115 * reshape to pass
1116 */
1117 allow_barrier(conf);
1118 wait_event(conf->wait_barrier,
1119 conf->reshape_progress <= bio->bi_sector ||
1120 conf->reshape_progress >= bio->bi_sector + sectors);
1121 wait_barrier(conf);
1122 }
1123 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1124 bio_data_dir(bio) == WRITE &&
1125 (mddev->reshape_backwards
1126 ? (bio->bi_sector < conf->reshape_safe &&
1127 bio->bi_sector + sectors > conf->reshape_progress)
1128 : (bio->bi_sector + sectors > conf->reshape_safe &&
1129 bio->bi_sector < conf->reshape_progress))) {
1130 /* Need to update reshape_position in metadata */
1131 mddev->reshape_position = conf->reshape_progress;
1132 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1133 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1134 md_wakeup_thread(mddev->thread);
1135 wait_event(mddev->sb_wait,
1136 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1137
1138 conf->reshape_safe = mddev->reshape_position;
1139 }
1140
1054 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1141 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1055 1142
1056 r10_bio->master_bio = bio; 1143 r10_bio->master_bio = bio;
1057 r10_bio->sectors = bio->bi_size >> 9; 1144 r10_bio->sectors = sectors;
1058 1145
1059 r10_bio->mddev = mddev; 1146 r10_bio->mddev = mddev;
1060 r10_bio->sector = bio->bi_sector; 1147 r10_bio->sector = bio->bi_sector;
@@ -1093,7 +1180,7 @@ read_again:
1093 r10_bio->devs[slot].rdev = rdev; 1180 r10_bio->devs[slot].rdev = rdev;
1094 1181
1095 read_bio->bi_sector = r10_bio->devs[slot].addr + 1182 read_bio->bi_sector = r10_bio->devs[slot].addr +
1096 rdev->data_offset; 1183 choose_data_offset(r10_bio, rdev);
1097 read_bio->bi_bdev = rdev->bdev; 1184 read_bio->bi_bdev = rdev->bdev;
1098 read_bio->bi_end_io = raid10_end_read_request; 1185 read_bio->bi_end_io = raid10_end_read_request;
1099 read_bio->bi_rw = READ | do_sync; 1186 read_bio->bi_rw = READ | do_sync;
@@ -1297,7 +1384,8 @@ retry_write:
1297 r10_bio->devs[i].bio = mbio; 1384 r10_bio->devs[i].bio = mbio;
1298 1385
1299 mbio->bi_sector = (r10_bio->devs[i].addr+ 1386 mbio->bi_sector = (r10_bio->devs[i].addr+
1300 conf->mirrors[d].rdev->data_offset); 1387 choose_data_offset(r10_bio,
1388 conf->mirrors[d].rdev));
1301 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1389 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1302 mbio->bi_end_io = raid10_end_write_request; 1390 mbio->bi_end_io = raid10_end_write_request;
1303 mbio->bi_rw = WRITE | do_sync | do_fua; 1391 mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -1321,8 +1409,10 @@ retry_write:
1321 * so it cannot disappear, so the replacement cannot 1409 * so it cannot disappear, so the replacement cannot
1322 * become NULL here 1410 * become NULL here
1323 */ 1411 */
1324 mbio->bi_sector = (r10_bio->devs[i].addr+ 1412 mbio->bi_sector = (r10_bio->devs[i].addr +
1325 conf->mirrors[d].replacement->data_offset); 1413 choose_data_offset(
1414 r10_bio,
1415 conf->mirrors[d].replacement));
1326 mbio->bi_bdev = conf->mirrors[d].replacement->bdev; 1416 mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1327 mbio->bi_end_io = raid10_end_write_request; 1417 mbio->bi_end_io = raid10_end_write_request;
1328 mbio->bi_rw = WRITE | do_sync | do_fua; 1418 mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -1368,19 +1458,19 @@ static void status(struct seq_file *seq, struct mddev *mddev)
1368 struct r10conf *conf = mddev->private; 1458 struct r10conf *conf = mddev->private;
1369 int i; 1459 int i;
1370 1460
1371 if (conf->near_copies < conf->raid_disks) 1461 if (conf->geo.near_copies < conf->geo.raid_disks)
1372 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 1462 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1373 if (conf->near_copies > 1) 1463 if (conf->geo.near_copies > 1)
1374 seq_printf(seq, " %d near-copies", conf->near_copies); 1464 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1375 if (conf->far_copies > 1) { 1465 if (conf->geo.far_copies > 1) {
1376 if (conf->far_offset) 1466 if (conf->geo.far_offset)
1377 seq_printf(seq, " %d offset-copies", conf->far_copies); 1467 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1378 else 1468 else
1379 seq_printf(seq, " %d far-copies", conf->far_copies); 1469 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1380 } 1470 }
1381 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 1471 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1382 conf->raid_disks - mddev->degraded); 1472 conf->geo.raid_disks - mddev->degraded);
1383 for (i = 0; i < conf->raid_disks; i++) 1473 for (i = 0; i < conf->geo.raid_disks; i++)
1384 seq_printf(seq, "%s", 1474 seq_printf(seq, "%s",
1385 conf->mirrors[i].rdev && 1475 conf->mirrors[i].rdev &&
1386 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); 1476 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
@@ -1392,7 +1482,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
1392 * Don't consider the device numbered 'ignore' 1482 * Don't consider the device numbered 'ignore'
1393 * as we might be about to remove it. 1483 * as we might be about to remove it.
1394 */ 1484 */
1395static int enough(struct r10conf *conf, int ignore) 1485static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
1396{ 1486{
1397 int first = 0; 1487 int first = 0;
1398 1488
@@ -1403,7 +1493,7 @@ static int enough(struct r10conf *conf, int ignore)
1403 if (conf->mirrors[first].rdev && 1493 if (conf->mirrors[first].rdev &&
1404 first != ignore) 1494 first != ignore)
1405 cnt++; 1495 cnt++;
1406 first = (first+1) % conf->raid_disks; 1496 first = (first+1) % geo->raid_disks;
1407 } 1497 }
1408 if (cnt == 0) 1498 if (cnt == 0)
1409 return 0; 1499 return 0;
@@ -1411,6 +1501,12 @@ static int enough(struct r10conf *conf, int ignore)
1411 return 1; 1501 return 1;
1412} 1502}
1413 1503
1504static int enough(struct r10conf *conf, int ignore)
1505{
1506 return _enough(conf, &conf->geo, ignore) &&
1507 _enough(conf, &conf->prev, ignore);
1508}
1509
1414static void error(struct mddev *mddev, struct md_rdev *rdev) 1510static void error(struct mddev *mddev, struct md_rdev *rdev)
1415{ 1511{
1416 char b[BDEVNAME_SIZE]; 1512 char b[BDEVNAME_SIZE];
@@ -1445,7 +1541,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1445 "md/raid10:%s: Disk failure on %s, disabling device.\n" 1541 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1446 "md/raid10:%s: Operation continuing on %d devices.\n", 1542 "md/raid10:%s: Operation continuing on %d devices.\n",
1447 mdname(mddev), bdevname(rdev->bdev, b), 1543 mdname(mddev), bdevname(rdev->bdev, b),
1448 mdname(mddev), conf->raid_disks - mddev->degraded); 1544 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1449} 1545}
1450 1546
1451static void print_conf(struct r10conf *conf) 1547static void print_conf(struct r10conf *conf)
@@ -1458,10 +1554,10 @@ static void print_conf(struct r10conf *conf)
1458 printk(KERN_DEBUG "(!conf)\n"); 1554 printk(KERN_DEBUG "(!conf)\n");
1459 return; 1555 return;
1460 } 1556 }
1461 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1557 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1462 conf->raid_disks); 1558 conf->geo.raid_disks);
1463 1559
1464 for (i = 0; i < conf->raid_disks; i++) { 1560 for (i = 0; i < conf->geo.raid_disks; i++) {
1465 char b[BDEVNAME_SIZE]; 1561 char b[BDEVNAME_SIZE];
1466 tmp = conf->mirrors + i; 1562 tmp = conf->mirrors + i;
1467 if (tmp->rdev) 1563 if (tmp->rdev)
@@ -1493,7 +1589,7 @@ static int raid10_spare_active(struct mddev *mddev)
1493 * Find all non-in_sync disks within the RAID10 configuration 1589 * Find all non-in_sync disks within the RAID10 configuration
1494 * and mark them in_sync 1590 * and mark them in_sync
1495 */ 1591 */
1496 for (i = 0; i < conf->raid_disks; i++) { 1592 for (i = 0; i < conf->geo.raid_disks; i++) {
1497 tmp = conf->mirrors + i; 1593 tmp = conf->mirrors + i;
1498 if (tmp->replacement 1594 if (tmp->replacement
1499 && tmp->replacement->recovery_offset == MaxSector 1595 && tmp->replacement->recovery_offset == MaxSector
@@ -1535,7 +1631,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1535 int err = -EEXIST; 1631 int err = -EEXIST;
1536 int mirror; 1632 int mirror;
1537 int first = 0; 1633 int first = 0;
1538 int last = conf->raid_disks - 1; 1634 int last = conf->geo.raid_disks - 1;
1539 struct request_queue *q = bdev_get_queue(rdev->bdev); 1635 struct request_queue *q = bdev_get_queue(rdev->bdev);
1540 1636
1541 if (mddev->recovery_cp < MaxSector) 1637 if (mddev->recovery_cp < MaxSector)
@@ -1543,7 +1639,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1543 * very different from resync 1639 * very different from resync
1544 */ 1640 */
1545 return -EBUSY; 1641 return -EBUSY;
1546 if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) 1642 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
1547 return -EINVAL; 1643 return -EINVAL;
1548 1644
1549 if (rdev->raid_disk >= 0) 1645 if (rdev->raid_disk >= 0)
@@ -1635,6 +1731,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1635 if (!test_bit(Faulty, &rdev->flags) && 1731 if (!test_bit(Faulty, &rdev->flags) &&
1636 mddev->recovery_disabled != p->recovery_disabled && 1732 mddev->recovery_disabled != p->recovery_disabled &&
1637 (!p->replacement || p->replacement == rdev) && 1733 (!p->replacement || p->replacement == rdev) &&
1734 number < conf->geo.raid_disks &&
1638 enough(conf, -1)) { 1735 enough(conf, -1)) {
1639 err = -EBUSY; 1736 err = -EBUSY;
1640 goto abort; 1737 goto abort;
@@ -1676,7 +1773,11 @@ static void end_sync_read(struct bio *bio, int error)
1676 struct r10conf *conf = r10_bio->mddev->private; 1773 struct r10conf *conf = r10_bio->mddev->private;
1677 int d; 1774 int d;
1678 1775
1679 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 1776 if (bio == r10_bio->master_bio) {
1777 /* this is a reshape read */
1778 d = r10_bio->read_slot; /* really the read dev */
1779 } else
1780 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1680 1781
1681 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1782 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1682 set_bit(R10BIO_Uptodate, &r10_bio->state); 1783 set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -2218,7 +2319,9 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2218 " (%d sectors at %llu on %s)\n", 2319 " (%d sectors at %llu on %s)\n",
2219 mdname(mddev), s, 2320 mdname(mddev), s,
2220 (unsigned long long)( 2321 (unsigned long long)(
2221 sect + rdev->data_offset), 2322 sect +
2323 choose_data_offset(r10_bio,
2324 rdev)),
2222 bdevname(rdev->bdev, b)); 2325 bdevname(rdev->bdev, b));
2223 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 2326 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2224 "drive\n", 2327 "drive\n",
@@ -2256,7 +2359,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2256 " (%d sectors at %llu on %s)\n", 2359 " (%d sectors at %llu on %s)\n",
2257 mdname(mddev), s, 2360 mdname(mddev), s,
2258 (unsigned long long)( 2361 (unsigned long long)(
2259 sect + rdev->data_offset), 2362 sect +
2363 choose_data_offset(r10_bio, rdev)),
2260 bdevname(rdev->bdev, b)); 2364 bdevname(rdev->bdev, b));
2261 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 2365 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2262 "drive\n", 2366 "drive\n",
@@ -2269,7 +2373,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2269 " (%d sectors at %llu on %s)\n", 2373 " (%d sectors at %llu on %s)\n",
2270 mdname(mddev), s, 2374 mdname(mddev), s,
2271 (unsigned long long)( 2375 (unsigned long long)(
2272 sect + rdev->data_offset), 2376 sect +
2377 choose_data_offset(r10_bio, rdev)),
2273 bdevname(rdev->bdev, b)); 2378 bdevname(rdev->bdev, b));
2274 atomic_add(s, &rdev->corrected_errors); 2379 atomic_add(s, &rdev->corrected_errors);
2275 } 2380 }
@@ -2343,7 +2448,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
2343 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 2448 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2344 md_trim_bio(wbio, sector - bio->bi_sector, sectors); 2449 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2345 wbio->bi_sector = (r10_bio->devs[i].addr+ 2450 wbio->bi_sector = (r10_bio->devs[i].addr+
2346 rdev->data_offset+ 2451 choose_data_offset(r10_bio, rdev) +
2347 (sector - r10_bio->sector)); 2452 (sector - r10_bio->sector));
2348 wbio->bi_bdev = rdev->bdev; 2453 wbio->bi_bdev = rdev->bdev;
2349 if (submit_bio_wait(WRITE, wbio) == 0) 2454 if (submit_bio_wait(WRITE, wbio) == 0)
@@ -2420,7 +2525,7 @@ read_more:
2420 r10_bio->devs[slot].bio = bio; 2525 r10_bio->devs[slot].bio = bio;
2421 r10_bio->devs[slot].rdev = rdev; 2526 r10_bio->devs[slot].rdev = rdev;
2422 bio->bi_sector = r10_bio->devs[slot].addr 2527 bio->bi_sector = r10_bio->devs[slot].addr
2423 + rdev->data_offset; 2528 + choose_data_offset(r10_bio, rdev);
2424 bio->bi_bdev = rdev->bdev; 2529 bio->bi_bdev = rdev->bdev;
2425 bio->bi_rw = READ | do_sync; 2530 bio->bi_rw = READ | do_sync;
2426 bio->bi_private = r10_bio; 2531 bio->bi_private = r10_bio;
@@ -2480,7 +2585,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2480 rdev_clear_badblocks( 2585 rdev_clear_badblocks(
2481 rdev, 2586 rdev,
2482 r10_bio->devs[m].addr, 2587 r10_bio->devs[m].addr,
2483 r10_bio->sectors); 2588 r10_bio->sectors, 0);
2484 } else { 2589 } else {
2485 if (!rdev_set_badblocks( 2590 if (!rdev_set_badblocks(
2486 rdev, 2591 rdev,
@@ -2496,7 +2601,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2496 rdev_clear_badblocks( 2601 rdev_clear_badblocks(
2497 rdev, 2602 rdev,
2498 r10_bio->devs[m].addr, 2603 r10_bio->devs[m].addr,
2499 r10_bio->sectors); 2604 r10_bio->sectors, 0);
2500 } else { 2605 } else {
2501 if (!rdev_set_badblocks( 2606 if (!rdev_set_badblocks(
2502 rdev, 2607 rdev,
@@ -2515,7 +2620,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2515 rdev_clear_badblocks( 2620 rdev_clear_badblocks(
2516 rdev, 2621 rdev,
2517 r10_bio->devs[m].addr, 2622 r10_bio->devs[m].addr,
2518 r10_bio->sectors); 2623 r10_bio->sectors, 0);
2519 rdev_dec_pending(rdev, conf->mddev); 2624 rdev_dec_pending(rdev, conf->mddev);
2520 } else if (bio != NULL && 2625 } else if (bio != NULL &&
2521 !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2626 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2532,7 +2637,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2532 rdev_clear_badblocks( 2637 rdev_clear_badblocks(
2533 rdev, 2638 rdev,
2534 r10_bio->devs[m].addr, 2639 r10_bio->devs[m].addr,
2535 r10_bio->sectors); 2640 r10_bio->sectors, 0);
2536 rdev_dec_pending(rdev, conf->mddev); 2641 rdev_dec_pending(rdev, conf->mddev);
2537 } 2642 }
2538 } 2643 }
@@ -2573,6 +2678,8 @@ static void raid10d(struct mddev *mddev)
2573 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2678 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2574 test_bit(R10BIO_WriteError, &r10_bio->state)) 2679 test_bit(R10BIO_WriteError, &r10_bio->state))
2575 handle_write_completed(conf, r10_bio); 2680 handle_write_completed(conf, r10_bio);
2681 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2682 reshape_request_write(mddev, r10_bio);
2576 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2683 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2577 sync_request_write(mddev, r10_bio); 2684 sync_request_write(mddev, r10_bio);
2578 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 2685 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
@@ -2603,7 +2710,7 @@ static int init_resync(struct r10conf *conf)
2603 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 2710 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2604 BUG_ON(conf->r10buf_pool); 2711 BUG_ON(conf->r10buf_pool);
2605 conf->have_replacement = 0; 2712 conf->have_replacement = 0;
2606 for (i = 0; i < conf->raid_disks; i++) 2713 for (i = 0; i < conf->geo.raid_disks; i++)
2607 if (conf->mirrors[i].replacement) 2714 if (conf->mirrors[i].replacement)
2608 conf->have_replacement = 1; 2715 conf->have_replacement = 1;
2609 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); 2716 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
@@ -2657,6 +2764,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2657 sector_t sync_blocks; 2764 sector_t sync_blocks;
2658 sector_t sectors_skipped = 0; 2765 sector_t sectors_skipped = 0;
2659 int chunks_skipped = 0; 2766 int chunks_skipped = 0;
2767 sector_t chunk_mask = conf->geo.chunk_mask;
2660 2768
2661 if (!conf->r10buf_pool) 2769 if (!conf->r10buf_pool)
2662 if (init_resync(conf)) 2770 if (init_resync(conf))
@@ -2664,7 +2772,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2664 2772
2665 skipped: 2773 skipped:
2666 max_sector = mddev->dev_sectors; 2774 max_sector = mddev->dev_sectors;
2667 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2775 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2776 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2668 max_sector = mddev->resync_max_sectors; 2777 max_sector = mddev->resync_max_sectors;
2669 if (sector_nr >= max_sector) { 2778 if (sector_nr >= max_sector) {
2670 /* If we aborted, we need to abort the 2779 /* If we aborted, we need to abort the
@@ -2676,11 +2785,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2676 * we need to convert that to several 2785 * we need to convert that to several
2677 * virtual addresses. 2786 * virtual addresses.
2678 */ 2787 */
2788 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2789 end_reshape(conf);
2790 return 0;
2791 }
2792
2679 if (mddev->curr_resync < max_sector) { /* aborted */ 2793 if (mddev->curr_resync < max_sector) { /* aborted */
2680 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2794 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2681 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2795 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2682 &sync_blocks, 1); 2796 &sync_blocks, 1);
2683 else for (i=0; i<conf->raid_disks; i++) { 2797 else for (i = 0; i < conf->geo.raid_disks; i++) {
2684 sector_t sect = 2798 sector_t sect =
2685 raid10_find_virt(conf, mddev->curr_resync, i); 2799 raid10_find_virt(conf, mddev->curr_resync, i);
2686 bitmap_end_sync(mddev->bitmap, sect, 2800 bitmap_end_sync(mddev->bitmap, sect,
@@ -2694,7 +2808,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2694 /* Completed a full sync so the replacements 2808 /* Completed a full sync so the replacements
2695 * are now fully recovered. 2809 * are now fully recovered.
2696 */ 2810 */
2697 for (i = 0; i < conf->raid_disks; i++) 2811 for (i = 0; i < conf->geo.raid_disks; i++)
2698 if (conf->mirrors[i].replacement) 2812 if (conf->mirrors[i].replacement)
2699 conf->mirrors[i].replacement 2813 conf->mirrors[i].replacement
2700 ->recovery_offset 2814 ->recovery_offset
@@ -2707,7 +2821,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2707 *skipped = 1; 2821 *skipped = 1;
2708 return sectors_skipped; 2822 return sectors_skipped;
2709 } 2823 }
2710 if (chunks_skipped >= conf->raid_disks) { 2824
2825 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2826 return reshape_request(mddev, sector_nr, skipped);
2827
2828 if (chunks_skipped >= conf->geo.raid_disks) {
2711 /* if there has been nothing to do on any drive, 2829 /* if there has been nothing to do on any drive,
2712 * then there is nothing to do at all.. 2830 * then there is nothing to do at all..
2713 */ 2831 */
@@ -2721,9 +2839,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2721 /* make sure whole request will fit in a chunk - if chunks 2839 /* make sure whole request will fit in a chunk - if chunks
2722 * are meaningful 2840 * are meaningful
2723 */ 2841 */
2724 if (conf->near_copies < conf->raid_disks && 2842 if (conf->geo.near_copies < conf->geo.raid_disks &&
2725 max_sector > (sector_nr | conf->chunk_mask)) 2843 max_sector > (sector_nr | chunk_mask))
2726 max_sector = (sector_nr | conf->chunk_mask) + 1; 2844 max_sector = (sector_nr | chunk_mask) + 1;
2727 /* 2845 /*
2728 * If there is non-resync activity waiting for us then 2846 * If there is non-resync activity waiting for us then
2729 * put in a delay to throttle resync. 2847 * put in a delay to throttle resync.
@@ -2752,7 +2870,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2752 int j; 2870 int j;
2753 r10_bio = NULL; 2871 r10_bio = NULL;
2754 2872
2755 for (i=0 ; i<conf->raid_disks; i++) { 2873 for (i = 0 ; i < conf->geo.raid_disks; i++) {
2756 int still_degraded; 2874 int still_degraded;
2757 struct r10bio *rb2; 2875 struct r10bio *rb2;
2758 sector_t sect; 2876 sector_t sect;
@@ -2806,7 +2924,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2806 /* Need to check if the array will still be 2924 /* Need to check if the array will still be
2807 * degraded 2925 * degraded
2808 */ 2926 */
2809 for (j=0; j<conf->raid_disks; j++) 2927 for (j = 0; j < conf->geo.raid_disks; j++)
2810 if (conf->mirrors[j].rdev == NULL || 2928 if (conf->mirrors[j].rdev == NULL ||
2811 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { 2929 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
2812 still_degraded = 1; 2930 still_degraded = 1;
@@ -2984,9 +3102,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2984 r10_bio->sector = sector_nr; 3102 r10_bio->sector = sector_nr;
2985 set_bit(R10BIO_IsSync, &r10_bio->state); 3103 set_bit(R10BIO_IsSync, &r10_bio->state);
2986 raid10_find_phys(conf, r10_bio); 3104 raid10_find_phys(conf, r10_bio);
2987 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; 3105 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
2988 3106
2989 for (i=0; i<conf->copies; i++) { 3107 for (i = 0; i < conf->copies; i++) {
2990 int d = r10_bio->devs[i].devnum; 3108 int d = r10_bio->devs[i].devnum;
2991 sector_t first_bad, sector; 3109 sector_t first_bad, sector;
2992 int bad_sectors; 3110 int bad_sectors;
@@ -3152,16 +3270,17 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3152 struct r10conf *conf = mddev->private; 3270 struct r10conf *conf = mddev->private;
3153 3271
3154 if (!raid_disks) 3272 if (!raid_disks)
3155 raid_disks = conf->raid_disks; 3273 raid_disks = min(conf->geo.raid_disks,
3274 conf->prev.raid_disks);
3156 if (!sectors) 3275 if (!sectors)
3157 sectors = conf->dev_sectors; 3276 sectors = conf->dev_sectors;
3158 3277
3159 size = sectors >> conf->chunk_shift; 3278 size = sectors >> conf->geo.chunk_shift;
3160 sector_div(size, conf->far_copies); 3279 sector_div(size, conf->geo.far_copies);
3161 size = size * raid_disks; 3280 size = size * raid_disks;
3162 sector_div(size, conf->near_copies); 3281 sector_div(size, conf->geo.near_copies);
3163 3282
3164 return size << conf->chunk_shift; 3283 return size << conf->geo.chunk_shift;
3165} 3284}
3166 3285
3167static void calc_sectors(struct r10conf *conf, sector_t size) 3286static void calc_sectors(struct r10conf *conf, sector_t size)
@@ -3171,10 +3290,10 @@ static void calc_sectors(struct r10conf *conf, sector_t size)
3171 * conf->stride 3290 * conf->stride
3172 */ 3291 */
3173 3292
3174 size = size >> conf->chunk_shift; 3293 size = size >> conf->geo.chunk_shift;
3175 sector_div(size, conf->far_copies); 3294 sector_div(size, conf->geo.far_copies);
3176 size = size * conf->raid_disks; 3295 size = size * conf->geo.raid_disks;
3177 sector_div(size, conf->near_copies); 3296 sector_div(size, conf->geo.near_copies);
3178 /* 'size' is now the number of chunks in the array */ 3297 /* 'size' is now the number of chunks in the array */
3179 /* calculate "used chunks per device" */ 3298 /* calculate "used chunks per device" */
3180 size = size * conf->copies; 3299 size = size * conf->copies;
@@ -3182,38 +3301,76 @@ static void calc_sectors(struct r10conf *conf, sector_t size)
3182 /* We need to round up when dividing by raid_disks to 3301 /* We need to round up when dividing by raid_disks to
3183 * get the stride size. 3302 * get the stride size.
3184 */ 3303 */
3185 size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); 3304 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3186 3305
3187 conf->dev_sectors = size << conf->chunk_shift; 3306 conf->dev_sectors = size << conf->geo.chunk_shift;
3188 3307
3189 if (conf->far_offset) 3308 if (conf->geo.far_offset)
3190 conf->stride = 1 << conf->chunk_shift; 3309 conf->geo.stride = 1 << conf->geo.chunk_shift;
3191 else { 3310 else {
3192 sector_div(size, conf->far_copies); 3311 sector_div(size, conf->geo.far_copies);
3193 conf->stride = size << conf->chunk_shift; 3312 conf->geo.stride = size << conf->geo.chunk_shift;
3194 } 3313 }
3195} 3314}
3196 3315
3316enum geo_type {geo_new, geo_old, geo_start};
3317static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3318{
3319 int nc, fc, fo;
3320 int layout, chunk, disks;
3321 switch (new) {
3322 case geo_old:
3323 layout = mddev->layout;
3324 chunk = mddev->chunk_sectors;
3325 disks = mddev->raid_disks - mddev->delta_disks;
3326 break;
3327 case geo_new:
3328 layout = mddev->new_layout;
3329 chunk = mddev->new_chunk_sectors;
3330 disks = mddev->raid_disks;
3331 break;
3332 default: /* avoid 'may be unused' warnings */
3333 case geo_start: /* new when starting reshape - raid_disks not
3334 * updated yet. */
3335 layout = mddev->new_layout;
3336 chunk = mddev->new_chunk_sectors;
3337 disks = mddev->raid_disks + mddev->delta_disks;
3338 break;
3339 }
3340 if (layout >> 17)
3341 return -1;
3342 if (chunk < (PAGE_SIZE >> 9) ||
3343 !is_power_of_2(chunk))
3344 return -2;
3345 nc = layout & 255;
3346 fc = (layout >> 8) & 255;
3347 fo = layout & (1<<16);
3348 geo->raid_disks = disks;
3349 geo->near_copies = nc;
3350 geo->far_copies = fc;
3351 geo->far_offset = fo;
3352 geo->chunk_mask = chunk - 1;
3353 geo->chunk_shift = ffz(~chunk);
3354 return nc*fc;
3355}
3356
3197static struct r10conf *setup_conf(struct mddev *mddev) 3357static struct r10conf *setup_conf(struct mddev *mddev)
3198{ 3358{
3199 struct r10conf *conf = NULL; 3359 struct r10conf *conf = NULL;
3200 int nc, fc, fo;
3201 int err = -EINVAL; 3360 int err = -EINVAL;
3361 struct geom geo;
3362 int copies;
3363
3364 copies = setup_geo(&geo, mddev, geo_new);
3202 3365
3203 if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || 3366 if (copies == -2) {
3204 !is_power_of_2(mddev->new_chunk_sectors)) {
3205 printk(KERN_ERR "md/raid10:%s: chunk size must be " 3367 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3206 "at least PAGE_SIZE(%ld) and be a power of 2.\n", 3368 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3207 mdname(mddev), PAGE_SIZE); 3369 mdname(mddev), PAGE_SIZE);
3208 goto out; 3370 goto out;
3209 } 3371 }
3210 3372
3211 nc = mddev->new_layout & 255; 3373 if (copies < 2 || copies > mddev->raid_disks) {
3212 fc = (mddev->new_layout >> 8) & 255;
3213 fo = mddev->new_layout & (1<<16);
3214
3215 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
3216 (mddev->new_layout >> 17)) {
3217 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 3374 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3218 mdname(mddev), mddev->new_layout); 3375 mdname(mddev), mddev->new_layout);
3219 goto out; 3376 goto out;
@@ -3224,7 +3381,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3224 if (!conf) 3381 if (!conf)
3225 goto out; 3382 goto out;
3226 3383
3227 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 3384 /* FIXME calc properly */
3385 conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks +
3386 max(0,mddev->delta_disks)),
3228 GFP_KERNEL); 3387 GFP_KERNEL);
3229 if (!conf->mirrors) 3388 if (!conf->mirrors)
3230 goto out; 3389 goto out;
@@ -3233,22 +3392,29 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3233 if (!conf->tmppage) 3392 if (!conf->tmppage)
3234 goto out; 3393 goto out;
3235 3394
3236 3395 conf->geo = geo;
3237 conf->raid_disks = mddev->raid_disks; 3396 conf->copies = copies;
3238 conf->near_copies = nc;
3239 conf->far_copies = fc;
3240 conf->copies = nc*fc;
3241 conf->far_offset = fo;
3242 conf->chunk_mask = mddev->new_chunk_sectors - 1;
3243 conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
3244
3245 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 3397 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3246 r10bio_pool_free, conf); 3398 r10bio_pool_free, conf);
3247 if (!conf->r10bio_pool) 3399 if (!conf->r10bio_pool)
3248 goto out; 3400 goto out;
3249 3401
3250 calc_sectors(conf, mddev->dev_sectors); 3402 calc_sectors(conf, mddev->dev_sectors);
3251 3403 if (mddev->reshape_position == MaxSector) {
3404 conf->prev = conf->geo;
3405 conf->reshape_progress = MaxSector;
3406 } else {
3407 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3408 err = -EINVAL;
3409 goto out;
3410 }
3411 conf->reshape_progress = mddev->reshape_position;
3412 if (conf->prev.far_offset)
3413 conf->prev.stride = 1 << conf->prev.chunk_shift;
3414 else
3415 /* far_copies must be 1 */
3416 conf->prev.stride = conf->dev_sectors;
3417 }
3252 spin_lock_init(&conf->device_lock); 3418 spin_lock_init(&conf->device_lock);
3253 INIT_LIST_HEAD(&conf->retry_list); 3419 INIT_LIST_HEAD(&conf->retry_list);
3254 3420
@@ -3263,8 +3429,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3263 return conf; 3429 return conf;
3264 3430
3265 out: 3431 out:
3266 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", 3432 if (err == -ENOMEM)
3267 mdname(mddev)); 3433 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3434 mdname(mddev));
3268 if (conf) { 3435 if (conf) {
3269 if (conf->r10bio_pool) 3436 if (conf->r10bio_pool)
3270 mempool_destroy(conf->r10bio_pool); 3437 mempool_destroy(conf->r10bio_pool);
@@ -3282,12 +3449,8 @@ static int run(struct mddev *mddev)
3282 struct mirror_info *disk; 3449 struct mirror_info *disk;
3283 struct md_rdev *rdev; 3450 struct md_rdev *rdev;
3284 sector_t size; 3451 sector_t size;
3285 3452 sector_t min_offset_diff = 0;
3286 /* 3453 int first = 1;
3287 * copy the already verified devices into our private RAID10
3288 * bookkeeping area. [whatever we allocate in run(),
3289 * should be freed in stop()]
3290 */
3291 3454
3292 if (mddev->private == NULL) { 3455 if (mddev->private == NULL) {
3293 conf = setup_conf(mddev); 3456 conf = setup_conf(mddev);
@@ -3304,17 +3467,20 @@ static int run(struct mddev *mddev)
3304 3467
3305 chunk_size = mddev->chunk_sectors << 9; 3468 chunk_size = mddev->chunk_sectors << 9;
3306 blk_queue_io_min(mddev->queue, chunk_size); 3469 blk_queue_io_min(mddev->queue, chunk_size);
3307 if (conf->raid_disks % conf->near_copies) 3470 if (conf->geo.raid_disks % conf->geo.near_copies)
3308 blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); 3471 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3309 else 3472 else
3310 blk_queue_io_opt(mddev->queue, chunk_size * 3473 blk_queue_io_opt(mddev->queue, chunk_size *
3311 (conf->raid_disks / conf->near_copies)); 3474 (conf->geo.raid_disks / conf->geo.near_copies));
3312 3475
3313 rdev_for_each(rdev, mddev) { 3476 rdev_for_each(rdev, mddev) {
3477 long long diff;
3314 3478
3315 disk_idx = rdev->raid_disk; 3479 disk_idx = rdev->raid_disk;
3316 if (disk_idx >= conf->raid_disks 3480 if (disk_idx < 0)
3317 || disk_idx < 0) 3481 continue;
3482 if (disk_idx >= conf->geo.raid_disks &&
3483 disk_idx >= conf->prev.raid_disks)
3318 continue; 3484 continue;
3319 disk = conf->mirrors + disk_idx; 3485 disk = conf->mirrors + disk_idx;
3320 3486
@@ -3327,12 +3493,20 @@ static int run(struct mddev *mddev)
3327 goto out_free_conf; 3493 goto out_free_conf;
3328 disk->rdev = rdev; 3494 disk->rdev = rdev;
3329 } 3495 }
3496 diff = (rdev->new_data_offset - rdev->data_offset);
3497 if (!mddev->reshape_backwards)
3498 diff = -diff;
3499 if (diff < 0)
3500 diff = 0;
3501 if (first || diff < min_offset_diff)
3502 min_offset_diff = diff;
3330 3503
3331 disk_stack_limits(mddev->gendisk, rdev->bdev, 3504 disk_stack_limits(mddev->gendisk, rdev->bdev,
3332 rdev->data_offset << 9); 3505 rdev->data_offset << 9);
3333 3506
3334 disk->head_position = 0; 3507 disk->head_position = 0;
3335 } 3508 }
3509
3336 /* need to check that every block has at least one working mirror */ 3510 /* need to check that every block has at least one working mirror */
3337 if (!enough(conf, -1)) { 3511 if (!enough(conf, -1)) {
3338 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 3512 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
@@ -3340,8 +3514,21 @@ static int run(struct mddev *mddev)
3340 goto out_free_conf; 3514 goto out_free_conf;
3341 } 3515 }
3342 3516
3517 if (conf->reshape_progress != MaxSector) {
3518 /* must ensure that shape change is supported */
3519 if (conf->geo.far_copies != 1 &&
3520 conf->geo.far_offset == 0)
3521 goto out_free_conf;
3522 if (conf->prev.far_copies != 1 &&
3523 conf->geo.far_offset == 0)
3524 goto out_free_conf;
3525 }
3526
3343 mddev->degraded = 0; 3527 mddev->degraded = 0;
3344 for (i = 0; i < conf->raid_disks; i++) { 3528 for (i = 0;
3529 i < conf->geo.raid_disks
3530 || i < conf->prev.raid_disks;
3531 i++) {
3345 3532
3346 disk = conf->mirrors + i; 3533 disk = conf->mirrors + i;
3347 3534
@@ -3368,8 +3555,8 @@ static int run(struct mddev *mddev)
3368 mdname(mddev)); 3555 mdname(mddev));
3369 printk(KERN_INFO 3556 printk(KERN_INFO
3370 "md/raid10:%s: active with %d out of %d devices\n", 3557 "md/raid10:%s: active with %d out of %d devices\n",
3371 mdname(mddev), conf->raid_disks - mddev->degraded, 3558 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3372 conf->raid_disks); 3559 conf->geo.raid_disks);
3373 /* 3560 /*
3374 * Ok, everything is just fine now 3561 * Ok, everything is just fine now
3375 */ 3562 */
@@ -3386,11 +3573,11 @@ static int run(struct mddev *mddev)
3386 * maybe... 3573 * maybe...
3387 */ 3574 */
3388 { 3575 {
3389 int stripe = conf->raid_disks * 3576 int stripe = conf->geo.raid_disks *
3390 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3577 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3391 stripe /= conf->near_copies; 3578 stripe /= conf->geo.near_copies;
3392 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 3579 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3393 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 3580 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3394 } 3581 }
3395 3582
3396 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 3583 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
@@ -3398,6 +3585,30 @@ static int run(struct mddev *mddev)
3398 if (md_integrity_register(mddev)) 3585 if (md_integrity_register(mddev))
3399 goto out_free_conf; 3586 goto out_free_conf;
3400 3587
3588 if (conf->reshape_progress != MaxSector) {
3589 unsigned long before_length, after_length;
3590
3591 before_length = ((1 << conf->prev.chunk_shift) *
3592 conf->prev.far_copies);
3593 after_length = ((1 << conf->geo.chunk_shift) *
3594 conf->geo.far_copies);
3595
3596 if (max(before_length, after_length) > min_offset_diff) {
3597 /* This cannot work */
3598 printk("md/raid10: offset difference not enough to continue reshape\n");
3599 goto out_free_conf;
3600 }
3601 conf->offset_diff = min_offset_diff;
3602
3603 conf->reshape_safe = conf->reshape_progress;
3604 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3605 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3606 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3607 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3608 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3609 "reshape");
3610 }
3611
3401 return 0; 3612 return 0;
3402 3613
3403out_free_conf: 3614out_free_conf:
@@ -3460,14 +3671,23 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
3460 struct r10conf *conf = mddev->private; 3671 struct r10conf *conf = mddev->private;
3461 sector_t oldsize, size; 3672 sector_t oldsize, size;
3462 3673
3463 if (conf->far_copies > 1 && !conf->far_offset) 3674 if (mddev->reshape_position != MaxSector)
3675 return -EBUSY;
3676
3677 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3464 return -EINVAL; 3678 return -EINVAL;
3465 3679
3466 oldsize = raid10_size(mddev, 0, 0); 3680 oldsize = raid10_size(mddev, 0, 0);
3467 size = raid10_size(mddev, sectors, 0); 3681 size = raid10_size(mddev, sectors, 0);
3468 md_set_array_sectors(mddev, size); 3682 if (mddev->external_size &&
3469 if (mddev->array_sectors > size) 3683 mddev->array_sectors > size)
3470 return -EINVAL; 3684 return -EINVAL;
3685 if (mddev->bitmap) {
3686 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3687 if (ret)
3688 return ret;
3689 }
3690 md_set_array_sectors(mddev, size);
3471 set_capacity(mddev->gendisk, mddev->array_sectors); 3691 set_capacity(mddev->gendisk, mddev->array_sectors);
3472 revalidate_disk(mddev->gendisk); 3692 revalidate_disk(mddev->gendisk);
3473 if (sectors > mddev->dev_sectors && 3693 if (sectors > mddev->dev_sectors &&
@@ -3534,6 +3754,758 @@ static void *raid10_takeover(struct mddev *mddev)
3534 return ERR_PTR(-EINVAL); 3754 return ERR_PTR(-EINVAL);
3535} 3755}
3536 3756
3757static int raid10_check_reshape(struct mddev *mddev)
3758{
3759 /* Called when there is a request to change
3760 * - layout (to ->new_layout)
3761 * - chunk size (to ->new_chunk_sectors)
3762 * - raid_disks (by delta_disks)
3763 * or when trying to restart a reshape that was ongoing.
3764 *
3765 * We need to validate the request and possibly allocate
3766 * space if that might be an issue later.
3767 *
3768 * Currently we reject any reshape of a 'far' mode array,
3769 * allow chunk size to change if new is generally acceptable,
3770 * allow raid_disks to increase, and allow
3771 * a switch between 'near' mode and 'offset' mode.
3772 */
3773 struct r10conf *conf = mddev->private;
3774 struct geom geo;
3775
3776 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3777 return -EINVAL;
3778
3779 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3780 /* mustn't change number of copies */
3781 return -EINVAL;
3782 if (geo.far_copies > 1 && !geo.far_offset)
3783 /* Cannot switch to 'far' mode */
3784 return -EINVAL;
3785
3786 if (mddev->array_sectors & geo.chunk_mask)
3787 /* not factor of array size */
3788 return -EINVAL;
3789
3790 if (!enough(conf, -1))
3791 return -EINVAL;
3792
3793 kfree(conf->mirrors_new);
3794 conf->mirrors_new = NULL;
3795 if (mddev->delta_disks > 0) {
3796 /* allocate new 'mirrors' list */
3797 conf->mirrors_new = kzalloc(
3798 sizeof(struct mirror_info)
3799 *(mddev->raid_disks +
3800 mddev->delta_disks),
3801 GFP_KERNEL);
3802 if (!conf->mirrors_new)
3803 return -ENOMEM;
3804 }
3805 return 0;
3806}
3807
3808/*
3809 * Need to check if array has failed when deciding whether to:
3810 * - start an array
3811 * - remove non-faulty devices
3812 * - add a spare
3813 * - allow a reshape
3814 * This determination is simple when no reshape is happening.
3815 * However if there is a reshape, we need to carefully check
3816 * both the before and after sections.
3817 * This is because some failed devices may only affect one
3818 * of the two sections, and some non-in_sync devices may
3819 * be insync in the section most affected by failed devices.
3820 */
3821static int calc_degraded(struct r10conf *conf)
3822{
3823 int degraded, degraded2;
3824 int i;
3825
3826 rcu_read_lock();
3827 degraded = 0;
3828 /* 'prev' section first */
3829 for (i = 0; i < conf->prev.raid_disks; i++) {
3830 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3831 if (!rdev || test_bit(Faulty, &rdev->flags))
3832 degraded++;
3833 else if (!test_bit(In_sync, &rdev->flags))
3834 /* When we can reduce the number of devices in
3835 * an array, this might not contribute to
3836 * 'degraded'. It does now.
3837 */
3838 degraded++;
3839 }
3840 rcu_read_unlock();
3841 if (conf->geo.raid_disks == conf->prev.raid_disks)
3842 return degraded;
3843 rcu_read_lock();
3844 degraded2 = 0;
3845 for (i = 0; i < conf->geo.raid_disks; i++) {
3846 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3847 if (!rdev || test_bit(Faulty, &rdev->flags))
3848 degraded2++;
3849 else if (!test_bit(In_sync, &rdev->flags)) {
3850 /* If reshape is increasing the number of devices,
3851 * this section has already been recovered, so
3852 * it doesn't contribute to degraded.
3853 * else it does.
3854 */
3855 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3856 degraded2++;
3857 }
3858 }
3859 rcu_read_unlock();
3860 if (degraded2 > degraded)
3861 return degraded2;
3862 return degraded;
3863}
3864
3865static int raid10_start_reshape(struct mddev *mddev)
3866{
3867 /* A 'reshape' has been requested. This commits
3868 * the various 'new' fields and sets MD_RECOVER_RESHAPE
3869 * This also checks if there are enough spares and adds them
3870 * to the array.
3871 * We currently require enough spares to make the final
3872 * array non-degraded. We also require that the difference
3873 * between old and new data_offset - on each device - is
3874 * enough that we never risk over-writing.
3875 */
3876
3877 unsigned long before_length, after_length;
3878 sector_t min_offset_diff = 0;
3879 int first = 1;
3880 struct geom new;
3881 struct r10conf *conf = mddev->private;
3882 struct md_rdev *rdev;
3883 int spares = 0;
3884 int ret;
3885
3886 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3887 return -EBUSY;
3888
3889 if (setup_geo(&new, mddev, geo_start) != conf->copies)
3890 return -EINVAL;
3891
3892 before_length = ((1 << conf->prev.chunk_shift) *
3893 conf->prev.far_copies);
3894 after_length = ((1 << conf->geo.chunk_shift) *
3895 conf->geo.far_copies);
3896
3897 rdev_for_each(rdev, mddev) {
3898 if (!test_bit(In_sync, &rdev->flags)
3899 && !test_bit(Faulty, &rdev->flags))
3900 spares++;
3901 if (rdev->raid_disk >= 0) {
3902 long long diff = (rdev->new_data_offset
3903 - rdev->data_offset);
3904 if (!mddev->reshape_backwards)
3905 diff = -diff;
3906 if (diff < 0)
3907 diff = 0;
3908 if (first || diff < min_offset_diff)
3909 min_offset_diff = diff;
3910 }
3911 }
3912
3913 if (max(before_length, after_length) > min_offset_diff)
3914 return -EINVAL;
3915
3916 if (spares < mddev->delta_disks)
3917 return -EINVAL;
3918
3919 conf->offset_diff = min_offset_diff;
3920 spin_lock_irq(&conf->device_lock);
3921 if (conf->mirrors_new) {
3922 memcpy(conf->mirrors_new, conf->mirrors,
3923 sizeof(struct mirror_info)*conf->prev.raid_disks);
3924 smp_mb();
3925 kfree(conf->mirrors_old); /* FIXME and elsewhere */
3926 conf->mirrors_old = conf->mirrors;
3927 conf->mirrors = conf->mirrors_new;
3928 conf->mirrors_new = NULL;
3929 }
3930 setup_geo(&conf->geo, mddev, geo_start);
3931 smp_mb();
3932 if (mddev->reshape_backwards) {
3933 sector_t size = raid10_size(mddev, 0, 0);
3934 if (size < mddev->array_sectors) {
3935 spin_unlock_irq(&conf->device_lock);
3936 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
3937 mdname(mddev));
3938 return -EINVAL;
3939 }
3940 mddev->resync_max_sectors = size;
3941 conf->reshape_progress = size;
3942 } else
3943 conf->reshape_progress = 0;
3944 spin_unlock_irq(&conf->device_lock);
3945
3946 if (mddev->delta_disks && mddev->bitmap) {
3947 ret = bitmap_resize(mddev->bitmap,
3948 raid10_size(mddev, 0,
3949 conf->geo.raid_disks),
3950 0, 0);
3951 if (ret)
3952 goto abort;
3953 }
3954 if (mddev->delta_disks > 0) {
3955 rdev_for_each(rdev, mddev)
3956 if (rdev->raid_disk < 0 &&
3957 !test_bit(Faulty, &rdev->flags)) {
3958 if (raid10_add_disk(mddev, rdev) == 0) {
3959 if (rdev->raid_disk >=
3960 conf->prev.raid_disks)
3961 set_bit(In_sync, &rdev->flags);
3962 else
3963 rdev->recovery_offset = 0;
3964
3965 if (sysfs_link_rdev(mddev, rdev))
3966 /* Failure here is OK */;
3967 }
3968 } else if (rdev->raid_disk >= conf->prev.raid_disks
3969 && !test_bit(Faulty, &rdev->flags)) {
3970 /* This is a spare that was manually added */
3971 set_bit(In_sync, &rdev->flags);
3972 }
3973 }
3974 /* When a reshape changes the number of devices,
3975 * ->degraded is measured against the larger of the
3976 * pre and post numbers.
3977 */
3978 spin_lock_irq(&conf->device_lock);
3979 mddev->degraded = calc_degraded(conf);
3980 spin_unlock_irq(&conf->device_lock);
3981 mddev->raid_disks = conf->geo.raid_disks;
3982 mddev->reshape_position = conf->reshape_progress;
3983 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3984
3985 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3986 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3987 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3988 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3989
3990 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3991 "reshape");
3992 if (!mddev->sync_thread) {
3993 ret = -EAGAIN;
3994 goto abort;
3995 }
3996 conf->reshape_checkpoint = jiffies;
3997 md_wakeup_thread(mddev->sync_thread);
3998 md_new_event(mddev);
3999 return 0;
4000
4001abort:
4002 mddev->recovery = 0;
4003 spin_lock_irq(&conf->device_lock);
4004 conf->geo = conf->prev;
4005 mddev->raid_disks = conf->geo.raid_disks;
4006 rdev_for_each(rdev, mddev)
4007 rdev->new_data_offset = rdev->data_offset;
4008 smp_wmb();
4009 conf->reshape_progress = MaxSector;
4010 mddev->reshape_position = MaxSector;
4011 spin_unlock_irq(&conf->device_lock);
4012 return ret;
4013}
4014
4015/* Calculate the last device-address that could contain
4016 * any block from the chunk that includes the array-address 's'
4017 * and report the next address.
4018 * i.e. the address returned will be chunk-aligned and after
4019 * any data that is in the chunk containing 's'.
4020 */
4021static sector_t last_dev_address(sector_t s, struct geom *geo)
4022{
4023 s = (s | geo->chunk_mask) + 1;
4024 s >>= geo->chunk_shift;
4025 s *= geo->near_copies;
4026 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4027 s *= geo->far_copies;
4028 s <<= geo->chunk_shift;
4029 return s;
4030}
4031
4032/* Calculate the first device-address that could contain
4033 * any block from the chunk that includes the array-address 's'.
4034 * This too will be the start of a chunk
4035 */
4036static sector_t first_dev_address(sector_t s, struct geom *geo)
4037{
4038 s >>= geo->chunk_shift;
4039 s *= geo->near_copies;
4040 sector_div(s, geo->raid_disks);
4041 s *= geo->far_copies;
4042 s <<= geo->chunk_shift;
4043 return s;
4044}
4045
4046static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4047 int *skipped)
4048{
4049 /* We simply copy at most one chunk (smallest of old and new)
4050 * at a time, possibly less if that exceeds RESYNC_PAGES,
4051 * or we hit a bad block or something.
4052 * This might mean we pause for normal IO in the middle of
4053 * a chunk, but that is not a problem was mddev->reshape_position
4054 * can record any location.
4055 *
4056 * If we will want to write to a location that isn't
4057 * yet recorded as 'safe' (i.e. in metadata on disk) then
4058 * we need to flush all reshape requests and update the metadata.
4059 *
4060 * When reshaping forwards (e.g. to more devices), we interpret
4061 * 'safe' as the earliest block which might not have been copied
4062 * down yet. We divide this by previous stripe size and multiply
4063 * by previous stripe length to get lowest device offset that we
4064 * cannot write to yet.
4065 * We interpret 'sector_nr' as an address that we want to write to.
4066 * From this we use last_device_address() to find where we might
4067 * write to, and first_device_address on the 'safe' position.
4068 * If this 'next' write position is after the 'safe' position,
4069 * we must update the metadata to increase the 'safe' position.
4070 *
4071 * When reshaping backwards, we round in the opposite direction
4072 * and perform the reverse test: next write position must not be
4073 * less than current safe position.
4074 *
4075 * In all this the minimum difference in data offsets
4076 * (conf->offset_diff - always positive) allows a bit of slack,
4077 * so next can be after 'safe', but not by more than offset_disk
4078 *
4079 * We need to prepare all the bios here before we start any IO
4080 * to ensure the size we choose is acceptable to all devices.
4081 * The means one for each copy for write-out and an extra one for
4082 * read-in.
4083 * We store the read-in bio in ->master_bio and the others in
4084 * ->devs[x].bio and ->devs[x].repl_bio.
4085 */
4086 struct r10conf *conf = mddev->private;
4087 struct r10bio *r10_bio;
4088 sector_t next, safe, last;
4089 int max_sectors;
4090 int nr_sectors;
4091 int s;
4092 struct md_rdev *rdev;
4093 int need_flush = 0;
4094 struct bio *blist;
4095 struct bio *bio, *read_bio;
4096 int sectors_done = 0;
4097
4098 if (sector_nr == 0) {
4099 /* If restarting in the middle, skip the initial sectors */
4100 if (mddev->reshape_backwards &&
4101 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4102 sector_nr = (raid10_size(mddev, 0, 0)
4103 - conf->reshape_progress);
4104 } else if (!mddev->reshape_backwards &&
4105 conf->reshape_progress > 0)
4106 sector_nr = conf->reshape_progress;
4107 if (sector_nr) {
4108 mddev->curr_resync_completed = sector_nr;
4109 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4110 *skipped = 1;
4111 return sector_nr;
4112 }
4113 }
4114
4115 /* We don't use sector_nr to track where we are up to
4116 * as that doesn't work well for ->reshape_backwards.
4117 * So just use ->reshape_progress.
4118 */
4119 if (mddev->reshape_backwards) {
4120 /* 'next' is the earliest device address that we might
4121 * write to for this chunk in the new layout
4122 */
4123 next = first_dev_address(conf->reshape_progress - 1,
4124 &conf->geo);
4125
4126 /* 'safe' is the last device address that we might read from
4127 * in the old layout after a restart
4128 */
4129 safe = last_dev_address(conf->reshape_safe - 1,
4130 &conf->prev);
4131
4132 if (next + conf->offset_diff < safe)
4133 need_flush = 1;
4134
4135 last = conf->reshape_progress - 1;
4136 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4137 & conf->prev.chunk_mask);
4138 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4139 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4140 } else {
4141 /* 'next' is after the last device address that we
4142 * might write to for this chunk in the new layout
4143 */
4144 next = last_dev_address(conf->reshape_progress, &conf->geo);
4145
4146 /* 'safe' is the earliest device address that we might
4147 * read from in the old layout after a restart
4148 */
4149 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4150
4151 /* Need to update metadata if 'next' might be beyond 'safe'
4152 * as that would possibly corrupt data
4153 */
4154 if (next > safe + conf->offset_diff)
4155 need_flush = 1;
4156
4157 sector_nr = conf->reshape_progress;
4158 last = sector_nr | (conf->geo.chunk_mask
4159 & conf->prev.chunk_mask);
4160
4161 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4162 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4163 }
4164
4165 if (need_flush ||
4166 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4167 /* Need to update reshape_position in metadata */
4168 wait_barrier(conf);
4169 mddev->reshape_position = conf->reshape_progress;
4170 if (mddev->reshape_backwards)
4171 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4172 - conf->reshape_progress;
4173 else
4174 mddev->curr_resync_completed = conf->reshape_progress;
4175 conf->reshape_checkpoint = jiffies;
4176 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4177 md_wakeup_thread(mddev->thread);
4178 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4179 kthread_should_stop());
4180 conf->reshape_safe = mddev->reshape_position;
4181 allow_barrier(conf);
4182 }
4183
4184read_more:
4185 /* Now schedule reads for blocks from sector_nr to last */
4186 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4187 raise_barrier(conf, sectors_done != 0);
4188 atomic_set(&r10_bio->remaining, 0);
4189 r10_bio->mddev = mddev;
4190 r10_bio->sector = sector_nr;
4191 set_bit(R10BIO_IsReshape, &r10_bio->state);
4192 r10_bio->sectors = last - sector_nr + 1;
4193 rdev = read_balance(conf, r10_bio, &max_sectors);
4194 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4195
4196 if (!rdev) {
4197 /* Cannot read from here, so need to record bad blocks
4198 * on all the target devices.
4199 */
4200 // FIXME
4201 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4202 return sectors_done;
4203 }
4204
4205 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4206
4207 read_bio->bi_bdev = rdev->bdev;
4208 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4209 + rdev->data_offset);
4210 read_bio->bi_private = r10_bio;
4211 read_bio->bi_end_io = end_sync_read;
4212 read_bio->bi_rw = READ;
4213 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4214 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4215 read_bio->bi_vcnt = 0;
4216 read_bio->bi_idx = 0;
4217 read_bio->bi_size = 0;
4218 r10_bio->master_bio = read_bio;
4219 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4220
4221 /* Now find the locations in the new layout */
4222 __raid10_find_phys(&conf->geo, r10_bio);
4223
4224 blist = read_bio;
4225 read_bio->bi_next = NULL;
4226
4227 for (s = 0; s < conf->copies*2; s++) {
4228 struct bio *b;
4229 int d = r10_bio->devs[s/2].devnum;
4230 struct md_rdev *rdev2;
4231 if (s&1) {
4232 rdev2 = conf->mirrors[d].replacement;
4233 b = r10_bio->devs[s/2].repl_bio;
4234 } else {
4235 rdev2 = conf->mirrors[d].rdev;
4236 b = r10_bio->devs[s/2].bio;
4237 }
4238 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4239 continue;
4240 b->bi_bdev = rdev2->bdev;
4241 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4242 b->bi_private = r10_bio;
4243 b->bi_end_io = end_reshape_write;
4244 b->bi_rw = WRITE;
4245 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4246 b->bi_flags |= 1 << BIO_UPTODATE;
4247 b->bi_next = blist;
4248 b->bi_vcnt = 0;
4249 b->bi_idx = 0;
4250 b->bi_size = 0;
4251 blist = b;
4252 }
4253
4254 /* Now add as many pages as possible to all of these bios. */
4255
4256 nr_sectors = 0;
4257 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4258 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4259 int len = (max_sectors - s) << 9;
4260 if (len > PAGE_SIZE)
4261 len = PAGE_SIZE;
4262 for (bio = blist; bio ; bio = bio->bi_next) {
4263 struct bio *bio2;
4264 if (bio_add_page(bio, page, len, 0))
4265 continue;
4266
4267 /* Didn't fit, must stop */
4268 for (bio2 = blist;
4269 bio2 && bio2 != bio;
4270 bio2 = bio2->bi_next) {
4271 /* Remove last page from this bio */
4272 bio2->bi_vcnt--;
4273 bio2->bi_size -= len;
4274 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4275 }
4276 goto bio_full;
4277 }
4278 sector_nr += len >> 9;
4279 nr_sectors += len >> 9;
4280 }
4281bio_full:
4282 r10_bio->sectors = nr_sectors;
4283
4284 /* Now submit the read */
4285 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4286 atomic_inc(&r10_bio->remaining);
4287 read_bio->bi_next = NULL;
4288 generic_make_request(read_bio);
4289 sector_nr += nr_sectors;
4290 sectors_done += nr_sectors;
4291 if (sector_nr <= last)
4292 goto read_more;
4293
4294 /* Now that we have done the whole section we can
4295 * update reshape_progress
4296 */
4297 if (mddev->reshape_backwards)
4298 conf->reshape_progress -= sectors_done;
4299 else
4300 conf->reshape_progress += sectors_done;
4301
4302 return sectors_done;
4303}
4304
4305static void end_reshape_request(struct r10bio *r10_bio);
4306static int handle_reshape_read_error(struct mddev *mddev,
4307 struct r10bio *r10_bio);
4308static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4309{
4310 /* Reshape read completed. Hopefully we have a block
4311 * to write out.
4312 * If we got a read error then we do sync 1-page reads from
4313 * elsewhere until we find the data - or give up.
4314 */
4315 struct r10conf *conf = mddev->private;
4316 int s;
4317
4318 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4319 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4320 /* Reshape has been aborted */
4321 md_done_sync(mddev, r10_bio->sectors, 0);
4322 return;
4323 }
4324
4325 /* We definitely have the data in the pages, schedule the
4326 * writes.
4327 */
4328 atomic_set(&r10_bio->remaining, 1);
4329 for (s = 0; s < conf->copies*2; s++) {
4330 struct bio *b;
4331 int d = r10_bio->devs[s/2].devnum;
4332 struct md_rdev *rdev;
4333 if (s&1) {
4334 rdev = conf->mirrors[d].replacement;
4335 b = r10_bio->devs[s/2].repl_bio;
4336 } else {
4337 rdev = conf->mirrors[d].rdev;
4338 b = r10_bio->devs[s/2].bio;
4339 }
4340 if (!rdev || test_bit(Faulty, &rdev->flags))
4341 continue;
4342 atomic_inc(&rdev->nr_pending);
4343 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4344 atomic_inc(&r10_bio->remaining);
4345 b->bi_next = NULL;
4346 generic_make_request(b);
4347 }
4348 end_reshape_request(r10_bio);
4349}
4350
4351static void end_reshape(struct r10conf *conf)
4352{
4353 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4354 return;
4355
4356 spin_lock_irq(&conf->device_lock);
4357 conf->prev = conf->geo;
4358 md_finish_reshape(conf->mddev);
4359 smp_wmb();
4360 conf->reshape_progress = MaxSector;
4361 spin_unlock_irq(&conf->device_lock);
4362
4363 /* read-ahead size must cover two whole stripes, which is
4364 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4365 */
4366 if (conf->mddev->queue) {
4367 int stripe = conf->geo.raid_disks *
4368 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4369 stripe /= conf->geo.near_copies;
4370 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4371 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4372 }
4373 conf->fullsync = 0;
4374}
4375
4376
4377static int handle_reshape_read_error(struct mddev *mddev,
4378 struct r10bio *r10_bio)
4379{
4380 /* Use sync reads to get the blocks from somewhere else */
4381 int sectors = r10_bio->sectors;
4382 struct r10bio r10b;
4383 struct r10conf *conf = mddev->private;
4384 int slot = 0;
4385 int idx = 0;
4386 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4387
4388 r10b.sector = r10_bio->sector;
4389 __raid10_find_phys(&conf->prev, &r10b);
4390
4391 while (sectors) {
4392 int s = sectors;
4393 int success = 0;
4394 int first_slot = slot;
4395
4396 if (s > (PAGE_SIZE >> 9))
4397 s = PAGE_SIZE >> 9;
4398
4399 while (!success) {
4400 int d = r10b.devs[slot].devnum;
4401 struct md_rdev *rdev = conf->mirrors[d].rdev;
4402 sector_t addr;
4403 if (rdev == NULL ||
4404 test_bit(Faulty, &rdev->flags) ||
4405 !test_bit(In_sync, &rdev->flags))
4406 goto failed;
4407
4408 addr = r10b.devs[slot].addr + idx * PAGE_SIZE;
4409 success = sync_page_io(rdev,
4410 addr,
4411 s << 9,
4412 bvec[idx].bv_page,
4413 READ, false);
4414 if (success)
4415 break;
4416 failed:
4417 slot++;
4418 if (slot >= conf->copies)
4419 slot = 0;
4420 if (slot == first_slot)
4421 break;
4422 }
4423 if (!success) {
4424 /* couldn't read this block, must give up */
4425 set_bit(MD_RECOVERY_INTR,
4426 &mddev->recovery);
4427 return -EIO;
4428 }
4429 sectors -= s;
4430 idx++;
4431 }
4432 return 0;
4433}
4434
4435static void end_reshape_write(struct bio *bio, int error)
4436{
4437 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4438 struct r10bio *r10_bio = bio->bi_private;
4439 struct mddev *mddev = r10_bio->mddev;
4440 struct r10conf *conf = mddev->private;
4441 int d;
4442 int slot;
4443 int repl;
4444 struct md_rdev *rdev = NULL;
4445
4446 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4447 if (repl)
4448 rdev = conf->mirrors[d].replacement;
4449 if (!rdev) {
4450 smp_mb();
4451 rdev = conf->mirrors[d].rdev;
4452 }
4453
4454 if (!uptodate) {
4455 /* FIXME should record badblock */
4456 md_error(mddev, rdev);
4457 }
4458
4459 rdev_dec_pending(rdev, mddev);
4460 end_reshape_request(r10_bio);
4461}
4462
4463static void end_reshape_request(struct r10bio *r10_bio)
4464{
4465 if (!atomic_dec_and_test(&r10_bio->remaining))
4466 return;
4467 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4468 bio_put(r10_bio->master_bio);
4469 put_buf(r10_bio);
4470}
4471
4472static void raid10_finish_reshape(struct mddev *mddev)
4473{
4474 struct r10conf *conf = mddev->private;
4475
4476 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4477 return;
4478
4479 if (mddev->delta_disks > 0) {
4480 sector_t size = raid10_size(mddev, 0, 0);
4481 md_set_array_sectors(mddev, size);
4482 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4483 mddev->recovery_cp = mddev->resync_max_sectors;
4484 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4485 }
4486 mddev->resync_max_sectors = size;
4487 set_capacity(mddev->gendisk, mddev->array_sectors);
4488 revalidate_disk(mddev->gendisk);
4489 } else {
4490 int d;
4491 for (d = conf->geo.raid_disks ;
4492 d < conf->geo.raid_disks - mddev->delta_disks;
4493 d++) {
4494 struct md_rdev *rdev = conf->mirrors[d].rdev;
4495 if (rdev)
4496 clear_bit(In_sync, &rdev->flags);
4497 rdev = conf->mirrors[d].replacement;
4498 if (rdev)
4499 clear_bit(In_sync, &rdev->flags);
4500 }
4501 }
4502 mddev->layout = mddev->new_layout;
4503 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4504 mddev->reshape_position = MaxSector;
4505 mddev->delta_disks = 0;
4506 mddev->reshape_backwards = 0;
4507}
4508
3537static struct md_personality raid10_personality = 4509static struct md_personality raid10_personality =
3538{ 4510{
3539 .name = "raid10", 4511 .name = "raid10",
@@ -3552,6 +4524,9 @@ static struct md_personality raid10_personality =
3552 .size = raid10_size, 4524 .size = raid10_size,
3553 .resize = raid10_resize, 4525 .resize = raid10_resize,
3554 .takeover = raid10_takeover, 4526 .takeover = raid10_takeover,
4527 .check_reshape = raid10_check_reshape,
4528 .start_reshape = raid10_start_reshape,
4529 .finish_reshape = raid10_finish_reshape,
3555}; 4530};
3556 4531
3557static int __init raid_init(void) 4532static int __init raid_init(void)