diff options
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 1281 |
1 files changed, 1128 insertions, 153 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3f91c2e1dfe7..987db37cb875 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/seq_file.h> | 25 | #include <linux/seq_file.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | ||
27 | #include "md.h" | 28 | #include "md.h" |
28 | #include "raid10.h" | 29 | #include "raid10.h" |
29 | #include "raid0.h" | 30 | #include "raid0.h" |
@@ -68,6 +69,11 @@ static int max_queued_requests = 1024; | |||
68 | static void allow_barrier(struct r10conf *conf); | 69 | static void allow_barrier(struct r10conf *conf); |
69 | static void lower_barrier(struct r10conf *conf); | 70 | static void lower_barrier(struct r10conf *conf); |
70 | static int enough(struct r10conf *conf, int ignore); | 71 | static int enough(struct r10conf *conf, int ignore); |
72 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | ||
73 | int *skipped); | ||
74 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); | ||
75 | static void end_reshape_write(struct bio *bio, int error); | ||
76 | static void end_reshape(struct r10conf *conf); | ||
71 | 77 | ||
72 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | 78 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) |
73 | { | 79 | { |
@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
112 | if (!r10_bio) | 118 | if (!r10_bio) |
113 | return NULL; | 119 | return NULL; |
114 | 120 | ||
115 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | 121 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || |
122 | test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) | ||
116 | nalloc = conf->copies; /* resync */ | 123 | nalloc = conf->copies; /* resync */ |
117 | else | 124 | else |
118 | nalloc = 2; /* recovery */ | 125 | nalloc = 2; /* recovery */ |
@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
140 | struct bio *rbio = r10_bio->devs[j].repl_bio; | 147 | struct bio *rbio = r10_bio->devs[j].repl_bio; |
141 | bio = r10_bio->devs[j].bio; | 148 | bio = r10_bio->devs[j].bio; |
142 | for (i = 0; i < RESYNC_PAGES; i++) { | 149 | for (i = 0; i < RESYNC_PAGES; i++) { |
143 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, | 150 | if (j > 0 && !test_bit(MD_RECOVERY_SYNC, |
144 | &conf->mddev->recovery)) { | 151 | &conf->mddev->recovery)) { |
145 | /* we can share bv_page's during recovery */ | 152 | /* we can share bv_page's during recovery |
153 | * and reshape */ | ||
146 | struct bio *rbio = r10_bio->devs[0].bio; | 154 | struct bio *rbio = r10_bio->devs[0].bio; |
147 | page = rbio->bi_io_vec[i].bv_page; | 155 | page = rbio->bi_io_vec[i].bv_page; |
148 | get_page(page); | 156 | get_page(page); |
@@ -165,10 +173,11 @@ out_free_pages: | |||
165 | while (j--) | 173 | while (j--) |
166 | for (i = 0; i < RESYNC_PAGES ; i++) | 174 | for (i = 0; i < RESYNC_PAGES ; i++) |
167 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); | 175 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); |
168 | j = -1; | 176 | j = 0; |
169 | out_free_bio: | 177 | out_free_bio: |
170 | while (++j < nalloc) { | 178 | for ( ; j < nalloc; j++) { |
171 | bio_put(r10_bio->devs[j].bio); | 179 | if (r10_bio->devs[j].bio) |
180 | bio_put(r10_bio->devs[j].bio); | ||
172 | if (r10_bio->devs[j].repl_bio) | 181 | if (r10_bio->devs[j].repl_bio) |
173 | bio_put(r10_bio->devs[j].repl_bio); | 182 | bio_put(r10_bio->devs[j].repl_bio); |
174 | } | 183 | } |
@@ -504,79 +513,96 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
504 | * sector offset to a virtual address | 513 | * sector offset to a virtual address |
505 | */ | 514 | */ |
506 | 515 | ||
507 | static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) | 516 | static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) |
508 | { | 517 | { |
509 | int n,f; | 518 | int n,f; |
510 | sector_t sector; | 519 | sector_t sector; |
511 | sector_t chunk; | 520 | sector_t chunk; |
512 | sector_t stripe; | 521 | sector_t stripe; |
513 | int dev; | 522 | int dev; |
514 | |||
515 | int slot = 0; | 523 | int slot = 0; |
516 | 524 | ||
517 | /* now calculate first sector/dev */ | 525 | /* now calculate first sector/dev */ |
518 | chunk = r10bio->sector >> conf->chunk_shift; | 526 | chunk = r10bio->sector >> geo->chunk_shift; |
519 | sector = r10bio->sector & conf->chunk_mask; | 527 | sector = r10bio->sector & geo->chunk_mask; |
520 | 528 | ||
521 | chunk *= conf->near_copies; | 529 | chunk *= geo->near_copies; |
522 | stripe = chunk; | 530 | stripe = chunk; |
523 | dev = sector_div(stripe, conf->raid_disks); | 531 | dev = sector_div(stripe, geo->raid_disks); |
524 | if (conf->far_offset) | 532 | if (geo->far_offset) |
525 | stripe *= conf->far_copies; | 533 | stripe *= geo->far_copies; |
526 | 534 | ||
527 | sector += stripe << conf->chunk_shift; | 535 | sector += stripe << geo->chunk_shift; |
528 | 536 | ||
529 | /* and calculate all the others */ | 537 | /* and calculate all the others */ |
530 | for (n=0; n < conf->near_copies; n++) { | 538 | for (n = 0; n < geo->near_copies; n++) { |
531 | int d = dev; | 539 | int d = dev; |
532 | sector_t s = sector; | 540 | sector_t s = sector; |
533 | r10bio->devs[slot].addr = sector; | 541 | r10bio->devs[slot].addr = sector; |
534 | r10bio->devs[slot].devnum = d; | 542 | r10bio->devs[slot].devnum = d; |
535 | slot++; | 543 | slot++; |
536 | 544 | ||
537 | for (f = 1; f < conf->far_copies; f++) { | 545 | for (f = 1; f < geo->far_copies; f++) { |
538 | d += conf->near_copies; | 546 | d += geo->near_copies; |
539 | if (d >= conf->raid_disks) | 547 | if (d >= geo->raid_disks) |
540 | d -= conf->raid_disks; | 548 | d -= geo->raid_disks; |
541 | s += conf->stride; | 549 | s += geo->stride; |
542 | r10bio->devs[slot].devnum = d; | 550 | r10bio->devs[slot].devnum = d; |
543 | r10bio->devs[slot].addr = s; | 551 | r10bio->devs[slot].addr = s; |
544 | slot++; | 552 | slot++; |
545 | } | 553 | } |
546 | dev++; | 554 | dev++; |
547 | if (dev >= conf->raid_disks) { | 555 | if (dev >= geo->raid_disks) { |
548 | dev = 0; | 556 | dev = 0; |
549 | sector += (conf->chunk_mask + 1); | 557 | sector += (geo->chunk_mask + 1); |
550 | } | 558 | } |
551 | } | 559 | } |
552 | BUG_ON(slot != conf->copies); | 560 | } |
561 | |||
562 | static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) | ||
563 | { | ||
564 | struct geom *geo = &conf->geo; | ||
565 | |||
566 | if (conf->reshape_progress != MaxSector && | ||
567 | ((r10bio->sector >= conf->reshape_progress) != | ||
568 | conf->mddev->reshape_backwards)) { | ||
569 | set_bit(R10BIO_Previous, &r10bio->state); | ||
570 | geo = &conf->prev; | ||
571 | } else | ||
572 | clear_bit(R10BIO_Previous, &r10bio->state); | ||
573 | |||
574 | __raid10_find_phys(geo, r10bio); | ||
553 | } | 575 | } |
554 | 576 | ||
555 | static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | 577 | static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) |
556 | { | 578 | { |
557 | sector_t offset, chunk, vchunk; | 579 | sector_t offset, chunk, vchunk; |
580 | /* Never use conf->prev as this is only called during resync | ||
581 | * or recovery, so reshape isn't happening | ||
582 | */ | ||
583 | struct geom *geo = &conf->geo; | ||
558 | 584 | ||
559 | offset = sector & conf->chunk_mask; | 585 | offset = sector & geo->chunk_mask; |
560 | if (conf->far_offset) { | 586 | if (geo->far_offset) { |
561 | int fc; | 587 | int fc; |
562 | chunk = sector >> conf->chunk_shift; | 588 | chunk = sector >> geo->chunk_shift; |
563 | fc = sector_div(chunk, conf->far_copies); | 589 | fc = sector_div(chunk, geo->far_copies); |
564 | dev -= fc * conf->near_copies; | 590 | dev -= fc * geo->near_copies; |
565 | if (dev < 0) | 591 | if (dev < 0) |
566 | dev += conf->raid_disks; | 592 | dev += geo->raid_disks; |
567 | } else { | 593 | } else { |
568 | while (sector >= conf->stride) { | 594 | while (sector >= geo->stride) { |
569 | sector -= conf->stride; | 595 | sector -= geo->stride; |
570 | if (dev < conf->near_copies) | 596 | if (dev < geo->near_copies) |
571 | dev += conf->raid_disks - conf->near_copies; | 597 | dev += geo->raid_disks - geo->near_copies; |
572 | else | 598 | else |
573 | dev -= conf->near_copies; | 599 | dev -= geo->near_copies; |
574 | } | 600 | } |
575 | chunk = sector >> conf->chunk_shift; | 601 | chunk = sector >> geo->chunk_shift; |
576 | } | 602 | } |
577 | vchunk = chunk * conf->raid_disks + dev; | 603 | vchunk = chunk * geo->raid_disks + dev; |
578 | sector_div(vchunk, conf->near_copies); | 604 | sector_div(vchunk, geo->near_copies); |
579 | return (vchunk << conf->chunk_shift) + offset; | 605 | return (vchunk << geo->chunk_shift) + offset; |
580 | } | 606 | } |
581 | 607 | ||
582 | /** | 608 | /** |
@@ -597,10 +623,17 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
597 | struct r10conf *conf = mddev->private; | 623 | struct r10conf *conf = mddev->private; |
598 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 624 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
599 | int max; | 625 | int max; |
600 | unsigned int chunk_sectors = mddev->chunk_sectors; | 626 | unsigned int chunk_sectors; |
601 | unsigned int bio_sectors = bvm->bi_size >> 9; | 627 | unsigned int bio_sectors = bvm->bi_size >> 9; |
628 | struct geom *geo = &conf->geo; | ||
629 | |||
630 | chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; | ||
631 | if (conf->reshape_progress != MaxSector && | ||
632 | ((sector >= conf->reshape_progress) != | ||
633 | conf->mddev->reshape_backwards)) | ||
634 | geo = &conf->prev; | ||
602 | 635 | ||
603 | if (conf->near_copies < conf->raid_disks) { | 636 | if (geo->near_copies < geo->raid_disks) { |
604 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) | 637 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) |
605 | + bio_sectors)) << 9; | 638 | + bio_sectors)) << 9; |
606 | if (max < 0) | 639 | if (max < 0) |
@@ -614,6 +647,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
614 | if (mddev->merge_check_needed) { | 647 | if (mddev->merge_check_needed) { |
615 | struct r10bio r10_bio; | 648 | struct r10bio r10_bio; |
616 | int s; | 649 | int s; |
650 | if (conf->reshape_progress != MaxSector) { | ||
651 | /* Cannot give any guidance during reshape */ | ||
652 | if (max <= biovec->bv_len && bio_sectors == 0) | ||
653 | return biovec->bv_len; | ||
654 | return 0; | ||
655 | } | ||
617 | r10_bio.sector = sector; | 656 | r10_bio.sector = sector; |
618 | raid10_find_phys(conf, &r10_bio); | 657 | raid10_find_phys(conf, &r10_bio); |
619 | rcu_read_lock(); | 658 | rcu_read_lock(); |
@@ -681,6 +720,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
681 | struct md_rdev *rdev, *best_rdev; | 720 | struct md_rdev *rdev, *best_rdev; |
682 | int do_balance; | 721 | int do_balance; |
683 | int best_slot; | 722 | int best_slot; |
723 | struct geom *geo = &conf->geo; | ||
684 | 724 | ||
685 | raid10_find_phys(conf, r10_bio); | 725 | raid10_find_phys(conf, r10_bio); |
686 | rcu_read_lock(); | 726 | rcu_read_lock(); |
@@ -761,11 +801,11 @@ retry: | |||
761 | * sequential read speed for 'far copies' arrays. So only | 801 | * sequential read speed for 'far copies' arrays. So only |
762 | * keep it for 'near' arrays, and review those later. | 802 | * keep it for 'near' arrays, and review those later. |
763 | */ | 803 | */ |
764 | if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) | 804 | if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) |
765 | break; | 805 | break; |
766 | 806 | ||
767 | /* for far > 1 always use the lowest address */ | 807 | /* for far > 1 always use the lowest address */ |
768 | if (conf->far_copies > 1) | 808 | if (geo->far_copies > 1) |
769 | new_distance = r10_bio->devs[slot].addr; | 809 | new_distance = r10_bio->devs[slot].addr; |
770 | else | 810 | else |
771 | new_distance = abs(r10_bio->devs[slot].addr - | 811 | new_distance = abs(r10_bio->devs[slot].addr - |
@@ -812,7 +852,10 @@ static int raid10_congested(void *data, int bits) | |||
812 | if (mddev_congested(mddev, bits)) | 852 | if (mddev_congested(mddev, bits)) |
813 | return 1; | 853 | return 1; |
814 | rcu_read_lock(); | 854 | rcu_read_lock(); |
815 | for (i = 0; i < conf->raid_disks && ret == 0; i++) { | 855 | for (i = 0; |
856 | (i < conf->geo.raid_disks || i < conf->prev.raid_disks) | ||
857 | && ret == 0; | ||
858 | i++) { | ||
816 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | 859 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); |
817 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 860 | if (rdev && !test_bit(Faulty, &rdev->flags)) { |
818 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 861 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
@@ -973,13 +1016,24 @@ static void unfreeze_array(struct r10conf *conf) | |||
973 | spin_unlock_irq(&conf->resync_lock); | 1016 | spin_unlock_irq(&conf->resync_lock); |
974 | } | 1017 | } |
975 | 1018 | ||
1019 | static sector_t choose_data_offset(struct r10bio *r10_bio, | ||
1020 | struct md_rdev *rdev) | ||
1021 | { | ||
1022 | if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || | ||
1023 | test_bit(R10BIO_Previous, &r10_bio->state)) | ||
1024 | return rdev->data_offset; | ||
1025 | else | ||
1026 | return rdev->new_data_offset; | ||
1027 | } | ||
1028 | |||
976 | static void make_request(struct mddev *mddev, struct bio * bio) | 1029 | static void make_request(struct mddev *mddev, struct bio * bio) |
977 | { | 1030 | { |
978 | struct r10conf *conf = mddev->private; | 1031 | struct r10conf *conf = mddev->private; |
979 | struct r10bio *r10_bio; | 1032 | struct r10bio *r10_bio; |
980 | struct bio *read_bio; | 1033 | struct bio *read_bio; |
981 | int i; | 1034 | int i; |
982 | int chunk_sects = conf->chunk_mask + 1; | 1035 | sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); |
1036 | int chunk_sects = chunk_mask + 1; | ||
983 | const int rw = bio_data_dir(bio); | 1037 | const int rw = bio_data_dir(bio); |
984 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 1038 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
985 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); | 1039 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
@@ -988,6 +1042,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
988 | int plugged; | 1042 | int plugged; |
989 | int sectors_handled; | 1043 | int sectors_handled; |
990 | int max_sectors; | 1044 | int max_sectors; |
1045 | int sectors; | ||
991 | 1046 | ||
992 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 1047 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
993 | md_flush_request(mddev, bio); | 1048 | md_flush_request(mddev, bio); |
@@ -997,9 +1052,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
997 | /* If this request crosses a chunk boundary, we need to | 1052 | /* If this request crosses a chunk boundary, we need to |
998 | * split it. This will only happen for 1 PAGE (or less) requests. | 1053 | * split it. This will only happen for 1 PAGE (or less) requests. |
999 | */ | 1054 | */ |
1000 | if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) | 1055 | if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) |
1001 | > chunk_sects && | 1056 | > chunk_sects |
1002 | conf->near_copies < conf->raid_disks)) { | 1057 | && (conf->geo.near_copies < conf->geo.raid_disks |
1058 | || conf->prev.near_copies < conf->prev.raid_disks))) { | ||
1003 | struct bio_pair *bp; | 1059 | struct bio_pair *bp; |
1004 | /* Sanity check -- queue functions should prevent this happening */ | 1060 | /* Sanity check -- queue functions should prevent this happening */ |
1005 | if (bio->bi_vcnt != 1 || | 1061 | if (bio->bi_vcnt != 1 || |
@@ -1051,10 +1107,41 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1051 | */ | 1107 | */ |
1052 | wait_barrier(conf); | 1108 | wait_barrier(conf); |
1053 | 1109 | ||
1110 | sectors = bio->bi_size >> 9; | ||
1111 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
1112 | bio->bi_sector < conf->reshape_progress && | ||
1113 | bio->bi_sector + sectors > conf->reshape_progress) { | ||
1114 | /* IO spans the reshape position. Need to wait for | ||
1115 | * reshape to pass | ||
1116 | */ | ||
1117 | allow_barrier(conf); | ||
1118 | wait_event(conf->wait_barrier, | ||
1119 | conf->reshape_progress <= bio->bi_sector || | ||
1120 | conf->reshape_progress >= bio->bi_sector + sectors); | ||
1121 | wait_barrier(conf); | ||
1122 | } | ||
1123 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
1124 | bio_data_dir(bio) == WRITE && | ||
1125 | (mddev->reshape_backwards | ||
1126 | ? (bio->bi_sector < conf->reshape_safe && | ||
1127 | bio->bi_sector + sectors > conf->reshape_progress) | ||
1128 | : (bio->bi_sector + sectors > conf->reshape_safe && | ||
1129 | bio->bi_sector < conf->reshape_progress))) { | ||
1130 | /* Need to update reshape_position in metadata */ | ||
1131 | mddev->reshape_position = conf->reshape_progress; | ||
1132 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
1133 | set_bit(MD_CHANGE_PENDING, &mddev->flags); | ||
1134 | md_wakeup_thread(mddev->thread); | ||
1135 | wait_event(mddev->sb_wait, | ||
1136 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
1137 | |||
1138 | conf->reshape_safe = mddev->reshape_position; | ||
1139 | } | ||
1140 | |||
1054 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | 1141 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); |
1055 | 1142 | ||
1056 | r10_bio->master_bio = bio; | 1143 | r10_bio->master_bio = bio; |
1057 | r10_bio->sectors = bio->bi_size >> 9; | 1144 | r10_bio->sectors = sectors; |
1058 | 1145 | ||
1059 | r10_bio->mddev = mddev; | 1146 | r10_bio->mddev = mddev; |
1060 | r10_bio->sector = bio->bi_sector; | 1147 | r10_bio->sector = bio->bi_sector; |
@@ -1093,7 +1180,7 @@ read_again: | |||
1093 | r10_bio->devs[slot].rdev = rdev; | 1180 | r10_bio->devs[slot].rdev = rdev; |
1094 | 1181 | ||
1095 | read_bio->bi_sector = r10_bio->devs[slot].addr + | 1182 | read_bio->bi_sector = r10_bio->devs[slot].addr + |
1096 | rdev->data_offset; | 1183 | choose_data_offset(r10_bio, rdev); |
1097 | read_bio->bi_bdev = rdev->bdev; | 1184 | read_bio->bi_bdev = rdev->bdev; |
1098 | read_bio->bi_end_io = raid10_end_read_request; | 1185 | read_bio->bi_end_io = raid10_end_read_request; |
1099 | read_bio->bi_rw = READ | do_sync; | 1186 | read_bio->bi_rw = READ | do_sync; |
@@ -1297,7 +1384,8 @@ retry_write: | |||
1297 | r10_bio->devs[i].bio = mbio; | 1384 | r10_bio->devs[i].bio = mbio; |
1298 | 1385 | ||
1299 | mbio->bi_sector = (r10_bio->devs[i].addr+ | 1386 | mbio->bi_sector = (r10_bio->devs[i].addr+ |
1300 | conf->mirrors[d].rdev->data_offset); | 1387 | choose_data_offset(r10_bio, |
1388 | conf->mirrors[d].rdev)); | ||
1301 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 1389 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1302 | mbio->bi_end_io = raid10_end_write_request; | 1390 | mbio->bi_end_io = raid10_end_write_request; |
1303 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1391 | mbio->bi_rw = WRITE | do_sync | do_fua; |
@@ -1321,8 +1409,10 @@ retry_write: | |||
1321 | * so it cannot disappear, so the replacement cannot | 1409 | * so it cannot disappear, so the replacement cannot |
1322 | * become NULL here | 1410 | * become NULL here |
1323 | */ | 1411 | */ |
1324 | mbio->bi_sector = (r10_bio->devs[i].addr+ | 1412 | mbio->bi_sector = (r10_bio->devs[i].addr + |
1325 | conf->mirrors[d].replacement->data_offset); | 1413 | choose_data_offset( |
1414 | r10_bio, | ||
1415 | conf->mirrors[d].replacement)); | ||
1326 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; | 1416 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; |
1327 | mbio->bi_end_io = raid10_end_write_request; | 1417 | mbio->bi_end_io = raid10_end_write_request; |
1328 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1418 | mbio->bi_rw = WRITE | do_sync | do_fua; |
@@ -1368,19 +1458,19 @@ static void status(struct seq_file *seq, struct mddev *mddev) | |||
1368 | struct r10conf *conf = mddev->private; | 1458 | struct r10conf *conf = mddev->private; |
1369 | int i; | 1459 | int i; |
1370 | 1460 | ||
1371 | if (conf->near_copies < conf->raid_disks) | 1461 | if (conf->geo.near_copies < conf->geo.raid_disks) |
1372 | seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); | 1462 | seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); |
1373 | if (conf->near_copies > 1) | 1463 | if (conf->geo.near_copies > 1) |
1374 | seq_printf(seq, " %d near-copies", conf->near_copies); | 1464 | seq_printf(seq, " %d near-copies", conf->geo.near_copies); |
1375 | if (conf->far_copies > 1) { | 1465 | if (conf->geo.far_copies > 1) { |
1376 | if (conf->far_offset) | 1466 | if (conf->geo.far_offset) |
1377 | seq_printf(seq, " %d offset-copies", conf->far_copies); | 1467 | seq_printf(seq, " %d offset-copies", conf->geo.far_copies); |
1378 | else | 1468 | else |
1379 | seq_printf(seq, " %d far-copies", conf->far_copies); | 1469 | seq_printf(seq, " %d far-copies", conf->geo.far_copies); |
1380 | } | 1470 | } |
1381 | seq_printf(seq, " [%d/%d] [", conf->raid_disks, | 1471 | seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, |
1382 | conf->raid_disks - mddev->degraded); | 1472 | conf->geo.raid_disks - mddev->degraded); |
1383 | for (i = 0; i < conf->raid_disks; i++) | 1473 | for (i = 0; i < conf->geo.raid_disks; i++) |
1384 | seq_printf(seq, "%s", | 1474 | seq_printf(seq, "%s", |
1385 | conf->mirrors[i].rdev && | 1475 | conf->mirrors[i].rdev && |
1386 | test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); | 1476 | test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); |
@@ -1392,7 +1482,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) | |||
1392 | * Don't consider the device numbered 'ignore' | 1482 | * Don't consider the device numbered 'ignore' |
1393 | * as we might be about to remove it. | 1483 | * as we might be about to remove it. |
1394 | */ | 1484 | */ |
1395 | static int enough(struct r10conf *conf, int ignore) | 1485 | static int _enough(struct r10conf *conf, struct geom *geo, int ignore) |
1396 | { | 1486 | { |
1397 | int first = 0; | 1487 | int first = 0; |
1398 | 1488 | ||
@@ -1403,7 +1493,7 @@ static int enough(struct r10conf *conf, int ignore) | |||
1403 | if (conf->mirrors[first].rdev && | 1493 | if (conf->mirrors[first].rdev && |
1404 | first != ignore) | 1494 | first != ignore) |
1405 | cnt++; | 1495 | cnt++; |
1406 | first = (first+1) % conf->raid_disks; | 1496 | first = (first+1) % geo->raid_disks; |
1407 | } | 1497 | } |
1408 | if (cnt == 0) | 1498 | if (cnt == 0) |
1409 | return 0; | 1499 | return 0; |
@@ -1411,6 +1501,12 @@ static int enough(struct r10conf *conf, int ignore) | |||
1411 | return 1; | 1501 | return 1; |
1412 | } | 1502 | } |
1413 | 1503 | ||
1504 | static int enough(struct r10conf *conf, int ignore) | ||
1505 | { | ||
1506 | return _enough(conf, &conf->geo, ignore) && | ||
1507 | _enough(conf, &conf->prev, ignore); | ||
1508 | } | ||
1509 | |||
1414 | static void error(struct mddev *mddev, struct md_rdev *rdev) | 1510 | static void error(struct mddev *mddev, struct md_rdev *rdev) |
1415 | { | 1511 | { |
1416 | char b[BDEVNAME_SIZE]; | 1512 | char b[BDEVNAME_SIZE]; |
@@ -1445,7 +1541,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) | |||
1445 | "md/raid10:%s: Disk failure on %s, disabling device.\n" | 1541 | "md/raid10:%s: Disk failure on %s, disabling device.\n" |
1446 | "md/raid10:%s: Operation continuing on %d devices.\n", | 1542 | "md/raid10:%s: Operation continuing on %d devices.\n", |
1447 | mdname(mddev), bdevname(rdev->bdev, b), | 1543 | mdname(mddev), bdevname(rdev->bdev, b), |
1448 | mdname(mddev), conf->raid_disks - mddev->degraded); | 1544 | mdname(mddev), conf->geo.raid_disks - mddev->degraded); |
1449 | } | 1545 | } |
1450 | 1546 | ||
1451 | static void print_conf(struct r10conf *conf) | 1547 | static void print_conf(struct r10conf *conf) |
@@ -1458,10 +1554,10 @@ static void print_conf(struct r10conf *conf) | |||
1458 | printk(KERN_DEBUG "(!conf)\n"); | 1554 | printk(KERN_DEBUG "(!conf)\n"); |
1459 | return; | 1555 | return; |
1460 | } | 1556 | } |
1461 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, | 1557 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, |
1462 | conf->raid_disks); | 1558 | conf->geo.raid_disks); |
1463 | 1559 | ||
1464 | for (i = 0; i < conf->raid_disks; i++) { | 1560 | for (i = 0; i < conf->geo.raid_disks; i++) { |
1465 | char b[BDEVNAME_SIZE]; | 1561 | char b[BDEVNAME_SIZE]; |
1466 | tmp = conf->mirrors + i; | 1562 | tmp = conf->mirrors + i; |
1467 | if (tmp->rdev) | 1563 | if (tmp->rdev) |
@@ -1493,7 +1589,7 @@ static int raid10_spare_active(struct mddev *mddev) | |||
1493 | * Find all non-in_sync disks within the RAID10 configuration | 1589 | * Find all non-in_sync disks within the RAID10 configuration |
1494 | * and mark them in_sync | 1590 | * and mark them in_sync |
1495 | */ | 1591 | */ |
1496 | for (i = 0; i < conf->raid_disks; i++) { | 1592 | for (i = 0; i < conf->geo.raid_disks; i++) { |
1497 | tmp = conf->mirrors + i; | 1593 | tmp = conf->mirrors + i; |
1498 | if (tmp->replacement | 1594 | if (tmp->replacement |
1499 | && tmp->replacement->recovery_offset == MaxSector | 1595 | && tmp->replacement->recovery_offset == MaxSector |
@@ -1535,7 +1631,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1535 | int err = -EEXIST; | 1631 | int err = -EEXIST; |
1536 | int mirror; | 1632 | int mirror; |
1537 | int first = 0; | 1633 | int first = 0; |
1538 | int last = conf->raid_disks - 1; | 1634 | int last = conf->geo.raid_disks - 1; |
1539 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 1635 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
1540 | 1636 | ||
1541 | if (mddev->recovery_cp < MaxSector) | 1637 | if (mddev->recovery_cp < MaxSector) |
@@ -1543,7 +1639,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1543 | * very different from resync | 1639 | * very different from resync |
1544 | */ | 1640 | */ |
1545 | return -EBUSY; | 1641 | return -EBUSY; |
1546 | if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) | 1642 | if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1)) |
1547 | return -EINVAL; | 1643 | return -EINVAL; |
1548 | 1644 | ||
1549 | if (rdev->raid_disk >= 0) | 1645 | if (rdev->raid_disk >= 0) |
@@ -1635,6 +1731,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1635 | if (!test_bit(Faulty, &rdev->flags) && | 1731 | if (!test_bit(Faulty, &rdev->flags) && |
1636 | mddev->recovery_disabled != p->recovery_disabled && | 1732 | mddev->recovery_disabled != p->recovery_disabled && |
1637 | (!p->replacement || p->replacement == rdev) && | 1733 | (!p->replacement || p->replacement == rdev) && |
1734 | number < conf->geo.raid_disks && | ||
1638 | enough(conf, -1)) { | 1735 | enough(conf, -1)) { |
1639 | err = -EBUSY; | 1736 | err = -EBUSY; |
1640 | goto abort; | 1737 | goto abort; |
@@ -1676,7 +1773,11 @@ static void end_sync_read(struct bio *bio, int error) | |||
1676 | struct r10conf *conf = r10_bio->mddev->private; | 1773 | struct r10conf *conf = r10_bio->mddev->private; |
1677 | int d; | 1774 | int d; |
1678 | 1775 | ||
1679 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); | 1776 | if (bio == r10_bio->master_bio) { |
1777 | /* this is a reshape read */ | ||
1778 | d = r10_bio->read_slot; /* really the read dev */ | ||
1779 | } else | ||
1780 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); | ||
1680 | 1781 | ||
1681 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1782 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1682 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1783 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
@@ -2218,7 +2319,9 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2218 | " (%d sectors at %llu on %s)\n", | 2319 | " (%d sectors at %llu on %s)\n", |
2219 | mdname(mddev), s, | 2320 | mdname(mddev), s, |
2220 | (unsigned long long)( | 2321 | (unsigned long long)( |
2221 | sect + rdev->data_offset), | 2322 | sect + |
2323 | choose_data_offset(r10_bio, | ||
2324 | rdev)), | ||
2222 | bdevname(rdev->bdev, b)); | 2325 | bdevname(rdev->bdev, b)); |
2223 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 2326 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
2224 | "drive\n", | 2327 | "drive\n", |
@@ -2256,7 +2359,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2256 | " (%d sectors at %llu on %s)\n", | 2359 | " (%d sectors at %llu on %s)\n", |
2257 | mdname(mddev), s, | 2360 | mdname(mddev), s, |
2258 | (unsigned long long)( | 2361 | (unsigned long long)( |
2259 | sect + rdev->data_offset), | 2362 | sect + |
2363 | choose_data_offset(r10_bio, rdev)), | ||
2260 | bdevname(rdev->bdev, b)); | 2364 | bdevname(rdev->bdev, b)); |
2261 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 2365 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
2262 | "drive\n", | 2366 | "drive\n", |
@@ -2269,7 +2373,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2269 | " (%d sectors at %llu on %s)\n", | 2373 | " (%d sectors at %llu on %s)\n", |
2270 | mdname(mddev), s, | 2374 | mdname(mddev), s, |
2271 | (unsigned long long)( | 2375 | (unsigned long long)( |
2272 | sect + rdev->data_offset), | 2376 | sect + |
2377 | choose_data_offset(r10_bio, rdev)), | ||
2273 | bdevname(rdev->bdev, b)); | 2378 | bdevname(rdev->bdev, b)); |
2274 | atomic_add(s, &rdev->corrected_errors); | 2379 | atomic_add(s, &rdev->corrected_errors); |
2275 | } | 2380 | } |
@@ -2343,7 +2448,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) | |||
2343 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 2448 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
2344 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); | 2449 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); |
2345 | wbio->bi_sector = (r10_bio->devs[i].addr+ | 2450 | wbio->bi_sector = (r10_bio->devs[i].addr+ |
2346 | rdev->data_offset+ | 2451 | choose_data_offset(r10_bio, rdev) + |
2347 | (sector - r10_bio->sector)); | 2452 | (sector - r10_bio->sector)); |
2348 | wbio->bi_bdev = rdev->bdev; | 2453 | wbio->bi_bdev = rdev->bdev; |
2349 | if (submit_bio_wait(WRITE, wbio) == 0) | 2454 | if (submit_bio_wait(WRITE, wbio) == 0) |
@@ -2420,7 +2525,7 @@ read_more: | |||
2420 | r10_bio->devs[slot].bio = bio; | 2525 | r10_bio->devs[slot].bio = bio; |
2421 | r10_bio->devs[slot].rdev = rdev; | 2526 | r10_bio->devs[slot].rdev = rdev; |
2422 | bio->bi_sector = r10_bio->devs[slot].addr | 2527 | bio->bi_sector = r10_bio->devs[slot].addr |
2423 | + rdev->data_offset; | 2528 | + choose_data_offset(r10_bio, rdev); |
2424 | bio->bi_bdev = rdev->bdev; | 2529 | bio->bi_bdev = rdev->bdev; |
2425 | bio->bi_rw = READ | do_sync; | 2530 | bio->bi_rw = READ | do_sync; |
2426 | bio->bi_private = r10_bio; | 2531 | bio->bi_private = r10_bio; |
@@ -2480,7 +2585,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2480 | rdev_clear_badblocks( | 2585 | rdev_clear_badblocks( |
2481 | rdev, | 2586 | rdev, |
2482 | r10_bio->devs[m].addr, | 2587 | r10_bio->devs[m].addr, |
2483 | r10_bio->sectors); | 2588 | r10_bio->sectors, 0); |
2484 | } else { | 2589 | } else { |
2485 | if (!rdev_set_badblocks( | 2590 | if (!rdev_set_badblocks( |
2486 | rdev, | 2591 | rdev, |
@@ -2496,7 +2601,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2496 | rdev_clear_badblocks( | 2601 | rdev_clear_badblocks( |
2497 | rdev, | 2602 | rdev, |
2498 | r10_bio->devs[m].addr, | 2603 | r10_bio->devs[m].addr, |
2499 | r10_bio->sectors); | 2604 | r10_bio->sectors, 0); |
2500 | } else { | 2605 | } else { |
2501 | if (!rdev_set_badblocks( | 2606 | if (!rdev_set_badblocks( |
2502 | rdev, | 2607 | rdev, |
@@ -2515,7 +2620,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2515 | rdev_clear_badblocks( | 2620 | rdev_clear_badblocks( |
2516 | rdev, | 2621 | rdev, |
2517 | r10_bio->devs[m].addr, | 2622 | r10_bio->devs[m].addr, |
2518 | r10_bio->sectors); | 2623 | r10_bio->sectors, 0); |
2519 | rdev_dec_pending(rdev, conf->mddev); | 2624 | rdev_dec_pending(rdev, conf->mddev); |
2520 | } else if (bio != NULL && | 2625 | } else if (bio != NULL && |
2521 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { | 2626 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { |
@@ -2532,7 +2637,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2532 | rdev_clear_badblocks( | 2637 | rdev_clear_badblocks( |
2533 | rdev, | 2638 | rdev, |
2534 | r10_bio->devs[m].addr, | 2639 | r10_bio->devs[m].addr, |
2535 | r10_bio->sectors); | 2640 | r10_bio->sectors, 0); |
2536 | rdev_dec_pending(rdev, conf->mddev); | 2641 | rdev_dec_pending(rdev, conf->mddev); |
2537 | } | 2642 | } |
2538 | } | 2643 | } |
@@ -2573,6 +2678,8 @@ static void raid10d(struct mddev *mddev) | |||
2573 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || | 2678 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
2574 | test_bit(R10BIO_WriteError, &r10_bio->state)) | 2679 | test_bit(R10BIO_WriteError, &r10_bio->state)) |
2575 | handle_write_completed(conf, r10_bio); | 2680 | handle_write_completed(conf, r10_bio); |
2681 | else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) | ||
2682 | reshape_request_write(mddev, r10_bio); | ||
2576 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) | 2683 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) |
2577 | sync_request_write(mddev, r10_bio); | 2684 | sync_request_write(mddev, r10_bio); |
2578 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 2685 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
@@ -2603,7 +2710,7 @@ static int init_resync(struct r10conf *conf) | |||
2603 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; | 2710 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; |
2604 | BUG_ON(conf->r10buf_pool); | 2711 | BUG_ON(conf->r10buf_pool); |
2605 | conf->have_replacement = 0; | 2712 | conf->have_replacement = 0; |
2606 | for (i = 0; i < conf->raid_disks; i++) | 2713 | for (i = 0; i < conf->geo.raid_disks; i++) |
2607 | if (conf->mirrors[i].replacement) | 2714 | if (conf->mirrors[i].replacement) |
2608 | conf->have_replacement = 1; | 2715 | conf->have_replacement = 1; |
2609 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); | 2716 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); |
@@ -2657,6 +2764,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2657 | sector_t sync_blocks; | 2764 | sector_t sync_blocks; |
2658 | sector_t sectors_skipped = 0; | 2765 | sector_t sectors_skipped = 0; |
2659 | int chunks_skipped = 0; | 2766 | int chunks_skipped = 0; |
2767 | sector_t chunk_mask = conf->geo.chunk_mask; | ||
2660 | 2768 | ||
2661 | if (!conf->r10buf_pool) | 2769 | if (!conf->r10buf_pool) |
2662 | if (init_resync(conf)) | 2770 | if (init_resync(conf)) |
@@ -2664,7 +2772,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2664 | 2772 | ||
2665 | skipped: | 2773 | skipped: |
2666 | max_sector = mddev->dev_sectors; | 2774 | max_sector = mddev->dev_sectors; |
2667 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 2775 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
2776 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
2668 | max_sector = mddev->resync_max_sectors; | 2777 | max_sector = mddev->resync_max_sectors; |
2669 | if (sector_nr >= max_sector) { | 2778 | if (sector_nr >= max_sector) { |
2670 | /* If we aborted, we need to abort the | 2779 | /* If we aborted, we need to abort the |
@@ -2676,11 +2785,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2676 | * we need to convert that to several | 2785 | * we need to convert that to several |
2677 | * virtual addresses. | 2786 | * virtual addresses. |
2678 | */ | 2787 | */ |
2788 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | ||
2789 | end_reshape(conf); | ||
2790 | return 0; | ||
2791 | } | ||
2792 | |||
2679 | if (mddev->curr_resync < max_sector) { /* aborted */ | 2793 | if (mddev->curr_resync < max_sector) { /* aborted */ |
2680 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 2794 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
2681 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | 2795 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, |
2682 | &sync_blocks, 1); | 2796 | &sync_blocks, 1); |
2683 | else for (i=0; i<conf->raid_disks; i++) { | 2797 | else for (i = 0; i < conf->geo.raid_disks; i++) { |
2684 | sector_t sect = | 2798 | sector_t sect = |
2685 | raid10_find_virt(conf, mddev->curr_resync, i); | 2799 | raid10_find_virt(conf, mddev->curr_resync, i); |
2686 | bitmap_end_sync(mddev->bitmap, sect, | 2800 | bitmap_end_sync(mddev->bitmap, sect, |
@@ -2694,7 +2808,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2694 | /* Completed a full sync so the replacements | 2808 | /* Completed a full sync so the replacements |
2695 | * are now fully recovered. | 2809 | * are now fully recovered. |
2696 | */ | 2810 | */ |
2697 | for (i = 0; i < conf->raid_disks; i++) | 2811 | for (i = 0; i < conf->geo.raid_disks; i++) |
2698 | if (conf->mirrors[i].replacement) | 2812 | if (conf->mirrors[i].replacement) |
2699 | conf->mirrors[i].replacement | 2813 | conf->mirrors[i].replacement |
2700 | ->recovery_offset | 2814 | ->recovery_offset |
@@ -2707,7 +2821,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2707 | *skipped = 1; | 2821 | *skipped = 1; |
2708 | return sectors_skipped; | 2822 | return sectors_skipped; |
2709 | } | 2823 | } |
2710 | if (chunks_skipped >= conf->raid_disks) { | 2824 | |
2825 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
2826 | return reshape_request(mddev, sector_nr, skipped); | ||
2827 | |||
2828 | if (chunks_skipped >= conf->geo.raid_disks) { | ||
2711 | /* if there has been nothing to do on any drive, | 2829 | /* if there has been nothing to do on any drive, |
2712 | * then there is nothing to do at all.. | 2830 | * then there is nothing to do at all.. |
2713 | */ | 2831 | */ |
@@ -2721,9 +2839,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2721 | /* make sure whole request will fit in a chunk - if chunks | 2839 | /* make sure whole request will fit in a chunk - if chunks |
2722 | * are meaningful | 2840 | * are meaningful |
2723 | */ | 2841 | */ |
2724 | if (conf->near_copies < conf->raid_disks && | 2842 | if (conf->geo.near_copies < conf->geo.raid_disks && |
2725 | max_sector > (sector_nr | conf->chunk_mask)) | 2843 | max_sector > (sector_nr | chunk_mask)) |
2726 | max_sector = (sector_nr | conf->chunk_mask) + 1; | 2844 | max_sector = (sector_nr | chunk_mask) + 1; |
2727 | /* | 2845 | /* |
2728 | * If there is non-resync activity waiting for us then | 2846 | * If there is non-resync activity waiting for us then |
2729 | * put in a delay to throttle resync. | 2847 | * put in a delay to throttle resync. |
@@ -2752,7 +2870,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2752 | int j; | 2870 | int j; |
2753 | r10_bio = NULL; | 2871 | r10_bio = NULL; |
2754 | 2872 | ||
2755 | for (i=0 ; i<conf->raid_disks; i++) { | 2873 | for (i = 0 ; i < conf->geo.raid_disks; i++) { |
2756 | int still_degraded; | 2874 | int still_degraded; |
2757 | struct r10bio *rb2; | 2875 | struct r10bio *rb2; |
2758 | sector_t sect; | 2876 | sector_t sect; |
@@ -2806,7 +2924,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2806 | /* Need to check if the array will still be | 2924 | /* Need to check if the array will still be |
2807 | * degraded | 2925 | * degraded |
2808 | */ | 2926 | */ |
2809 | for (j=0; j<conf->raid_disks; j++) | 2927 | for (j = 0; j < conf->geo.raid_disks; j++) |
2810 | if (conf->mirrors[j].rdev == NULL || | 2928 | if (conf->mirrors[j].rdev == NULL || |
2811 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { | 2929 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { |
2812 | still_degraded = 1; | 2930 | still_degraded = 1; |
@@ -2984,9 +3102,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2984 | r10_bio->sector = sector_nr; | 3102 | r10_bio->sector = sector_nr; |
2985 | set_bit(R10BIO_IsSync, &r10_bio->state); | 3103 | set_bit(R10BIO_IsSync, &r10_bio->state); |
2986 | raid10_find_phys(conf, r10_bio); | 3104 | raid10_find_phys(conf, r10_bio); |
2987 | r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; | 3105 | r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; |
2988 | 3106 | ||
2989 | for (i=0; i<conf->copies; i++) { | 3107 | for (i = 0; i < conf->copies; i++) { |
2990 | int d = r10_bio->devs[i].devnum; | 3108 | int d = r10_bio->devs[i].devnum; |
2991 | sector_t first_bad, sector; | 3109 | sector_t first_bad, sector; |
2992 | int bad_sectors; | 3110 | int bad_sectors; |
@@ -3152,16 +3270,17 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) | |||
3152 | struct r10conf *conf = mddev->private; | 3270 | struct r10conf *conf = mddev->private; |
3153 | 3271 | ||
3154 | if (!raid_disks) | 3272 | if (!raid_disks) |
3155 | raid_disks = conf->raid_disks; | 3273 | raid_disks = min(conf->geo.raid_disks, |
3274 | conf->prev.raid_disks); | ||
3156 | if (!sectors) | 3275 | if (!sectors) |
3157 | sectors = conf->dev_sectors; | 3276 | sectors = conf->dev_sectors; |
3158 | 3277 | ||
3159 | size = sectors >> conf->chunk_shift; | 3278 | size = sectors >> conf->geo.chunk_shift; |
3160 | sector_div(size, conf->far_copies); | 3279 | sector_div(size, conf->geo.far_copies); |
3161 | size = size * raid_disks; | 3280 | size = size * raid_disks; |
3162 | sector_div(size, conf->near_copies); | 3281 | sector_div(size, conf->geo.near_copies); |
3163 | 3282 | ||
3164 | return size << conf->chunk_shift; | 3283 | return size << conf->geo.chunk_shift; |
3165 | } | 3284 | } |
3166 | 3285 | ||
3167 | static void calc_sectors(struct r10conf *conf, sector_t size) | 3286 | static void calc_sectors(struct r10conf *conf, sector_t size) |
@@ -3171,10 +3290,10 @@ static void calc_sectors(struct r10conf *conf, sector_t size) | |||
3171 | * conf->stride | 3290 | * conf->stride |
3172 | */ | 3291 | */ |
3173 | 3292 | ||
3174 | size = size >> conf->chunk_shift; | 3293 | size = size >> conf->geo.chunk_shift; |
3175 | sector_div(size, conf->far_copies); | 3294 | sector_div(size, conf->geo.far_copies); |
3176 | size = size * conf->raid_disks; | 3295 | size = size * conf->geo.raid_disks; |
3177 | sector_div(size, conf->near_copies); | 3296 | sector_div(size, conf->geo.near_copies); |
3178 | /* 'size' is now the number of chunks in the array */ | 3297 | /* 'size' is now the number of chunks in the array */ |
3179 | /* calculate "used chunks per device" */ | 3298 | /* calculate "used chunks per device" */ |
3180 | size = size * conf->copies; | 3299 | size = size * conf->copies; |
@@ -3182,38 +3301,76 @@ static void calc_sectors(struct r10conf *conf, sector_t size) | |||
3182 | /* We need to round up when dividing by raid_disks to | 3301 | /* We need to round up when dividing by raid_disks to |
3183 | * get the stride size. | 3302 | * get the stride size. |
3184 | */ | 3303 | */ |
3185 | size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); | 3304 | size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); |
3186 | 3305 | ||
3187 | conf->dev_sectors = size << conf->chunk_shift; | 3306 | conf->dev_sectors = size << conf->geo.chunk_shift; |
3188 | 3307 | ||
3189 | if (conf->far_offset) | 3308 | if (conf->geo.far_offset) |
3190 | conf->stride = 1 << conf->chunk_shift; | 3309 | conf->geo.stride = 1 << conf->geo.chunk_shift; |
3191 | else { | 3310 | else { |
3192 | sector_div(size, conf->far_copies); | 3311 | sector_div(size, conf->geo.far_copies); |
3193 | conf->stride = size << conf->chunk_shift; | 3312 | conf->geo.stride = size << conf->geo.chunk_shift; |
3194 | } | 3313 | } |
3195 | } | 3314 | } |
3196 | 3315 | ||
3316 | enum geo_type {geo_new, geo_old, geo_start}; | ||
3317 | static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) | ||
3318 | { | ||
3319 | int nc, fc, fo; | ||
3320 | int layout, chunk, disks; | ||
3321 | switch (new) { | ||
3322 | case geo_old: | ||
3323 | layout = mddev->layout; | ||
3324 | chunk = mddev->chunk_sectors; | ||
3325 | disks = mddev->raid_disks - mddev->delta_disks; | ||
3326 | break; | ||
3327 | case geo_new: | ||
3328 | layout = mddev->new_layout; | ||
3329 | chunk = mddev->new_chunk_sectors; | ||
3330 | disks = mddev->raid_disks; | ||
3331 | break; | ||
3332 | default: /* avoid 'may be unused' warnings */ | ||
3333 | case geo_start: /* new when starting reshape - raid_disks not | ||
3334 | * updated yet. */ | ||
3335 | layout = mddev->new_layout; | ||
3336 | chunk = mddev->new_chunk_sectors; | ||
3337 | disks = mddev->raid_disks + mddev->delta_disks; | ||
3338 | break; | ||
3339 | } | ||
3340 | if (layout >> 17) | ||
3341 | return -1; | ||
3342 | if (chunk < (PAGE_SIZE >> 9) || | ||
3343 | !is_power_of_2(chunk)) | ||
3344 | return -2; | ||
3345 | nc = layout & 255; | ||
3346 | fc = (layout >> 8) & 255; | ||
3347 | fo = layout & (1<<16); | ||
3348 | geo->raid_disks = disks; | ||
3349 | geo->near_copies = nc; | ||
3350 | geo->far_copies = fc; | ||
3351 | geo->far_offset = fo; | ||
3352 | geo->chunk_mask = chunk - 1; | ||
3353 | geo->chunk_shift = ffz(~chunk); | ||
3354 | return nc*fc; | ||
3355 | } | ||
3356 | |||
3197 | static struct r10conf *setup_conf(struct mddev *mddev) | 3357 | static struct r10conf *setup_conf(struct mddev *mddev) |
3198 | { | 3358 | { |
3199 | struct r10conf *conf = NULL; | 3359 | struct r10conf *conf = NULL; |
3200 | int nc, fc, fo; | ||
3201 | int err = -EINVAL; | 3360 | int err = -EINVAL; |
3361 | struct geom geo; | ||
3362 | int copies; | ||
3363 | |||
3364 | copies = setup_geo(&geo, mddev, geo_new); | ||
3202 | 3365 | ||
3203 | if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || | 3366 | if (copies == -2) { |
3204 | !is_power_of_2(mddev->new_chunk_sectors)) { | ||
3205 | printk(KERN_ERR "md/raid10:%s: chunk size must be " | 3367 | printk(KERN_ERR "md/raid10:%s: chunk size must be " |
3206 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", | 3368 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", |
3207 | mdname(mddev), PAGE_SIZE); | 3369 | mdname(mddev), PAGE_SIZE); |
3208 | goto out; | 3370 | goto out; |
3209 | } | 3371 | } |
3210 | 3372 | ||
3211 | nc = mddev->new_layout & 255; | 3373 | if (copies < 2 || copies > mddev->raid_disks) { |
3212 | fc = (mddev->new_layout >> 8) & 255; | ||
3213 | fo = mddev->new_layout & (1<<16); | ||
3214 | |||
3215 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || | ||
3216 | (mddev->new_layout >> 17)) { | ||
3217 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", | 3374 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", |
3218 | mdname(mddev), mddev->new_layout); | 3375 | mdname(mddev), mddev->new_layout); |
3219 | goto out; | 3376 | goto out; |
@@ -3224,7 +3381,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3224 | if (!conf) | 3381 | if (!conf) |
3225 | goto out; | 3382 | goto out; |
3226 | 3383 | ||
3227 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, | 3384 | /* FIXME calc properly */ |
3385 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + | ||
3386 | max(0,mddev->delta_disks)), | ||
3228 | GFP_KERNEL); | 3387 | GFP_KERNEL); |
3229 | if (!conf->mirrors) | 3388 | if (!conf->mirrors) |
3230 | goto out; | 3389 | goto out; |
@@ -3233,22 +3392,29 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3233 | if (!conf->tmppage) | 3392 | if (!conf->tmppage) |
3234 | goto out; | 3393 | goto out; |
3235 | 3394 | ||
3236 | 3395 | conf->geo = geo; | |
3237 | conf->raid_disks = mddev->raid_disks; | 3396 | conf->copies = copies; |
3238 | conf->near_copies = nc; | ||
3239 | conf->far_copies = fc; | ||
3240 | conf->copies = nc*fc; | ||
3241 | conf->far_offset = fo; | ||
3242 | conf->chunk_mask = mddev->new_chunk_sectors - 1; | ||
3243 | conf->chunk_shift = ffz(~mddev->new_chunk_sectors); | ||
3244 | |||
3245 | conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, | 3397 | conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, |
3246 | r10bio_pool_free, conf); | 3398 | r10bio_pool_free, conf); |
3247 | if (!conf->r10bio_pool) | 3399 | if (!conf->r10bio_pool) |
3248 | goto out; | 3400 | goto out; |
3249 | 3401 | ||
3250 | calc_sectors(conf, mddev->dev_sectors); | 3402 | calc_sectors(conf, mddev->dev_sectors); |
3251 | 3403 | if (mddev->reshape_position == MaxSector) { | |
3404 | conf->prev = conf->geo; | ||
3405 | conf->reshape_progress = MaxSector; | ||
3406 | } else { | ||
3407 | if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { | ||
3408 | err = -EINVAL; | ||
3409 | goto out; | ||
3410 | } | ||
3411 | conf->reshape_progress = mddev->reshape_position; | ||
3412 | if (conf->prev.far_offset) | ||
3413 | conf->prev.stride = 1 << conf->prev.chunk_shift; | ||
3414 | else | ||
3415 | /* far_copies must be 1 */ | ||
3416 | conf->prev.stride = conf->dev_sectors; | ||
3417 | } | ||
3252 | spin_lock_init(&conf->device_lock); | 3418 | spin_lock_init(&conf->device_lock); |
3253 | INIT_LIST_HEAD(&conf->retry_list); | 3419 | INIT_LIST_HEAD(&conf->retry_list); |
3254 | 3420 | ||
@@ -3263,8 +3429,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3263 | return conf; | 3429 | return conf; |
3264 | 3430 | ||
3265 | out: | 3431 | out: |
3266 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", | 3432 | if (err == -ENOMEM) |
3267 | mdname(mddev)); | 3433 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", |
3434 | mdname(mddev)); | ||
3268 | if (conf) { | 3435 | if (conf) { |
3269 | if (conf->r10bio_pool) | 3436 | if (conf->r10bio_pool) |
3270 | mempool_destroy(conf->r10bio_pool); | 3437 | mempool_destroy(conf->r10bio_pool); |
@@ -3282,12 +3449,8 @@ static int run(struct mddev *mddev) | |||
3282 | struct mirror_info *disk; | 3449 | struct mirror_info *disk; |
3283 | struct md_rdev *rdev; | 3450 | struct md_rdev *rdev; |
3284 | sector_t size; | 3451 | sector_t size; |
3285 | 3452 | sector_t min_offset_diff = 0; | |
3286 | /* | 3453 | int first = 1; |
3287 | * copy the already verified devices into our private RAID10 | ||
3288 | * bookkeeping area. [whatever we allocate in run(), | ||
3289 | * should be freed in stop()] | ||
3290 | */ | ||
3291 | 3454 | ||
3292 | if (mddev->private == NULL) { | 3455 | if (mddev->private == NULL) { |
3293 | conf = setup_conf(mddev); | 3456 | conf = setup_conf(mddev); |
@@ -3304,17 +3467,20 @@ static int run(struct mddev *mddev) | |||
3304 | 3467 | ||
3305 | chunk_size = mddev->chunk_sectors << 9; | 3468 | chunk_size = mddev->chunk_sectors << 9; |
3306 | blk_queue_io_min(mddev->queue, chunk_size); | 3469 | blk_queue_io_min(mddev->queue, chunk_size); |
3307 | if (conf->raid_disks % conf->near_copies) | 3470 | if (conf->geo.raid_disks % conf->geo.near_copies) |
3308 | blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); | 3471 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
3309 | else | 3472 | else |
3310 | blk_queue_io_opt(mddev->queue, chunk_size * | 3473 | blk_queue_io_opt(mddev->queue, chunk_size * |
3311 | (conf->raid_disks / conf->near_copies)); | 3474 | (conf->geo.raid_disks / conf->geo.near_copies)); |
3312 | 3475 | ||
3313 | rdev_for_each(rdev, mddev) { | 3476 | rdev_for_each(rdev, mddev) { |
3477 | long long diff; | ||
3314 | 3478 | ||
3315 | disk_idx = rdev->raid_disk; | 3479 | disk_idx = rdev->raid_disk; |
3316 | if (disk_idx >= conf->raid_disks | 3480 | if (disk_idx < 0) |
3317 | || disk_idx < 0) | 3481 | continue; |
3482 | if (disk_idx >= conf->geo.raid_disks && | ||
3483 | disk_idx >= conf->prev.raid_disks) | ||
3318 | continue; | 3484 | continue; |
3319 | disk = conf->mirrors + disk_idx; | 3485 | disk = conf->mirrors + disk_idx; |
3320 | 3486 | ||
@@ -3327,12 +3493,20 @@ static int run(struct mddev *mddev) | |||
3327 | goto out_free_conf; | 3493 | goto out_free_conf; |
3328 | disk->rdev = rdev; | 3494 | disk->rdev = rdev; |
3329 | } | 3495 | } |
3496 | diff = (rdev->new_data_offset - rdev->data_offset); | ||
3497 | if (!mddev->reshape_backwards) | ||
3498 | diff = -diff; | ||
3499 | if (diff < 0) | ||
3500 | diff = 0; | ||
3501 | if (first || diff < min_offset_diff) | ||
3502 | min_offset_diff = diff; | ||
3330 | 3503 | ||
3331 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3504 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
3332 | rdev->data_offset << 9); | 3505 | rdev->data_offset << 9); |
3333 | 3506 | ||
3334 | disk->head_position = 0; | 3507 | disk->head_position = 0; |
3335 | } | 3508 | } |
3509 | |||
3336 | /* need to check that every block has at least one working mirror */ | 3510 | /* need to check that every block has at least one working mirror */ |
3337 | if (!enough(conf, -1)) { | 3511 | if (!enough(conf, -1)) { |
3338 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 3512 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
@@ -3340,8 +3514,21 @@ static int run(struct mddev *mddev) | |||
3340 | goto out_free_conf; | 3514 | goto out_free_conf; |
3341 | } | 3515 | } |
3342 | 3516 | ||
3517 | if (conf->reshape_progress != MaxSector) { | ||
3518 | /* must ensure that shape change is supported */ | ||
3519 | if (conf->geo.far_copies != 1 && | ||
3520 | conf->geo.far_offset == 0) | ||
3521 | goto out_free_conf; | ||
3522 | if (conf->prev.far_copies != 1 && | ||
3523 | conf->geo.far_offset == 0) | ||
3524 | goto out_free_conf; | ||
3525 | } | ||
3526 | |||
3343 | mddev->degraded = 0; | 3527 | mddev->degraded = 0; |
3344 | for (i = 0; i < conf->raid_disks; i++) { | 3528 | for (i = 0; |
3529 | i < conf->geo.raid_disks | ||
3530 | || i < conf->prev.raid_disks; | ||
3531 | i++) { | ||
3345 | 3532 | ||
3346 | disk = conf->mirrors + i; | 3533 | disk = conf->mirrors + i; |
3347 | 3534 | ||
@@ -3368,8 +3555,8 @@ static int run(struct mddev *mddev) | |||
3368 | mdname(mddev)); | 3555 | mdname(mddev)); |
3369 | printk(KERN_INFO | 3556 | printk(KERN_INFO |
3370 | "md/raid10:%s: active with %d out of %d devices\n", | 3557 | "md/raid10:%s: active with %d out of %d devices\n", |
3371 | mdname(mddev), conf->raid_disks - mddev->degraded, | 3558 | mdname(mddev), conf->geo.raid_disks - mddev->degraded, |
3372 | conf->raid_disks); | 3559 | conf->geo.raid_disks); |
3373 | /* | 3560 | /* |
3374 | * Ok, everything is just fine now | 3561 | * Ok, everything is just fine now |
3375 | */ | 3562 | */ |
@@ -3386,11 +3573,11 @@ static int run(struct mddev *mddev) | |||
3386 | * maybe... | 3573 | * maybe... |
3387 | */ | 3574 | */ |
3388 | { | 3575 | { |
3389 | int stripe = conf->raid_disks * | 3576 | int stripe = conf->geo.raid_disks * |
3390 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); | 3577 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
3391 | stripe /= conf->near_copies; | 3578 | stripe /= conf->geo.near_copies; |
3392 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | 3579 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
3393 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 3580 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
3394 | } | 3581 | } |
3395 | 3582 | ||
3396 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | 3583 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); |
@@ -3398,6 +3585,30 @@ static int run(struct mddev *mddev) | |||
3398 | if (md_integrity_register(mddev)) | 3585 | if (md_integrity_register(mddev)) |
3399 | goto out_free_conf; | 3586 | goto out_free_conf; |
3400 | 3587 | ||
3588 | if (conf->reshape_progress != MaxSector) { | ||
3589 | unsigned long before_length, after_length; | ||
3590 | |||
3591 | before_length = ((1 << conf->prev.chunk_shift) * | ||
3592 | conf->prev.far_copies); | ||
3593 | after_length = ((1 << conf->geo.chunk_shift) * | ||
3594 | conf->geo.far_copies); | ||
3595 | |||
3596 | if (max(before_length, after_length) > min_offset_diff) { | ||
3597 | /* This cannot work */ | ||
3598 | printk("md/raid10: offset difference not enough to continue reshape\n"); | ||
3599 | goto out_free_conf; | ||
3600 | } | ||
3601 | conf->offset_diff = min_offset_diff; | ||
3602 | |||
3603 | conf->reshape_safe = conf->reshape_progress; | ||
3604 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
3605 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
3606 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
3607 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
3608 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
3609 | "reshape"); | ||
3610 | } | ||
3611 | |||
3401 | return 0; | 3612 | return 0; |
3402 | 3613 | ||
3403 | out_free_conf: | 3614 | out_free_conf: |
@@ -3460,14 +3671,23 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) | |||
3460 | struct r10conf *conf = mddev->private; | 3671 | struct r10conf *conf = mddev->private; |
3461 | sector_t oldsize, size; | 3672 | sector_t oldsize, size; |
3462 | 3673 | ||
3463 | if (conf->far_copies > 1 && !conf->far_offset) | 3674 | if (mddev->reshape_position != MaxSector) |
3675 | return -EBUSY; | ||
3676 | |||
3677 | if (conf->geo.far_copies > 1 && !conf->geo.far_offset) | ||
3464 | return -EINVAL; | 3678 | return -EINVAL; |
3465 | 3679 | ||
3466 | oldsize = raid10_size(mddev, 0, 0); | 3680 | oldsize = raid10_size(mddev, 0, 0); |
3467 | size = raid10_size(mddev, sectors, 0); | 3681 | size = raid10_size(mddev, sectors, 0); |
3468 | md_set_array_sectors(mddev, size); | 3682 | if (mddev->external_size && |
3469 | if (mddev->array_sectors > size) | 3683 | mddev->array_sectors > size) |
3470 | return -EINVAL; | 3684 | return -EINVAL; |
3685 | if (mddev->bitmap) { | ||
3686 | int ret = bitmap_resize(mddev->bitmap, size, 0, 0); | ||
3687 | if (ret) | ||
3688 | return ret; | ||
3689 | } | ||
3690 | md_set_array_sectors(mddev, size); | ||
3471 | set_capacity(mddev->gendisk, mddev->array_sectors); | 3691 | set_capacity(mddev->gendisk, mddev->array_sectors); |
3472 | revalidate_disk(mddev->gendisk); | 3692 | revalidate_disk(mddev->gendisk); |
3473 | if (sectors > mddev->dev_sectors && | 3693 | if (sectors > mddev->dev_sectors && |
@@ -3534,6 +3754,758 @@ static void *raid10_takeover(struct mddev *mddev) | |||
3534 | return ERR_PTR(-EINVAL); | 3754 | return ERR_PTR(-EINVAL); |
3535 | } | 3755 | } |
3536 | 3756 | ||
3757 | static int raid10_check_reshape(struct mddev *mddev) | ||
3758 | { | ||
3759 | /* Called when there is a request to change | ||
3760 | * - layout (to ->new_layout) | ||
3761 | * - chunk size (to ->new_chunk_sectors) | ||
3762 | * - raid_disks (by delta_disks) | ||
3763 | * or when trying to restart a reshape that was ongoing. | ||
3764 | * | ||
3765 | * We need to validate the request and possibly allocate | ||
3766 | * space if that might be an issue later. | ||
3767 | * | ||
3768 | * Currently we reject any reshape of a 'far' mode array, | ||
3769 | * allow chunk size to change if new is generally acceptable, | ||
3770 | * allow raid_disks to increase, and allow | ||
3771 | * a switch between 'near' mode and 'offset' mode. | ||
3772 | */ | ||
3773 | struct r10conf *conf = mddev->private; | ||
3774 | struct geom geo; | ||
3775 | |||
3776 | if (conf->geo.far_copies != 1 && !conf->geo.far_offset) | ||
3777 | return -EINVAL; | ||
3778 | |||
3779 | if (setup_geo(&geo, mddev, geo_start) != conf->copies) | ||
3780 | /* mustn't change number of copies */ | ||
3781 | return -EINVAL; | ||
3782 | if (geo.far_copies > 1 && !geo.far_offset) | ||
3783 | /* Cannot switch to 'far' mode */ | ||
3784 | return -EINVAL; | ||
3785 | |||
3786 | if (mddev->array_sectors & geo.chunk_mask) | ||
3787 | /* not factor of array size */ | ||
3788 | return -EINVAL; | ||
3789 | |||
3790 | if (!enough(conf, -1)) | ||
3791 | return -EINVAL; | ||
3792 | |||
3793 | kfree(conf->mirrors_new); | ||
3794 | conf->mirrors_new = NULL; | ||
3795 | if (mddev->delta_disks > 0) { | ||
3796 | /* allocate new 'mirrors' list */ | ||
3797 | conf->mirrors_new = kzalloc( | ||
3798 | sizeof(struct mirror_info) | ||
3799 | *(mddev->raid_disks + | ||
3800 | mddev->delta_disks), | ||
3801 | GFP_KERNEL); | ||
3802 | if (!conf->mirrors_new) | ||
3803 | return -ENOMEM; | ||
3804 | } | ||
3805 | return 0; | ||
3806 | } | ||
3807 | |||
3808 | /* | ||
3809 | * Need to check if array has failed when deciding whether to: | ||
3810 | * - start an array | ||
3811 | * - remove non-faulty devices | ||
3812 | * - add a spare | ||
3813 | * - allow a reshape | ||
3814 | * This determination is simple when no reshape is happening. | ||
3815 | * However if there is a reshape, we need to carefully check | ||
3816 | * both the before and after sections. | ||
3817 | * This is because some failed devices may only affect one | ||
3818 | * of the two sections, and some non-in_sync devices may | ||
3819 | * be insync in the section most affected by failed devices. | ||
3820 | */ | ||
3821 | static int calc_degraded(struct r10conf *conf) | ||
3822 | { | ||
3823 | int degraded, degraded2; | ||
3824 | int i; | ||
3825 | |||
3826 | rcu_read_lock(); | ||
3827 | degraded = 0; | ||
3828 | /* 'prev' section first */ | ||
3829 | for (i = 0; i < conf->prev.raid_disks; i++) { | ||
3830 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
3831 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
3832 | degraded++; | ||
3833 | else if (!test_bit(In_sync, &rdev->flags)) | ||
3834 | /* When we can reduce the number of devices in | ||
3835 | * an array, this might not contribute to | ||
3836 | * 'degraded'. It does now. | ||
3837 | */ | ||
3838 | degraded++; | ||
3839 | } | ||
3840 | rcu_read_unlock(); | ||
3841 | if (conf->geo.raid_disks == conf->prev.raid_disks) | ||
3842 | return degraded; | ||
3843 | rcu_read_lock(); | ||
3844 | degraded2 = 0; | ||
3845 | for (i = 0; i < conf->geo.raid_disks; i++) { | ||
3846 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
3847 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
3848 | degraded2++; | ||
3849 | else if (!test_bit(In_sync, &rdev->flags)) { | ||
3850 | /* If reshape is increasing the number of devices, | ||
3851 | * this section has already been recovered, so | ||
3852 | * it doesn't contribute to degraded. | ||
3853 | * else it does. | ||
3854 | */ | ||
3855 | if (conf->geo.raid_disks <= conf->prev.raid_disks) | ||
3856 | degraded2++; | ||
3857 | } | ||
3858 | } | ||
3859 | rcu_read_unlock(); | ||
3860 | if (degraded2 > degraded) | ||
3861 | return degraded2; | ||
3862 | return degraded; | ||
3863 | } | ||
3864 | |||
3865 | static int raid10_start_reshape(struct mddev *mddev) | ||
3866 | { | ||
3867 | /* A 'reshape' has been requested. This commits | ||
3868 | * the various 'new' fields and sets MD_RECOVER_RESHAPE | ||
3869 | * This also checks if there are enough spares and adds them | ||
3870 | * to the array. | ||
3871 | * We currently require enough spares to make the final | ||
3872 | * array non-degraded. We also require that the difference | ||
3873 | * between old and new data_offset - on each device - is | ||
3874 | * enough that we never risk over-writing. | ||
3875 | */ | ||
3876 | |||
3877 | unsigned long before_length, after_length; | ||
3878 | sector_t min_offset_diff = 0; | ||
3879 | int first = 1; | ||
3880 | struct geom new; | ||
3881 | struct r10conf *conf = mddev->private; | ||
3882 | struct md_rdev *rdev; | ||
3883 | int spares = 0; | ||
3884 | int ret; | ||
3885 | |||
3886 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | ||
3887 | return -EBUSY; | ||
3888 | |||
3889 | if (setup_geo(&new, mddev, geo_start) != conf->copies) | ||
3890 | return -EINVAL; | ||
3891 | |||
3892 | before_length = ((1 << conf->prev.chunk_shift) * | ||
3893 | conf->prev.far_copies); | ||
3894 | after_length = ((1 << conf->geo.chunk_shift) * | ||
3895 | conf->geo.far_copies); | ||
3896 | |||
3897 | rdev_for_each(rdev, mddev) { | ||
3898 | if (!test_bit(In_sync, &rdev->flags) | ||
3899 | && !test_bit(Faulty, &rdev->flags)) | ||
3900 | spares++; | ||
3901 | if (rdev->raid_disk >= 0) { | ||
3902 | long long diff = (rdev->new_data_offset | ||
3903 | - rdev->data_offset); | ||
3904 | if (!mddev->reshape_backwards) | ||
3905 | diff = -diff; | ||
3906 | if (diff < 0) | ||
3907 | diff = 0; | ||
3908 | if (first || diff < min_offset_diff) | ||
3909 | min_offset_diff = diff; | ||
3910 | } | ||
3911 | } | ||
3912 | |||
3913 | if (max(before_length, after_length) > min_offset_diff) | ||
3914 | return -EINVAL; | ||
3915 | |||
3916 | if (spares < mddev->delta_disks) | ||
3917 | return -EINVAL; | ||
3918 | |||
3919 | conf->offset_diff = min_offset_diff; | ||
3920 | spin_lock_irq(&conf->device_lock); | ||
3921 | if (conf->mirrors_new) { | ||
3922 | memcpy(conf->mirrors_new, conf->mirrors, | ||
3923 | sizeof(struct mirror_info)*conf->prev.raid_disks); | ||
3924 | smp_mb(); | ||
3925 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ | ||
3926 | conf->mirrors_old = conf->mirrors; | ||
3927 | conf->mirrors = conf->mirrors_new; | ||
3928 | conf->mirrors_new = NULL; | ||
3929 | } | ||
3930 | setup_geo(&conf->geo, mddev, geo_start); | ||
3931 | smp_mb(); | ||
3932 | if (mddev->reshape_backwards) { | ||
3933 | sector_t size = raid10_size(mddev, 0, 0); | ||
3934 | if (size < mddev->array_sectors) { | ||
3935 | spin_unlock_irq(&conf->device_lock); | ||
3936 | printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", | ||
3937 | mdname(mddev)); | ||
3938 | return -EINVAL; | ||
3939 | } | ||
3940 | mddev->resync_max_sectors = size; | ||
3941 | conf->reshape_progress = size; | ||
3942 | } else | ||
3943 | conf->reshape_progress = 0; | ||
3944 | spin_unlock_irq(&conf->device_lock); | ||
3945 | |||
3946 | if (mddev->delta_disks && mddev->bitmap) { | ||
3947 | ret = bitmap_resize(mddev->bitmap, | ||
3948 | raid10_size(mddev, 0, | ||
3949 | conf->geo.raid_disks), | ||
3950 | 0, 0); | ||
3951 | if (ret) | ||
3952 | goto abort; | ||
3953 | } | ||
3954 | if (mddev->delta_disks > 0) { | ||
3955 | rdev_for_each(rdev, mddev) | ||
3956 | if (rdev->raid_disk < 0 && | ||
3957 | !test_bit(Faulty, &rdev->flags)) { | ||
3958 | if (raid10_add_disk(mddev, rdev) == 0) { | ||
3959 | if (rdev->raid_disk >= | ||
3960 | conf->prev.raid_disks) | ||
3961 | set_bit(In_sync, &rdev->flags); | ||
3962 | else | ||
3963 | rdev->recovery_offset = 0; | ||
3964 | |||
3965 | if (sysfs_link_rdev(mddev, rdev)) | ||
3966 | /* Failure here is OK */; | ||
3967 | } | ||
3968 | } else if (rdev->raid_disk >= conf->prev.raid_disks | ||
3969 | && !test_bit(Faulty, &rdev->flags)) { | ||
3970 | /* This is a spare that was manually added */ | ||
3971 | set_bit(In_sync, &rdev->flags); | ||
3972 | } | ||
3973 | } | ||
3974 | /* When a reshape changes the number of devices, | ||
3975 | * ->degraded is measured against the larger of the | ||
3976 | * pre and post numbers. | ||
3977 | */ | ||
3978 | spin_lock_irq(&conf->device_lock); | ||
3979 | mddev->degraded = calc_degraded(conf); | ||
3980 | spin_unlock_irq(&conf->device_lock); | ||
3981 | mddev->raid_disks = conf->geo.raid_disks; | ||
3982 | mddev->reshape_position = conf->reshape_progress; | ||
3983 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
3984 | |||
3985 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
3986 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
3987 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
3988 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
3989 | |||
3990 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
3991 | "reshape"); | ||
3992 | if (!mddev->sync_thread) { | ||
3993 | ret = -EAGAIN; | ||
3994 | goto abort; | ||
3995 | } | ||
3996 | conf->reshape_checkpoint = jiffies; | ||
3997 | md_wakeup_thread(mddev->sync_thread); | ||
3998 | md_new_event(mddev); | ||
3999 | return 0; | ||
4000 | |||
4001 | abort: | ||
4002 | mddev->recovery = 0; | ||
4003 | spin_lock_irq(&conf->device_lock); | ||
4004 | conf->geo = conf->prev; | ||
4005 | mddev->raid_disks = conf->geo.raid_disks; | ||
4006 | rdev_for_each(rdev, mddev) | ||
4007 | rdev->new_data_offset = rdev->data_offset; | ||
4008 | smp_wmb(); | ||
4009 | conf->reshape_progress = MaxSector; | ||
4010 | mddev->reshape_position = MaxSector; | ||
4011 | spin_unlock_irq(&conf->device_lock); | ||
4012 | return ret; | ||
4013 | } | ||
4014 | |||
4015 | /* Calculate the last device-address that could contain | ||
4016 | * any block from the chunk that includes the array-address 's' | ||
4017 | * and report the next address. | ||
4018 | * i.e. the address returned will be chunk-aligned and after | ||
4019 | * any data that is in the chunk containing 's'. | ||
4020 | */ | ||
4021 | static sector_t last_dev_address(sector_t s, struct geom *geo) | ||
4022 | { | ||
4023 | s = (s | geo->chunk_mask) + 1; | ||
4024 | s >>= geo->chunk_shift; | ||
4025 | s *= geo->near_copies; | ||
4026 | s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); | ||
4027 | s *= geo->far_copies; | ||
4028 | s <<= geo->chunk_shift; | ||
4029 | return s; | ||
4030 | } | ||
4031 | |||
4032 | /* Calculate the first device-address that could contain | ||
4033 | * any block from the chunk that includes the array-address 's'. | ||
4034 | * This too will be the start of a chunk | ||
4035 | */ | ||
4036 | static sector_t first_dev_address(sector_t s, struct geom *geo) | ||
4037 | { | ||
4038 | s >>= geo->chunk_shift; | ||
4039 | s *= geo->near_copies; | ||
4040 | sector_div(s, geo->raid_disks); | ||
4041 | s *= geo->far_copies; | ||
4042 | s <<= geo->chunk_shift; | ||
4043 | return s; | ||
4044 | } | ||
4045 | |||
4046 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | ||
4047 | int *skipped) | ||
4048 | { | ||
4049 | /* We simply copy at most one chunk (smallest of old and new) | ||
4050 | * at a time, possibly less if that exceeds RESYNC_PAGES, | ||
4051 | * or we hit a bad block or something. | ||
4052 | * This might mean we pause for normal IO in the middle of | ||
4053 | * a chunk, but that is not a problem was mddev->reshape_position | ||
4054 | * can record any location. | ||
4055 | * | ||
4056 | * If we will want to write to a location that isn't | ||
4057 | * yet recorded as 'safe' (i.e. in metadata on disk) then | ||
4058 | * we need to flush all reshape requests and update the metadata. | ||
4059 | * | ||
4060 | * When reshaping forwards (e.g. to more devices), we interpret | ||
4061 | * 'safe' as the earliest block which might not have been copied | ||
4062 | * down yet. We divide this by previous stripe size and multiply | ||
4063 | * by previous stripe length to get lowest device offset that we | ||
4064 | * cannot write to yet. | ||
4065 | * We interpret 'sector_nr' as an address that we want to write to. | ||
4066 | * From this we use last_device_address() to find where we might | ||
4067 | * write to, and first_device_address on the 'safe' position. | ||
4068 | * If this 'next' write position is after the 'safe' position, | ||
4069 | * we must update the metadata to increase the 'safe' position. | ||
4070 | * | ||
4071 | * When reshaping backwards, we round in the opposite direction | ||
4072 | * and perform the reverse test: next write position must not be | ||
4073 | * less than current safe position. | ||
4074 | * | ||
4075 | * In all this the minimum difference in data offsets | ||
4076 | * (conf->offset_diff - always positive) allows a bit of slack, | ||
4077 | * so next can be after 'safe', but not by more than offset_disk | ||
4078 | * | ||
4079 | * We need to prepare all the bios here before we start any IO | ||
4080 | * to ensure the size we choose is acceptable to all devices. | ||
4081 | * The means one for each copy for write-out and an extra one for | ||
4082 | * read-in. | ||
4083 | * We store the read-in bio in ->master_bio and the others in | ||
4084 | * ->devs[x].bio and ->devs[x].repl_bio. | ||
4085 | */ | ||
4086 | struct r10conf *conf = mddev->private; | ||
4087 | struct r10bio *r10_bio; | ||
4088 | sector_t next, safe, last; | ||
4089 | int max_sectors; | ||
4090 | int nr_sectors; | ||
4091 | int s; | ||
4092 | struct md_rdev *rdev; | ||
4093 | int need_flush = 0; | ||
4094 | struct bio *blist; | ||
4095 | struct bio *bio, *read_bio; | ||
4096 | int sectors_done = 0; | ||
4097 | |||
4098 | if (sector_nr == 0) { | ||
4099 | /* If restarting in the middle, skip the initial sectors */ | ||
4100 | if (mddev->reshape_backwards && | ||
4101 | conf->reshape_progress < raid10_size(mddev, 0, 0)) { | ||
4102 | sector_nr = (raid10_size(mddev, 0, 0) | ||
4103 | - conf->reshape_progress); | ||
4104 | } else if (!mddev->reshape_backwards && | ||
4105 | conf->reshape_progress > 0) | ||
4106 | sector_nr = conf->reshape_progress; | ||
4107 | if (sector_nr) { | ||
4108 | mddev->curr_resync_completed = sector_nr; | ||
4109 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||
4110 | *skipped = 1; | ||
4111 | return sector_nr; | ||
4112 | } | ||
4113 | } | ||
4114 | |||
4115 | /* We don't use sector_nr to track where we are up to | ||
4116 | * as that doesn't work well for ->reshape_backwards. | ||
4117 | * So just use ->reshape_progress. | ||
4118 | */ | ||
4119 | if (mddev->reshape_backwards) { | ||
4120 | /* 'next' is the earliest device address that we might | ||
4121 | * write to for this chunk in the new layout | ||
4122 | */ | ||
4123 | next = first_dev_address(conf->reshape_progress - 1, | ||
4124 | &conf->geo); | ||
4125 | |||
4126 | /* 'safe' is the last device address that we might read from | ||
4127 | * in the old layout after a restart | ||
4128 | */ | ||
4129 | safe = last_dev_address(conf->reshape_safe - 1, | ||
4130 | &conf->prev); | ||
4131 | |||
4132 | if (next + conf->offset_diff < safe) | ||
4133 | need_flush = 1; | ||
4134 | |||
4135 | last = conf->reshape_progress - 1; | ||
4136 | sector_nr = last & ~(sector_t)(conf->geo.chunk_mask | ||
4137 | & conf->prev.chunk_mask); | ||
4138 | if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) | ||
4139 | sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; | ||
4140 | } else { | ||
4141 | /* 'next' is after the last device address that we | ||
4142 | * might write to for this chunk in the new layout | ||
4143 | */ | ||
4144 | next = last_dev_address(conf->reshape_progress, &conf->geo); | ||
4145 | |||
4146 | /* 'safe' is the earliest device address that we might | ||
4147 | * read from in the old layout after a restart | ||
4148 | */ | ||
4149 | safe = first_dev_address(conf->reshape_safe, &conf->prev); | ||
4150 | |||
4151 | /* Need to update metadata if 'next' might be beyond 'safe' | ||
4152 | * as that would possibly corrupt data | ||
4153 | */ | ||
4154 | if (next > safe + conf->offset_diff) | ||
4155 | need_flush = 1; | ||
4156 | |||
4157 | sector_nr = conf->reshape_progress; | ||
4158 | last = sector_nr | (conf->geo.chunk_mask | ||
4159 | & conf->prev.chunk_mask); | ||
4160 | |||
4161 | if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) | ||
4162 | last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; | ||
4163 | } | ||
4164 | |||
4165 | if (need_flush || | ||
4166 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | ||
4167 | /* Need to update reshape_position in metadata */ | ||
4168 | wait_barrier(conf); | ||
4169 | mddev->reshape_position = conf->reshape_progress; | ||
4170 | if (mddev->reshape_backwards) | ||
4171 | mddev->curr_resync_completed = raid10_size(mddev, 0, 0) | ||
4172 | - conf->reshape_progress; | ||
4173 | else | ||
4174 | mddev->curr_resync_completed = conf->reshape_progress; | ||
4175 | conf->reshape_checkpoint = jiffies; | ||
4176 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
4177 | md_wakeup_thread(mddev->thread); | ||
4178 | wait_event(mddev->sb_wait, mddev->flags == 0 || | ||
4179 | kthread_should_stop()); | ||
4180 | conf->reshape_safe = mddev->reshape_position; | ||
4181 | allow_barrier(conf); | ||
4182 | } | ||
4183 | |||
4184 | read_more: | ||
4185 | /* Now schedule reads for blocks from sector_nr to last */ | ||
4186 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); | ||
4187 | raise_barrier(conf, sectors_done != 0); | ||
4188 | atomic_set(&r10_bio->remaining, 0); | ||
4189 | r10_bio->mddev = mddev; | ||
4190 | r10_bio->sector = sector_nr; | ||
4191 | set_bit(R10BIO_IsReshape, &r10_bio->state); | ||
4192 | r10_bio->sectors = last - sector_nr + 1; | ||
4193 | rdev = read_balance(conf, r10_bio, &max_sectors); | ||
4194 | BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); | ||
4195 | |||
4196 | if (!rdev) { | ||
4197 | /* Cannot read from here, so need to record bad blocks | ||
4198 | * on all the target devices. | ||
4199 | */ | ||
4200 | // FIXME | ||
4201 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
4202 | return sectors_done; | ||
4203 | } | ||
4204 | |||
4205 | read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); | ||
4206 | |||
4207 | read_bio->bi_bdev = rdev->bdev; | ||
4208 | read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr | ||
4209 | + rdev->data_offset); | ||
4210 | read_bio->bi_private = r10_bio; | ||
4211 | read_bio->bi_end_io = end_sync_read; | ||
4212 | read_bio->bi_rw = READ; | ||
4213 | read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
4214 | read_bio->bi_flags |= 1 << BIO_UPTODATE; | ||
4215 | read_bio->bi_vcnt = 0; | ||
4216 | read_bio->bi_idx = 0; | ||
4217 | read_bio->bi_size = 0; | ||
4218 | r10_bio->master_bio = read_bio; | ||
4219 | r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; | ||
4220 | |||
4221 | /* Now find the locations in the new layout */ | ||
4222 | __raid10_find_phys(&conf->geo, r10_bio); | ||
4223 | |||
4224 | blist = read_bio; | ||
4225 | read_bio->bi_next = NULL; | ||
4226 | |||
4227 | for (s = 0; s < conf->copies*2; s++) { | ||
4228 | struct bio *b; | ||
4229 | int d = r10_bio->devs[s/2].devnum; | ||
4230 | struct md_rdev *rdev2; | ||
4231 | if (s&1) { | ||
4232 | rdev2 = conf->mirrors[d].replacement; | ||
4233 | b = r10_bio->devs[s/2].repl_bio; | ||
4234 | } else { | ||
4235 | rdev2 = conf->mirrors[d].rdev; | ||
4236 | b = r10_bio->devs[s/2].bio; | ||
4237 | } | ||
4238 | if (!rdev2 || test_bit(Faulty, &rdev2->flags)) | ||
4239 | continue; | ||
4240 | b->bi_bdev = rdev2->bdev; | ||
4241 | b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; | ||
4242 | b->bi_private = r10_bio; | ||
4243 | b->bi_end_io = end_reshape_write; | ||
4244 | b->bi_rw = WRITE; | ||
4245 | b->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
4246 | b->bi_flags |= 1 << BIO_UPTODATE; | ||
4247 | b->bi_next = blist; | ||
4248 | b->bi_vcnt = 0; | ||
4249 | b->bi_idx = 0; | ||
4250 | b->bi_size = 0; | ||
4251 | blist = b; | ||
4252 | } | ||
4253 | |||
4254 | /* Now add as many pages as possible to all of these bios. */ | ||
4255 | |||
4256 | nr_sectors = 0; | ||
4257 | for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { | ||
4258 | struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; | ||
4259 | int len = (max_sectors - s) << 9; | ||
4260 | if (len > PAGE_SIZE) | ||
4261 | len = PAGE_SIZE; | ||
4262 | for (bio = blist; bio ; bio = bio->bi_next) { | ||
4263 | struct bio *bio2; | ||
4264 | if (bio_add_page(bio, page, len, 0)) | ||
4265 | continue; | ||
4266 | |||
4267 | /* Didn't fit, must stop */ | ||
4268 | for (bio2 = blist; | ||
4269 | bio2 && bio2 != bio; | ||
4270 | bio2 = bio2->bi_next) { | ||
4271 | /* Remove last page from this bio */ | ||
4272 | bio2->bi_vcnt--; | ||
4273 | bio2->bi_size -= len; | ||
4274 | bio2->bi_flags &= ~(1<<BIO_SEG_VALID); | ||
4275 | } | ||
4276 | goto bio_full; | ||
4277 | } | ||
4278 | sector_nr += len >> 9; | ||
4279 | nr_sectors += len >> 9; | ||
4280 | } | ||
4281 | bio_full: | ||
4282 | r10_bio->sectors = nr_sectors; | ||
4283 | |||
4284 | /* Now submit the read */ | ||
4285 | md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); | ||
4286 | atomic_inc(&r10_bio->remaining); | ||
4287 | read_bio->bi_next = NULL; | ||
4288 | generic_make_request(read_bio); | ||
4289 | sector_nr += nr_sectors; | ||
4290 | sectors_done += nr_sectors; | ||
4291 | if (sector_nr <= last) | ||
4292 | goto read_more; | ||
4293 | |||
4294 | /* Now that we have done the whole section we can | ||
4295 | * update reshape_progress | ||
4296 | */ | ||
4297 | if (mddev->reshape_backwards) | ||
4298 | conf->reshape_progress -= sectors_done; | ||
4299 | else | ||
4300 | conf->reshape_progress += sectors_done; | ||
4301 | |||
4302 | return sectors_done; | ||
4303 | } | ||
4304 | |||
4305 | static void end_reshape_request(struct r10bio *r10_bio); | ||
4306 | static int handle_reshape_read_error(struct mddev *mddev, | ||
4307 | struct r10bio *r10_bio); | ||
4308 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) | ||
4309 | { | ||
4310 | /* Reshape read completed. Hopefully we have a block | ||
4311 | * to write out. | ||
4312 | * If we got a read error then we do sync 1-page reads from | ||
4313 | * elsewhere until we find the data - or give up. | ||
4314 | */ | ||
4315 | struct r10conf *conf = mddev->private; | ||
4316 | int s; | ||
4317 | |||
4318 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
4319 | if (handle_reshape_read_error(mddev, r10_bio) < 0) { | ||
4320 | /* Reshape has been aborted */ | ||
4321 | md_done_sync(mddev, r10_bio->sectors, 0); | ||
4322 | return; | ||
4323 | } | ||
4324 | |||
4325 | /* We definitely have the data in the pages, schedule the | ||
4326 | * writes. | ||
4327 | */ | ||
4328 | atomic_set(&r10_bio->remaining, 1); | ||
4329 | for (s = 0; s < conf->copies*2; s++) { | ||
4330 | struct bio *b; | ||
4331 | int d = r10_bio->devs[s/2].devnum; | ||
4332 | struct md_rdev *rdev; | ||
4333 | if (s&1) { | ||
4334 | rdev = conf->mirrors[d].replacement; | ||
4335 | b = r10_bio->devs[s/2].repl_bio; | ||
4336 | } else { | ||
4337 | rdev = conf->mirrors[d].rdev; | ||
4338 | b = r10_bio->devs[s/2].bio; | ||
4339 | } | ||
4340 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
4341 | continue; | ||
4342 | atomic_inc(&rdev->nr_pending); | ||
4343 | md_sync_acct(b->bi_bdev, r10_bio->sectors); | ||
4344 | atomic_inc(&r10_bio->remaining); | ||
4345 | b->bi_next = NULL; | ||
4346 | generic_make_request(b); | ||
4347 | } | ||
4348 | end_reshape_request(r10_bio); | ||
4349 | } | ||
4350 | |||
4351 | static void end_reshape(struct r10conf *conf) | ||
4352 | { | ||
4353 | if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) | ||
4354 | return; | ||
4355 | |||
4356 | spin_lock_irq(&conf->device_lock); | ||
4357 | conf->prev = conf->geo; | ||
4358 | md_finish_reshape(conf->mddev); | ||
4359 | smp_wmb(); | ||
4360 | conf->reshape_progress = MaxSector; | ||
4361 | spin_unlock_irq(&conf->device_lock); | ||
4362 | |||
4363 | /* read-ahead size must cover two whole stripes, which is | ||
4364 | * 2 * (datadisks) * chunksize where 'n' is the number of raid devices | ||
4365 | */ | ||
4366 | if (conf->mddev->queue) { | ||
4367 | int stripe = conf->geo.raid_disks * | ||
4368 | ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); | ||
4369 | stripe /= conf->geo.near_copies; | ||
4370 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | ||
4371 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | ||
4372 | } | ||
4373 | conf->fullsync = 0; | ||
4374 | } | ||
4375 | |||
4376 | |||
4377 | static int handle_reshape_read_error(struct mddev *mddev, | ||
4378 | struct r10bio *r10_bio) | ||
4379 | { | ||
4380 | /* Use sync reads to get the blocks from somewhere else */ | ||
4381 | int sectors = r10_bio->sectors; | ||
4382 | struct r10bio r10b; | ||
4383 | struct r10conf *conf = mddev->private; | ||
4384 | int slot = 0; | ||
4385 | int idx = 0; | ||
4386 | struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; | ||
4387 | |||
4388 | r10b.sector = r10_bio->sector; | ||
4389 | __raid10_find_phys(&conf->prev, &r10b); | ||
4390 | |||
4391 | while (sectors) { | ||
4392 | int s = sectors; | ||
4393 | int success = 0; | ||
4394 | int first_slot = slot; | ||
4395 | |||
4396 | if (s > (PAGE_SIZE >> 9)) | ||
4397 | s = PAGE_SIZE >> 9; | ||
4398 | |||
4399 | while (!success) { | ||
4400 | int d = r10b.devs[slot].devnum; | ||
4401 | struct md_rdev *rdev = conf->mirrors[d].rdev; | ||
4402 | sector_t addr; | ||
4403 | if (rdev == NULL || | ||
4404 | test_bit(Faulty, &rdev->flags) || | ||
4405 | !test_bit(In_sync, &rdev->flags)) | ||
4406 | goto failed; | ||
4407 | |||
4408 | addr = r10b.devs[slot].addr + idx * PAGE_SIZE; | ||
4409 | success = sync_page_io(rdev, | ||
4410 | addr, | ||
4411 | s << 9, | ||
4412 | bvec[idx].bv_page, | ||
4413 | READ, false); | ||
4414 | if (success) | ||
4415 | break; | ||
4416 | failed: | ||
4417 | slot++; | ||
4418 | if (slot >= conf->copies) | ||
4419 | slot = 0; | ||
4420 | if (slot == first_slot) | ||
4421 | break; | ||
4422 | } | ||
4423 | if (!success) { | ||
4424 | /* couldn't read this block, must give up */ | ||
4425 | set_bit(MD_RECOVERY_INTR, | ||
4426 | &mddev->recovery); | ||
4427 | return -EIO; | ||
4428 | } | ||
4429 | sectors -= s; | ||
4430 | idx++; | ||
4431 | } | ||
4432 | return 0; | ||
4433 | } | ||
4434 | |||
4435 | static void end_reshape_write(struct bio *bio, int error) | ||
4436 | { | ||
4437 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
4438 | struct r10bio *r10_bio = bio->bi_private; | ||
4439 | struct mddev *mddev = r10_bio->mddev; | ||
4440 | struct r10conf *conf = mddev->private; | ||
4441 | int d; | ||
4442 | int slot; | ||
4443 | int repl; | ||
4444 | struct md_rdev *rdev = NULL; | ||
4445 | |||
4446 | d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); | ||
4447 | if (repl) | ||
4448 | rdev = conf->mirrors[d].replacement; | ||
4449 | if (!rdev) { | ||
4450 | smp_mb(); | ||
4451 | rdev = conf->mirrors[d].rdev; | ||
4452 | } | ||
4453 | |||
4454 | if (!uptodate) { | ||
4455 | /* FIXME should record badblock */ | ||
4456 | md_error(mddev, rdev); | ||
4457 | } | ||
4458 | |||
4459 | rdev_dec_pending(rdev, mddev); | ||
4460 | end_reshape_request(r10_bio); | ||
4461 | } | ||
4462 | |||
4463 | static void end_reshape_request(struct r10bio *r10_bio) | ||
4464 | { | ||
4465 | if (!atomic_dec_and_test(&r10_bio->remaining)) | ||
4466 | return; | ||
4467 | md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); | ||
4468 | bio_put(r10_bio->master_bio); | ||
4469 | put_buf(r10_bio); | ||
4470 | } | ||
4471 | |||
4472 | static void raid10_finish_reshape(struct mddev *mddev) | ||
4473 | { | ||
4474 | struct r10conf *conf = mddev->private; | ||
4475 | |||
4476 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
4477 | return; | ||
4478 | |||
4479 | if (mddev->delta_disks > 0) { | ||
4480 | sector_t size = raid10_size(mddev, 0, 0); | ||
4481 | md_set_array_sectors(mddev, size); | ||
4482 | if (mddev->recovery_cp > mddev->resync_max_sectors) { | ||
4483 | mddev->recovery_cp = mddev->resync_max_sectors; | ||
4484 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
4485 | } | ||
4486 | mddev->resync_max_sectors = size; | ||
4487 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
4488 | revalidate_disk(mddev->gendisk); | ||
4489 | } else { | ||
4490 | int d; | ||
4491 | for (d = conf->geo.raid_disks ; | ||
4492 | d < conf->geo.raid_disks - mddev->delta_disks; | ||
4493 | d++) { | ||
4494 | struct md_rdev *rdev = conf->mirrors[d].rdev; | ||
4495 | if (rdev) | ||
4496 | clear_bit(In_sync, &rdev->flags); | ||
4497 | rdev = conf->mirrors[d].replacement; | ||
4498 | if (rdev) | ||
4499 | clear_bit(In_sync, &rdev->flags); | ||
4500 | } | ||
4501 | } | ||
4502 | mddev->layout = mddev->new_layout; | ||
4503 | mddev->chunk_sectors = 1 << conf->geo.chunk_shift; | ||
4504 | mddev->reshape_position = MaxSector; | ||
4505 | mddev->delta_disks = 0; | ||
4506 | mddev->reshape_backwards = 0; | ||
4507 | } | ||
4508 | |||
3537 | static struct md_personality raid10_personality = | 4509 | static struct md_personality raid10_personality = |
3538 | { | 4510 | { |
3539 | .name = "raid10", | 4511 | .name = "raid10", |
@@ -3552,6 +4524,9 @@ static struct md_personality raid10_personality = | |||
3552 | .size = raid10_size, | 4524 | .size = raid10_size, |
3553 | .resize = raid10_resize, | 4525 | .resize = raid10_resize, |
3554 | .takeover = raid10_takeover, | 4526 | .takeover = raid10_takeover, |
4527 | .check_reshape = raid10_check_reshape, | ||
4528 | .start_reshape = raid10_start_reshape, | ||
4529 | .finish_reshape = raid10_finish_reshape, | ||
3555 | }; | 4530 | }; |
3556 | 4531 | ||
3557 | static int __init raid_init(void) | 4532 | static int __init raid_init(void) |