aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c582
1 files changed, 468 insertions, 114 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 685ddf325ee4..6e8aa213f0d5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
73 struct r10conf *conf = data; 73 struct r10conf *conf = data;
74 int size = offsetof(struct r10bio, devs[conf->copies]); 74 int size = offsetof(struct r10bio, devs[conf->copies]);
75 75
76 /* allocate a r10bio with room for raid_disks entries in the bios array */ 76 /* allocate a r10bio with room for raid_disks entries in the
77 * bios array */
77 return kzalloc(size, gfp_flags); 78 return kzalloc(size, gfp_flags);
78} 79}
79 80
@@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123 if (!bio) 124 if (!bio)
124 goto out_free_bio; 125 goto out_free_bio;
125 r10_bio->devs[j].bio = bio; 126 r10_bio->devs[j].bio = bio;
127 if (!conf->have_replacement)
128 continue;
129 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
130 if (!bio)
131 goto out_free_bio;
132 r10_bio->devs[j].repl_bio = bio;
126 } 133 }
127 /* 134 /*
128 * Allocate RESYNC_PAGES data pages and attach them 135 * Allocate RESYNC_PAGES data pages and attach them
129 * where needed. 136 * where needed.
130 */ 137 */
131 for (j = 0 ; j < nalloc; j++) { 138 for (j = 0 ; j < nalloc; j++) {
139 struct bio *rbio = r10_bio->devs[j].repl_bio;
132 bio = r10_bio->devs[j].bio; 140 bio = r10_bio->devs[j].bio;
133 for (i = 0; i < RESYNC_PAGES; i++) { 141 for (i = 0; i < RESYNC_PAGES; i++) {
134 if (j == 1 && !test_bit(MD_RECOVERY_SYNC, 142 if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
@@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
143 goto out_free_pages; 151 goto out_free_pages;
144 152
145 bio->bi_io_vec[i].bv_page = page; 153 bio->bi_io_vec[i].bv_page = page;
154 if (rbio)
155 rbio->bi_io_vec[i].bv_page = page;
146 } 156 }
147 } 157 }
148 158
@@ -156,8 +166,11 @@ out_free_pages:
156 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 166 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
157 j = -1; 167 j = -1;
158out_free_bio: 168out_free_bio:
159 while ( ++j < nalloc ) 169 while (++j < nalloc) {
160 bio_put(r10_bio->devs[j].bio); 170 bio_put(r10_bio->devs[j].bio);
171 if (r10_bio->devs[j].repl_bio)
172 bio_put(r10_bio->devs[j].repl_bio);
173 }
161 r10bio_pool_free(r10_bio, conf); 174 r10bio_pool_free(r10_bio, conf);
162 return NULL; 175 return NULL;
163} 176}
@@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
178 } 191 }
179 bio_put(bio); 192 bio_put(bio);
180 } 193 }
194 bio = r10bio->devs[j].repl_bio;
195 if (bio)
196 bio_put(bio);
181 } 197 }
182 r10bio_pool_free(r10bio, conf); 198 r10bio_pool_free(r10bio, conf);
183} 199}
@@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
191 if (!BIO_SPECIAL(*bio)) 207 if (!BIO_SPECIAL(*bio))
192 bio_put(*bio); 208 bio_put(*bio);
193 *bio = NULL; 209 *bio = NULL;
210 bio = &r10_bio->devs[i].repl_bio;
211 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
212 bio_put(*bio);
213 *bio = NULL;
194 } 214 }
195} 215}
196 216
@@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio)
275 * Find the disk number which triggered given bio 295 * Find the disk number which triggered given bio
276 */ 296 */
277static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 297static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
278 struct bio *bio, int *slotp) 298 struct bio *bio, int *slotp, int *replp)
279{ 299{
280 int slot; 300 int slot;
301 int repl = 0;
281 302
282 for (slot = 0; slot < conf->copies; slot++) 303 for (slot = 0; slot < conf->copies; slot++) {
283 if (r10_bio->devs[slot].bio == bio) 304 if (r10_bio->devs[slot].bio == bio)
284 break; 305 break;
306 if (r10_bio->devs[slot].repl_bio == bio) {
307 repl = 1;
308 break;
309 }
310 }
285 311
286 BUG_ON(slot == conf->copies); 312 BUG_ON(slot == conf->copies);
287 update_head_pos(slot, r10_bio); 313 update_head_pos(slot, r10_bio);
288 314
289 if (slotp) 315 if (slotp)
290 *slotp = slot; 316 *slotp = slot;
317 if (replp)
318 *replp = repl;
291 return r10_bio->devs[slot].devnum; 319 return r10_bio->devs[slot].devnum;
292} 320}
293 321
@@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error)
296 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 324 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
297 struct r10bio *r10_bio = bio->bi_private; 325 struct r10bio *r10_bio = bio->bi_private;
298 int slot, dev; 326 int slot, dev;
327 struct md_rdev *rdev;
299 struct r10conf *conf = r10_bio->mddev->private; 328 struct r10conf *conf = r10_bio->mddev->private;
300 329
301 330
302 slot = r10_bio->read_slot; 331 slot = r10_bio->read_slot;
303 dev = r10_bio->devs[slot].devnum; 332 dev = r10_bio->devs[slot].devnum;
333 rdev = r10_bio->devs[slot].rdev;
304 /* 334 /*
305 * this branch is our 'one mirror IO has finished' event handler: 335 * this branch is our 'one mirror IO has finished' event handler:
306 */ 336 */
@@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
318 */ 348 */
319 set_bit(R10BIO_Uptodate, &r10_bio->state); 349 set_bit(R10BIO_Uptodate, &r10_bio->state);
320 raid_end_bio_io(r10_bio); 350 raid_end_bio_io(r10_bio);
321 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); 351 rdev_dec_pending(rdev, conf->mddev);
322 } else { 352 } else {
323 /* 353 /*
324 * oops, read error - keep the refcount on the rdev 354 * oops, read error - keep the refcount on the rdev
@@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
327 printk_ratelimited(KERN_ERR 357 printk_ratelimited(KERN_ERR
328 "md/raid10:%s: %s: rescheduling sector %llu\n", 358 "md/raid10:%s: %s: rescheduling sector %llu\n",
329 mdname(conf->mddev), 359 mdname(conf->mddev),
330 bdevname(conf->mirrors[dev].rdev->bdev, b), 360 bdevname(rdev->bdev, b),
331 (unsigned long long)r10_bio->sector); 361 (unsigned long long)r10_bio->sector);
332 set_bit(R10BIO_ReadError, &r10_bio->state); 362 set_bit(R10BIO_ReadError, &r10_bio->state);
333 reschedule_retry(r10_bio); 363 reschedule_retry(r10_bio);
@@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error)
366 int dev; 396 int dev;
367 int dec_rdev = 1; 397 int dec_rdev = 1;
368 struct r10conf *conf = r10_bio->mddev->private; 398 struct r10conf *conf = r10_bio->mddev->private;
369 int slot; 399 int slot, repl;
400 struct md_rdev *rdev = NULL;
370 401
371 dev = find_bio_disk(conf, r10_bio, bio, &slot); 402 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
372 403
404 if (repl)
405 rdev = conf->mirrors[dev].replacement;
406 if (!rdev) {
407 smp_rmb();
408 repl = 0;
409 rdev = conf->mirrors[dev].rdev;
410 }
373 /* 411 /*
374 * this branch is our 'one mirror IO has finished' event handler: 412 * this branch is our 'one mirror IO has finished' event handler:
375 */ 413 */
376 if (!uptodate) { 414 if (!uptodate) {
377 set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); 415 if (repl)
378 set_bit(R10BIO_WriteError, &r10_bio->state); 416 /* Never record new bad blocks to replacement,
379 dec_rdev = 0; 417 * just fail it.
418 */
419 md_error(rdev->mddev, rdev);
420 else {
421 set_bit(WriteErrorSeen, &rdev->flags);
422 if (!test_and_set_bit(WantReplacement, &rdev->flags))
423 set_bit(MD_RECOVERY_NEEDED,
424 &rdev->mddev->recovery);
425 set_bit(R10BIO_WriteError, &r10_bio->state);
426 dec_rdev = 0;
427 }
380 } else { 428 } else {
381 /* 429 /*
382 * Set R10BIO_Uptodate in our master bio, so that 430 * Set R10BIO_Uptodate in our master bio, so that
@@ -393,12 +441,15 @@ static void raid10_end_write_request(struct bio *bio, int error)
393 set_bit(R10BIO_Uptodate, &r10_bio->state); 441 set_bit(R10BIO_Uptodate, &r10_bio->state);
394 442
395 /* Maybe we can clear some bad blocks. */ 443 /* Maybe we can clear some bad blocks. */
396 if (is_badblock(conf->mirrors[dev].rdev, 444 if (is_badblock(rdev,
397 r10_bio->devs[slot].addr, 445 r10_bio->devs[slot].addr,
398 r10_bio->sectors, 446 r10_bio->sectors,
399 &first_bad, &bad_sectors)) { 447 &first_bad, &bad_sectors)) {
400 bio_put(bio); 448 bio_put(bio);
401 r10_bio->devs[slot].bio = IO_MADE_GOOD; 449 if (repl)
450 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
451 else
452 r10_bio->devs[slot].bio = IO_MADE_GOOD;
402 dec_rdev = 0; 453 dec_rdev = 0;
403 set_bit(R10BIO_MadeGood, &r10_bio->state); 454 set_bit(R10BIO_MadeGood, &r10_bio->state);
404 } 455 }
@@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error)
414 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); 465 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
415} 466}
416 467
417
418/* 468/*
419 * RAID10 layout manager 469 * RAID10 layout manager
420 * As well as the chunksize and raid_disks count, there are two 470 * As well as the chunksize and raid_disks count, there are two
@@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q,
562 * FIXME: possibly should rethink readbalancing and do it differently 612 * FIXME: possibly should rethink readbalancing and do it differently
563 * depending on near_copies / far_copies geometry. 613 * depending on near_copies / far_copies geometry.
564 */ 614 */
565static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors) 615static struct md_rdev *read_balance(struct r10conf *conf,
616 struct r10bio *r10_bio,
617 int *max_sectors)
566{ 618{
567 const sector_t this_sector = r10_bio->sector; 619 const sector_t this_sector = r10_bio->sector;
568 int disk, slot; 620 int disk, slot;
569 int sectors = r10_bio->sectors; 621 int sectors = r10_bio->sectors;
570 int best_good_sectors; 622 int best_good_sectors;
571 sector_t new_distance, best_dist; 623 sector_t new_distance, best_dist;
572 struct md_rdev *rdev; 624 struct md_rdev *rdev, *best_rdev;
573 int do_balance; 625 int do_balance;
574 int best_slot; 626 int best_slot;
575 627
@@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s
578retry: 630retry:
579 sectors = r10_bio->sectors; 631 sectors = r10_bio->sectors;
580 best_slot = -1; 632 best_slot = -1;
633 best_rdev = NULL;
581 best_dist = MaxSector; 634 best_dist = MaxSector;
582 best_good_sectors = 0; 635 best_good_sectors = 0;
583 do_balance = 1; 636 do_balance = 1;
@@ -599,10 +652,16 @@ retry:
599 if (r10_bio->devs[slot].bio == IO_BLOCKED) 652 if (r10_bio->devs[slot].bio == IO_BLOCKED)
600 continue; 653 continue;
601 disk = r10_bio->devs[slot].devnum; 654 disk = r10_bio->devs[slot].devnum;
602 rdev = rcu_dereference(conf->mirrors[disk].rdev); 655 rdev = rcu_dereference(conf->mirrors[disk].replacement);
656 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
657 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
658 rdev = rcu_dereference(conf->mirrors[disk].rdev);
603 if (rdev == NULL) 659 if (rdev == NULL)
604 continue; 660 continue;
605 if (!test_bit(In_sync, &rdev->flags)) 661 if (test_bit(Faulty, &rdev->flags))
662 continue;
663 if (!test_bit(In_sync, &rdev->flags) &&
664 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
606 continue; 665 continue;
607 666
608 dev_sector = r10_bio->devs[slot].addr; 667 dev_sector = r10_bio->devs[slot].addr;
@@ -627,6 +686,7 @@ retry:
627 if (good_sectors > best_good_sectors) { 686 if (good_sectors > best_good_sectors) {
628 best_good_sectors = good_sectors; 687 best_good_sectors = good_sectors;
629 best_slot = slot; 688 best_slot = slot;
689 best_rdev = rdev;
630 } 690 }
631 if (!do_balance) 691 if (!do_balance)
632 /* Must read from here */ 692 /* Must read from here */
@@ -655,16 +715,15 @@ retry:
655 if (new_distance < best_dist) { 715 if (new_distance < best_dist) {
656 best_dist = new_distance; 716 best_dist = new_distance;
657 best_slot = slot; 717 best_slot = slot;
718 best_rdev = rdev;
658 } 719 }
659 } 720 }
660 if (slot == conf->copies) 721 if (slot >= conf->copies) {
661 slot = best_slot; 722 slot = best_slot;
723 rdev = best_rdev;
724 }
662 725
663 if (slot >= 0) { 726 if (slot >= 0) {
664 disk = r10_bio->devs[slot].devnum;
665 rdev = rcu_dereference(conf->mirrors[disk].rdev);
666 if (!rdev)
667 goto retry;
668 atomic_inc(&rdev->nr_pending); 727 atomic_inc(&rdev->nr_pending);
669 if (test_bit(Faulty, &rdev->flags)) { 728 if (test_bit(Faulty, &rdev->flags)) {
670 /* Cannot risk returning a device that failed 729 /* Cannot risk returning a device that failed
@@ -675,11 +734,11 @@ retry:
675 } 734 }
676 r10_bio->read_slot = slot; 735 r10_bio->read_slot = slot;
677 } else 736 } else
678 disk = -1; 737 rdev = NULL;
679 rcu_read_unlock(); 738 rcu_read_unlock();
680 *max_sectors = best_good_sectors; 739 *max_sectors = best_good_sectors;
681 740
682 return disk; 741 return rdev;
683} 742}
684 743
685static int raid10_congested(void *data, int bits) 744static int raid10_congested(void *data, int bits)
@@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf)
846static void make_request(struct mddev *mddev, struct bio * bio) 905static void make_request(struct mddev *mddev, struct bio * bio)
847{ 906{
848 struct r10conf *conf = mddev->private; 907 struct r10conf *conf = mddev->private;
849 struct mirror_info *mirror;
850 struct r10bio *r10_bio; 908 struct r10bio *r10_bio;
851 struct bio *read_bio; 909 struct bio *read_bio;
852 int i; 910 int i;
@@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
945 /* 1003 /*
946 * read balancing logic: 1004 * read balancing logic:
947 */ 1005 */
948 int disk; 1006 struct md_rdev *rdev;
949 int slot; 1007 int slot;
950 1008
951read_again: 1009read_again:
952 disk = read_balance(conf, r10_bio, &max_sectors); 1010 rdev = read_balance(conf, r10_bio, &max_sectors);
953 slot = r10_bio->read_slot; 1011 if (!rdev) {
954 if (disk < 0) {
955 raid_end_bio_io(r10_bio); 1012 raid_end_bio_io(r10_bio);
956 return; 1013 return;
957 } 1014 }
958 mirror = conf->mirrors + disk; 1015 slot = r10_bio->read_slot;
959 1016
960 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1017 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
961 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, 1018 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
962 max_sectors); 1019 max_sectors);
963 1020
964 r10_bio->devs[slot].bio = read_bio; 1021 r10_bio->devs[slot].bio = read_bio;
1022 r10_bio->devs[slot].rdev = rdev;
965 1023
966 read_bio->bi_sector = r10_bio->devs[slot].addr + 1024 read_bio->bi_sector = r10_bio->devs[slot].addr +
967 mirror->rdev->data_offset; 1025 rdev->data_offset;
968 read_bio->bi_bdev = mirror->rdev->bdev; 1026 read_bio->bi_bdev = rdev->bdev;
969 read_bio->bi_end_io = raid10_end_read_request; 1027 read_bio->bi_end_io = raid10_end_read_request;
970 read_bio->bi_rw = READ | do_sync; 1028 read_bio->bi_rw = READ | do_sync;
971 read_bio->bi_private = r10_bio; 1029 read_bio->bi_private = r10_bio;
@@ -1025,6 +1083,7 @@ read_again:
1025 */ 1083 */
1026 plugged = mddev_check_plugged(mddev); 1084 plugged = mddev_check_plugged(mddev);
1027 1085
1086 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1028 raid10_find_phys(conf, r10_bio); 1087 raid10_find_phys(conf, r10_bio);
1029retry_write: 1088retry_write:
1030 blocked_rdev = NULL; 1089 blocked_rdev = NULL;
@@ -1034,12 +1093,25 @@ retry_write:
1034 for (i = 0; i < conf->copies; i++) { 1093 for (i = 0; i < conf->copies; i++) {
1035 int d = r10_bio->devs[i].devnum; 1094 int d = r10_bio->devs[i].devnum;
1036 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 1095 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1096 struct md_rdev *rrdev = rcu_dereference(
1097 conf->mirrors[d].replacement);
1098 if (rdev == rrdev)
1099 rrdev = NULL;
1037 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1100 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1038 atomic_inc(&rdev->nr_pending); 1101 atomic_inc(&rdev->nr_pending);
1039 blocked_rdev = rdev; 1102 blocked_rdev = rdev;
1040 break; 1103 break;
1041 } 1104 }
1105 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1106 atomic_inc(&rrdev->nr_pending);
1107 blocked_rdev = rrdev;
1108 break;
1109 }
1110 if (rrdev && test_bit(Faulty, &rrdev->flags))
1111 rrdev = NULL;
1112
1042 r10_bio->devs[i].bio = NULL; 1113 r10_bio->devs[i].bio = NULL;
1114 r10_bio->devs[i].repl_bio = NULL;
1043 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1115 if (!rdev || test_bit(Faulty, &rdev->flags)) {
1044 set_bit(R10BIO_Degraded, &r10_bio->state); 1116 set_bit(R10BIO_Degraded, &r10_bio->state);
1045 continue; 1117 continue;
@@ -1088,6 +1160,10 @@ retry_write:
1088 } 1160 }
1089 r10_bio->devs[i].bio = bio; 1161 r10_bio->devs[i].bio = bio;
1090 atomic_inc(&rdev->nr_pending); 1162 atomic_inc(&rdev->nr_pending);
1163 if (rrdev) {
1164 r10_bio->devs[i].repl_bio = bio;
1165 atomic_inc(&rrdev->nr_pending);
1166 }
1091 } 1167 }
1092 rcu_read_unlock(); 1168 rcu_read_unlock();
1093 1169
@@ -1096,11 +1172,23 @@ retry_write:
1096 int j; 1172 int j;
1097 int d; 1173 int d;
1098 1174
1099 for (j = 0; j < i; j++) 1175 for (j = 0; j < i; j++) {
1100 if (r10_bio->devs[j].bio) { 1176 if (r10_bio->devs[j].bio) {
1101 d = r10_bio->devs[j].devnum; 1177 d = r10_bio->devs[j].devnum;
1102 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 1178 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1103 } 1179 }
1180 if (r10_bio->devs[j].repl_bio) {
1181 struct md_rdev *rdev;
1182 d = r10_bio->devs[j].devnum;
1183 rdev = conf->mirrors[d].replacement;
1184 if (!rdev) {
1185 /* Race with remove_disk */
1186 smp_mb();
1187 rdev = conf->mirrors[d].rdev;
1188 }
1189 rdev_dec_pending(rdev, mddev);
1190 }
1191 }
1104 allow_barrier(conf); 1192 allow_barrier(conf);
1105 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1193 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1106 wait_barrier(conf); 1194 wait_barrier(conf);
@@ -1147,6 +1235,31 @@ retry_write:
1147 bio_list_add(&conf->pending_bio_list, mbio); 1235 bio_list_add(&conf->pending_bio_list, mbio);
1148 conf->pending_count++; 1236 conf->pending_count++;
1149 spin_unlock_irqrestore(&conf->device_lock, flags); 1237 spin_unlock_irqrestore(&conf->device_lock, flags);
1238
1239 if (!r10_bio->devs[i].repl_bio)
1240 continue;
1241
1242 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1243 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1244 max_sectors);
1245 r10_bio->devs[i].repl_bio = mbio;
1246
1247 /* We are actively writing to the original device
1248 * so it cannot disappear, so the replacement cannot
1249 * become NULL here
1250 */
1251 mbio->bi_sector = (r10_bio->devs[i].addr+
1252 conf->mirrors[d].replacement->data_offset);
1253 mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1254 mbio->bi_end_io = raid10_end_write_request;
1255 mbio->bi_rw = WRITE | do_sync | do_fua;
1256 mbio->bi_private = r10_bio;
1257
1258 atomic_inc(&r10_bio->remaining);
1259 spin_lock_irqsave(&conf->device_lock, flags);
1260 bio_list_add(&conf->pending_bio_list, mbio);
1261 conf->pending_count++;
1262 spin_unlock_irqrestore(&conf->device_lock, flags);
1150 } 1263 }
1151 1264
1152 /* Don't remove the bias on 'remaining' (one_write_done) until 1265 /* Don't remove the bias on 'remaining' (one_write_done) until
@@ -1309,9 +1422,27 @@ static int raid10_spare_active(struct mddev *mddev)
1309 */ 1422 */
1310 for (i = 0; i < conf->raid_disks; i++) { 1423 for (i = 0; i < conf->raid_disks; i++) {
1311 tmp = conf->mirrors + i; 1424 tmp = conf->mirrors + i;
1312 if (tmp->rdev 1425 if (tmp->replacement
1313 && !test_bit(Faulty, &tmp->rdev->flags) 1426 && tmp->replacement->recovery_offset == MaxSector
1314 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 1427 && !test_bit(Faulty, &tmp->replacement->flags)
1428 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1429 /* Replacement has just become active */
1430 if (!tmp->rdev
1431 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1432 count++;
1433 if (tmp->rdev) {
1434 /* Replaced device not technically faulty,
1435 * but we need to be sure it gets removed
1436 * and never re-added.
1437 */
1438 set_bit(Faulty, &tmp->rdev->flags);
1439 sysfs_notify_dirent_safe(
1440 tmp->rdev->sysfs_state);
1441 }
1442 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1443 } else if (tmp->rdev
1444 && !test_bit(Faulty, &tmp->rdev->flags)
1445 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1315 count++; 1446 count++;
1316 sysfs_notify_dirent(tmp->rdev->sysfs_state); 1447 sysfs_notify_dirent(tmp->rdev->sysfs_state);
1317 } 1448 }
@@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1353 struct mirror_info *p = &conf->mirrors[mirror]; 1484 struct mirror_info *p = &conf->mirrors[mirror];
1354 if (p->recovery_disabled == mddev->recovery_disabled) 1485 if (p->recovery_disabled == mddev->recovery_disabled)
1355 continue; 1486 continue;
1356 if (p->rdev) 1487 if (p->rdev) {
1357 continue; 1488 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1489 p->replacement != NULL)
1490 continue;
1491 clear_bit(In_sync, &rdev->flags);
1492 set_bit(Replacement, &rdev->flags);
1493 rdev->raid_disk = mirror;
1494 err = 0;
1495 disk_stack_limits(mddev->gendisk, rdev->bdev,
1496 rdev->data_offset << 9);
1497 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1498 blk_queue_max_segments(mddev->queue, 1);
1499 blk_queue_segment_boundary(mddev->queue,
1500 PAGE_CACHE_SIZE - 1);
1501 }
1502 conf->fullsync = 1;
1503 rcu_assign_pointer(p->replacement, rdev);
1504 break;
1505 }
1358 1506
1359 disk_stack_limits(mddev->gendisk, rdev->bdev, 1507 disk_stack_limits(mddev->gendisk, rdev->bdev,
1360 rdev->data_offset << 9); 1508 rdev->data_offset << 9);
@@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1385 return err; 1533 return err;
1386} 1534}
1387 1535
1388static int raid10_remove_disk(struct mddev *mddev, int number) 1536static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1389{ 1537{
1390 struct r10conf *conf = mddev->private; 1538 struct r10conf *conf = mddev->private;
1391 int err = 0; 1539 int err = 0;
1392 struct md_rdev *rdev; 1540 int number = rdev->raid_disk;
1393 struct mirror_info *p = conf->mirrors+ number; 1541 struct md_rdev **rdevp;
1542 struct mirror_info *p = conf->mirrors + number;
1394 1543
1395 print_conf(conf); 1544 print_conf(conf);
1396 rdev = p->rdev; 1545 if (rdev == p->rdev)
1397 if (rdev) { 1546 rdevp = &p->rdev;
1398 if (test_bit(In_sync, &rdev->flags) || 1547 else if (rdev == p->replacement)
1399 atomic_read(&rdev->nr_pending)) { 1548 rdevp = &p->replacement;
1400 err = -EBUSY; 1549 else
1401 goto abort; 1550 return 0;
1402 } 1551
1403 /* Only remove faulty devices in recovery 1552 if (test_bit(In_sync, &rdev->flags) ||
1404 * is not possible. 1553 atomic_read(&rdev->nr_pending)) {
1405 */ 1554 err = -EBUSY;
1406 if (!test_bit(Faulty, &rdev->flags) && 1555 goto abort;
1407 mddev->recovery_disabled != p->recovery_disabled &&
1408 enough(conf, -1)) {
1409 err = -EBUSY;
1410 goto abort;
1411 }
1412 p->rdev = NULL;
1413 synchronize_rcu();
1414 if (atomic_read(&rdev->nr_pending)) {
1415 /* lost the race, try later */
1416 err = -EBUSY;
1417 p->rdev = rdev;
1418 goto abort;
1419 }
1420 err = md_integrity_register(mddev);
1421 } 1556 }
1557 /* Only remove faulty devices if recovery
1558 * is not possible.
1559 */
1560 if (!test_bit(Faulty, &rdev->flags) &&
1561 mddev->recovery_disabled != p->recovery_disabled &&
1562 (!p->replacement || p->replacement == rdev) &&
1563 enough(conf, -1)) {
1564 err = -EBUSY;
1565 goto abort;
1566 }
1567 *rdevp = NULL;
1568 synchronize_rcu();
1569 if (atomic_read(&rdev->nr_pending)) {
1570 /* lost the race, try later */
1571 err = -EBUSY;
1572 *rdevp = rdev;
1573 goto abort;
1574 } else if (p->replacement) {
1575 /* We must have just cleared 'rdev' */
1576 p->rdev = p->replacement;
1577 clear_bit(Replacement, &p->replacement->flags);
1578 smp_mb(); /* Make sure other CPUs may see both as identical
1579 * but will never see neither -- if they are careful.
1580 */
1581 p->replacement = NULL;
1582 clear_bit(WantReplacement, &rdev->flags);
1583 } else
1584 /* We might have just remove the Replacement as faulty
1585 * Clear the flag just in case
1586 */
1587 clear_bit(WantReplacement, &rdev->flags);
1588
1589 err = md_integrity_register(mddev);
1590
1422abort: 1591abort:
1423 1592
1424 print_conf(conf); 1593 print_conf(conf);
@@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error)
1432 struct r10conf *conf = r10_bio->mddev->private; 1601 struct r10conf *conf = r10_bio->mddev->private;
1433 int d; 1602 int d;
1434 1603
1435 d = find_bio_disk(conf, r10_bio, bio, NULL); 1604 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1436 1605
1437 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1606 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1438 set_bit(R10BIO_Uptodate, &r10_bio->state); 1607 set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error)
1493 sector_t first_bad; 1662 sector_t first_bad;
1494 int bad_sectors; 1663 int bad_sectors;
1495 int slot; 1664 int slot;
1496 1665 int repl;
1497 d = find_bio_disk(conf, r10_bio, bio, &slot); 1666 struct md_rdev *rdev = NULL;
1667
1668 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1669 if (repl)
1670 rdev = conf->mirrors[d].replacement;
1671 if (!rdev) {
1672 smp_mb();
1673 rdev = conf->mirrors[d].rdev;
1674 }
1498 1675
1499 if (!uptodate) { 1676 if (!uptodate) {
1500 set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); 1677 if (repl)
1501 set_bit(R10BIO_WriteError, &r10_bio->state); 1678 md_error(mddev, rdev);
1502 } else if (is_badblock(conf->mirrors[d].rdev, 1679 else {
1680 set_bit(WriteErrorSeen, &rdev->flags);
1681 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1682 set_bit(MD_RECOVERY_NEEDED,
1683 &rdev->mddev->recovery);
1684 set_bit(R10BIO_WriteError, &r10_bio->state);
1685 }
1686 } else if (is_badblock(rdev,
1503 r10_bio->devs[slot].addr, 1687 r10_bio->devs[slot].addr,
1504 r10_bio->sectors, 1688 r10_bio->sectors,
1505 &first_bad, &bad_sectors)) 1689 &first_bad, &bad_sectors))
1506 set_bit(R10BIO_MadeGood, &r10_bio->state); 1690 set_bit(R10BIO_MadeGood, &r10_bio->state);
1507 1691
1508 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 1692 rdev_dec_pending(rdev, mddev);
1509 1693
1510 end_sync_request(r10_bio); 1694 end_sync_request(r10_bio);
1511} 1695}
@@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1609 generic_make_request(tbio); 1793 generic_make_request(tbio);
1610 } 1794 }
1611 1795
1796 /* Now write out to any replacement devices
1797 * that are active
1798 */
1799 for (i = 0; i < conf->copies; i++) {
1800 int j, d;
1801 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1802
1803 tbio = r10_bio->devs[i].repl_bio;
1804 if (!tbio || !tbio->bi_end_io)
1805 continue;
1806 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
1807 && r10_bio->devs[i].bio != fbio)
1808 for (j = 0; j < vcnt; j++)
1809 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1810 page_address(fbio->bi_io_vec[j].bv_page),
1811 PAGE_SIZE);
1812 d = r10_bio->devs[i].devnum;
1813 atomic_inc(&r10_bio->remaining);
1814 md_sync_acct(conf->mirrors[d].replacement->bdev,
1815 tbio->bi_size >> 9);
1816 generic_make_request(tbio);
1817 }
1818
1612done: 1819done:
1613 if (atomic_dec_and_test(&r10_bio->remaining)) { 1820 if (atomic_dec_and_test(&r10_bio->remaining)) {
1614 md_done_sync(mddev, r10_bio->sectors, 1); 1821 md_done_sync(mddev, r10_bio->sectors, 1);
@@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
1668 s << 9, 1875 s << 9,
1669 bio->bi_io_vec[idx].bv_page, 1876 bio->bi_io_vec[idx].bv_page,
1670 WRITE, false); 1877 WRITE, false);
1671 if (!ok) 1878 if (!ok) {
1672 set_bit(WriteErrorSeen, &rdev->flags); 1879 set_bit(WriteErrorSeen, &rdev->flags);
1880 if (!test_and_set_bit(WantReplacement,
1881 &rdev->flags))
1882 set_bit(MD_RECOVERY_NEEDED,
1883 &rdev->mddev->recovery);
1884 }
1673 } 1885 }
1674 if (!ok) { 1886 if (!ok) {
1675 /* We don't worry if we cannot set a bad block - 1887 /* We don't worry if we cannot set a bad block -
@@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1709{ 1921{
1710 struct r10conf *conf = mddev->private; 1922 struct r10conf *conf = mddev->private;
1711 int d; 1923 int d;
1712 struct bio *wbio; 1924 struct bio *wbio, *wbio2;
1713 1925
1714 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 1926 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
1715 fix_recovery_read_error(r10_bio); 1927 fix_recovery_read_error(r10_bio);
@@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1721 * share the pages with the first bio 1933 * share the pages with the first bio
1722 * and submit the write request 1934 * and submit the write request
1723 */ 1935 */
1724 wbio = r10_bio->devs[1].bio;
1725 d = r10_bio->devs[1].devnum; 1936 d = r10_bio->devs[1].devnum;
1726 1937 wbio = r10_bio->devs[1].bio;
1727 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1938 wbio2 = r10_bio->devs[1].repl_bio;
1728 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1939 if (wbio->bi_end_io) {
1729 generic_make_request(wbio); 1940 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1941 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1942 generic_make_request(wbio);
1943 }
1944 if (wbio2 && wbio2->bi_end_io) {
1945 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
1946 md_sync_acct(conf->mirrors[d].replacement->bdev,
1947 wbio2->bi_size >> 9);
1948 generic_make_request(wbio2);
1949 }
1730} 1950}
1731 1951
1732 1952
@@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
1779 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 1999 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1780 /* success */ 2000 /* success */
1781 return 1; 2001 return 1;
1782 if (rw == WRITE) 2002 if (rw == WRITE) {
1783 set_bit(WriteErrorSeen, &rdev->flags); 2003 set_bit(WriteErrorSeen, &rdev->flags);
2004 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2005 set_bit(MD_RECOVERY_NEEDED,
2006 &rdev->mddev->recovery);
2007 }
1784 /* need to record an error - either for the block or the device */ 2008 /* need to record an error - either for the block or the device */
1785 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 2009 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1786 md_error(rdev->mddev, rdev); 2010 md_error(rdev->mddev, rdev);
@@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
2060static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 2284static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2061{ 2285{
2062 int slot = r10_bio->read_slot; 2286 int slot = r10_bio->read_slot;
2063 int mirror = r10_bio->devs[slot].devnum;
2064 struct bio *bio; 2287 struct bio *bio;
2065 struct r10conf *conf = mddev->private; 2288 struct r10conf *conf = mddev->private;
2066 struct md_rdev *rdev; 2289 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2067 char b[BDEVNAME_SIZE]; 2290 char b[BDEVNAME_SIZE];
2068 unsigned long do_sync; 2291 unsigned long do_sync;
2069 int max_sectors; 2292 int max_sectors;
@@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2081 fix_read_error(conf, mddev, r10_bio); 2304 fix_read_error(conf, mddev, r10_bio);
2082 unfreeze_array(conf); 2305 unfreeze_array(conf);
2083 } 2306 }
2084 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); 2307 rdev_dec_pending(rdev, mddev);
2085 2308
2086 bio = r10_bio->devs[slot].bio; 2309 bio = r10_bio->devs[slot].bio;
2087 bdevname(bio->bi_bdev, b); 2310 bdevname(bio->bi_bdev, b);
2088 r10_bio->devs[slot].bio = 2311 r10_bio->devs[slot].bio =
2089 mddev->ro ? IO_BLOCKED : NULL; 2312 mddev->ro ? IO_BLOCKED : NULL;
2090read_more: 2313read_more:
2091 mirror = read_balance(conf, r10_bio, &max_sectors); 2314 rdev = read_balance(conf, r10_bio, &max_sectors);
2092 if (mirror == -1) { 2315 if (rdev == NULL) {
2093 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 2316 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2094 " read error for block %llu\n", 2317 " read error for block %llu\n",
2095 mdname(mddev), b, 2318 mdname(mddev), b,
@@ -2103,7 +2326,6 @@ read_more:
2103 if (bio) 2326 if (bio)
2104 bio_put(bio); 2327 bio_put(bio);
2105 slot = r10_bio->read_slot; 2328 slot = r10_bio->read_slot;
2106 rdev = conf->mirrors[mirror].rdev;
2107 printk_ratelimited( 2329 printk_ratelimited(
2108 KERN_ERR 2330 KERN_ERR
2109 "md/raid10:%s: %s: redirecting" 2331 "md/raid10:%s: %s: redirecting"
@@ -2117,6 +2339,7 @@ read_more:
2117 r10_bio->sector - bio->bi_sector, 2339 r10_bio->sector - bio->bi_sector,
2118 max_sectors); 2340 max_sectors);
2119 r10_bio->devs[slot].bio = bio; 2341 r10_bio->devs[slot].bio = bio;
2342 r10_bio->devs[slot].rdev = rdev;
2120 bio->bi_sector = r10_bio->devs[slot].addr 2343 bio->bi_sector = r10_bio->devs[slot].addr
2121 + rdev->data_offset; 2344 + rdev->data_offset;
2122 bio->bi_bdev = rdev->bdev; 2345 bio->bi_bdev = rdev->bdev;
@@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2187 r10_bio->sectors, 0)) 2410 r10_bio->sectors, 0))
2188 md_error(conf->mddev, rdev); 2411 md_error(conf->mddev, rdev);
2189 } 2412 }
2413 rdev = conf->mirrors[dev].replacement;
2414 if (r10_bio->devs[m].repl_bio == NULL)
2415 continue;
2416 if (test_bit(BIO_UPTODATE,
2417 &r10_bio->devs[m].repl_bio->bi_flags)) {
2418 rdev_clear_badblocks(
2419 rdev,
2420 r10_bio->devs[m].addr,
2421 r10_bio->sectors);
2422 } else {
2423 if (!rdev_set_badblocks(
2424 rdev,
2425 r10_bio->devs[m].addr,
2426 r10_bio->sectors, 0))
2427 md_error(conf->mddev, rdev);
2428 }
2190 } 2429 }
2191 put_buf(r10_bio); 2430 put_buf(r10_bio);
2192 } else { 2431 } else {
@@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2209 } 2448 }
2210 rdev_dec_pending(rdev, conf->mddev); 2449 rdev_dec_pending(rdev, conf->mddev);
2211 } 2450 }
2451 bio = r10_bio->devs[m].repl_bio;
2452 rdev = conf->mirrors[dev].replacement;
2453 if (rdev && bio == IO_MADE_GOOD) {
2454 rdev_clear_badblocks(
2455 rdev,
2456 r10_bio->devs[m].addr,
2457 r10_bio->sectors);
2458 rdev_dec_pending(rdev, conf->mddev);
2459 }
2212 } 2460 }
2213 if (test_bit(R10BIO_WriteError, 2461 if (test_bit(R10BIO_WriteError,
2214 &r10_bio->state)) 2462 &r10_bio->state))
@@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev)
2272static int init_resync(struct r10conf *conf) 2520static int init_resync(struct r10conf *conf)
2273{ 2521{
2274 int buffs; 2522 int buffs;
2523 int i;
2275 2524
2276 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 2525 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2277 BUG_ON(conf->r10buf_pool); 2526 BUG_ON(conf->r10buf_pool);
2527 conf->have_replacement = 0;
2528 for (i = 0; i < conf->raid_disks; i++)
2529 if (conf->mirrors[i].replacement)
2530 conf->have_replacement = 1;
2278 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); 2531 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2279 if (!conf->r10buf_pool) 2532 if (!conf->r10buf_pool)
2280 return -ENOMEM; 2533 return -ENOMEM;
@@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2355 bitmap_end_sync(mddev->bitmap, sect, 2608 bitmap_end_sync(mddev->bitmap, sect,
2356 &sync_blocks, 1); 2609 &sync_blocks, 1);
2357 } 2610 }
2358 } else /* completed sync */ 2611 } else {
2612 /* completed sync */
2613 if ((!mddev->bitmap || conf->fullsync)
2614 && conf->have_replacement
2615 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2616 /* Completed a full sync so the replacements
2617 * are now fully recovered.
2618 */
2619 for (i = 0; i < conf->raid_disks; i++)
2620 if (conf->mirrors[i].replacement)
2621 conf->mirrors[i].replacement
2622 ->recovery_offset
2623 = MaxSector;
2624 }
2359 conf->fullsync = 0; 2625 conf->fullsync = 0;
2360 2626 }
2361 bitmap_close_sync(mddev->bitmap); 2627 bitmap_close_sync(mddev->bitmap);
2362 close_sync(conf); 2628 close_sync(conf);
2363 *skipped = 1; 2629 *skipped = 1;
@@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2414 sector_t sect; 2680 sector_t sect;
2415 int must_sync; 2681 int must_sync;
2416 int any_working; 2682 int any_working;
2417 2683 struct mirror_info *mirror = &conf->mirrors[i];
2418 if (conf->mirrors[i].rdev == NULL || 2684
2419 test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 2685 if ((mirror->rdev == NULL ||
2686 test_bit(In_sync, &mirror->rdev->flags))
2687 &&
2688 (mirror->replacement == NULL ||
2689 test_bit(Faulty,
2690 &mirror->replacement->flags)))
2420 continue; 2691 continue;
2421 2692
2422 still_degraded = 0; 2693 still_degraded = 0;
2423 /* want to reconstruct this device */ 2694 /* want to reconstruct this device */
2424 rb2 = r10_bio; 2695 rb2 = r10_bio;
2425 sect = raid10_find_virt(conf, sector_nr, i); 2696 sect = raid10_find_virt(conf, sector_nr, i);
2426 /* Unless we are doing a full sync, we only need 2697 /* Unless we are doing a full sync, or a replacement
2427 * to recover the block if it is set in the bitmap 2698 * we only need to recover the block if it is set in
2699 * the bitmap
2428 */ 2700 */
2429 must_sync = bitmap_start_sync(mddev->bitmap, sect, 2701 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2430 &sync_blocks, 1); 2702 &sync_blocks, 1);
2431 if (sync_blocks < max_sync) 2703 if (sync_blocks < max_sync)
2432 max_sync = sync_blocks; 2704 max_sync = sync_blocks;
2433 if (!must_sync && 2705 if (!must_sync &&
2706 mirror->replacement == NULL &&
2434 !conf->fullsync) { 2707 !conf->fullsync) {
2435 /* yep, skip the sync_blocks here, but don't assume 2708 /* yep, skip the sync_blocks here, but don't assume
2436 * that there will never be anything to do here 2709 * that there will never be anything to do here
@@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2500 bio->bi_end_io = end_sync_read; 2773 bio->bi_end_io = end_sync_read;
2501 bio->bi_rw = READ; 2774 bio->bi_rw = READ;
2502 from_addr = r10_bio->devs[j].addr; 2775 from_addr = r10_bio->devs[j].addr;
2503 bio->bi_sector = from_addr + 2776 bio->bi_sector = from_addr + rdev->data_offset;
2504 conf->mirrors[d].rdev->data_offset; 2777 bio->bi_bdev = rdev->bdev;
2505 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2778 atomic_inc(&rdev->nr_pending);
2506 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2779 /* and we write to 'i' (if not in_sync) */
2507 atomic_inc(&r10_bio->remaining);
2508 /* and we write to 'i' */
2509 2780
2510 for (k=0; k<conf->copies; k++) 2781 for (k=0; k<conf->copies; k++)
2511 if (r10_bio->devs[k].devnum == i) 2782 if (r10_bio->devs[k].devnum == i)
2512 break; 2783 break;
2513 BUG_ON(k == conf->copies); 2784 BUG_ON(k == conf->copies);
2514 bio = r10_bio->devs[1].bio;
2515 bio->bi_next = biolist;
2516 biolist = bio;
2517 bio->bi_private = r10_bio;
2518 bio->bi_end_io = end_sync_write;
2519 bio->bi_rw = WRITE;
2520 to_addr = r10_bio->devs[k].addr; 2785 to_addr = r10_bio->devs[k].addr;
2521 bio->bi_sector = to_addr +
2522 conf->mirrors[i].rdev->data_offset;
2523 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
2524
2525 r10_bio->devs[0].devnum = d; 2786 r10_bio->devs[0].devnum = d;
2526 r10_bio->devs[0].addr = from_addr; 2787 r10_bio->devs[0].addr = from_addr;
2527 r10_bio->devs[1].devnum = i; 2788 r10_bio->devs[1].devnum = i;
2528 r10_bio->devs[1].addr = to_addr; 2789 r10_bio->devs[1].addr = to_addr;
2529 2790
2791 rdev = mirror->rdev;
2792 if (!test_bit(In_sync, &rdev->flags)) {
2793 bio = r10_bio->devs[1].bio;
2794 bio->bi_next = biolist;
2795 biolist = bio;
2796 bio->bi_private = r10_bio;
2797 bio->bi_end_io = end_sync_write;
2798 bio->bi_rw = WRITE;
2799 bio->bi_sector = to_addr
2800 + rdev->data_offset;
2801 bio->bi_bdev = rdev->bdev;
2802 atomic_inc(&r10_bio->remaining);
2803 } else
2804 r10_bio->devs[1].bio->bi_end_io = NULL;
2805
2806 /* and maybe write to replacement */
2807 bio = r10_bio->devs[1].repl_bio;
2808 if (bio)
2809 bio->bi_end_io = NULL;
2810 rdev = mirror->replacement;
2811 /* Note: if rdev != NULL, then bio
2812 * cannot be NULL as r10buf_pool_alloc will
2813 * have allocated it.
2814 * So the second test here is pointless.
2815 * But it keeps semantic-checkers happy, and
2816 * this comment keeps human reviewers
2817 * happy.
2818 */
2819 if (rdev == NULL || bio == NULL ||
2820 test_bit(Faulty, &rdev->flags))
2821 break;
2822 bio->bi_next = biolist;
2823 biolist = bio;
2824 bio->bi_private = r10_bio;
2825 bio->bi_end_io = end_sync_write;
2826 bio->bi_rw = WRITE;
2827 bio->bi_sector = to_addr + rdev->data_offset;
2828 bio->bi_bdev = rdev->bdev;
2829 atomic_inc(&r10_bio->remaining);
2530 break; 2830 break;
2531 } 2831 }
2532 if (j == conf->copies) { 2832 if (j == conf->copies) {
@@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2544 for (k = 0; k < conf->copies; k++) 2844 for (k = 0; k < conf->copies; k++)
2545 if (r10_bio->devs[k].devnum == i) 2845 if (r10_bio->devs[k].devnum == i)
2546 break; 2846 break;
2547 if (!rdev_set_badblocks( 2847 if (!test_bit(In_sync,
2548 conf->mirrors[i].rdev, 2848 &mirror->rdev->flags)
2849 && !rdev_set_badblocks(
2850 mirror->rdev,
2851 r10_bio->devs[k].addr,
2852 max_sync, 0))
2853 any_working = 0;
2854 if (mirror->replacement &&
2855 !rdev_set_badblocks(
2856 mirror->replacement,
2549 r10_bio->devs[k].addr, 2857 r10_bio->devs[k].addr,
2550 max_sync, 0)) 2858 max_sync, 0))
2551 any_working = 0; 2859 any_working = 0;
@@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2556 printk(KERN_INFO "md/raid10:%s: insufficient " 2864 printk(KERN_INFO "md/raid10:%s: insufficient "
2557 "working devices for recovery.\n", 2865 "working devices for recovery.\n",
2558 mdname(mddev)); 2866 mdname(mddev));
2559 conf->mirrors[i].recovery_disabled 2867 mirror->recovery_disabled
2560 = mddev->recovery_disabled; 2868 = mddev->recovery_disabled;
2561 } 2869 }
2562 break; 2870 break;
@@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2605 sector_t first_bad, sector; 2913 sector_t first_bad, sector;
2606 int bad_sectors; 2914 int bad_sectors;
2607 2915
2916 if (r10_bio->devs[i].repl_bio)
2917 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
2918
2608 bio = r10_bio->devs[i].bio; 2919 bio = r10_bio->devs[i].bio;
2609 bio->bi_end_io = NULL; 2920 bio->bi_end_io = NULL;
2610 clear_bit(BIO_UPTODATE, &bio->bi_flags); 2921 clear_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2635 conf->mirrors[d].rdev->data_offset; 2946 conf->mirrors[d].rdev->data_offset;
2636 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2947 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
2637 count++; 2948 count++;
2949
2950 if (conf->mirrors[d].replacement == NULL ||
2951 test_bit(Faulty,
2952 &conf->mirrors[d].replacement->flags))
2953 continue;
2954
2955 /* Need to set up for writing to the replacement */
2956 bio = r10_bio->devs[i].repl_bio;
2957 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2958
2959 sector = r10_bio->devs[i].addr;
2960 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2961 bio->bi_next = biolist;
2962 biolist = bio;
2963 bio->bi_private = r10_bio;
2964 bio->bi_end_io = end_sync_write;
2965 bio->bi_rw = WRITE;
2966 bio->bi_sector = sector +
2967 conf->mirrors[d].replacement->data_offset;
2968 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
2969 count++;
2638 } 2970 }
2639 2971
2640 if (count < 2) { 2972 if (count < 2) {
@@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2643 if (r10_bio->devs[i].bio->bi_end_io) 2975 if (r10_bio->devs[i].bio->bi_end_io)
2644 rdev_dec_pending(conf->mirrors[d].rdev, 2976 rdev_dec_pending(conf->mirrors[d].rdev,
2645 mddev); 2977 mddev);
2978 if (r10_bio->devs[i].repl_bio &&
2979 r10_bio->devs[i].repl_bio->bi_end_io)
2980 rdev_dec_pending(
2981 conf->mirrors[d].replacement,
2982 mddev);
2646 } 2983 }
2647 put_buf(r10_bio); 2984 put_buf(r10_bio);
2648 biolist = NULL; 2985 biolist = NULL;
@@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev)
2896 continue; 3233 continue;
2897 disk = conf->mirrors + disk_idx; 3234 disk = conf->mirrors + disk_idx;
2898 3235
3236 if (test_bit(Replacement, &rdev->flags)) {
3237 if (disk->replacement)
3238 goto out_free_conf;
3239 disk->replacement = rdev;
3240 } else {
3241 if (disk->rdev)
3242 goto out_free_conf;
3243 disk->rdev = rdev;
3244 }
3245
2899 disk->rdev = rdev; 3246 disk->rdev = rdev;
2900 disk_stack_limits(mddev->gendisk, rdev->bdev, 3247 disk_stack_limits(mddev->gendisk, rdev->bdev,
2901 rdev->data_offset << 9); 3248 rdev->data_offset << 9);
@@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev)
2923 3270
2924 disk = conf->mirrors + i; 3271 disk = conf->mirrors + i;
2925 3272
3273 if (!disk->rdev && disk->replacement) {
3274 /* The replacement is all we have - use it */
3275 disk->rdev = disk->replacement;
3276 disk->replacement = NULL;
3277 clear_bit(Replacement, &disk->rdev->flags);
3278 }
3279
2926 if (!disk->rdev || 3280 if (!disk->rdev ||
2927 !test_bit(In_sync, &disk->rdev->flags)) { 3281 !test_bit(In_sync, &disk->rdev->flags)) {
2928 disk->head_position = 0; 3282 disk->head_position = 0;