diff options
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 582 |
1 files changed, 468 insertions, 114 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 685ddf325ee4..6e8aa213f0d5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | |||
73 | struct r10conf *conf = data; | 73 | struct r10conf *conf = data; |
74 | int size = offsetof(struct r10bio, devs[conf->copies]); | 74 | int size = offsetof(struct r10bio, devs[conf->copies]); |
75 | 75 | ||
76 | /* allocate a r10bio with room for raid_disks entries in the bios array */ | 76 | /* allocate a r10bio with room for raid_disks entries in the |
77 | * bios array */ | ||
77 | return kzalloc(size, gfp_flags); | 78 | return kzalloc(size, gfp_flags); |
78 | } | 79 | } |
79 | 80 | ||
@@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
123 | if (!bio) | 124 | if (!bio) |
124 | goto out_free_bio; | 125 | goto out_free_bio; |
125 | r10_bio->devs[j].bio = bio; | 126 | r10_bio->devs[j].bio = bio; |
127 | if (!conf->have_replacement) | ||
128 | continue; | ||
129 | bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); | ||
130 | if (!bio) | ||
131 | goto out_free_bio; | ||
132 | r10_bio->devs[j].repl_bio = bio; | ||
126 | } | 133 | } |
127 | /* | 134 | /* |
128 | * Allocate RESYNC_PAGES data pages and attach them | 135 | * Allocate RESYNC_PAGES data pages and attach them |
129 | * where needed. | 136 | * where needed. |
130 | */ | 137 | */ |
131 | for (j = 0 ; j < nalloc; j++) { | 138 | for (j = 0 ; j < nalloc; j++) { |
139 | struct bio *rbio = r10_bio->devs[j].repl_bio; | ||
132 | bio = r10_bio->devs[j].bio; | 140 | bio = r10_bio->devs[j].bio; |
133 | for (i = 0; i < RESYNC_PAGES; i++) { | 141 | for (i = 0; i < RESYNC_PAGES; i++) { |
134 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, | 142 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, |
@@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
143 | goto out_free_pages; | 151 | goto out_free_pages; |
144 | 152 | ||
145 | bio->bi_io_vec[i].bv_page = page; | 153 | bio->bi_io_vec[i].bv_page = page; |
154 | if (rbio) | ||
155 | rbio->bi_io_vec[i].bv_page = page; | ||
146 | } | 156 | } |
147 | } | 157 | } |
148 | 158 | ||
@@ -156,8 +166,11 @@ out_free_pages: | |||
156 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); | 166 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); |
157 | j = -1; | 167 | j = -1; |
158 | out_free_bio: | 168 | out_free_bio: |
159 | while ( ++j < nalloc ) | 169 | while (++j < nalloc) { |
160 | bio_put(r10_bio->devs[j].bio); | 170 | bio_put(r10_bio->devs[j].bio); |
171 | if (r10_bio->devs[j].repl_bio) | ||
172 | bio_put(r10_bio->devs[j].repl_bio); | ||
173 | } | ||
161 | r10bio_pool_free(r10_bio, conf); | 174 | r10bio_pool_free(r10_bio, conf); |
162 | return NULL; | 175 | return NULL; |
163 | } | 176 | } |
@@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data) | |||
178 | } | 191 | } |
179 | bio_put(bio); | 192 | bio_put(bio); |
180 | } | 193 | } |
194 | bio = r10bio->devs[j].repl_bio; | ||
195 | if (bio) | ||
196 | bio_put(bio); | ||
181 | } | 197 | } |
182 | r10bio_pool_free(r10bio, conf); | 198 | r10bio_pool_free(r10bio, conf); |
183 | } | 199 | } |
@@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) | |||
191 | if (!BIO_SPECIAL(*bio)) | 207 | if (!BIO_SPECIAL(*bio)) |
192 | bio_put(*bio); | 208 | bio_put(*bio); |
193 | *bio = NULL; | 209 | *bio = NULL; |
210 | bio = &r10_bio->devs[i].repl_bio; | ||
211 | if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) | ||
212 | bio_put(*bio); | ||
213 | *bio = NULL; | ||
194 | } | 214 | } |
195 | } | 215 | } |
196 | 216 | ||
@@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio) | |||
275 | * Find the disk number which triggered given bio | 295 | * Find the disk number which triggered given bio |
276 | */ | 296 | */ |
277 | static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, | 297 | static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, |
278 | struct bio *bio, int *slotp) | 298 | struct bio *bio, int *slotp, int *replp) |
279 | { | 299 | { |
280 | int slot; | 300 | int slot; |
301 | int repl = 0; | ||
281 | 302 | ||
282 | for (slot = 0; slot < conf->copies; slot++) | 303 | for (slot = 0; slot < conf->copies; slot++) { |
283 | if (r10_bio->devs[slot].bio == bio) | 304 | if (r10_bio->devs[slot].bio == bio) |
284 | break; | 305 | break; |
306 | if (r10_bio->devs[slot].repl_bio == bio) { | ||
307 | repl = 1; | ||
308 | break; | ||
309 | } | ||
310 | } | ||
285 | 311 | ||
286 | BUG_ON(slot == conf->copies); | 312 | BUG_ON(slot == conf->copies); |
287 | update_head_pos(slot, r10_bio); | 313 | update_head_pos(slot, r10_bio); |
288 | 314 | ||
289 | if (slotp) | 315 | if (slotp) |
290 | *slotp = slot; | 316 | *slotp = slot; |
317 | if (replp) | ||
318 | *replp = repl; | ||
291 | return r10_bio->devs[slot].devnum; | 319 | return r10_bio->devs[slot].devnum; |
292 | } | 320 | } |
293 | 321 | ||
@@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
296 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 324 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
297 | struct r10bio *r10_bio = bio->bi_private; | 325 | struct r10bio *r10_bio = bio->bi_private; |
298 | int slot, dev; | 326 | int slot, dev; |
327 | struct md_rdev *rdev; | ||
299 | struct r10conf *conf = r10_bio->mddev->private; | 328 | struct r10conf *conf = r10_bio->mddev->private; |
300 | 329 | ||
301 | 330 | ||
302 | slot = r10_bio->read_slot; | 331 | slot = r10_bio->read_slot; |
303 | dev = r10_bio->devs[slot].devnum; | 332 | dev = r10_bio->devs[slot].devnum; |
333 | rdev = r10_bio->devs[slot].rdev; | ||
304 | /* | 334 | /* |
305 | * this branch is our 'one mirror IO has finished' event handler: | 335 | * this branch is our 'one mirror IO has finished' event handler: |
306 | */ | 336 | */ |
@@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
318 | */ | 348 | */ |
319 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 349 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
320 | raid_end_bio_io(r10_bio); | 350 | raid_end_bio_io(r10_bio); |
321 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | 351 | rdev_dec_pending(rdev, conf->mddev); |
322 | } else { | 352 | } else { |
323 | /* | 353 | /* |
324 | * oops, read error - keep the refcount on the rdev | 354 | * oops, read error - keep the refcount on the rdev |
@@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
327 | printk_ratelimited(KERN_ERR | 357 | printk_ratelimited(KERN_ERR |
328 | "md/raid10:%s: %s: rescheduling sector %llu\n", | 358 | "md/raid10:%s: %s: rescheduling sector %llu\n", |
329 | mdname(conf->mddev), | 359 | mdname(conf->mddev), |
330 | bdevname(conf->mirrors[dev].rdev->bdev, b), | 360 | bdevname(rdev->bdev, b), |
331 | (unsigned long long)r10_bio->sector); | 361 | (unsigned long long)r10_bio->sector); |
332 | set_bit(R10BIO_ReadError, &r10_bio->state); | 362 | set_bit(R10BIO_ReadError, &r10_bio->state); |
333 | reschedule_retry(r10_bio); | 363 | reschedule_retry(r10_bio); |
@@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
366 | int dev; | 396 | int dev; |
367 | int dec_rdev = 1; | 397 | int dec_rdev = 1; |
368 | struct r10conf *conf = r10_bio->mddev->private; | 398 | struct r10conf *conf = r10_bio->mddev->private; |
369 | int slot; | 399 | int slot, repl; |
400 | struct md_rdev *rdev = NULL; | ||
370 | 401 | ||
371 | dev = find_bio_disk(conf, r10_bio, bio, &slot); | 402 | dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); |
372 | 403 | ||
404 | if (repl) | ||
405 | rdev = conf->mirrors[dev].replacement; | ||
406 | if (!rdev) { | ||
407 | smp_rmb(); | ||
408 | repl = 0; | ||
409 | rdev = conf->mirrors[dev].rdev; | ||
410 | } | ||
373 | /* | 411 | /* |
374 | * this branch is our 'one mirror IO has finished' event handler: | 412 | * this branch is our 'one mirror IO has finished' event handler: |
375 | */ | 413 | */ |
376 | if (!uptodate) { | 414 | if (!uptodate) { |
377 | set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); | 415 | if (repl) |
378 | set_bit(R10BIO_WriteError, &r10_bio->state); | 416 | /* Never record new bad blocks to replacement, |
379 | dec_rdev = 0; | 417 | * just fail it. |
418 | */ | ||
419 | md_error(rdev->mddev, rdev); | ||
420 | else { | ||
421 | set_bit(WriteErrorSeen, &rdev->flags); | ||
422 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | ||
423 | set_bit(MD_RECOVERY_NEEDED, | ||
424 | &rdev->mddev->recovery); | ||
425 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
426 | dec_rdev = 0; | ||
427 | } | ||
380 | } else { | 428 | } else { |
381 | /* | 429 | /* |
382 | * Set R10BIO_Uptodate in our master bio, so that | 430 | * Set R10BIO_Uptodate in our master bio, so that |
@@ -393,12 +441,15 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
393 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 441 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
394 | 442 | ||
395 | /* Maybe we can clear some bad blocks. */ | 443 | /* Maybe we can clear some bad blocks. */ |
396 | if (is_badblock(conf->mirrors[dev].rdev, | 444 | if (is_badblock(rdev, |
397 | r10_bio->devs[slot].addr, | 445 | r10_bio->devs[slot].addr, |
398 | r10_bio->sectors, | 446 | r10_bio->sectors, |
399 | &first_bad, &bad_sectors)) { | 447 | &first_bad, &bad_sectors)) { |
400 | bio_put(bio); | 448 | bio_put(bio); |
401 | r10_bio->devs[slot].bio = IO_MADE_GOOD; | 449 | if (repl) |
450 | r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; | ||
451 | else | ||
452 | r10_bio->devs[slot].bio = IO_MADE_GOOD; | ||
402 | dec_rdev = 0; | 453 | dec_rdev = 0; |
403 | set_bit(R10BIO_MadeGood, &r10_bio->state); | 454 | set_bit(R10BIO_MadeGood, &r10_bio->state); |
404 | } | 455 | } |
@@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
414 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | 465 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); |
415 | } | 466 | } |
416 | 467 | ||
417 | |||
418 | /* | 468 | /* |
419 | * RAID10 layout manager | 469 | * RAID10 layout manager |
420 | * As well as the chunksize and raid_disks count, there are two | 470 | * As well as the chunksize and raid_disks count, there are two |
@@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
562 | * FIXME: possibly should rethink readbalancing and do it differently | 612 | * FIXME: possibly should rethink readbalancing and do it differently |
563 | * depending on near_copies / far_copies geometry. | 613 | * depending on near_copies / far_copies geometry. |
564 | */ | 614 | */ |
565 | static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors) | 615 | static struct md_rdev *read_balance(struct r10conf *conf, |
616 | struct r10bio *r10_bio, | ||
617 | int *max_sectors) | ||
566 | { | 618 | { |
567 | const sector_t this_sector = r10_bio->sector; | 619 | const sector_t this_sector = r10_bio->sector; |
568 | int disk, slot; | 620 | int disk, slot; |
569 | int sectors = r10_bio->sectors; | 621 | int sectors = r10_bio->sectors; |
570 | int best_good_sectors; | 622 | int best_good_sectors; |
571 | sector_t new_distance, best_dist; | 623 | sector_t new_distance, best_dist; |
572 | struct md_rdev *rdev; | 624 | struct md_rdev *rdev, *best_rdev; |
573 | int do_balance; | 625 | int do_balance; |
574 | int best_slot; | 626 | int best_slot; |
575 | 627 | ||
@@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s | |||
578 | retry: | 630 | retry: |
579 | sectors = r10_bio->sectors; | 631 | sectors = r10_bio->sectors; |
580 | best_slot = -1; | 632 | best_slot = -1; |
633 | best_rdev = NULL; | ||
581 | best_dist = MaxSector; | 634 | best_dist = MaxSector; |
582 | best_good_sectors = 0; | 635 | best_good_sectors = 0; |
583 | do_balance = 1; | 636 | do_balance = 1; |
@@ -599,10 +652,16 @@ retry: | |||
599 | if (r10_bio->devs[slot].bio == IO_BLOCKED) | 652 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
600 | continue; | 653 | continue; |
601 | disk = r10_bio->devs[slot].devnum; | 654 | disk = r10_bio->devs[slot].devnum; |
602 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 655 | rdev = rcu_dereference(conf->mirrors[disk].replacement); |
656 | if (rdev == NULL || test_bit(Faulty, &rdev->flags) || | ||
657 | r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) | ||
658 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | ||
603 | if (rdev == NULL) | 659 | if (rdev == NULL) |
604 | continue; | 660 | continue; |
605 | if (!test_bit(In_sync, &rdev->flags)) | 661 | if (test_bit(Faulty, &rdev->flags)) |
662 | continue; | ||
663 | if (!test_bit(In_sync, &rdev->flags) && | ||
664 | r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) | ||
606 | continue; | 665 | continue; |
607 | 666 | ||
608 | dev_sector = r10_bio->devs[slot].addr; | 667 | dev_sector = r10_bio->devs[slot].addr; |
@@ -627,6 +686,7 @@ retry: | |||
627 | if (good_sectors > best_good_sectors) { | 686 | if (good_sectors > best_good_sectors) { |
628 | best_good_sectors = good_sectors; | 687 | best_good_sectors = good_sectors; |
629 | best_slot = slot; | 688 | best_slot = slot; |
689 | best_rdev = rdev; | ||
630 | } | 690 | } |
631 | if (!do_balance) | 691 | if (!do_balance) |
632 | /* Must read from here */ | 692 | /* Must read from here */ |
@@ -655,16 +715,15 @@ retry: | |||
655 | if (new_distance < best_dist) { | 715 | if (new_distance < best_dist) { |
656 | best_dist = new_distance; | 716 | best_dist = new_distance; |
657 | best_slot = slot; | 717 | best_slot = slot; |
718 | best_rdev = rdev; | ||
658 | } | 719 | } |
659 | } | 720 | } |
660 | if (slot == conf->copies) | 721 | if (slot >= conf->copies) { |
661 | slot = best_slot; | 722 | slot = best_slot; |
723 | rdev = best_rdev; | ||
724 | } | ||
662 | 725 | ||
663 | if (slot >= 0) { | 726 | if (slot >= 0) { |
664 | disk = r10_bio->devs[slot].devnum; | ||
665 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | ||
666 | if (!rdev) | ||
667 | goto retry; | ||
668 | atomic_inc(&rdev->nr_pending); | 727 | atomic_inc(&rdev->nr_pending); |
669 | if (test_bit(Faulty, &rdev->flags)) { | 728 | if (test_bit(Faulty, &rdev->flags)) { |
670 | /* Cannot risk returning a device that failed | 729 | /* Cannot risk returning a device that failed |
@@ -675,11 +734,11 @@ retry: | |||
675 | } | 734 | } |
676 | r10_bio->read_slot = slot; | 735 | r10_bio->read_slot = slot; |
677 | } else | 736 | } else |
678 | disk = -1; | 737 | rdev = NULL; |
679 | rcu_read_unlock(); | 738 | rcu_read_unlock(); |
680 | *max_sectors = best_good_sectors; | 739 | *max_sectors = best_good_sectors; |
681 | 740 | ||
682 | return disk; | 741 | return rdev; |
683 | } | 742 | } |
684 | 743 | ||
685 | static int raid10_congested(void *data, int bits) | 744 | static int raid10_congested(void *data, int bits) |
@@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf) | |||
846 | static void make_request(struct mddev *mddev, struct bio * bio) | 905 | static void make_request(struct mddev *mddev, struct bio * bio) |
847 | { | 906 | { |
848 | struct r10conf *conf = mddev->private; | 907 | struct r10conf *conf = mddev->private; |
849 | struct mirror_info *mirror; | ||
850 | struct r10bio *r10_bio; | 908 | struct r10bio *r10_bio; |
851 | struct bio *read_bio; | 909 | struct bio *read_bio; |
852 | int i; | 910 | int i; |
@@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
945 | /* | 1003 | /* |
946 | * read balancing logic: | 1004 | * read balancing logic: |
947 | */ | 1005 | */ |
948 | int disk; | 1006 | struct md_rdev *rdev; |
949 | int slot; | 1007 | int slot; |
950 | 1008 | ||
951 | read_again: | 1009 | read_again: |
952 | disk = read_balance(conf, r10_bio, &max_sectors); | 1010 | rdev = read_balance(conf, r10_bio, &max_sectors); |
953 | slot = r10_bio->read_slot; | 1011 | if (!rdev) { |
954 | if (disk < 0) { | ||
955 | raid_end_bio_io(r10_bio); | 1012 | raid_end_bio_io(r10_bio); |
956 | return; | 1013 | return; |
957 | } | 1014 | } |
958 | mirror = conf->mirrors + disk; | 1015 | slot = r10_bio->read_slot; |
959 | 1016 | ||
960 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1017 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
961 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, | 1018 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, |
962 | max_sectors); | 1019 | max_sectors); |
963 | 1020 | ||
964 | r10_bio->devs[slot].bio = read_bio; | 1021 | r10_bio->devs[slot].bio = read_bio; |
1022 | r10_bio->devs[slot].rdev = rdev; | ||
965 | 1023 | ||
966 | read_bio->bi_sector = r10_bio->devs[slot].addr + | 1024 | read_bio->bi_sector = r10_bio->devs[slot].addr + |
967 | mirror->rdev->data_offset; | 1025 | rdev->data_offset; |
968 | read_bio->bi_bdev = mirror->rdev->bdev; | 1026 | read_bio->bi_bdev = rdev->bdev; |
969 | read_bio->bi_end_io = raid10_end_read_request; | 1027 | read_bio->bi_end_io = raid10_end_read_request; |
970 | read_bio->bi_rw = READ | do_sync; | 1028 | read_bio->bi_rw = READ | do_sync; |
971 | read_bio->bi_private = r10_bio; | 1029 | read_bio->bi_private = r10_bio; |
@@ -1025,6 +1083,7 @@ read_again: | |||
1025 | */ | 1083 | */ |
1026 | plugged = mddev_check_plugged(mddev); | 1084 | plugged = mddev_check_plugged(mddev); |
1027 | 1085 | ||
1086 | r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ | ||
1028 | raid10_find_phys(conf, r10_bio); | 1087 | raid10_find_phys(conf, r10_bio); |
1029 | retry_write: | 1088 | retry_write: |
1030 | blocked_rdev = NULL; | 1089 | blocked_rdev = NULL; |
@@ -1034,12 +1093,25 @@ retry_write: | |||
1034 | for (i = 0; i < conf->copies; i++) { | 1093 | for (i = 0; i < conf->copies; i++) { |
1035 | int d = r10_bio->devs[i].devnum; | 1094 | int d = r10_bio->devs[i].devnum; |
1036 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); | 1095 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); |
1096 | struct md_rdev *rrdev = rcu_dereference( | ||
1097 | conf->mirrors[d].replacement); | ||
1098 | if (rdev == rrdev) | ||
1099 | rrdev = NULL; | ||
1037 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 1100 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { |
1038 | atomic_inc(&rdev->nr_pending); | 1101 | atomic_inc(&rdev->nr_pending); |
1039 | blocked_rdev = rdev; | 1102 | blocked_rdev = rdev; |
1040 | break; | 1103 | break; |
1041 | } | 1104 | } |
1105 | if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { | ||
1106 | atomic_inc(&rrdev->nr_pending); | ||
1107 | blocked_rdev = rrdev; | ||
1108 | break; | ||
1109 | } | ||
1110 | if (rrdev && test_bit(Faulty, &rrdev->flags)) | ||
1111 | rrdev = NULL; | ||
1112 | |||
1042 | r10_bio->devs[i].bio = NULL; | 1113 | r10_bio->devs[i].bio = NULL; |
1114 | r10_bio->devs[i].repl_bio = NULL; | ||
1043 | if (!rdev || test_bit(Faulty, &rdev->flags)) { | 1115 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
1044 | set_bit(R10BIO_Degraded, &r10_bio->state); | 1116 | set_bit(R10BIO_Degraded, &r10_bio->state); |
1045 | continue; | 1117 | continue; |
@@ -1088,6 +1160,10 @@ retry_write: | |||
1088 | } | 1160 | } |
1089 | r10_bio->devs[i].bio = bio; | 1161 | r10_bio->devs[i].bio = bio; |
1090 | atomic_inc(&rdev->nr_pending); | 1162 | atomic_inc(&rdev->nr_pending); |
1163 | if (rrdev) { | ||
1164 | r10_bio->devs[i].repl_bio = bio; | ||
1165 | atomic_inc(&rrdev->nr_pending); | ||
1166 | } | ||
1091 | } | 1167 | } |
1092 | rcu_read_unlock(); | 1168 | rcu_read_unlock(); |
1093 | 1169 | ||
@@ -1096,11 +1172,23 @@ retry_write: | |||
1096 | int j; | 1172 | int j; |
1097 | int d; | 1173 | int d; |
1098 | 1174 | ||
1099 | for (j = 0; j < i; j++) | 1175 | for (j = 0; j < i; j++) { |
1100 | if (r10_bio->devs[j].bio) { | 1176 | if (r10_bio->devs[j].bio) { |
1101 | d = r10_bio->devs[j].devnum; | 1177 | d = r10_bio->devs[j].devnum; |
1102 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | 1178 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); |
1103 | } | 1179 | } |
1180 | if (r10_bio->devs[j].repl_bio) { | ||
1181 | struct md_rdev *rdev; | ||
1182 | d = r10_bio->devs[j].devnum; | ||
1183 | rdev = conf->mirrors[d].replacement; | ||
1184 | if (!rdev) { | ||
1185 | /* Race with remove_disk */ | ||
1186 | smp_mb(); | ||
1187 | rdev = conf->mirrors[d].rdev; | ||
1188 | } | ||
1189 | rdev_dec_pending(rdev, mddev); | ||
1190 | } | ||
1191 | } | ||
1104 | allow_barrier(conf); | 1192 | allow_barrier(conf); |
1105 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1193 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
1106 | wait_barrier(conf); | 1194 | wait_barrier(conf); |
@@ -1147,6 +1235,31 @@ retry_write: | |||
1147 | bio_list_add(&conf->pending_bio_list, mbio); | 1235 | bio_list_add(&conf->pending_bio_list, mbio); |
1148 | conf->pending_count++; | 1236 | conf->pending_count++; |
1149 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1237 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1238 | |||
1239 | if (!r10_bio->devs[i].repl_bio) | ||
1240 | continue; | ||
1241 | |||
1242 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | ||
1243 | md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, | ||
1244 | max_sectors); | ||
1245 | r10_bio->devs[i].repl_bio = mbio; | ||
1246 | |||
1247 | /* We are actively writing to the original device | ||
1248 | * so it cannot disappear, so the replacement cannot | ||
1249 | * become NULL here | ||
1250 | */ | ||
1251 | mbio->bi_sector = (r10_bio->devs[i].addr+ | ||
1252 | conf->mirrors[d].replacement->data_offset); | ||
1253 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; | ||
1254 | mbio->bi_end_io = raid10_end_write_request; | ||
1255 | mbio->bi_rw = WRITE | do_sync | do_fua; | ||
1256 | mbio->bi_private = r10_bio; | ||
1257 | |||
1258 | atomic_inc(&r10_bio->remaining); | ||
1259 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1260 | bio_list_add(&conf->pending_bio_list, mbio); | ||
1261 | conf->pending_count++; | ||
1262 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1150 | } | 1263 | } |
1151 | 1264 | ||
1152 | /* Don't remove the bias on 'remaining' (one_write_done) until | 1265 | /* Don't remove the bias on 'remaining' (one_write_done) until |
@@ -1309,9 +1422,27 @@ static int raid10_spare_active(struct mddev *mddev) | |||
1309 | */ | 1422 | */ |
1310 | for (i = 0; i < conf->raid_disks; i++) { | 1423 | for (i = 0; i < conf->raid_disks; i++) { |
1311 | tmp = conf->mirrors + i; | 1424 | tmp = conf->mirrors + i; |
1312 | if (tmp->rdev | 1425 | if (tmp->replacement |
1313 | && !test_bit(Faulty, &tmp->rdev->flags) | 1426 | && tmp->replacement->recovery_offset == MaxSector |
1314 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 1427 | && !test_bit(Faulty, &tmp->replacement->flags) |
1428 | && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { | ||
1429 | /* Replacement has just become active */ | ||
1430 | if (!tmp->rdev | ||
1431 | || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) | ||
1432 | count++; | ||
1433 | if (tmp->rdev) { | ||
1434 | /* Replaced device not technically faulty, | ||
1435 | * but we need to be sure it gets removed | ||
1436 | * and never re-added. | ||
1437 | */ | ||
1438 | set_bit(Faulty, &tmp->rdev->flags); | ||
1439 | sysfs_notify_dirent_safe( | ||
1440 | tmp->rdev->sysfs_state); | ||
1441 | } | ||
1442 | sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); | ||
1443 | } else if (tmp->rdev | ||
1444 | && !test_bit(Faulty, &tmp->rdev->flags) | ||
1445 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | ||
1315 | count++; | 1446 | count++; |
1316 | sysfs_notify_dirent(tmp->rdev->sysfs_state); | 1447 | sysfs_notify_dirent(tmp->rdev->sysfs_state); |
1317 | } | 1448 | } |
@@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1353 | struct mirror_info *p = &conf->mirrors[mirror]; | 1484 | struct mirror_info *p = &conf->mirrors[mirror]; |
1354 | if (p->recovery_disabled == mddev->recovery_disabled) | 1485 | if (p->recovery_disabled == mddev->recovery_disabled) |
1355 | continue; | 1486 | continue; |
1356 | if (p->rdev) | 1487 | if (p->rdev) { |
1357 | continue; | 1488 | if (!test_bit(WantReplacement, &p->rdev->flags) || |
1489 | p->replacement != NULL) | ||
1490 | continue; | ||
1491 | clear_bit(In_sync, &rdev->flags); | ||
1492 | set_bit(Replacement, &rdev->flags); | ||
1493 | rdev->raid_disk = mirror; | ||
1494 | err = 0; | ||
1495 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
1496 | rdev->data_offset << 9); | ||
1497 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1498 | blk_queue_max_segments(mddev->queue, 1); | ||
1499 | blk_queue_segment_boundary(mddev->queue, | ||
1500 | PAGE_CACHE_SIZE - 1); | ||
1501 | } | ||
1502 | conf->fullsync = 1; | ||
1503 | rcu_assign_pointer(p->replacement, rdev); | ||
1504 | break; | ||
1505 | } | ||
1358 | 1506 | ||
1359 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1507 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1360 | rdev->data_offset << 9); | 1508 | rdev->data_offset << 9); |
@@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1385 | return err; | 1533 | return err; |
1386 | } | 1534 | } |
1387 | 1535 | ||
1388 | static int raid10_remove_disk(struct mddev *mddev, int number) | 1536 | static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) |
1389 | { | 1537 | { |
1390 | struct r10conf *conf = mddev->private; | 1538 | struct r10conf *conf = mddev->private; |
1391 | int err = 0; | 1539 | int err = 0; |
1392 | struct md_rdev *rdev; | 1540 | int number = rdev->raid_disk; |
1393 | struct mirror_info *p = conf->mirrors+ number; | 1541 | struct md_rdev **rdevp; |
1542 | struct mirror_info *p = conf->mirrors + number; | ||
1394 | 1543 | ||
1395 | print_conf(conf); | 1544 | print_conf(conf); |
1396 | rdev = p->rdev; | 1545 | if (rdev == p->rdev) |
1397 | if (rdev) { | 1546 | rdevp = &p->rdev; |
1398 | if (test_bit(In_sync, &rdev->flags) || | 1547 | else if (rdev == p->replacement) |
1399 | atomic_read(&rdev->nr_pending)) { | 1548 | rdevp = &p->replacement; |
1400 | err = -EBUSY; | 1549 | else |
1401 | goto abort; | 1550 | return 0; |
1402 | } | 1551 | |
1403 | /* Only remove faulty devices in recovery | 1552 | if (test_bit(In_sync, &rdev->flags) || |
1404 | * is not possible. | 1553 | atomic_read(&rdev->nr_pending)) { |
1405 | */ | 1554 | err = -EBUSY; |
1406 | if (!test_bit(Faulty, &rdev->flags) && | 1555 | goto abort; |
1407 | mddev->recovery_disabled != p->recovery_disabled && | ||
1408 | enough(conf, -1)) { | ||
1409 | err = -EBUSY; | ||
1410 | goto abort; | ||
1411 | } | ||
1412 | p->rdev = NULL; | ||
1413 | synchronize_rcu(); | ||
1414 | if (atomic_read(&rdev->nr_pending)) { | ||
1415 | /* lost the race, try later */ | ||
1416 | err = -EBUSY; | ||
1417 | p->rdev = rdev; | ||
1418 | goto abort; | ||
1419 | } | ||
1420 | err = md_integrity_register(mddev); | ||
1421 | } | 1556 | } |
1557 | /* Only remove faulty devices if recovery | ||
1558 | * is not possible. | ||
1559 | */ | ||
1560 | if (!test_bit(Faulty, &rdev->flags) && | ||
1561 | mddev->recovery_disabled != p->recovery_disabled && | ||
1562 | (!p->replacement || p->replacement == rdev) && | ||
1563 | enough(conf, -1)) { | ||
1564 | err = -EBUSY; | ||
1565 | goto abort; | ||
1566 | } | ||
1567 | *rdevp = NULL; | ||
1568 | synchronize_rcu(); | ||
1569 | if (atomic_read(&rdev->nr_pending)) { | ||
1570 | /* lost the race, try later */ | ||
1571 | err = -EBUSY; | ||
1572 | *rdevp = rdev; | ||
1573 | goto abort; | ||
1574 | } else if (p->replacement) { | ||
1575 | /* We must have just cleared 'rdev' */ | ||
1576 | p->rdev = p->replacement; | ||
1577 | clear_bit(Replacement, &p->replacement->flags); | ||
1578 | smp_mb(); /* Make sure other CPUs may see both as identical | ||
1579 | * but will never see neither -- if they are careful. | ||
1580 | */ | ||
1581 | p->replacement = NULL; | ||
1582 | clear_bit(WantReplacement, &rdev->flags); | ||
1583 | } else | ||
1584 | /* We might have just remove the Replacement as faulty | ||
1585 | * Clear the flag just in case | ||
1586 | */ | ||
1587 | clear_bit(WantReplacement, &rdev->flags); | ||
1588 | |||
1589 | err = md_integrity_register(mddev); | ||
1590 | |||
1422 | abort: | 1591 | abort: |
1423 | 1592 | ||
1424 | print_conf(conf); | 1593 | print_conf(conf); |
@@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error) | |||
1432 | struct r10conf *conf = r10_bio->mddev->private; | 1601 | struct r10conf *conf = r10_bio->mddev->private; |
1433 | int d; | 1602 | int d; |
1434 | 1603 | ||
1435 | d = find_bio_disk(conf, r10_bio, bio, NULL); | 1604 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); |
1436 | 1605 | ||
1437 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1606 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1438 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1607 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
@@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error) | |||
1493 | sector_t first_bad; | 1662 | sector_t first_bad; |
1494 | int bad_sectors; | 1663 | int bad_sectors; |
1495 | int slot; | 1664 | int slot; |
1496 | 1665 | int repl; | |
1497 | d = find_bio_disk(conf, r10_bio, bio, &slot); | 1666 | struct md_rdev *rdev = NULL; |
1667 | |||
1668 | d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); | ||
1669 | if (repl) | ||
1670 | rdev = conf->mirrors[d].replacement; | ||
1671 | if (!rdev) { | ||
1672 | smp_mb(); | ||
1673 | rdev = conf->mirrors[d].rdev; | ||
1674 | } | ||
1498 | 1675 | ||
1499 | if (!uptodate) { | 1676 | if (!uptodate) { |
1500 | set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); | 1677 | if (repl) |
1501 | set_bit(R10BIO_WriteError, &r10_bio->state); | 1678 | md_error(mddev, rdev); |
1502 | } else if (is_badblock(conf->mirrors[d].rdev, | 1679 | else { |
1680 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1681 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | ||
1682 | set_bit(MD_RECOVERY_NEEDED, | ||
1683 | &rdev->mddev->recovery); | ||
1684 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
1685 | } | ||
1686 | } else if (is_badblock(rdev, | ||
1503 | r10_bio->devs[slot].addr, | 1687 | r10_bio->devs[slot].addr, |
1504 | r10_bio->sectors, | 1688 | r10_bio->sectors, |
1505 | &first_bad, &bad_sectors)) | 1689 | &first_bad, &bad_sectors)) |
1506 | set_bit(R10BIO_MadeGood, &r10_bio->state); | 1690 | set_bit(R10BIO_MadeGood, &r10_bio->state); |
1507 | 1691 | ||
1508 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | 1692 | rdev_dec_pending(rdev, mddev); |
1509 | 1693 | ||
1510 | end_sync_request(r10_bio); | 1694 | end_sync_request(r10_bio); |
1511 | } | 1695 | } |
@@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1609 | generic_make_request(tbio); | 1793 | generic_make_request(tbio); |
1610 | } | 1794 | } |
1611 | 1795 | ||
1796 | /* Now write out to any replacement devices | ||
1797 | * that are active | ||
1798 | */ | ||
1799 | for (i = 0; i < conf->copies; i++) { | ||
1800 | int j, d; | ||
1801 | int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); | ||
1802 | |||
1803 | tbio = r10_bio->devs[i].repl_bio; | ||
1804 | if (!tbio || !tbio->bi_end_io) | ||
1805 | continue; | ||
1806 | if (r10_bio->devs[i].bio->bi_end_io != end_sync_write | ||
1807 | && r10_bio->devs[i].bio != fbio) | ||
1808 | for (j = 0; j < vcnt; j++) | ||
1809 | memcpy(page_address(tbio->bi_io_vec[j].bv_page), | ||
1810 | page_address(fbio->bi_io_vec[j].bv_page), | ||
1811 | PAGE_SIZE); | ||
1812 | d = r10_bio->devs[i].devnum; | ||
1813 | atomic_inc(&r10_bio->remaining); | ||
1814 | md_sync_acct(conf->mirrors[d].replacement->bdev, | ||
1815 | tbio->bi_size >> 9); | ||
1816 | generic_make_request(tbio); | ||
1817 | } | ||
1818 | |||
1612 | done: | 1819 | done: |
1613 | if (atomic_dec_and_test(&r10_bio->remaining)) { | 1820 | if (atomic_dec_and_test(&r10_bio->remaining)) { |
1614 | md_done_sync(mddev, r10_bio->sectors, 1); | 1821 | md_done_sync(mddev, r10_bio->sectors, 1); |
@@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) | |||
1668 | s << 9, | 1875 | s << 9, |
1669 | bio->bi_io_vec[idx].bv_page, | 1876 | bio->bi_io_vec[idx].bv_page, |
1670 | WRITE, false); | 1877 | WRITE, false); |
1671 | if (!ok) | 1878 | if (!ok) { |
1672 | set_bit(WriteErrorSeen, &rdev->flags); | 1879 | set_bit(WriteErrorSeen, &rdev->flags); |
1880 | if (!test_and_set_bit(WantReplacement, | ||
1881 | &rdev->flags)) | ||
1882 | set_bit(MD_RECOVERY_NEEDED, | ||
1883 | &rdev->mddev->recovery); | ||
1884 | } | ||
1673 | } | 1885 | } |
1674 | if (!ok) { | 1886 | if (!ok) { |
1675 | /* We don't worry if we cannot set a bad block - | 1887 | /* We don't worry if we cannot set a bad block - |
@@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1709 | { | 1921 | { |
1710 | struct r10conf *conf = mddev->private; | 1922 | struct r10conf *conf = mddev->private; |
1711 | int d; | 1923 | int d; |
1712 | struct bio *wbio; | 1924 | struct bio *wbio, *wbio2; |
1713 | 1925 | ||
1714 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { | 1926 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { |
1715 | fix_recovery_read_error(r10_bio); | 1927 | fix_recovery_read_error(r10_bio); |
@@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1721 | * share the pages with the first bio | 1933 | * share the pages with the first bio |
1722 | * and submit the write request | 1934 | * and submit the write request |
1723 | */ | 1935 | */ |
1724 | wbio = r10_bio->devs[1].bio; | ||
1725 | d = r10_bio->devs[1].devnum; | 1936 | d = r10_bio->devs[1].devnum; |
1726 | 1937 | wbio = r10_bio->devs[1].bio; | |
1727 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 1938 | wbio2 = r10_bio->devs[1].repl_bio; |
1728 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); | 1939 | if (wbio->bi_end_io) { |
1729 | generic_make_request(wbio); | 1940 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
1941 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); | ||
1942 | generic_make_request(wbio); | ||
1943 | } | ||
1944 | if (wbio2 && wbio2->bi_end_io) { | ||
1945 | atomic_inc(&conf->mirrors[d].replacement->nr_pending); | ||
1946 | md_sync_acct(conf->mirrors[d].replacement->bdev, | ||
1947 | wbio2->bi_size >> 9); | ||
1948 | generic_make_request(wbio2); | ||
1949 | } | ||
1730 | } | 1950 | } |
1731 | 1951 | ||
1732 | 1952 | ||
@@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, | |||
1779 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | 1999 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) |
1780 | /* success */ | 2000 | /* success */ |
1781 | return 1; | 2001 | return 1; |
1782 | if (rw == WRITE) | 2002 | if (rw == WRITE) { |
1783 | set_bit(WriteErrorSeen, &rdev->flags); | 2003 | set_bit(WriteErrorSeen, &rdev->flags); |
2004 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | ||
2005 | set_bit(MD_RECOVERY_NEEDED, | ||
2006 | &rdev->mddev->recovery); | ||
2007 | } | ||
1784 | /* need to record an error - either for the block or the device */ | 2008 | /* need to record an error - either for the block or the device */ |
1785 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | 2009 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) |
1786 | md_error(rdev->mddev, rdev); | 2010 | md_error(rdev->mddev, rdev); |
@@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) | |||
2060 | static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) | 2284 | static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) |
2061 | { | 2285 | { |
2062 | int slot = r10_bio->read_slot; | 2286 | int slot = r10_bio->read_slot; |
2063 | int mirror = r10_bio->devs[slot].devnum; | ||
2064 | struct bio *bio; | 2287 | struct bio *bio; |
2065 | struct r10conf *conf = mddev->private; | 2288 | struct r10conf *conf = mddev->private; |
2066 | struct md_rdev *rdev; | 2289 | struct md_rdev *rdev = r10_bio->devs[slot].rdev; |
2067 | char b[BDEVNAME_SIZE]; | 2290 | char b[BDEVNAME_SIZE]; |
2068 | unsigned long do_sync; | 2291 | unsigned long do_sync; |
2069 | int max_sectors; | 2292 | int max_sectors; |
@@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) | |||
2081 | fix_read_error(conf, mddev, r10_bio); | 2304 | fix_read_error(conf, mddev, r10_bio); |
2082 | unfreeze_array(conf); | 2305 | unfreeze_array(conf); |
2083 | } | 2306 | } |
2084 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | 2307 | rdev_dec_pending(rdev, mddev); |
2085 | 2308 | ||
2086 | bio = r10_bio->devs[slot].bio; | 2309 | bio = r10_bio->devs[slot].bio; |
2087 | bdevname(bio->bi_bdev, b); | 2310 | bdevname(bio->bi_bdev, b); |
2088 | r10_bio->devs[slot].bio = | 2311 | r10_bio->devs[slot].bio = |
2089 | mddev->ro ? IO_BLOCKED : NULL; | 2312 | mddev->ro ? IO_BLOCKED : NULL; |
2090 | read_more: | 2313 | read_more: |
2091 | mirror = read_balance(conf, r10_bio, &max_sectors); | 2314 | rdev = read_balance(conf, r10_bio, &max_sectors); |
2092 | if (mirror == -1) { | 2315 | if (rdev == NULL) { |
2093 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | 2316 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" |
2094 | " read error for block %llu\n", | 2317 | " read error for block %llu\n", |
2095 | mdname(mddev), b, | 2318 | mdname(mddev), b, |
@@ -2103,7 +2326,6 @@ read_more: | |||
2103 | if (bio) | 2326 | if (bio) |
2104 | bio_put(bio); | 2327 | bio_put(bio); |
2105 | slot = r10_bio->read_slot; | 2328 | slot = r10_bio->read_slot; |
2106 | rdev = conf->mirrors[mirror].rdev; | ||
2107 | printk_ratelimited( | 2329 | printk_ratelimited( |
2108 | KERN_ERR | 2330 | KERN_ERR |
2109 | "md/raid10:%s: %s: redirecting" | 2331 | "md/raid10:%s: %s: redirecting" |
@@ -2117,6 +2339,7 @@ read_more: | |||
2117 | r10_bio->sector - bio->bi_sector, | 2339 | r10_bio->sector - bio->bi_sector, |
2118 | max_sectors); | 2340 | max_sectors); |
2119 | r10_bio->devs[slot].bio = bio; | 2341 | r10_bio->devs[slot].bio = bio; |
2342 | r10_bio->devs[slot].rdev = rdev; | ||
2120 | bio->bi_sector = r10_bio->devs[slot].addr | 2343 | bio->bi_sector = r10_bio->devs[slot].addr |
2121 | + rdev->data_offset; | 2344 | + rdev->data_offset; |
2122 | bio->bi_bdev = rdev->bdev; | 2345 | bio->bi_bdev = rdev->bdev; |
@@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2187 | r10_bio->sectors, 0)) | 2410 | r10_bio->sectors, 0)) |
2188 | md_error(conf->mddev, rdev); | 2411 | md_error(conf->mddev, rdev); |
2189 | } | 2412 | } |
2413 | rdev = conf->mirrors[dev].replacement; | ||
2414 | if (r10_bio->devs[m].repl_bio == NULL) | ||
2415 | continue; | ||
2416 | if (test_bit(BIO_UPTODATE, | ||
2417 | &r10_bio->devs[m].repl_bio->bi_flags)) { | ||
2418 | rdev_clear_badblocks( | ||
2419 | rdev, | ||
2420 | r10_bio->devs[m].addr, | ||
2421 | r10_bio->sectors); | ||
2422 | } else { | ||
2423 | if (!rdev_set_badblocks( | ||
2424 | rdev, | ||
2425 | r10_bio->devs[m].addr, | ||
2426 | r10_bio->sectors, 0)) | ||
2427 | md_error(conf->mddev, rdev); | ||
2428 | } | ||
2190 | } | 2429 | } |
2191 | put_buf(r10_bio); | 2430 | put_buf(r10_bio); |
2192 | } else { | 2431 | } else { |
@@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2209 | } | 2448 | } |
2210 | rdev_dec_pending(rdev, conf->mddev); | 2449 | rdev_dec_pending(rdev, conf->mddev); |
2211 | } | 2450 | } |
2451 | bio = r10_bio->devs[m].repl_bio; | ||
2452 | rdev = conf->mirrors[dev].replacement; | ||
2453 | if (rdev && bio == IO_MADE_GOOD) { | ||
2454 | rdev_clear_badblocks( | ||
2455 | rdev, | ||
2456 | r10_bio->devs[m].addr, | ||
2457 | r10_bio->sectors); | ||
2458 | rdev_dec_pending(rdev, conf->mddev); | ||
2459 | } | ||
2212 | } | 2460 | } |
2213 | if (test_bit(R10BIO_WriteError, | 2461 | if (test_bit(R10BIO_WriteError, |
2214 | &r10_bio->state)) | 2462 | &r10_bio->state)) |
@@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev) | |||
2272 | static int init_resync(struct r10conf *conf) | 2520 | static int init_resync(struct r10conf *conf) |
2273 | { | 2521 | { |
2274 | int buffs; | 2522 | int buffs; |
2523 | int i; | ||
2275 | 2524 | ||
2276 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; | 2525 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; |
2277 | BUG_ON(conf->r10buf_pool); | 2526 | BUG_ON(conf->r10buf_pool); |
2527 | conf->have_replacement = 0; | ||
2528 | for (i = 0; i < conf->raid_disks; i++) | ||
2529 | if (conf->mirrors[i].replacement) | ||
2530 | conf->have_replacement = 1; | ||
2278 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); | 2531 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); |
2279 | if (!conf->r10buf_pool) | 2532 | if (!conf->r10buf_pool) |
2280 | return -ENOMEM; | 2533 | return -ENOMEM; |
@@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2355 | bitmap_end_sync(mddev->bitmap, sect, | 2608 | bitmap_end_sync(mddev->bitmap, sect, |
2356 | &sync_blocks, 1); | 2609 | &sync_blocks, 1); |
2357 | } | 2610 | } |
2358 | } else /* completed sync */ | 2611 | } else { |
2612 | /* completed sync */ | ||
2613 | if ((!mddev->bitmap || conf->fullsync) | ||
2614 | && conf->have_replacement | ||
2615 | && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | ||
2616 | /* Completed a full sync so the replacements | ||
2617 | * are now fully recovered. | ||
2618 | */ | ||
2619 | for (i = 0; i < conf->raid_disks; i++) | ||
2620 | if (conf->mirrors[i].replacement) | ||
2621 | conf->mirrors[i].replacement | ||
2622 | ->recovery_offset | ||
2623 | = MaxSector; | ||
2624 | } | ||
2359 | conf->fullsync = 0; | 2625 | conf->fullsync = 0; |
2360 | 2626 | } | |
2361 | bitmap_close_sync(mddev->bitmap); | 2627 | bitmap_close_sync(mddev->bitmap); |
2362 | close_sync(conf); | 2628 | close_sync(conf); |
2363 | *skipped = 1; | 2629 | *skipped = 1; |
@@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2414 | sector_t sect; | 2680 | sector_t sect; |
2415 | int must_sync; | 2681 | int must_sync; |
2416 | int any_working; | 2682 | int any_working; |
2417 | 2683 | struct mirror_info *mirror = &conf->mirrors[i]; | |
2418 | if (conf->mirrors[i].rdev == NULL || | 2684 | |
2419 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) | 2685 | if ((mirror->rdev == NULL || |
2686 | test_bit(In_sync, &mirror->rdev->flags)) | ||
2687 | && | ||
2688 | (mirror->replacement == NULL || | ||
2689 | test_bit(Faulty, | ||
2690 | &mirror->replacement->flags))) | ||
2420 | continue; | 2691 | continue; |
2421 | 2692 | ||
2422 | still_degraded = 0; | 2693 | still_degraded = 0; |
2423 | /* want to reconstruct this device */ | 2694 | /* want to reconstruct this device */ |
2424 | rb2 = r10_bio; | 2695 | rb2 = r10_bio; |
2425 | sect = raid10_find_virt(conf, sector_nr, i); | 2696 | sect = raid10_find_virt(conf, sector_nr, i); |
2426 | /* Unless we are doing a full sync, we only need | 2697 | /* Unless we are doing a full sync, or a replacement |
2427 | * to recover the block if it is set in the bitmap | 2698 | * we only need to recover the block if it is set in |
2699 | * the bitmap | ||
2428 | */ | 2700 | */ |
2429 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | 2701 | must_sync = bitmap_start_sync(mddev->bitmap, sect, |
2430 | &sync_blocks, 1); | 2702 | &sync_blocks, 1); |
2431 | if (sync_blocks < max_sync) | 2703 | if (sync_blocks < max_sync) |
2432 | max_sync = sync_blocks; | 2704 | max_sync = sync_blocks; |
2433 | if (!must_sync && | 2705 | if (!must_sync && |
2706 | mirror->replacement == NULL && | ||
2434 | !conf->fullsync) { | 2707 | !conf->fullsync) { |
2435 | /* yep, skip the sync_blocks here, but don't assume | 2708 | /* yep, skip the sync_blocks here, but don't assume |
2436 | * that there will never be anything to do here | 2709 | * that there will never be anything to do here |
@@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2500 | bio->bi_end_io = end_sync_read; | 2773 | bio->bi_end_io = end_sync_read; |
2501 | bio->bi_rw = READ; | 2774 | bio->bi_rw = READ; |
2502 | from_addr = r10_bio->devs[j].addr; | 2775 | from_addr = r10_bio->devs[j].addr; |
2503 | bio->bi_sector = from_addr + | 2776 | bio->bi_sector = from_addr + rdev->data_offset; |
2504 | conf->mirrors[d].rdev->data_offset; | 2777 | bio->bi_bdev = rdev->bdev; |
2505 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2778 | atomic_inc(&rdev->nr_pending); |
2506 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2779 | /* and we write to 'i' (if not in_sync) */ |
2507 | atomic_inc(&r10_bio->remaining); | ||
2508 | /* and we write to 'i' */ | ||
2509 | 2780 | ||
2510 | for (k=0; k<conf->copies; k++) | 2781 | for (k=0; k<conf->copies; k++) |
2511 | if (r10_bio->devs[k].devnum == i) | 2782 | if (r10_bio->devs[k].devnum == i) |
2512 | break; | 2783 | break; |
2513 | BUG_ON(k == conf->copies); | 2784 | BUG_ON(k == conf->copies); |
2514 | bio = r10_bio->devs[1].bio; | ||
2515 | bio->bi_next = biolist; | ||
2516 | biolist = bio; | ||
2517 | bio->bi_private = r10_bio; | ||
2518 | bio->bi_end_io = end_sync_write; | ||
2519 | bio->bi_rw = WRITE; | ||
2520 | to_addr = r10_bio->devs[k].addr; | 2785 | to_addr = r10_bio->devs[k].addr; |
2521 | bio->bi_sector = to_addr + | ||
2522 | conf->mirrors[i].rdev->data_offset; | ||
2523 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
2524 | |||
2525 | r10_bio->devs[0].devnum = d; | 2786 | r10_bio->devs[0].devnum = d; |
2526 | r10_bio->devs[0].addr = from_addr; | 2787 | r10_bio->devs[0].addr = from_addr; |
2527 | r10_bio->devs[1].devnum = i; | 2788 | r10_bio->devs[1].devnum = i; |
2528 | r10_bio->devs[1].addr = to_addr; | 2789 | r10_bio->devs[1].addr = to_addr; |
2529 | 2790 | ||
2791 | rdev = mirror->rdev; | ||
2792 | if (!test_bit(In_sync, &rdev->flags)) { | ||
2793 | bio = r10_bio->devs[1].bio; | ||
2794 | bio->bi_next = biolist; | ||
2795 | biolist = bio; | ||
2796 | bio->bi_private = r10_bio; | ||
2797 | bio->bi_end_io = end_sync_write; | ||
2798 | bio->bi_rw = WRITE; | ||
2799 | bio->bi_sector = to_addr | ||
2800 | + rdev->data_offset; | ||
2801 | bio->bi_bdev = rdev->bdev; | ||
2802 | atomic_inc(&r10_bio->remaining); | ||
2803 | } else | ||
2804 | r10_bio->devs[1].bio->bi_end_io = NULL; | ||
2805 | |||
2806 | /* and maybe write to replacement */ | ||
2807 | bio = r10_bio->devs[1].repl_bio; | ||
2808 | if (bio) | ||
2809 | bio->bi_end_io = NULL; | ||
2810 | rdev = mirror->replacement; | ||
2811 | /* Note: if rdev != NULL, then bio | ||
2812 | * cannot be NULL as r10buf_pool_alloc will | ||
2813 | * have allocated it. | ||
2814 | * So the second test here is pointless. | ||
2815 | * But it keeps semantic-checkers happy, and | ||
2816 | * this comment keeps human reviewers | ||
2817 | * happy. | ||
2818 | */ | ||
2819 | if (rdev == NULL || bio == NULL || | ||
2820 | test_bit(Faulty, &rdev->flags)) | ||
2821 | break; | ||
2822 | bio->bi_next = biolist; | ||
2823 | biolist = bio; | ||
2824 | bio->bi_private = r10_bio; | ||
2825 | bio->bi_end_io = end_sync_write; | ||
2826 | bio->bi_rw = WRITE; | ||
2827 | bio->bi_sector = to_addr + rdev->data_offset; | ||
2828 | bio->bi_bdev = rdev->bdev; | ||
2829 | atomic_inc(&r10_bio->remaining); | ||
2530 | break; | 2830 | break; |
2531 | } | 2831 | } |
2532 | if (j == conf->copies) { | 2832 | if (j == conf->copies) { |
@@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2544 | for (k = 0; k < conf->copies; k++) | 2844 | for (k = 0; k < conf->copies; k++) |
2545 | if (r10_bio->devs[k].devnum == i) | 2845 | if (r10_bio->devs[k].devnum == i) |
2546 | break; | 2846 | break; |
2547 | if (!rdev_set_badblocks( | 2847 | if (!test_bit(In_sync, |
2548 | conf->mirrors[i].rdev, | 2848 | &mirror->rdev->flags) |
2849 | && !rdev_set_badblocks( | ||
2850 | mirror->rdev, | ||
2851 | r10_bio->devs[k].addr, | ||
2852 | max_sync, 0)) | ||
2853 | any_working = 0; | ||
2854 | if (mirror->replacement && | ||
2855 | !rdev_set_badblocks( | ||
2856 | mirror->replacement, | ||
2549 | r10_bio->devs[k].addr, | 2857 | r10_bio->devs[k].addr, |
2550 | max_sync, 0)) | 2858 | max_sync, 0)) |
2551 | any_working = 0; | 2859 | any_working = 0; |
@@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2556 | printk(KERN_INFO "md/raid10:%s: insufficient " | 2864 | printk(KERN_INFO "md/raid10:%s: insufficient " |
2557 | "working devices for recovery.\n", | 2865 | "working devices for recovery.\n", |
2558 | mdname(mddev)); | 2866 | mdname(mddev)); |
2559 | conf->mirrors[i].recovery_disabled | 2867 | mirror->recovery_disabled |
2560 | = mddev->recovery_disabled; | 2868 | = mddev->recovery_disabled; |
2561 | } | 2869 | } |
2562 | break; | 2870 | break; |
@@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2605 | sector_t first_bad, sector; | 2913 | sector_t first_bad, sector; |
2606 | int bad_sectors; | 2914 | int bad_sectors; |
2607 | 2915 | ||
2916 | if (r10_bio->devs[i].repl_bio) | ||
2917 | r10_bio->devs[i].repl_bio->bi_end_io = NULL; | ||
2918 | |||
2608 | bio = r10_bio->devs[i].bio; | 2919 | bio = r10_bio->devs[i].bio; |
2609 | bio->bi_end_io = NULL; | 2920 | bio->bi_end_io = NULL; |
2610 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | 2921 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
@@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2635 | conf->mirrors[d].rdev->data_offset; | 2946 | conf->mirrors[d].rdev->data_offset; |
2636 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2947 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
2637 | count++; | 2948 | count++; |
2949 | |||
2950 | if (conf->mirrors[d].replacement == NULL || | ||
2951 | test_bit(Faulty, | ||
2952 | &conf->mirrors[d].replacement->flags)) | ||
2953 | continue; | ||
2954 | |||
2955 | /* Need to set up for writing to the replacement */ | ||
2956 | bio = r10_bio->devs[i].repl_bio; | ||
2957 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
2958 | |||
2959 | sector = r10_bio->devs[i].addr; | ||
2960 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
2961 | bio->bi_next = biolist; | ||
2962 | biolist = bio; | ||
2963 | bio->bi_private = r10_bio; | ||
2964 | bio->bi_end_io = end_sync_write; | ||
2965 | bio->bi_rw = WRITE; | ||
2966 | bio->bi_sector = sector + | ||
2967 | conf->mirrors[d].replacement->data_offset; | ||
2968 | bio->bi_bdev = conf->mirrors[d].replacement->bdev; | ||
2969 | count++; | ||
2638 | } | 2970 | } |
2639 | 2971 | ||
2640 | if (count < 2) { | 2972 | if (count < 2) { |
@@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2643 | if (r10_bio->devs[i].bio->bi_end_io) | 2975 | if (r10_bio->devs[i].bio->bi_end_io) |
2644 | rdev_dec_pending(conf->mirrors[d].rdev, | 2976 | rdev_dec_pending(conf->mirrors[d].rdev, |
2645 | mddev); | 2977 | mddev); |
2978 | if (r10_bio->devs[i].repl_bio && | ||
2979 | r10_bio->devs[i].repl_bio->bi_end_io) | ||
2980 | rdev_dec_pending( | ||
2981 | conf->mirrors[d].replacement, | ||
2982 | mddev); | ||
2646 | } | 2983 | } |
2647 | put_buf(r10_bio); | 2984 | put_buf(r10_bio); |
2648 | biolist = NULL; | 2985 | biolist = NULL; |
@@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev) | |||
2896 | continue; | 3233 | continue; |
2897 | disk = conf->mirrors + disk_idx; | 3234 | disk = conf->mirrors + disk_idx; |
2898 | 3235 | ||
3236 | if (test_bit(Replacement, &rdev->flags)) { | ||
3237 | if (disk->replacement) | ||
3238 | goto out_free_conf; | ||
3239 | disk->replacement = rdev; | ||
3240 | } else { | ||
3241 | if (disk->rdev) | ||
3242 | goto out_free_conf; | ||
3243 | disk->rdev = rdev; | ||
3244 | } | ||
3245 | |||
2899 | disk->rdev = rdev; | 3246 | disk->rdev = rdev; |
2900 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3247 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2901 | rdev->data_offset << 9); | 3248 | rdev->data_offset << 9); |
@@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev) | |||
2923 | 3270 | ||
2924 | disk = conf->mirrors + i; | 3271 | disk = conf->mirrors + i; |
2925 | 3272 | ||
3273 | if (!disk->rdev && disk->replacement) { | ||
3274 | /* The replacement is all we have - use it */ | ||
3275 | disk->rdev = disk->replacement; | ||
3276 | disk->replacement = NULL; | ||
3277 | clear_bit(Replacement, &disk->rdev->flags); | ||
3278 | } | ||
3279 | |||
2926 | if (!disk->rdev || | 3280 | if (!disk->rdev || |
2927 | !test_bit(In_sync, &disk->rdev->flags)) { | 3281 | !test_bit(In_sync, &disk->rdev->flags)) { |
2928 | disk->head_position = 0; | 3282 | disk->head_position = 0; |