aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c1209
1 files changed, 916 insertions, 293 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e846688962..1d44228530a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,6 +22,7 @@
22#include <linux/delay.h> 22#include <linux/delay.h>
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/seq_file.h> 24#include <linux/seq_file.h>
25#include <linux/ratelimit.h>
25#include "md.h" 26#include "md.h"
26#include "raid10.h" 27#include "raid10.h"
27#include "raid0.h" 28#include "raid0.h"
@@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123 for (j = 0 ; j < nalloc; j++) { 124 for (j = 0 ; j < nalloc; j++) {
124 bio = r10_bio->devs[j].bio; 125 bio = r10_bio->devs[j].bio;
125 for (i = 0; i < RESYNC_PAGES; i++) { 126 for (i = 0; i < RESYNC_PAGES; i++) {
126 page = alloc_page(gfp_flags); 127 if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
128 &conf->mddev->recovery)) {
129 /* we can share bv_page's during recovery */
130 struct bio *rbio = r10_bio->devs[0].bio;
131 page = rbio->bi_io_vec[i].bv_page;
132 get_page(page);
133 } else
134 page = alloc_page(gfp_flags);
127 if (unlikely(!page)) 135 if (unlikely(!page))
128 goto out_free_pages; 136 goto out_free_pages;
129 137
@@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
173 181
174 for (i = 0; i < conf->copies; i++) { 182 for (i = 0; i < conf->copies; i++) {
175 struct bio **bio = & r10_bio->devs[i].bio; 183 struct bio **bio = & r10_bio->devs[i].bio;
176 if (*bio && *bio != IO_BLOCKED) 184 if (!BIO_SPECIAL(*bio))
177 bio_put(*bio); 185 bio_put(*bio);
178 *bio = NULL; 186 *bio = NULL;
179 } 187 }
@@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
183{ 191{
184 conf_t *conf = r10_bio->mddev->private; 192 conf_t *conf = r10_bio->mddev->private;
185 193
186 /*
187 * Wake up any possible resync thread that waits for the device
188 * to go idle.
189 */
190 allow_barrier(conf);
191
192 put_all_bios(conf, r10_bio); 194 put_all_bios(conf, r10_bio);
193 mempool_free(r10_bio, conf->r10bio_pool); 195 mempool_free(r10_bio, conf->r10bio_pool);
194} 196}
@@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
227static void raid_end_bio_io(r10bio_t *r10_bio) 229static void raid_end_bio_io(r10bio_t *r10_bio)
228{ 230{
229 struct bio *bio = r10_bio->master_bio; 231 struct bio *bio = r10_bio->master_bio;
232 int done;
233 conf_t *conf = r10_bio->mddev->private;
230 234
231 bio_endio(bio, 235 if (bio->bi_phys_segments) {
232 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); 236 unsigned long flags;
237 spin_lock_irqsave(&conf->device_lock, flags);
238 bio->bi_phys_segments--;
239 done = (bio->bi_phys_segments == 0);
240 spin_unlock_irqrestore(&conf->device_lock, flags);
241 } else
242 done = 1;
243 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
244 clear_bit(BIO_UPTODATE, &bio->bi_flags);
245 if (done) {
246 bio_endio(bio, 0);
247 /*
248 * Wake up any possible resync thread that waits for the device
249 * to go idle.
250 */
251 allow_barrier(conf);
252 }
233 free_r10bio(r10_bio); 253 free_r10bio(r10_bio);
234} 254}
235 255
@@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
244 r10_bio->devs[slot].addr + (r10_bio->sectors); 264 r10_bio->devs[slot].addr + (r10_bio->sectors);
245} 265}
246 266
267/*
268 * Find the disk number which triggered given bio
269 */
270static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
271 struct bio *bio, int *slotp)
272{
273 int slot;
274
275 for (slot = 0; slot < conf->copies; slot++)
276 if (r10_bio->devs[slot].bio == bio)
277 break;
278
279 BUG_ON(slot == conf->copies);
280 update_head_pos(slot, r10_bio);
281
282 if (slotp)
283 *slotp = slot;
284 return r10_bio->devs[slot].devnum;
285}
286
247static void raid10_end_read_request(struct bio *bio, int error) 287static void raid10_end_read_request(struct bio *bio, int error)
248{ 288{
249 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 289 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -277,34 +317,60 @@ static void raid10_end_read_request(struct bio *bio, int error)
277 * oops, read error - keep the refcount on the rdev 317 * oops, read error - keep the refcount on the rdev
278 */ 318 */
279 char b[BDEVNAME_SIZE]; 319 char b[BDEVNAME_SIZE];
280 if (printk_ratelimit()) 320 printk_ratelimited(KERN_ERR
281 printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", 321 "md/raid10:%s: %s: rescheduling sector %llu\n",
282 mdname(conf->mddev), 322 mdname(conf->mddev),
283 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 323 bdevname(conf->mirrors[dev].rdev->bdev, b),
324 (unsigned long long)r10_bio->sector);
325 set_bit(R10BIO_ReadError, &r10_bio->state);
284 reschedule_retry(r10_bio); 326 reschedule_retry(r10_bio);
285 } 327 }
286} 328}
287 329
330static void close_write(r10bio_t *r10_bio)
331{
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
334 r10_bio->sectors,
335 !test_bit(R10BIO_Degraded, &r10_bio->state),
336 0);
337 md_write_end(r10_bio->mddev);
338}
339
340static void one_write_done(r10bio_t *r10_bio)
341{
342 if (atomic_dec_and_test(&r10_bio->remaining)) {
343 if (test_bit(R10BIO_WriteError, &r10_bio->state))
344 reschedule_retry(r10_bio);
345 else {
346 close_write(r10_bio);
347 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
348 reschedule_retry(r10_bio);
349 else
350 raid_end_bio_io(r10_bio);
351 }
352 }
353}
354
288static void raid10_end_write_request(struct bio *bio, int error) 355static void raid10_end_write_request(struct bio *bio, int error)
289{ 356{
290 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 357 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
291 r10bio_t *r10_bio = bio->bi_private; 358 r10bio_t *r10_bio = bio->bi_private;
292 int slot, dev; 359 int dev;
360 int dec_rdev = 1;
293 conf_t *conf = r10_bio->mddev->private; 361 conf_t *conf = r10_bio->mddev->private;
362 int slot;
294 363
295 for (slot = 0; slot < conf->copies; slot++) 364 dev = find_bio_disk(conf, r10_bio, bio, &slot);
296 if (r10_bio->devs[slot].bio == bio)
297 break;
298 dev = r10_bio->devs[slot].devnum;
299 365
300 /* 366 /*
301 * this branch is our 'one mirror IO has finished' event handler: 367 * this branch is our 'one mirror IO has finished' event handler:
302 */ 368 */
303 if (!uptodate) { 369 if (!uptodate) {
304 md_error(r10_bio->mddev, conf->mirrors[dev].rdev); 370 set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
305 /* an I/O failed, we can't clear the bitmap */ 371 set_bit(R10BIO_WriteError, &r10_bio->state);
306 set_bit(R10BIO_Degraded, &r10_bio->state); 372 dec_rdev = 0;
307 } else 373 } else {
308 /* 374 /*
309 * Set R10BIO_Uptodate in our master bio, so that 375 * Set R10BIO_Uptodate in our master bio, so that
310 * we will return a good error code for to the higher 376 * we will return a good error code for to the higher
@@ -314,26 +380,31 @@ static void raid10_end_write_request(struct bio *bio, int error)
314 * user-side. So if something waits for IO, then it will 380 * user-side. So if something waits for IO, then it will
315 * wait for the 'master' bio. 381 * wait for the 'master' bio.
316 */ 382 */
383 sector_t first_bad;
384 int bad_sectors;
385
317 set_bit(R10BIO_Uptodate, &r10_bio->state); 386 set_bit(R10BIO_Uptodate, &r10_bio->state);
318 387
319 update_head_pos(slot, r10_bio); 388 /* Maybe we can clear some bad blocks. */
389 if (is_badblock(conf->mirrors[dev].rdev,
390 r10_bio->devs[slot].addr,
391 r10_bio->sectors,
392 &first_bad, &bad_sectors)) {
393 bio_put(bio);
394 r10_bio->devs[slot].bio = IO_MADE_GOOD;
395 dec_rdev = 0;
396 set_bit(R10BIO_MadeGood, &r10_bio->state);
397 }
398 }
320 399
321 /* 400 /*
322 * 401 *
323 * Let's see if all mirrored write operations have finished 402 * Let's see if all mirrored write operations have finished
324 * already. 403 * already.
325 */ 404 */
326 if (atomic_dec_and_test(&r10_bio->remaining)) { 405 one_write_done(r10_bio);
327 /* clear the bitmap if all writes complete successfully */ 406 if (dec_rdev)
328 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 407 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
329 r10_bio->sectors,
330 !test_bit(R10BIO_Degraded, &r10_bio->state),
331 0);
332 md_write_end(r10_bio->mddev);
333 raid_end_bio_io(r10_bio);
334 }
335
336 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
337} 408}
338 409
339 410
@@ -484,11 +555,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
484 * FIXME: possibly should rethink readbalancing and do it differently 555 * FIXME: possibly should rethink readbalancing and do it differently
485 * depending on near_copies / far_copies geometry. 556 * depending on near_copies / far_copies geometry.
486 */ 557 */
487static int read_balance(conf_t *conf, r10bio_t *r10_bio) 558static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
488{ 559{
489 const sector_t this_sector = r10_bio->sector; 560 const sector_t this_sector = r10_bio->sector;
490 int disk, slot; 561 int disk, slot;
491 const int sectors = r10_bio->sectors; 562 int sectors = r10_bio->sectors;
563 int best_good_sectors;
492 sector_t new_distance, best_dist; 564 sector_t new_distance, best_dist;
493 mdk_rdev_t *rdev; 565 mdk_rdev_t *rdev;
494 int do_balance; 566 int do_balance;
@@ -497,8 +569,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
497 raid10_find_phys(conf, r10_bio); 569 raid10_find_phys(conf, r10_bio);
498 rcu_read_lock(); 570 rcu_read_lock();
499retry: 571retry:
572 sectors = r10_bio->sectors;
500 best_slot = -1; 573 best_slot = -1;
501 best_dist = MaxSector; 574 best_dist = MaxSector;
575 best_good_sectors = 0;
502 do_balance = 1; 576 do_balance = 1;
503 /* 577 /*
504 * Check if we can balance. We can balance on the whole 578 * Check if we can balance. We can balance on the whole
@@ -511,6 +585,10 @@ retry:
511 do_balance = 0; 585 do_balance = 0;
512 586
513 for (slot = 0; slot < conf->copies ; slot++) { 587 for (slot = 0; slot < conf->copies ; slot++) {
588 sector_t first_bad;
589 int bad_sectors;
590 sector_t dev_sector;
591
514 if (r10_bio->devs[slot].bio == IO_BLOCKED) 592 if (r10_bio->devs[slot].bio == IO_BLOCKED)
515 continue; 593 continue;
516 disk = r10_bio->devs[slot].devnum; 594 disk = r10_bio->devs[slot].devnum;
@@ -520,6 +598,37 @@ retry:
520 if (!test_bit(In_sync, &rdev->flags)) 598 if (!test_bit(In_sync, &rdev->flags))
521 continue; 599 continue;
522 600
601 dev_sector = r10_bio->devs[slot].addr;
602 if (is_badblock(rdev, dev_sector, sectors,
603 &first_bad, &bad_sectors)) {
604 if (best_dist < MaxSector)
605 /* Already have a better slot */
606 continue;
607 if (first_bad <= dev_sector) {
608 /* Cannot read here. If this is the
609 * 'primary' device, then we must not read
610 * beyond 'bad_sectors' from another device.
611 */
612 bad_sectors -= (dev_sector - first_bad);
613 if (!do_balance && sectors > bad_sectors)
614 sectors = bad_sectors;
615 if (best_good_sectors > sectors)
616 best_good_sectors = sectors;
617 } else {
618 sector_t good_sectors =
619 first_bad - dev_sector;
620 if (good_sectors > best_good_sectors) {
621 best_good_sectors = good_sectors;
622 best_slot = slot;
623 }
624 if (!do_balance)
625 /* Must read from here */
626 break;
627 }
628 continue;
629 } else
630 best_good_sectors = sectors;
631
523 if (!do_balance) 632 if (!do_balance)
524 break; 633 break;
525 634
@@ -561,6 +670,7 @@ retry:
561 } else 670 } else
562 disk = -1; 671 disk = -1;
563 rcu_read_unlock(); 672 rcu_read_unlock();
673 *max_sectors = best_good_sectors;
564 674
565 return disk; 675 return disk;
566} 676}
@@ -734,6 +844,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
734 unsigned long flags; 844 unsigned long flags;
735 mdk_rdev_t *blocked_rdev; 845 mdk_rdev_t *blocked_rdev;
736 int plugged; 846 int plugged;
847 int sectors_handled;
848 int max_sectors;
737 849
738 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 850 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
739 md_flush_request(mddev, bio); 851 md_flush_request(mddev, bio);
@@ -808,12 +920,26 @@ static int make_request(mddev_t *mddev, struct bio * bio)
808 r10_bio->sector = bio->bi_sector; 920 r10_bio->sector = bio->bi_sector;
809 r10_bio->state = 0; 921 r10_bio->state = 0;
810 922
923 /* We might need to issue multiple reads to different
924 * devices if there are bad blocks around, so we keep
925 * track of the number of reads in bio->bi_phys_segments.
926 * If this is 0, there is only one r10_bio and no locking
927 * will be needed when the request completes. If it is
928 * non-zero, then it is the number of not-completed requests.
929 */
930 bio->bi_phys_segments = 0;
931 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
932
811 if (rw == READ) { 933 if (rw == READ) {
812 /* 934 /*
813 * read balancing logic: 935 * read balancing logic:
814 */ 936 */
815 int disk = read_balance(conf, r10_bio); 937 int disk;
816 int slot = r10_bio->read_slot; 938 int slot;
939
940read_again:
941 disk = read_balance(conf, r10_bio, &max_sectors);
942 slot = r10_bio->read_slot;
817 if (disk < 0) { 943 if (disk < 0) {
818 raid_end_bio_io(r10_bio); 944 raid_end_bio_io(r10_bio);
819 return 0; 945 return 0;
@@ -821,6 +947,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 mirror = conf->mirrors + disk; 947 mirror = conf->mirrors + disk;
822 948
823 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 949 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
950 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
951 max_sectors);
824 952
825 r10_bio->devs[slot].bio = read_bio; 953 r10_bio->devs[slot].bio = read_bio;
826 954
@@ -831,7 +959,37 @@ static int make_request(mddev_t *mddev, struct bio * bio)
831 read_bio->bi_rw = READ | do_sync; 959 read_bio->bi_rw = READ | do_sync;
832 read_bio->bi_private = r10_bio; 960 read_bio->bi_private = r10_bio;
833 961
834 generic_make_request(read_bio); 962 if (max_sectors < r10_bio->sectors) {
963 /* Could not read all from this device, so we will
964 * need another r10_bio.
965 */
966 sectors_handled = (r10_bio->sectors + max_sectors
967 - bio->bi_sector);
968 r10_bio->sectors = max_sectors;
969 spin_lock_irq(&conf->device_lock);
970 if (bio->bi_phys_segments == 0)
971 bio->bi_phys_segments = 2;
972 else
973 bio->bi_phys_segments++;
974 spin_unlock(&conf->device_lock);
975 /* Cannot call generic_make_request directly
976 * as that will be queued in __generic_make_request
977 * and subsequent mempool_alloc might block
978 * waiting for it. so hand bio over to raid10d.
979 */
980 reschedule_retry(r10_bio);
981
982 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
983
984 r10_bio->master_bio = bio;
985 r10_bio->sectors = ((bio->bi_size >> 9)
986 - sectors_handled);
987 r10_bio->state = 0;
988 r10_bio->mddev = mddev;
989 r10_bio->sector = bio->bi_sector + sectors_handled;
990 goto read_again;
991 } else
992 generic_make_request(read_bio);
835 return 0; 993 return 0;
836 } 994 }
837 995
@@ -841,13 +999,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
841 /* first select target devices under rcu_lock and 999 /* first select target devices under rcu_lock and
842 * inc refcount on their rdev. Record them by setting 1000 * inc refcount on their rdev. Record them by setting
843 * bios[x] to bio 1001 * bios[x] to bio
1002 * If there are known/acknowledged bad blocks on any device
1003 * on which we have seen a write error, we want to avoid
1004 * writing to those blocks. This potentially requires several
1005 * writes to write around the bad blocks. Each set of writes
1006 * gets its own r10_bio with a set of bios attached. The number
1007 * of r10_bios is recored in bio->bi_phys_segments just as with
1008 * the read case.
844 */ 1009 */
845 plugged = mddev_check_plugged(mddev); 1010 plugged = mddev_check_plugged(mddev);
846 1011
847 raid10_find_phys(conf, r10_bio); 1012 raid10_find_phys(conf, r10_bio);
848 retry_write: 1013retry_write:
849 blocked_rdev = NULL; 1014 blocked_rdev = NULL;
850 rcu_read_lock(); 1015 rcu_read_lock();
1016 max_sectors = r10_bio->sectors;
1017
851 for (i = 0; i < conf->copies; i++) { 1018 for (i = 0; i < conf->copies; i++) {
852 int d = r10_bio->devs[i].devnum; 1019 int d = r10_bio->devs[i].devnum;
853 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); 1020 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -856,13 +1023,55 @@ static int make_request(mddev_t *mddev, struct bio * bio)
856 blocked_rdev = rdev; 1023 blocked_rdev = rdev;
857 break; 1024 break;
858 } 1025 }
859 if (rdev && !test_bit(Faulty, &rdev->flags)) { 1026 r10_bio->devs[i].bio = NULL;
860 atomic_inc(&rdev->nr_pending); 1027 if (!rdev || test_bit(Faulty, &rdev->flags)) {
861 r10_bio->devs[i].bio = bio;
862 } else {
863 r10_bio->devs[i].bio = NULL;
864 set_bit(R10BIO_Degraded, &r10_bio->state); 1028 set_bit(R10BIO_Degraded, &r10_bio->state);
1029 continue;
1030 }
1031 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1032 sector_t first_bad;
1033 sector_t dev_sector = r10_bio->devs[i].addr;
1034 int bad_sectors;
1035 int is_bad;
1036
1037 is_bad = is_badblock(rdev, dev_sector,
1038 max_sectors,
1039 &first_bad, &bad_sectors);
1040 if (is_bad < 0) {
1041 /* Mustn't write here until the bad block
1042 * is acknowledged
1043 */
1044 atomic_inc(&rdev->nr_pending);
1045 set_bit(BlockedBadBlocks, &rdev->flags);
1046 blocked_rdev = rdev;
1047 break;
1048 }
1049 if (is_bad && first_bad <= dev_sector) {
1050 /* Cannot write here at all */
1051 bad_sectors -= (dev_sector - first_bad);
1052 if (bad_sectors < max_sectors)
1053 /* Mustn't write more than bad_sectors
1054 * to other devices yet
1055 */
1056 max_sectors = bad_sectors;
1057 /* We don't set R10BIO_Degraded as that
1058 * only applies if the disk is missing,
1059 * so it might be re-added, and we want to
1060 * know to recover this chunk.
1061 * In this case the device is here, and the
1062 * fact that this chunk is not in-sync is
1063 * recorded in the bad block log.
1064 */
1065 continue;
1066 }
1067 if (is_bad) {
1068 int good_sectors = first_bad - dev_sector;
1069 if (good_sectors < max_sectors)
1070 max_sectors = good_sectors;
1071 }
865 } 1072 }
1073 r10_bio->devs[i].bio = bio;
1074 atomic_inc(&rdev->nr_pending);
866 } 1075 }
867 rcu_read_unlock(); 1076 rcu_read_unlock();
868 1077
@@ -882,8 +1091,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
882 goto retry_write; 1091 goto retry_write;
883 } 1092 }
884 1093
1094 if (max_sectors < r10_bio->sectors) {
1095 /* We are splitting this into multiple parts, so
1096 * we need to prepare for allocating another r10_bio.
1097 */
1098 r10_bio->sectors = max_sectors;
1099 spin_lock_irq(&conf->device_lock);
1100 if (bio->bi_phys_segments == 0)
1101 bio->bi_phys_segments = 2;
1102 else
1103 bio->bi_phys_segments++;
1104 spin_unlock_irq(&conf->device_lock);
1105 }
1106 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1107
885 atomic_set(&r10_bio->remaining, 1); 1108 atomic_set(&r10_bio->remaining, 1);
886 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); 1109 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
887 1110
888 for (i = 0; i < conf->copies; i++) { 1111 for (i = 0; i < conf->copies; i++) {
889 struct bio *mbio; 1112 struct bio *mbio;
@@ -892,10 +1115,12 @@ static int make_request(mddev_t *mddev, struct bio * bio)
892 continue; 1115 continue;
893 1116
894 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1117 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1118 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1119 max_sectors);
895 r10_bio->devs[i].bio = mbio; 1120 r10_bio->devs[i].bio = mbio;
896 1121
897 mbio->bi_sector = r10_bio->devs[i].addr+ 1122 mbio->bi_sector = (r10_bio->devs[i].addr+
898 conf->mirrors[d].rdev->data_offset; 1123 conf->mirrors[d].rdev->data_offset);
899 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1124 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
900 mbio->bi_end_io = raid10_end_write_request; 1125 mbio->bi_end_io = raid10_end_write_request;
901 mbio->bi_rw = WRITE | do_sync | do_fua; 1126 mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -907,15 +1132,26 @@ static int make_request(mddev_t *mddev, struct bio * bio)
907 spin_unlock_irqrestore(&conf->device_lock, flags); 1132 spin_unlock_irqrestore(&conf->device_lock, flags);
908 } 1133 }
909 1134
910 if (atomic_dec_and_test(&r10_bio->remaining)) { 1135 /* Don't remove the bias on 'remaining' (one_write_done) until
911 /* This matches the end of raid10_end_write_request() */ 1136 * after checking if we need to go around again.
912 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 1137 */
913 r10_bio->sectors, 1138
914 !test_bit(R10BIO_Degraded, &r10_bio->state), 1139 if (sectors_handled < (bio->bi_size >> 9)) {
915 0); 1140 one_write_done(r10_bio);
916 md_write_end(mddev); 1141 /* We need another r10_bio. It has already been counted
917 raid_end_bio_io(r10_bio); 1142 * in bio->bi_phys_segments.
1143 */
1144 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1145
1146 r10_bio->master_bio = bio;
1147 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1148
1149 r10_bio->mddev = mddev;
1150 r10_bio->sector = bio->bi_sector + sectors_handled;
1151 r10_bio->state = 0;
1152 goto retry_write;
918 } 1153 }
1154 one_write_done(r10_bio);
919 1155
920 /* In case raid10d snuck in to freeze_array */ 1156 /* In case raid10d snuck in to freeze_array */
921 wake_up(&conf->wait_barrier); 1157 wake_up(&conf->wait_barrier);
@@ -949,6 +1185,30 @@ static void status(struct seq_file *seq, mddev_t *mddev)
949 seq_printf(seq, "]"); 1185 seq_printf(seq, "]");
950} 1186}
951 1187
1188/* check if there are enough drives for
1189 * every block to appear on atleast one.
1190 * Don't consider the device numbered 'ignore'
1191 * as we might be about to remove it.
1192 */
1193static int enough(conf_t *conf, int ignore)
1194{
1195 int first = 0;
1196
1197 do {
1198 int n = conf->copies;
1199 int cnt = 0;
1200 while (n--) {
1201 if (conf->mirrors[first].rdev &&
1202 first != ignore)
1203 cnt++;
1204 first = (first+1) % conf->raid_disks;
1205 }
1206 if (cnt == 0)
1207 return 0;
1208 } while (first != 0);
1209 return 1;
1210}
1211
952static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1212static void error(mddev_t *mddev, mdk_rdev_t *rdev)
953{ 1213{
954 char b[BDEVNAME_SIZE]; 1214 char b[BDEVNAME_SIZE];
@@ -961,13 +1221,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
961 * else mark the drive as failed 1221 * else mark the drive as failed
962 */ 1222 */
963 if (test_bit(In_sync, &rdev->flags) 1223 if (test_bit(In_sync, &rdev->flags)
964 && conf->raid_disks-mddev->degraded == 1) 1224 && !enough(conf, rdev->raid_disk))
965 /* 1225 /*
966 * Don't fail the drive, just return an IO error. 1226 * Don't fail the drive, just return an IO error.
967 * The test should really be more sophisticated than
968 * "working_disks == 1", but it isn't critical, and
969 * can wait until we do more sophisticated "is the drive
970 * really dead" tests...
971 */ 1227 */
972 return; 1228 return;
973 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1229 if (test_and_clear_bit(In_sync, &rdev->flags)) {
@@ -980,6 +1236,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
980 */ 1236 */
981 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1237 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
982 } 1238 }
1239 set_bit(Blocked, &rdev->flags);
983 set_bit(Faulty, &rdev->flags); 1240 set_bit(Faulty, &rdev->flags);
984 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1241 set_bit(MD_CHANGE_DEVS, &mddev->flags);
985 printk(KERN_ALERT 1242 printk(KERN_ALERT
@@ -1022,27 +1279,6 @@ static void close_sync(conf_t *conf)
1022 conf->r10buf_pool = NULL; 1279 conf->r10buf_pool = NULL;
1023} 1280}
1024 1281
1025/* check if there are enough drives for
1026 * every block to appear on atleast one
1027 */
1028static int enough(conf_t *conf)
1029{
1030 int first = 0;
1031
1032 do {
1033 int n = conf->copies;
1034 int cnt = 0;
1035 while (n--) {
1036 if (conf->mirrors[first].rdev)
1037 cnt++;
1038 first = (first+1) % conf->raid_disks;
1039 }
1040 if (cnt == 0)
1041 return 0;
1042 } while (first != 0);
1043 return 1;
1044}
1045
1046static int raid10_spare_active(mddev_t *mddev) 1282static int raid10_spare_active(mddev_t *mddev)
1047{ 1283{
1048 int i; 1284 int i;
@@ -1078,7 +1314,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1078 conf_t *conf = mddev->private; 1314 conf_t *conf = mddev->private;
1079 int err = -EEXIST; 1315 int err = -EEXIST;
1080 int mirror; 1316 int mirror;
1081 mirror_info_t *p;
1082 int first = 0; 1317 int first = 0;
1083 int last = conf->raid_disks - 1; 1318 int last = conf->raid_disks - 1;
1084 1319
@@ -1087,44 +1322,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1087 * very different from resync 1322 * very different from resync
1088 */ 1323 */
1089 return -EBUSY; 1324 return -EBUSY;
1090 if (!enough(conf)) 1325 if (!enough(conf, -1))
1091 return -EINVAL; 1326 return -EINVAL;
1092 1327
1093 if (rdev->raid_disk >= 0) 1328 if (rdev->raid_disk >= 0)
1094 first = last = rdev->raid_disk; 1329 first = last = rdev->raid_disk;
1095 1330
1096 if (rdev->saved_raid_disk >= 0 && 1331 if (rdev->saved_raid_disk >= first &&
1097 rdev->saved_raid_disk >= first &&
1098 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1332 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1099 mirror = rdev->saved_raid_disk; 1333 mirror = rdev->saved_raid_disk;
1100 else 1334 else
1101 mirror = first; 1335 mirror = first;
1102 for ( ; mirror <= last ; mirror++) 1336 for ( ; mirror <= last ; mirror++) {
1103 if ( !(p=conf->mirrors+mirror)->rdev) { 1337 mirror_info_t *p = &conf->mirrors[mirror];
1104 1338 if (p->recovery_disabled == mddev->recovery_disabled)
1105 disk_stack_limits(mddev->gendisk, rdev->bdev, 1339 continue;
1106 rdev->data_offset << 9); 1340 if (p->rdev)
1107 /* as we don't honour merge_bvec_fn, we must 1341 continue;
1108 * never risk violating it, so limit
1109 * ->max_segments to one lying with a single
1110 * page, as a one page request is never in
1111 * violation.
1112 */
1113 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1114 blk_queue_max_segments(mddev->queue, 1);
1115 blk_queue_segment_boundary(mddev->queue,
1116 PAGE_CACHE_SIZE - 1);
1117 }
1118 1342
1119 p->head_position = 0; 1343 disk_stack_limits(mddev->gendisk, rdev->bdev,
1120 rdev->raid_disk = mirror; 1344 rdev->data_offset << 9);
1121 err = 0; 1345 /* as we don't honour merge_bvec_fn, we must
1122 if (rdev->saved_raid_disk != mirror) 1346 * never risk violating it, so limit
1123 conf->fullsync = 1; 1347 * ->max_segments to one lying with a single
1124 rcu_assign_pointer(p->rdev, rdev); 1348 * page, as a one page request is never in
1125 break; 1349 * violation.
1350 */
1351 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1352 blk_queue_max_segments(mddev->queue, 1);
1353 blk_queue_segment_boundary(mddev->queue,
1354 PAGE_CACHE_SIZE - 1);
1126 } 1355 }
1127 1356
1357 p->head_position = 0;
1358 rdev->raid_disk = mirror;
1359 err = 0;
1360 if (rdev->saved_raid_disk != mirror)
1361 conf->fullsync = 1;
1362 rcu_assign_pointer(p->rdev, rdev);
1363 break;
1364 }
1365
1128 md_integrity_add_rdev(rdev, mddev); 1366 md_integrity_add_rdev(rdev, mddev);
1129 print_conf(conf); 1367 print_conf(conf);
1130 return err; 1368 return err;
@@ -1149,7 +1387,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1149 * is not possible. 1387 * is not possible.
1150 */ 1388 */
1151 if (!test_bit(Faulty, &rdev->flags) && 1389 if (!test_bit(Faulty, &rdev->flags) &&
1152 enough(conf)) { 1390 mddev->recovery_disabled != p->recovery_disabled &&
1391 enough(conf, -1)) {
1153 err = -EBUSY; 1392 err = -EBUSY;
1154 goto abort; 1393 goto abort;
1155 } 1394 }
@@ -1174,24 +1413,18 @@ static void end_sync_read(struct bio *bio, int error)
1174{ 1413{
1175 r10bio_t *r10_bio = bio->bi_private; 1414 r10bio_t *r10_bio = bio->bi_private;
1176 conf_t *conf = r10_bio->mddev->private; 1415 conf_t *conf = r10_bio->mddev->private;
1177 int i,d; 1416 int d;
1178 1417
1179 for (i=0; i<conf->copies; i++) 1418 d = find_bio_disk(conf, r10_bio, bio, NULL);
1180 if (r10_bio->devs[i].bio == bio)
1181 break;
1182 BUG_ON(i == conf->copies);
1183 update_head_pos(i, r10_bio);
1184 d = r10_bio->devs[i].devnum;
1185 1419
1186 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1420 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1187 set_bit(R10BIO_Uptodate, &r10_bio->state); 1421 set_bit(R10BIO_Uptodate, &r10_bio->state);
1188 else { 1422 else
1423 /* The write handler will notice the lack of
1424 * R10BIO_Uptodate and record any errors etc
1425 */
1189 atomic_add(r10_bio->sectors, 1426 atomic_add(r10_bio->sectors,
1190 &conf->mirrors[d].rdev->corrected_errors); 1427 &conf->mirrors[d].rdev->corrected_errors);
1191 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1192 md_error(r10_bio->mddev,
1193 conf->mirrors[d].rdev);
1194 }
1195 1428
1196 /* for reconstruct, we always reschedule after a read. 1429 /* for reconstruct, we always reschedule after a read.
1197 * for resync, only after all reads 1430 * for resync, only after all reads
@@ -1206,40 +1439,60 @@ static void end_sync_read(struct bio *bio, int error)
1206 } 1439 }
1207} 1440}
1208 1441
1209static void end_sync_write(struct bio *bio, int error) 1442static void end_sync_request(r10bio_t *r10_bio)
1210{ 1443{
1211 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1212 r10bio_t *r10_bio = bio->bi_private;
1213 mddev_t *mddev = r10_bio->mddev; 1444 mddev_t *mddev = r10_bio->mddev;
1214 conf_t *conf = mddev->private;
1215 int i,d;
1216
1217 for (i = 0; i < conf->copies; i++)
1218 if (r10_bio->devs[i].bio == bio)
1219 break;
1220 d = r10_bio->devs[i].devnum;
1221
1222 if (!uptodate)
1223 md_error(mddev, conf->mirrors[d].rdev);
1224
1225 update_head_pos(i, r10_bio);
1226 1445
1227 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1228 while (atomic_dec_and_test(&r10_bio->remaining)) { 1446 while (atomic_dec_and_test(&r10_bio->remaining)) {
1229 if (r10_bio->master_bio == NULL) { 1447 if (r10_bio->master_bio == NULL) {
1230 /* the primary of several recovery bios */ 1448 /* the primary of several recovery bios */
1231 sector_t s = r10_bio->sectors; 1449 sector_t s = r10_bio->sectors;
1232 put_buf(r10_bio); 1450 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1451 test_bit(R10BIO_WriteError, &r10_bio->state))
1452 reschedule_retry(r10_bio);
1453 else
1454 put_buf(r10_bio);
1233 md_done_sync(mddev, s, 1); 1455 md_done_sync(mddev, s, 1);
1234 break; 1456 break;
1235 } else { 1457 } else {
1236 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; 1458 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1237 put_buf(r10_bio); 1459 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1460 test_bit(R10BIO_WriteError, &r10_bio->state))
1461 reschedule_retry(r10_bio);
1462 else
1463 put_buf(r10_bio);
1238 r10_bio = r10_bio2; 1464 r10_bio = r10_bio2;
1239 } 1465 }
1240 } 1466 }
1241} 1467}
1242 1468
1469static void end_sync_write(struct bio *bio, int error)
1470{
1471 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1472 r10bio_t *r10_bio = bio->bi_private;
1473 mddev_t *mddev = r10_bio->mddev;
1474 conf_t *conf = mddev->private;
1475 int d;
1476 sector_t first_bad;
1477 int bad_sectors;
1478 int slot;
1479
1480 d = find_bio_disk(conf, r10_bio, bio, &slot);
1481
1482 if (!uptodate) {
1483 set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
1484 set_bit(R10BIO_WriteError, &r10_bio->state);
1485 } else if (is_badblock(conf->mirrors[d].rdev,
1486 r10_bio->devs[slot].addr,
1487 r10_bio->sectors,
1488 &first_bad, &bad_sectors))
1489 set_bit(R10BIO_MadeGood, &r10_bio->state);
1490
1491 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1492
1493 end_sync_request(r10_bio);
1494}
1495
1243/* 1496/*
1244 * Note: sync and recover and handled very differently for raid10 1497 * Note: sync and recover and handled very differently for raid10
1245 * This code is for resync. 1498 * This code is for resync.
@@ -1299,11 +1552,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1299 if (j == vcnt) 1552 if (j == vcnt)
1300 continue; 1553 continue;
1301 mddev->resync_mismatches += r10_bio->sectors; 1554 mddev->resync_mismatches += r10_bio->sectors;
1555 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1556 /* Don't fix anything. */
1557 continue;
1302 } 1558 }
1303 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 1559 /* Ok, we need to write this bio, either to correct an
1304 /* Don't fix anything. */ 1560 * inconsistency or to correct an unreadable block.
1305 continue;
1306 /* Ok, we need to write this bio
1307 * First we need to fixup bv_offset, bv_len and 1561 * First we need to fixup bv_offset, bv_len and
1308 * bi_vecs, as the read request might have corrupted these 1562 * bi_vecs, as the read request might have corrupted these
1309 */ 1563 */
@@ -1355,32 +1609,107 @@ done:
1355 * The second for writing. 1609 * The second for writing.
1356 * 1610 *
1357 */ 1611 */
1612static void fix_recovery_read_error(r10bio_t *r10_bio)
1613{
1614 /* We got a read error during recovery.
1615 * We repeat the read in smaller page-sized sections.
1616 * If a read succeeds, write it to the new device or record
1617 * a bad block if we cannot.
1618 * If a read fails, record a bad block on both old and
1619 * new devices.
1620 */
1621 mddev_t *mddev = r10_bio->mddev;
1622 conf_t *conf = mddev->private;
1623 struct bio *bio = r10_bio->devs[0].bio;
1624 sector_t sect = 0;
1625 int sectors = r10_bio->sectors;
1626 int idx = 0;
1627 int dr = r10_bio->devs[0].devnum;
1628 int dw = r10_bio->devs[1].devnum;
1629
1630 while (sectors) {
1631 int s = sectors;
1632 mdk_rdev_t *rdev;
1633 sector_t addr;
1634 int ok;
1635
1636 if (s > (PAGE_SIZE>>9))
1637 s = PAGE_SIZE >> 9;
1638
1639 rdev = conf->mirrors[dr].rdev;
1640 addr = r10_bio->devs[0].addr + sect,
1641 ok = sync_page_io(rdev,
1642 addr,
1643 s << 9,
1644 bio->bi_io_vec[idx].bv_page,
1645 READ, false);
1646 if (ok) {
1647 rdev = conf->mirrors[dw].rdev;
1648 addr = r10_bio->devs[1].addr + sect;
1649 ok = sync_page_io(rdev,
1650 addr,
1651 s << 9,
1652 bio->bi_io_vec[idx].bv_page,
1653 WRITE, false);
1654 if (!ok)
1655 set_bit(WriteErrorSeen, &rdev->flags);
1656 }
1657 if (!ok) {
1658 /* We don't worry if we cannot set a bad block -
1659 * it really is bad so there is no loss in not
1660 * recording it yet
1661 */
1662 rdev_set_badblocks(rdev, addr, s, 0);
1663
1664 if (rdev != conf->mirrors[dw].rdev) {
1665 /* need bad block on destination too */
1666 mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
1667 addr = r10_bio->devs[1].addr + sect;
1668 ok = rdev_set_badblocks(rdev2, addr, s, 0);
1669 if (!ok) {
1670 /* just abort the recovery */
1671 printk(KERN_NOTICE
1672 "md/raid10:%s: recovery aborted"
1673 " due to read error\n",
1674 mdname(mddev));
1675
1676 conf->mirrors[dw].recovery_disabled
1677 = mddev->recovery_disabled;
1678 set_bit(MD_RECOVERY_INTR,
1679 &mddev->recovery);
1680 break;
1681 }
1682 }
1683 }
1684
1685 sectors -= s;
1686 sect += s;
1687 idx++;
1688 }
1689}
1358 1690
1359static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) 1691static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1360{ 1692{
1361 conf_t *conf = mddev->private; 1693 conf_t *conf = mddev->private;
1362 int i, d; 1694 int d;
1363 struct bio *bio, *wbio; 1695 struct bio *wbio;
1364 1696
1697 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
1698 fix_recovery_read_error(r10_bio);
1699 end_sync_request(r10_bio);
1700 return;
1701 }
1365 1702
1366 /* move the pages across to the second bio 1703 /*
1704 * share the pages with the first bio
1367 * and submit the write request 1705 * and submit the write request
1368 */ 1706 */
1369 bio = r10_bio->devs[0].bio;
1370 wbio = r10_bio->devs[1].bio; 1707 wbio = r10_bio->devs[1].bio;
1371 for (i=0; i < wbio->bi_vcnt; i++) {
1372 struct page *p = bio->bi_io_vec[i].bv_page;
1373 bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1374 wbio->bi_io_vec[i].bv_page = p;
1375 }
1376 d = r10_bio->devs[1].devnum; 1708 d = r10_bio->devs[1].devnum;
1377 1709
1378 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1710 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1379 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1711 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1380 if (test_bit(R10BIO_Uptodate, &r10_bio->state)) 1712 generic_make_request(wbio);
1381 generic_make_request(wbio);
1382 else
1383 bio_endio(wbio, -EIO);
1384} 1713}
1385 1714
1386 1715
@@ -1421,6 +1750,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1421 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 1750 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1422} 1751}
1423 1752
1753static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1754 int sectors, struct page *page, int rw)
1755{
1756 sector_t first_bad;
1757 int bad_sectors;
1758
1759 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
1760 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
1761 return -1;
1762 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1763 /* success */
1764 return 1;
1765 if (rw == WRITE)
1766 set_bit(WriteErrorSeen, &rdev->flags);
1767 /* need to record an error - either for the block or the device */
1768 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1769 md_error(rdev->mddev, rdev);
1770 return 0;
1771}
1772
1424/* 1773/*
1425 * This is a kernel thread which: 1774 * This is a kernel thread which:
1426 * 1775 *
@@ -1476,10 +1825,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1476 1825
1477 rcu_read_lock(); 1826 rcu_read_lock();
1478 do { 1827 do {
1828 sector_t first_bad;
1829 int bad_sectors;
1830
1479 d = r10_bio->devs[sl].devnum; 1831 d = r10_bio->devs[sl].devnum;
1480 rdev = rcu_dereference(conf->mirrors[d].rdev); 1832 rdev = rcu_dereference(conf->mirrors[d].rdev);
1481 if (rdev && 1833 if (rdev &&
1482 test_bit(In_sync, &rdev->flags)) { 1834 test_bit(In_sync, &rdev->flags) &&
1835 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
1836 &first_bad, &bad_sectors) == 0) {
1483 atomic_inc(&rdev->nr_pending); 1837 atomic_inc(&rdev->nr_pending);
1484 rcu_read_unlock(); 1838 rcu_read_unlock();
1485 success = sync_page_io(rdev, 1839 success = sync_page_io(rdev,
@@ -1499,9 +1853,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1499 rcu_read_unlock(); 1853 rcu_read_unlock();
1500 1854
1501 if (!success) { 1855 if (!success) {
1502 /* Cannot read from anywhere -- bye bye array */ 1856 /* Cannot read from anywhere, just mark the block
1857 * as bad on the first device to discourage future
1858 * reads.
1859 */
1503 int dn = r10_bio->devs[r10_bio->read_slot].devnum; 1860 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1504 md_error(mddev, conf->mirrors[dn].rdev); 1861 rdev = conf->mirrors[dn].rdev;
1862
1863 if (!rdev_set_badblocks(
1864 rdev,
1865 r10_bio->devs[r10_bio->read_slot].addr
1866 + sect,
1867 s, 0))
1868 md_error(mddev, rdev);
1505 break; 1869 break;
1506 } 1870 }
1507 1871
@@ -1516,80 +1880,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1516 sl--; 1880 sl--;
1517 d = r10_bio->devs[sl].devnum; 1881 d = r10_bio->devs[sl].devnum;
1518 rdev = rcu_dereference(conf->mirrors[d].rdev); 1882 rdev = rcu_dereference(conf->mirrors[d].rdev);
1519 if (rdev && 1883 if (!rdev ||
1520 test_bit(In_sync, &rdev->flags)) { 1884 !test_bit(In_sync, &rdev->flags))
1521 atomic_inc(&rdev->nr_pending); 1885 continue;
1522 rcu_read_unlock(); 1886
1523 atomic_add(s, &rdev->corrected_errors); 1887 atomic_inc(&rdev->nr_pending);
1524 if (sync_page_io(rdev, 1888 rcu_read_unlock();
1525 r10_bio->devs[sl].addr + 1889 if (r10_sync_page_io(rdev,
1526 sect, 1890 r10_bio->devs[sl].addr +
1527 s<<9, conf->tmppage, WRITE, false) 1891 sect,
1528 == 0) { 1892 s<<9, conf->tmppage, WRITE)
1529 /* Well, this device is dead */ 1893 == 0) {
1530 printk(KERN_NOTICE 1894 /* Well, this device is dead */
1531 "md/raid10:%s: read correction " 1895 printk(KERN_NOTICE
1532 "write failed" 1896 "md/raid10:%s: read correction "
1533 " (%d sectors at %llu on %s)\n", 1897 "write failed"
1534 mdname(mddev), s, 1898 " (%d sectors at %llu on %s)\n",
1535 (unsigned long long)( 1899 mdname(mddev), s,
1536 sect + rdev->data_offset), 1900 (unsigned long long)(
1537 bdevname(rdev->bdev, b)); 1901 sect + rdev->data_offset),
1538 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1902 bdevname(rdev->bdev, b));
1539 "drive\n", 1903 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1540 mdname(mddev), 1904 "drive\n",
1541 bdevname(rdev->bdev, b)); 1905 mdname(mddev),
1542 md_error(mddev, rdev); 1906 bdevname(rdev->bdev, b));
1543 }
1544 rdev_dec_pending(rdev, mddev);
1545 rcu_read_lock();
1546 } 1907 }
1908 rdev_dec_pending(rdev, mddev);
1909 rcu_read_lock();
1547 } 1910 }
1548 sl = start; 1911 sl = start;
1549 while (sl != r10_bio->read_slot) { 1912 while (sl != r10_bio->read_slot) {
1913 char b[BDEVNAME_SIZE];
1550 1914
1551 if (sl==0) 1915 if (sl==0)
1552 sl = conf->copies; 1916 sl = conf->copies;
1553 sl--; 1917 sl--;
1554 d = r10_bio->devs[sl].devnum; 1918 d = r10_bio->devs[sl].devnum;
1555 rdev = rcu_dereference(conf->mirrors[d].rdev); 1919 rdev = rcu_dereference(conf->mirrors[d].rdev);
1556 if (rdev && 1920 if (!rdev ||
1557 test_bit(In_sync, &rdev->flags)) { 1921 !test_bit(In_sync, &rdev->flags))
1558 char b[BDEVNAME_SIZE]; 1922 continue;
1559 atomic_inc(&rdev->nr_pending);
1560 rcu_read_unlock();
1561 if (sync_page_io(rdev,
1562 r10_bio->devs[sl].addr +
1563 sect,
1564 s<<9, conf->tmppage,
1565 READ, false) == 0) {
1566 /* Well, this device is dead */
1567 printk(KERN_NOTICE
1568 "md/raid10:%s: unable to read back "
1569 "corrected sectors"
1570 " (%d sectors at %llu on %s)\n",
1571 mdname(mddev), s,
1572 (unsigned long long)(
1573 sect + rdev->data_offset),
1574 bdevname(rdev->bdev, b));
1575 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1576 mdname(mddev),
1577 bdevname(rdev->bdev, b));
1578
1579 md_error(mddev, rdev);
1580 } else {
1581 printk(KERN_INFO
1582 "md/raid10:%s: read error corrected"
1583 " (%d sectors at %llu on %s)\n",
1584 mdname(mddev), s,
1585 (unsigned long long)(
1586 sect + rdev->data_offset),
1587 bdevname(rdev->bdev, b));
1588 }
1589 1923
1590 rdev_dec_pending(rdev, mddev); 1924 atomic_inc(&rdev->nr_pending);
1591 rcu_read_lock(); 1925 rcu_read_unlock();
1926 switch (r10_sync_page_io(rdev,
1927 r10_bio->devs[sl].addr +
1928 sect,
1929 s<<9, conf->tmppage,
1930 READ)) {
1931 case 0:
1932 /* Well, this device is dead */
1933 printk(KERN_NOTICE
1934 "md/raid10:%s: unable to read back "
1935 "corrected sectors"
1936 " (%d sectors at %llu on %s)\n",
1937 mdname(mddev), s,
1938 (unsigned long long)(
1939 sect + rdev->data_offset),
1940 bdevname(rdev->bdev, b));
1941 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1942 "drive\n",
1943 mdname(mddev),
1944 bdevname(rdev->bdev, b));
1945 break;
1946 case 1:
1947 printk(KERN_INFO
1948 "md/raid10:%s: read error corrected"
1949 " (%d sectors at %llu on %s)\n",
1950 mdname(mddev), s,
1951 (unsigned long long)(
1952 sect + rdev->data_offset),
1953 bdevname(rdev->bdev, b));
1954 atomic_add(s, &rdev->corrected_errors);
1592 } 1955 }
1956
1957 rdev_dec_pending(rdev, mddev);
1958 rcu_read_lock();
1593 } 1959 }
1594 rcu_read_unlock(); 1960 rcu_read_unlock();
1595 1961
@@ -1598,21 +1964,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1598 } 1964 }
1599} 1965}
1600 1966
1967static void bi_complete(struct bio *bio, int error)
1968{
1969 complete((struct completion *)bio->bi_private);
1970}
1971
1972static int submit_bio_wait(int rw, struct bio *bio)
1973{
1974 struct completion event;
1975 rw |= REQ_SYNC;
1976
1977 init_completion(&event);
1978 bio->bi_private = &event;
1979 bio->bi_end_io = bi_complete;
1980 submit_bio(rw, bio);
1981 wait_for_completion(&event);
1982
1983 return test_bit(BIO_UPTODATE, &bio->bi_flags);
1984}
1985
1986static int narrow_write_error(r10bio_t *r10_bio, int i)
1987{
1988 struct bio *bio = r10_bio->master_bio;
1989 mddev_t *mddev = r10_bio->mddev;
1990 conf_t *conf = mddev->private;
1991 mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
1992 /* bio has the data to be written to slot 'i' where
1993 * we just recently had a write error.
1994 * We repeatedly clone the bio and trim down to one block,
1995 * then try the write. Where the write fails we record
1996 * a bad block.
1997 * It is conceivable that the bio doesn't exactly align with
1998 * blocks. We must handle this.
1999 *
2000 * We currently own a reference to the rdev.
2001 */
2002
2003 int block_sectors;
2004 sector_t sector;
2005 int sectors;
2006 int sect_to_write = r10_bio->sectors;
2007 int ok = 1;
2008
2009 if (rdev->badblocks.shift < 0)
2010 return 0;
2011
2012 block_sectors = 1 << rdev->badblocks.shift;
2013 sector = r10_bio->sector;
2014 sectors = ((r10_bio->sector + block_sectors)
2015 & ~(sector_t)(block_sectors - 1))
2016 - sector;
2017
2018 while (sect_to_write) {
2019 struct bio *wbio;
2020 if (sectors > sect_to_write)
2021 sectors = sect_to_write;
2022 /* Write at 'sector' for 'sectors' */
2023 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2024 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2025 wbio->bi_sector = (r10_bio->devs[i].addr+
2026 rdev->data_offset+
2027 (sector - r10_bio->sector));
2028 wbio->bi_bdev = rdev->bdev;
2029 if (submit_bio_wait(WRITE, wbio) == 0)
2030 /* Failure! */
2031 ok = rdev_set_badblocks(rdev, sector,
2032 sectors, 0)
2033 && ok;
2034
2035 bio_put(wbio);
2036 sect_to_write -= sectors;
2037 sector += sectors;
2038 sectors = block_sectors;
2039 }
2040 return ok;
2041}
2042
2043static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
2044{
2045 int slot = r10_bio->read_slot;
2046 int mirror = r10_bio->devs[slot].devnum;
2047 struct bio *bio;
2048 conf_t *conf = mddev->private;
2049 mdk_rdev_t *rdev;
2050 char b[BDEVNAME_SIZE];
2051 unsigned long do_sync;
2052 int max_sectors;
2053
2054 /* we got a read error. Maybe the drive is bad. Maybe just
2055 * the block and we can fix it.
2056 * We freeze all other IO, and try reading the block from
2057 * other devices. When we find one, we re-write
2058 * and check it that fixes the read error.
2059 * This is all done synchronously while the array is
2060 * frozen.
2061 */
2062 if (mddev->ro == 0) {
2063 freeze_array(conf);
2064 fix_read_error(conf, mddev, r10_bio);
2065 unfreeze_array(conf);
2066 }
2067 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
2068
2069 bio = r10_bio->devs[slot].bio;
2070 bdevname(bio->bi_bdev, b);
2071 r10_bio->devs[slot].bio =
2072 mddev->ro ? IO_BLOCKED : NULL;
2073read_more:
2074 mirror = read_balance(conf, r10_bio, &max_sectors);
2075 if (mirror == -1) {
2076 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2077 " read error for block %llu\n",
2078 mdname(mddev), b,
2079 (unsigned long long)r10_bio->sector);
2080 raid_end_bio_io(r10_bio);
2081 bio_put(bio);
2082 return;
2083 }
2084
2085 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2086 if (bio)
2087 bio_put(bio);
2088 slot = r10_bio->read_slot;
2089 rdev = conf->mirrors[mirror].rdev;
2090 printk_ratelimited(
2091 KERN_ERR
2092 "md/raid10:%s: %s: redirecting"
2093 "sector %llu to another mirror\n",
2094 mdname(mddev),
2095 bdevname(rdev->bdev, b),
2096 (unsigned long long)r10_bio->sector);
2097 bio = bio_clone_mddev(r10_bio->master_bio,
2098 GFP_NOIO, mddev);
2099 md_trim_bio(bio,
2100 r10_bio->sector - bio->bi_sector,
2101 max_sectors);
2102 r10_bio->devs[slot].bio = bio;
2103 bio->bi_sector = r10_bio->devs[slot].addr
2104 + rdev->data_offset;
2105 bio->bi_bdev = rdev->bdev;
2106 bio->bi_rw = READ | do_sync;
2107 bio->bi_private = r10_bio;
2108 bio->bi_end_io = raid10_end_read_request;
2109 if (max_sectors < r10_bio->sectors) {
2110 /* Drat - have to split this up more */
2111 struct bio *mbio = r10_bio->master_bio;
2112 int sectors_handled =
2113 r10_bio->sector + max_sectors
2114 - mbio->bi_sector;
2115 r10_bio->sectors = max_sectors;
2116 spin_lock_irq(&conf->device_lock);
2117 if (mbio->bi_phys_segments == 0)
2118 mbio->bi_phys_segments = 2;
2119 else
2120 mbio->bi_phys_segments++;
2121 spin_unlock_irq(&conf->device_lock);
2122 generic_make_request(bio);
2123 bio = NULL;
2124
2125 r10_bio = mempool_alloc(conf->r10bio_pool,
2126 GFP_NOIO);
2127 r10_bio->master_bio = mbio;
2128 r10_bio->sectors = (mbio->bi_size >> 9)
2129 - sectors_handled;
2130 r10_bio->state = 0;
2131 set_bit(R10BIO_ReadError,
2132 &r10_bio->state);
2133 r10_bio->mddev = mddev;
2134 r10_bio->sector = mbio->bi_sector
2135 + sectors_handled;
2136
2137 goto read_more;
2138 } else
2139 generic_make_request(bio);
2140}
2141
2142static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
2143{
2144 /* Some sort of write request has finished and it
2145 * succeeded in writing where we thought there was a
2146 * bad block. So forget the bad block.
2147 * Or possibly if failed and we need to record
2148 * a bad block.
2149 */
2150 int m;
2151 mdk_rdev_t *rdev;
2152
2153 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2154 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2155 for (m = 0; m < conf->copies; m++) {
2156 int dev = r10_bio->devs[m].devnum;
2157 rdev = conf->mirrors[dev].rdev;
2158 if (r10_bio->devs[m].bio == NULL)
2159 continue;
2160 if (test_bit(BIO_UPTODATE,
2161 &r10_bio->devs[m].bio->bi_flags)) {
2162 rdev_clear_badblocks(
2163 rdev,
2164 r10_bio->devs[m].addr,
2165 r10_bio->sectors);
2166 } else {
2167 if (!rdev_set_badblocks(
2168 rdev,
2169 r10_bio->devs[m].addr,
2170 r10_bio->sectors, 0))
2171 md_error(conf->mddev, rdev);
2172 }
2173 }
2174 put_buf(r10_bio);
2175 } else {
2176 for (m = 0; m < conf->copies; m++) {
2177 int dev = r10_bio->devs[m].devnum;
2178 struct bio *bio = r10_bio->devs[m].bio;
2179 rdev = conf->mirrors[dev].rdev;
2180 if (bio == IO_MADE_GOOD) {
2181 rdev_clear_badblocks(
2182 rdev,
2183 r10_bio->devs[m].addr,
2184 r10_bio->sectors);
2185 rdev_dec_pending(rdev, conf->mddev);
2186 } else if (bio != NULL &&
2187 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2188 if (!narrow_write_error(r10_bio, m)) {
2189 md_error(conf->mddev, rdev);
2190 set_bit(R10BIO_Degraded,
2191 &r10_bio->state);
2192 }
2193 rdev_dec_pending(rdev, conf->mddev);
2194 }
2195 }
2196 if (test_bit(R10BIO_WriteError,
2197 &r10_bio->state))
2198 close_write(r10_bio);
2199 raid_end_bio_io(r10_bio);
2200 }
2201}
2202
1601static void raid10d(mddev_t *mddev) 2203static void raid10d(mddev_t *mddev)
1602{ 2204{
1603 r10bio_t *r10_bio; 2205 r10bio_t *r10_bio;
1604 struct bio *bio;
1605 unsigned long flags; 2206 unsigned long flags;
1606 conf_t *conf = mddev->private; 2207 conf_t *conf = mddev->private;
1607 struct list_head *head = &conf->retry_list; 2208 struct list_head *head = &conf->retry_list;
1608 mdk_rdev_t *rdev;
1609 struct blk_plug plug; 2209 struct blk_plug plug;
1610 2210
1611 md_check_recovery(mddev); 2211 md_check_recovery(mddev);
1612 2212
1613 blk_start_plug(&plug); 2213 blk_start_plug(&plug);
1614 for (;;) { 2214 for (;;) {
1615 char b[BDEVNAME_SIZE];
1616 2215
1617 flush_pending_writes(conf); 2216 flush_pending_writes(conf);
1618 2217
@@ -1628,64 +2227,26 @@ static void raid10d(mddev_t *mddev)
1628 2227
1629 mddev = r10_bio->mddev; 2228 mddev = r10_bio->mddev;
1630 conf = mddev->private; 2229 conf = mddev->private;
1631 if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2230 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2231 test_bit(R10BIO_WriteError, &r10_bio->state))
2232 handle_write_completed(conf, r10_bio);
2233 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
1632 sync_request_write(mddev, r10_bio); 2234 sync_request_write(mddev, r10_bio);
1633 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 2235 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1634 recovery_request_write(mddev, r10_bio); 2236 recovery_request_write(mddev, r10_bio);
2237 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2238 handle_read_error(mddev, r10_bio);
1635 else { 2239 else {
1636 int slot = r10_bio->read_slot; 2240 /* just a partial read to be scheduled from a
1637 int mirror = r10_bio->devs[slot].devnum; 2241 * separate context
1638 /* we got a read error. Maybe the drive is bad. Maybe just
1639 * the block and we can fix it.
1640 * We freeze all other IO, and try reading the block from
1641 * other devices. When we find one, we re-write
1642 * and check it that fixes the read error.
1643 * This is all done synchronously while the array is
1644 * frozen.
1645 */ 2242 */
1646 if (mddev->ro == 0) { 2243 int slot = r10_bio->read_slot;
1647 freeze_array(conf); 2244 generic_make_request(r10_bio->devs[slot].bio);
1648 fix_read_error(conf, mddev, r10_bio);
1649 unfreeze_array(conf);
1650 }
1651 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1652
1653 bio = r10_bio->devs[slot].bio;
1654 r10_bio->devs[slot].bio =
1655 mddev->ro ? IO_BLOCKED : NULL;
1656 mirror = read_balance(conf, r10_bio);
1657 if (mirror == -1) {
1658 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1659 " read error for block %llu\n",
1660 mdname(mddev),
1661 bdevname(bio->bi_bdev,b),
1662 (unsigned long long)r10_bio->sector);
1663 raid_end_bio_io(r10_bio);
1664 bio_put(bio);
1665 } else {
1666 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
1667 bio_put(bio);
1668 slot = r10_bio->read_slot;
1669 rdev = conf->mirrors[mirror].rdev;
1670 if (printk_ratelimit())
1671 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
1672 " another mirror\n",
1673 mdname(mddev),
1674 bdevname(rdev->bdev,b),
1675 (unsigned long long)r10_bio->sector);
1676 bio = bio_clone_mddev(r10_bio->master_bio,
1677 GFP_NOIO, mddev);
1678 r10_bio->devs[slot].bio = bio;
1679 bio->bi_sector = r10_bio->devs[slot].addr
1680 + rdev->data_offset;
1681 bio->bi_bdev = rdev->bdev;
1682 bio->bi_rw = READ | do_sync;
1683 bio->bi_private = r10_bio;
1684 bio->bi_end_io = raid10_end_read_request;
1685 generic_make_request(bio);
1686 }
1687 } 2245 }
2246
1688 cond_resched(); 2247 cond_resched();
2248 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2249 md_check_recovery(mddev);
1689 } 2250 }
1690 blk_finish_plug(&plug); 2251 blk_finish_plug(&plug);
1691} 2252}
@@ -1746,7 +2307,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1746 int i; 2307 int i;
1747 int max_sync; 2308 int max_sync;
1748 sector_t sync_blocks; 2309 sector_t sync_blocks;
1749
1750 sector_t sectors_skipped = 0; 2310 sector_t sectors_skipped = 0;
1751 int chunks_skipped = 0; 2311 int chunks_skipped = 0;
1752 2312
@@ -1828,7 +2388,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1828 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 2388 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1829 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2389 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1830 /* recovery... the complicated one */ 2390 /* recovery... the complicated one */
1831 int j, k; 2391 int j;
1832 r10_bio = NULL; 2392 r10_bio = NULL;
1833 2393
1834 for (i=0 ; i<conf->raid_disks; i++) { 2394 for (i=0 ; i<conf->raid_disks; i++) {
@@ -1836,6 +2396,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1836 r10bio_t *rb2; 2396 r10bio_t *rb2;
1837 sector_t sect; 2397 sector_t sect;
1838 int must_sync; 2398 int must_sync;
2399 int any_working;
1839 2400
1840 if (conf->mirrors[i].rdev == NULL || 2401 if (conf->mirrors[i].rdev == NULL ||
1841 test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 2402 test_bit(In_sync, &conf->mirrors[i].rdev->flags))
@@ -1887,19 +2448,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1887 must_sync = bitmap_start_sync(mddev->bitmap, sect, 2448 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1888 &sync_blocks, still_degraded); 2449 &sync_blocks, still_degraded);
1889 2450
2451 any_working = 0;
1890 for (j=0; j<conf->copies;j++) { 2452 for (j=0; j<conf->copies;j++) {
2453 int k;
1891 int d = r10_bio->devs[j].devnum; 2454 int d = r10_bio->devs[j].devnum;
2455 sector_t from_addr, to_addr;
2456 mdk_rdev_t *rdev;
2457 sector_t sector, first_bad;
2458 int bad_sectors;
1892 if (!conf->mirrors[d].rdev || 2459 if (!conf->mirrors[d].rdev ||
1893 !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) 2460 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
1894 continue; 2461 continue;
1895 /* This is where we read from */ 2462 /* This is where we read from */
2463 any_working = 1;
2464 rdev = conf->mirrors[d].rdev;
2465 sector = r10_bio->devs[j].addr;
2466
2467 if (is_badblock(rdev, sector, max_sync,
2468 &first_bad, &bad_sectors)) {
2469 if (first_bad > sector)
2470 max_sync = first_bad - sector;
2471 else {
2472 bad_sectors -= (sector
2473 - first_bad);
2474 if (max_sync > bad_sectors)
2475 max_sync = bad_sectors;
2476 continue;
2477 }
2478 }
1896 bio = r10_bio->devs[0].bio; 2479 bio = r10_bio->devs[0].bio;
1897 bio->bi_next = biolist; 2480 bio->bi_next = biolist;
1898 biolist = bio; 2481 biolist = bio;
1899 bio->bi_private = r10_bio; 2482 bio->bi_private = r10_bio;
1900 bio->bi_end_io = end_sync_read; 2483 bio->bi_end_io = end_sync_read;
1901 bio->bi_rw = READ; 2484 bio->bi_rw = READ;
1902 bio->bi_sector = r10_bio->devs[j].addr + 2485 from_addr = r10_bio->devs[j].addr;
2486 bio->bi_sector = from_addr +
1903 conf->mirrors[d].rdev->data_offset; 2487 conf->mirrors[d].rdev->data_offset;
1904 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2488 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1905 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2489 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
@@ -1916,26 +2500,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1916 bio->bi_private = r10_bio; 2500 bio->bi_private = r10_bio;
1917 bio->bi_end_io = end_sync_write; 2501 bio->bi_end_io = end_sync_write;
1918 bio->bi_rw = WRITE; 2502 bio->bi_rw = WRITE;
1919 bio->bi_sector = r10_bio->devs[k].addr + 2503 to_addr = r10_bio->devs[k].addr;
2504 bio->bi_sector = to_addr +
1920 conf->mirrors[i].rdev->data_offset; 2505 conf->mirrors[i].rdev->data_offset;
1921 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 2506 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1922 2507
1923 r10_bio->devs[0].devnum = d; 2508 r10_bio->devs[0].devnum = d;
2509 r10_bio->devs[0].addr = from_addr;
1924 r10_bio->devs[1].devnum = i; 2510 r10_bio->devs[1].devnum = i;
2511 r10_bio->devs[1].addr = to_addr;
1925 2512
1926 break; 2513 break;
1927 } 2514 }
1928 if (j == conf->copies) { 2515 if (j == conf->copies) {
1929 /* Cannot recover, so abort the recovery */ 2516 /* Cannot recover, so abort the recovery or
2517 * record a bad block */
1930 put_buf(r10_bio); 2518 put_buf(r10_bio);
1931 if (rb2) 2519 if (rb2)
1932 atomic_dec(&rb2->remaining); 2520 atomic_dec(&rb2->remaining);
1933 r10_bio = rb2; 2521 r10_bio = rb2;
1934 if (!test_and_set_bit(MD_RECOVERY_INTR, 2522 if (any_working) {
1935 &mddev->recovery)) 2523 /* problem is that there are bad blocks
1936 printk(KERN_INFO "md/raid10:%s: insufficient " 2524 * on other device(s)
1937 "working devices for recovery.\n", 2525 */
1938 mdname(mddev)); 2526 int k;
2527 for (k = 0; k < conf->copies; k++)
2528 if (r10_bio->devs[k].devnum == i)
2529 break;
2530 if (!rdev_set_badblocks(
2531 conf->mirrors[i].rdev,
2532 r10_bio->devs[k].addr,
2533 max_sync, 0))
2534 any_working = 0;
2535 }
2536 if (!any_working) {
2537 if (!test_and_set_bit(MD_RECOVERY_INTR,
2538 &mddev->recovery))
2539 printk(KERN_INFO "md/raid10:%s: insufficient "
2540 "working devices for recovery.\n",
2541 mdname(mddev));
2542 conf->mirrors[i].recovery_disabled
2543 = mddev->recovery_disabled;
2544 }
1939 break; 2545 break;
1940 } 2546 }
1941 } 2547 }
@@ -1979,12 +2585,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1979 2585
1980 for (i=0; i<conf->copies; i++) { 2586 for (i=0; i<conf->copies; i++) {
1981 int d = r10_bio->devs[i].devnum; 2587 int d = r10_bio->devs[i].devnum;
2588 sector_t first_bad, sector;
2589 int bad_sectors;
2590
1982 bio = r10_bio->devs[i].bio; 2591 bio = r10_bio->devs[i].bio;
1983 bio->bi_end_io = NULL; 2592 bio->bi_end_io = NULL;
1984 clear_bit(BIO_UPTODATE, &bio->bi_flags); 2593 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1985 if (conf->mirrors[d].rdev == NULL || 2594 if (conf->mirrors[d].rdev == NULL ||
1986 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) 2595 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1987 continue; 2596 continue;
2597 sector = r10_bio->devs[i].addr;
2598 if (is_badblock(conf->mirrors[d].rdev,
2599 sector, max_sync,
2600 &first_bad, &bad_sectors)) {
2601 if (first_bad > sector)
2602 max_sync = first_bad - sector;
2603 else {
2604 bad_sectors -= (sector - first_bad);
2605 if (max_sync > bad_sectors)
2606 max_sync = max_sync;
2607 continue;
2608 }
2609 }
1988 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2610 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1989 atomic_inc(&r10_bio->remaining); 2611 atomic_inc(&r10_bio->remaining);
1990 bio->bi_next = biolist; 2612 bio->bi_next = biolist;
@@ -1992,7 +2614,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1992 bio->bi_private = r10_bio; 2614 bio->bi_private = r10_bio;
1993 bio->bi_end_io = end_sync_read; 2615 bio->bi_end_io = end_sync_read;
1994 bio->bi_rw = READ; 2616 bio->bi_rw = READ;
1995 bio->bi_sector = r10_bio->devs[i].addr + 2617 bio->bi_sector = sector +
1996 conf->mirrors[d].rdev->data_offset; 2618 conf->mirrors[d].rdev->data_offset;
1997 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2619 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1998 count++; 2620 count++;
@@ -2079,7 +2701,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
2079 return sectors_skipped + nr_sectors; 2701 return sectors_skipped + nr_sectors;
2080 giveup: 2702 giveup:
2081 /* There is nowhere to write, so all non-sync 2703 /* There is nowhere to write, so all non-sync
2082 * drives must be failed, so try the next chunk... 2704 * drives must be failed or in resync, all drives
2705 * have a bad block, so try the next chunk...
2083 */ 2706 */
2084 if (sector_nr + max_sync < max_sector) 2707 if (sector_nr + max_sync < max_sector)
2085 max_sector = sector_nr + max_sync; 2708 max_sector = sector_nr + max_sync;
@@ -2249,6 +2872,7 @@ static int run(mddev_t *mddev)
2249 (conf->raid_disks / conf->near_copies)); 2872 (conf->raid_disks / conf->near_copies));
2250 2873
2251 list_for_each_entry(rdev, &mddev->disks, same_set) { 2874 list_for_each_entry(rdev, &mddev->disks, same_set) {
2875
2252 disk_idx = rdev->raid_disk; 2876 disk_idx = rdev->raid_disk;
2253 if (disk_idx >= conf->raid_disks 2877 if (disk_idx >= conf->raid_disks
2254 || disk_idx < 0) 2878 || disk_idx < 0)
@@ -2271,7 +2895,7 @@ static int run(mddev_t *mddev)
2271 disk->head_position = 0; 2895 disk->head_position = 0;
2272 } 2896 }
2273 /* need to check that every block has at least one working mirror */ 2897 /* need to check that every block has at least one working mirror */
2274 if (!enough(conf)) { 2898 if (!enough(conf, -1)) {
2275 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 2899 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
2276 mdname(mddev)); 2900 mdname(mddev));
2277 goto out_free_conf; 2901 goto out_free_conf;
@@ -2331,7 +2955,7 @@ static int run(mddev_t *mddev)
2331 return 0; 2955 return 0;
2332 2956
2333out_free_conf: 2957out_free_conf:
2334 md_unregister_thread(mddev->thread); 2958 md_unregister_thread(&mddev->thread);
2335 if (conf->r10bio_pool) 2959 if (conf->r10bio_pool)
2336 mempool_destroy(conf->r10bio_pool); 2960 mempool_destroy(conf->r10bio_pool);
2337 safe_put_page(conf->tmppage); 2961 safe_put_page(conf->tmppage);
@@ -2349,8 +2973,7 @@ static int stop(mddev_t *mddev)
2349 raise_barrier(conf, 0); 2973 raise_barrier(conf, 0);
2350 lower_barrier(conf); 2974 lower_barrier(conf);
2351 2975
2352 md_unregister_thread(mddev->thread); 2976 md_unregister_thread(&mddev->thread);
2353 mddev->thread = NULL;
2354 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2977 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2355 if (conf->r10bio_pool) 2978 if (conf->r10bio_pool)
2356 mempool_destroy(conf->r10bio_pool); 2979 mempool_destroy(conf->r10bio_pool);