aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-07-27 21:33:42 -0400
committerNeilBrown <neilb@suse.de>2011-07-27 21:33:42 -0400
commit3a9f28a5117e00a868dd8b4395f9a707ae56764b (patch)
tree36fe0fc7a7ccfc0da03dea546286b7bdef581246
parentd8f05d2995d467a91db1af01637e6ffd94660ca8 (diff)
md/raid1: improve handling of read failure during recovery.
If we cannot read a block from anywhere during recovery, there is now a better approach than just giving up. We can record a bad block on each device and keep going - being careful not to clear the bad block when a write succeeds as it might - it will be a write of incorrect data. We have now reached the state where - for raid1 - we only call md_error if md_set_badblocks has failed. Signed-off-by: NeilBrown <neilb@suse.de> Reviewed-by: Namhyung Kim <namhyung@gmail.com>
-rw-r--r--drivers/md/raid1.c41
1 files changed, 34 insertions, 7 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e6957151233b..039e3af72929 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1392,7 +1392,12 @@ static void end_sync_write(struct bio *bio, int error)
1392 } else if (is_badblock(conf->mirrors[mirror].rdev, 1392 } else if (is_badblock(conf->mirrors[mirror].rdev,
1393 r1_bio->sector, 1393 r1_bio->sector,
1394 r1_bio->sectors, 1394 r1_bio->sectors,
1395 &first_bad, &bad_sectors)) 1395 &first_bad, &bad_sectors) &&
1396 !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
1397 r1_bio->sector,
1398 r1_bio->sectors,
1399 &first_bad, &bad_sectors)
1400 )
1396 set_bit(R1BIO_MadeGood, &r1_bio->state); 1401 set_bit(R1BIO_MadeGood, &r1_bio->state);
1397 1402
1398 update_head_pos(mirror, r1_bio); 1403 update_head_pos(mirror, r1_bio);
@@ -1473,16 +1478,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1473 1478
1474 if (!success) { 1479 if (!success) {
1475 char b[BDEVNAME_SIZE]; 1480 char b[BDEVNAME_SIZE];
1476 /* Cannot read from anywhere, array is toast */ 1481 int abort = 0;
1477 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1482 /* Cannot read from anywhere, this block is lost.
1483 * Record a bad block on each device. If that doesn't
1484 * work just disable and interrupt the recovery.
1485 * Don't fail devices as that won't really help.
1486 */
1478 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" 1487 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1479 " for block %llu\n", 1488 " for block %llu\n",
1480 mdname(mddev), 1489 mdname(mddev),
1481 bdevname(bio->bi_bdev, b), 1490 bdevname(bio->bi_bdev, b),
1482 (unsigned long long)r1_bio->sector); 1491 (unsigned long long)r1_bio->sector);
1483 md_done_sync(mddev, r1_bio->sectors, 0); 1492 for (d = 0; d < conf->raid_disks; d++) {
1484 put_buf(r1_bio); 1493 rdev = conf->mirrors[d].rdev;
1485 return 0; 1494 if (!rdev || test_bit(Faulty, &rdev->flags))
1495 continue;
1496 if (!rdev_set_badblocks(rdev, sect, s, 0))
1497 abort = 1;
1498 }
1499 if (abort) {
1500 mddev->recovery_disabled = 1;
1501 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1502 md_done_sync(mddev, r1_bio->sectors, 0);
1503 put_buf(r1_bio);
1504 return 0;
1505 }
1506 /* Try next page */
1507 sectors -= s;
1508 sect += s;
1509 idx++;
1510 continue;
1486 } 1511 }
1487 1512
1488 start = d; 1513 start = d;
@@ -1879,7 +1904,9 @@ static void raid1d(mddev_t *mddev)
1879 if (bio->bi_end_io == NULL) 1904 if (bio->bi_end_io == NULL)
1880 continue; 1905 continue;
1881 if (test_bit(BIO_UPTODATE, 1906 if (test_bit(BIO_UPTODATE,
1882 &bio->bi_flags)) { 1907 &bio->bi_flags) &&
1908 test_bit(R1BIO_MadeGood,
1909 &r1_bio->state)) {
1883 rdev_clear_badblocks( 1910 rdev_clear_badblocks(
1884 rdev, 1911 rdev,
1885 r1_bio->sector, 1912 r1_bio->sector,