aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2008-05-23 16:04:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-05-24 12:56:10 -0400
commitdfc7064500061677720fa26352963c772d3ebe6b (patch)
treea8ca495bccf98837c6762ffba54a8009c9772259 /drivers/md/raid10.c
parent90b08710e41a07d4ff0fb8940dcce3a552991a56 (diff)
md: restart recovery cleanly after device failure.
When we get any IO error during a recovery (rebuilding a spare), we abort the recovery and restart it. For RAID6 (and multi-drive RAID1) it may not be best to restart at the beginning: when multiple failures can be tolerated, the recovery may be able to continue and re-doing all that has already been done doesn't make sense. We already have the infrastructure to record where a recovery is up to and restart from there, but it is not being used properly. This is because: - We sometimes abort with MD_RECOVERY_ERR rather than just MD_RECOVERY_INTR, which causes the recovery not be be checkpointed. - We remove spares and then re-added them which loses important state information. The distinction between MD_RECOVERY_ERR and MD_RECOVERY_INTR really isn't needed. If there is an error, the relevant drive will be marked as Faulty, and that is enough to ensure correct handling of the error. So we first remove MD_RECOVERY_ERR, changing some of the uses of it to MD_RECOVERY_INTR. Then we cause the attempt to remove a non-faulty device from an array to fail (unless recovery is impossible as the array is too degraded). Then when remove_and_add_spares attempts to remove the devices on which recovery can continue, it will fail, they will remain in place, and recovery will continue on them as desired. Issue: If we are halfway through rebuilding a spare and another drive fails, and a new spare is immediately available, do we want to: 1/ complete the current rebuild, then go back and rebuild the new spare or 2/ restart the rebuild from the start and rebuild both devices in parallel. Both options can be argued for. The code currently takes option 2 as a/ this requires least code change b/ this results in a minimally-degraded array in minimal time. Cc: "Eivind Sarto" <ivan@kasenna.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c14
1 files changed, 12 insertions, 2 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8536ede1e712..1de17da34a95 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1020,7 +1020,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1020 /* 1020 /*
1021 * if recovery is running, make sure it aborts. 1021 * if recovery is running, make sure it aborts.
1022 */ 1022 */
1023 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 1023 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1024 } 1024 }
1025 set_bit(Faulty, &rdev->flags); 1025 set_bit(Faulty, &rdev->flags);
1026 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1026 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1171,6 +1171,14 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1171 err = -EBUSY; 1171 err = -EBUSY;
1172 goto abort; 1172 goto abort;
1173 } 1173 }
1174 /* Only remove faulty devices in recovery
1175 * is not possible.
1176 */
1177 if (!test_bit(Faulty, &rdev->flags) &&
1178 enough(conf)) {
1179 err = -EBUSY;
1180 goto abort;
1181 }
1174 p->rdev = NULL; 1182 p->rdev = NULL;
1175 synchronize_rcu(); 1183 synchronize_rcu();
1176 if (atomic_read(&rdev->nr_pending)) { 1184 if (atomic_read(&rdev->nr_pending)) {
@@ -1237,6 +1245,7 @@ static void end_sync_write(struct bio *bio, int error)
1237 1245
1238 if (!uptodate) 1246 if (!uptodate)
1239 md_error(mddev, conf->mirrors[d].rdev); 1247 md_error(mddev, conf->mirrors[d].rdev);
1248
1240 update_head_pos(i, r10_bio); 1249 update_head_pos(i, r10_bio);
1241 1250
1242 while (atomic_dec_and_test(&r10_bio->remaining)) { 1251 while (atomic_dec_and_test(&r10_bio->remaining)) {
@@ -1844,7 +1853,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1844 if (rb2) 1853 if (rb2)
1845 atomic_dec(&rb2->remaining); 1854 atomic_dec(&rb2->remaining);
1846 r10_bio = rb2; 1855 r10_bio = rb2;
1847 if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) 1856 if (!test_and_set_bit(MD_RECOVERY_INTR,
1857 &mddev->recovery))
1848 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", 1858 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
1849 mdname(mddev)); 1859 mdname(mddev));
1850 break; 1860 break;