aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2008-05-23 16:04:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-05-24 12:56:10 -0400
commitdfc7064500061677720fa26352963c772d3ebe6b (patch)
treea8ca495bccf98837c6762ffba54a8009c9772259
parent90b08710e41a07d4ff0fb8940dcce3a552991a56 (diff)
md: restart recovery cleanly after device failure.
When we get any IO error during a recovery (rebuilding a spare), we abort the recovery and restart it. For RAID6 (and multi-drive RAID1) it may not be best to restart at the beginning: when multiple failures can be tolerated, the recovery may be able to continue and re-doing all that has already been done doesn't make sense. We already have the infrastructure to record where a recovery is up to and restart from there, but it is not being used properly. This is because: - We sometimes abort with MD_RECOVERY_ERR rather than just MD_RECOVERY_INTR, which causes the recovery not be be checkpointed. - We remove spares and then re-added them which loses important state information. The distinction between MD_RECOVERY_ERR and MD_RECOVERY_INTR really isn't needed. If there is an error, the relevant drive will be marked as Faulty, and that is enough to ensure correct handling of the error. So we first remove MD_RECOVERY_ERR, changing some of the uses of it to MD_RECOVERY_INTR. Then we cause the attempt to remove a non-faulty device from an array to fail (unless recovery is impossible as the array is too degraded). Then when remove_and_add_spares attempts to remove the devices on which recovery can continue, it will fail, they will remain in place, and recovery will continue on them as desired. Issue: If we are halfway through rebuilding a spare and another drive fails, and a new spare is immediately available, do we want to: 1/ complete the current rebuild, then go back and rebuild the new spare or 2/ restart the rebuild from the start and rebuild both devices in parallel. Both options can be argued for. The code currently takes option 2 as a/ this requires least code change b/ this results in a minimally-degraded array in minimal time. Cc: "Eivind Sarto" <ivan@kasenna.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/md/md.c22
-rw-r--r--drivers/md/multipath.c3
-rw-r--r--drivers/md/raid1.c10
-rw-r--r--drivers/md/raid10.c14
-rw-r--r--drivers/md/raid5.c10
-rw-r--r--include/linux/raid/md_k.h4
6 files changed, 44 insertions, 19 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 295be1a68806..51c19f86ff99 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5434,7 +5434,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
5434 atomic_sub(blocks, &mddev->recovery_active); 5434 atomic_sub(blocks, &mddev->recovery_active);
5435 wake_up(&mddev->recovery_wait); 5435 wake_up(&mddev->recovery_wait);
5436 if (!ok) { 5436 if (!ok) {
5437 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5437 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5438 md_wakeup_thread(mddev->thread); 5438 md_wakeup_thread(mddev->thread);
5439 // stop recovery, signal do_sync .... 5439 // stop recovery, signal do_sync ....
5440 } 5440 }
@@ -5690,7 +5690,7 @@ void md_do_sync(mddev_t *mddev)
5690 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5690 sectors = mddev->pers->sync_request(mddev, j, &skipped,
5691 currspeed < speed_min(mddev)); 5691 currspeed < speed_min(mddev));
5692 if (sectors == 0) { 5692 if (sectors == 0) {
5693 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5693 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5694 goto out; 5694 goto out;
5695 } 5695 }
5696 5696
@@ -5713,8 +5713,7 @@ void md_do_sync(mddev_t *mddev)
5713 5713
5714 last_check = io_sectors; 5714 last_check = io_sectors;
5715 5715
5716 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 5716 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5717 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5718 break; 5717 break;
5719 5718
5720 repeat: 5719 repeat:
@@ -5768,8 +5767,7 @@ void md_do_sync(mddev_t *mddev)
5768 /* tell personality that we are finished */ 5767 /* tell personality that we are finished */
5769 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5768 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5770 5769
5771 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5770 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5772 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5773 mddev->curr_resync > 2) { 5771 mddev->curr_resync > 2) {
5774 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5772 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5775 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5773 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5838,7 +5836,10 @@ static int remove_and_add_spares(mddev_t *mddev)
5838 } 5836 }
5839 5837
5840 if (mddev->degraded) { 5838 if (mddev->degraded) {
5841 rdev_for_each(rdev, rtmp, mddev) 5839 rdev_for_each(rdev, rtmp, mddev) {
5840 if (rdev->raid_disk >= 0 &&
5841 !test_bit(In_sync, &rdev->flags))
5842 spares++;
5842 if (rdev->raid_disk < 0 5843 if (rdev->raid_disk < 0
5843 && !test_bit(Faulty, &rdev->flags)) { 5844 && !test_bit(Faulty, &rdev->flags)) {
5844 rdev->recovery_offset = 0; 5845 rdev->recovery_offset = 0;
@@ -5856,6 +5857,7 @@ static int remove_and_add_spares(mddev_t *mddev)
5856 } else 5857 } else
5857 break; 5858 break;
5858 } 5859 }
5860 }
5859 } 5861 }
5860 return spares; 5862 return spares;
5861} 5863}
@@ -5869,7 +5871,7 @@ static int remove_and_add_spares(mddev_t *mddev)
5869 * to do that as needed. 5871 * to do that as needed.
5870 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 5872 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
5871 * "->recovery" and create a thread at ->sync_thread. 5873 * "->recovery" and create a thread at ->sync_thread.
5872 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 5874 * When the thread finishes it sets MD_RECOVERY_DONE
5873 * and wakeups up this thread which will reap the thread and finish up. 5875 * and wakeups up this thread which will reap the thread and finish up.
5874 * This thread also removes any faulty devices (with nr_pending == 0). 5876 * This thread also removes any faulty devices (with nr_pending == 0).
5875 * 5877 *
@@ -5944,8 +5946,7 @@ void md_check_recovery(mddev_t *mddev)
5944 /* resync has finished, collect result */ 5946 /* resync has finished, collect result */
5945 md_unregister_thread(mddev->sync_thread); 5947 md_unregister_thread(mddev->sync_thread);
5946 mddev->sync_thread = NULL; 5948 mddev->sync_thread = NULL;
5947 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5949 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5948 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5949 /* success...*/ 5950 /* success...*/
5950 /* activate any spares */ 5951 /* activate any spares */
5951 mddev->pers->spare_active(mddev); 5952 mddev->pers->spare_active(mddev);
@@ -5969,7 +5970,6 @@ void md_check_recovery(mddev_t *mddev)
5969 * might be left set 5970 * might be left set
5970 */ 5971 */
5971 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5972 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5972 clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5973 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5973 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5974 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5974 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5975 5975
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 4f4d1f383842..e968116e0de9 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -327,7 +327,8 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
327 if (rdev) { 327 if (rdev) {
328 if (test_bit(In_sync, &rdev->flags) || 328 if (test_bit(In_sync, &rdev->flags) ||
329 atomic_read(&rdev->nr_pending)) { 329 atomic_read(&rdev->nr_pending)) {
330 printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); 330 printk(KERN_ERR "hot-remove-disk, slot %d is identified"
331 " but is still operational!\n", number);
331 err = -EBUSY; 332 err = -EBUSY;
332 goto abort; 333 goto abort;
333 } 334 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d0f4021bbc2e..c610b947218a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1027,7 +1027,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1027 /* 1027 /*
1028 * if recovery is running, make sure it aborts. 1028 * if recovery is running, make sure it aborts.
1029 */ 1029 */
1030 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 1030 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1031 } else 1031 } else
1032 set_bit(Faulty, &rdev->flags); 1032 set_bit(Faulty, &rdev->flags);
1033 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1033 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1148,6 +1148,14 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1148 err = -EBUSY; 1148 err = -EBUSY;
1149 goto abort; 1149 goto abort;
1150 } 1150 }
1151 /* Only remove non-faulty devices is recovery
1152 * is not possible.
1153 */
1154 if (!test_bit(Faulty, &rdev->flags) &&
1155 mddev->degraded < conf->raid_disks) {
1156 err = -EBUSY;
1157 goto abort;
1158 }
1151 p->rdev = NULL; 1159 p->rdev = NULL;
1152 synchronize_rcu(); 1160 synchronize_rcu();
1153 if (atomic_read(&rdev->nr_pending)) { 1161 if (atomic_read(&rdev->nr_pending)) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8536ede1e712..1de17da34a95 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1020,7 +1020,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1020 /* 1020 /*
1021 * if recovery is running, make sure it aborts. 1021 * if recovery is running, make sure it aborts.
1022 */ 1022 */
1023 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 1023 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1024 } 1024 }
1025 set_bit(Faulty, &rdev->flags); 1025 set_bit(Faulty, &rdev->flags);
1026 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1026 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1171,6 +1171,14 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1171 err = -EBUSY; 1171 err = -EBUSY;
1172 goto abort; 1172 goto abort;
1173 } 1173 }
1174 /* Only remove faulty devices in recovery
1175 * is not possible.
1176 */
1177 if (!test_bit(Faulty, &rdev->flags) &&
1178 enough(conf)) {
1179 err = -EBUSY;
1180 goto abort;
1181 }
1174 p->rdev = NULL; 1182 p->rdev = NULL;
1175 synchronize_rcu(); 1183 synchronize_rcu();
1176 if (atomic_read(&rdev->nr_pending)) { 1184 if (atomic_read(&rdev->nr_pending)) {
@@ -1237,6 +1245,7 @@ static void end_sync_write(struct bio *bio, int error)
1237 1245
1238 if (!uptodate) 1246 if (!uptodate)
1239 md_error(mddev, conf->mirrors[d].rdev); 1247 md_error(mddev, conf->mirrors[d].rdev);
1248
1240 update_head_pos(i, r10_bio); 1249 update_head_pos(i, r10_bio);
1241 1250
1242 while (atomic_dec_and_test(&r10_bio->remaining)) { 1251 while (atomic_dec_and_test(&r10_bio->remaining)) {
@@ -1844,7 +1853,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1844 if (rb2) 1853 if (rb2)
1845 atomic_dec(&rb2->remaining); 1854 atomic_dec(&rb2->remaining);
1846 r10_bio = rb2; 1855 r10_bio = rb2;
1847 if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) 1856 if (!test_and_set_bit(MD_RECOVERY_INTR,
1857 &mddev->recovery))
1848 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", 1858 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
1849 mdname(mddev)); 1859 mdname(mddev));
1850 break; 1860 break;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2f28745dacf9..425958a76b84 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1268,7 +1268,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1268 /* 1268 /*
1269 * if recovery was running, make sure it aborts. 1269 * if recovery was running, make sure it aborts.
1270 */ 1270 */
1271 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 1271 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1272 } 1272 }
1273 set_bit(Faulty, &rdev->flags); 1273 set_bit(Faulty, &rdev->flags);
1274 printk (KERN_ALERT 1274 printk (KERN_ALERT
@@ -4574,6 +4574,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
4574 err = -EBUSY; 4574 err = -EBUSY;
4575 goto abort; 4575 goto abort;
4576 } 4576 }
4577 /* Only remove non-faulty devices if recovery
4578 * isn't possible.
4579 */
4580 if (!test_bit(Faulty, &rdev->flags) &&
4581 mddev->degraded <= conf->max_degraded) {
4582 err = -EBUSY;
4583 goto abort;
4584 }
4577 p->rdev = NULL; 4585 p->rdev = NULL;
4578 synchronize_rcu(); 4586 synchronize_rcu();
4579 if (atomic_read(&rdev->nr_pending)) { 4587 if (atomic_read(&rdev->nr_pending)) {
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index a6d7ab688ede..3dea9f545c8f 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -188,8 +188,7 @@ struct mddev_s
188 * NEEDED: we might need to start a resync/recover 188 * NEEDED: we might need to start a resync/recover
189 * RUNNING: a thread is running, or about to be started 189 * RUNNING: a thread is running, or about to be started
190 * SYNC: actually doing a resync, not a recovery 190 * SYNC: actually doing a resync, not a recovery
191 * ERR: and IO error was detected - abort the resync/recovery 191 * INTR: resync needs to be aborted for some reason
192 * INTR: someone requested a (clean) early abort.
193 * DONE: thread is done and is waiting to be reaped 192 * DONE: thread is done and is waiting to be reaped
194 * REQUEST: user-space has requested a sync (used with SYNC) 193 * REQUEST: user-space has requested a sync (used with SYNC)
195 * CHECK: user-space request for for check-only, no repair 194 * CHECK: user-space request for for check-only, no repair
@@ -199,7 +198,6 @@ struct mddev_s
199 */ 198 */
200#define MD_RECOVERY_RUNNING 0 199#define MD_RECOVERY_RUNNING 0
201#define MD_RECOVERY_SYNC 1 200#define MD_RECOVERY_SYNC 1
202#define MD_RECOVERY_ERR 2
203#define MD_RECOVERY_INTR 3 201#define MD_RECOVERY_INTR 3
204#define MD_RECOVERY_DONE 4 202#define MD_RECOVERY_DONE 4
205#define MD_RECOVERY_NEEDED 5 203#define MD_RECOVERY_NEEDED 5