aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2009-03-30 23:33:13 -0400
committerNeilBrown <neilb@suse.de>2009-03-30 23:33:13 -0400
commit97e4f42d62badb0f9fbc27c013e89bc1336a03bc (patch)
tree04cc809702a6b080c417c4ddf605642bbf7de521
parent43b2e5d86d8bdd77386226db0bc961529492c043 (diff)
md: occasionally checkpoint drive recovery to reduce duplicate effort after a crash
Version 1.x metadata has the ability to record the status of a partially completed drive recovery. However we only update that record on a clean shutdown. It would be nice to update it on unclean shutdowns too, particularly when using a bitmap that removes much to the 'sync' effort after an unclean shutdown. One complication with checkpointing recovery is that we only know where we are up to in terms of IO requests started, not which ones have completed. And we need to know what has completed to record how much is recovered. So occasionally pause the recovery until all submitted requests are completed, then update the record of where we are up to. When we have a bitmap, we already do that pause occasionally to keep the bitmap up-to-date. So enhance that code to record the recovery offset and schedule a superblock update. And when there is no bitmap, just pause 16 times during the resync to do a checkpoint. '16' is a fairly arbitrary number. But we don't really have any good way to judge how often is acceptable, and it seems like a reasonable number for now. Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/bitmap.c2
-rw-r--r--drivers/md/md.c27
-rw-r--r--drivers/md/md.h7
3 files changed, 32 insertions, 4 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 623292a5473e..5d64da990804 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1470,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1470 wait_event(bitmap->mddev->recovery_wait, 1470 wait_event(bitmap->mddev->recovery_wait,
1471 atomic_read(&bitmap->mddev->recovery_active) == 0); 1471 atomic_read(&bitmap->mddev->recovery_active) == 0);
1472 1472
1473 bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
1474 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
1473 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); 1475 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
1474 s = 0; 1476 s = 0;
1475 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1477 while (s < sector && s < bitmap->mddev->resync_max_sectors) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index aad0ac54bf90..8ea208847a6d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1324,10 +1324,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1324 } 1324 }
1325 1325
1326 if (rdev->raid_disk >= 0 && 1326 if (rdev->raid_disk >= 0 &&
1327 !test_bit(In_sync, &rdev->flags) && 1327 !test_bit(In_sync, &rdev->flags)) {
1328 rdev->recovery_offset > 0) { 1328 if (mddev->curr_resync_completed > rdev->recovery_offset)
1329 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1329 rdev->recovery_offset = mddev->curr_resync_completed;
1330 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1330 if (rdev->recovery_offset > 0) {
1331 sb->feature_map |=
1332 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1333 sb->recovery_offset =
1334 cpu_to_le64(rdev->recovery_offset);
1335 }
1331 } 1336 }
1332 1337
1333 if (mddev->reshape_position != MaxSector) { 1338 if (mddev->reshape_position != MaxSector) {
@@ -6072,6 +6077,18 @@ void md_do_sync(mddev_t *mddev)
6072 } 6077 }
6073 if (kthread_should_stop()) 6078 if (kthread_should_stop())
6074 goto interrupted; 6079 goto interrupted;
6080
6081 if (mddev->curr_resync > mddev->curr_resync_completed &&
6082 (mddev->curr_resync - mddev->curr_resync_completed)
6083 > (max_sectors >> 4)) {
6084 /* time to update curr_resync_completed */
6085 blk_unplug(mddev->queue);
6086 wait_event(mddev->recovery_wait,
6087 atomic_read(&mddev->recovery_active) == 0);
6088 mddev->curr_resync_completed =
6089 mddev->curr_resync;
6090 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6091 }
6075 sectors = mddev->pers->sync_request(mddev, j, &skipped, 6092 sectors = mddev->pers->sync_request(mddev, j, &skipped,
6076 currspeed < speed_min(mddev)); 6093 currspeed < speed_min(mddev));
6077 if (sectors == 0) { 6094 if (sectors == 0) {
@@ -6205,6 +6222,8 @@ static int remove_and_add_spares(mddev_t *mddev)
6205 mdk_rdev_t *rdev; 6222 mdk_rdev_t *rdev;
6206 int spares = 0; 6223 int spares = 0;
6207 6224
6225 mddev->curr_resync_completed = 0;
6226
6208 list_for_each_entry(rdev, &mddev->disks, same_set) 6227 list_for_each_entry(rdev, &mddev->disks, same_set)
6209 if (rdev->raid_disk >= 0 && 6228 if (rdev->raid_disk >= 0 &&
6210 !test_bit(Blocked, &rdev->flags) && 6229 !test_bit(Blocked, &rdev->flags) &&
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e78b3c1d55fd..bede26c9d4a9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -159,6 +159,13 @@ struct mddev_s
159 struct mdk_thread_s *thread; /* management thread */ 159 struct mdk_thread_s *thread; /* management thread */
160 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ 160 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
161 sector_t curr_resync; /* last block scheduled */ 161 sector_t curr_resync; /* last block scheduled */
162 /* As resync requests can complete out of order, we cannot easily track
163 * how much resync has been completed. So we occasionally pause until
164 * everything completes, then set curr_resync_completed to curr_resync.
165 * As such it may be well behind the real resync mark, but it is a value
166 * we are certain of.
167 */
168 sector_t curr_resync_completed;
162 unsigned long resync_mark; /* a recent timestamp */ 169 unsigned long resync_mark; /* a recent timestamp */
163 sector_t resync_mark_cnt;/* blocks written at resync_mark */ 170 sector_t resync_mark_cnt;/* blocks written at resync_mark */
164 sector_t curr_mark_cnt; /* blocks scheduled now */ 171 sector_t curr_mark_cnt; /* blocks scheduled now */