diff options
author | NeilBrown <neilb@suse.de> | 2009-03-30 23:33:13 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2009-03-30 23:33:13 -0400 |
commit | 97e4f42d62badb0f9fbc27c013e89bc1336a03bc (patch) | |
tree | 04cc809702a6b080c417c4ddf605642bbf7de521 /drivers | |
parent | 43b2e5d86d8bdd77386226db0bc961529492c043 (diff) |
md: occasionally checkpoint drive recovery to reduce duplicate effort after a crash
Version 1.x metadata has the ability to record the status of a
partially completed drive recovery.
However we only update that record on a clean shutdown.
It would be nice to update it on unclean shutdowns too, particularly
when using a bitmap that removes much to the 'sync' effort after an
unclean shutdown.
One complication with checkpointing recovery is that we only know
where we are up to in terms of IO requests started, not which ones
have completed. And we need to know what has completed to record
how much is recovered. So occasionally pause the recovery until all
submitted requests are completed, then update the record of where
we are up to.
When we have a bitmap, we already do that pause occasionally to keep
the bitmap up-to-date. So enhance that code to record the recovery
offset and schedule a superblock update.
And when there is no bitmap, just pause 16 times during the resync to
do a checkpoint.
'16' is a fairly arbitrary number. But we don't really have any good
way to judge how often is acceptable, and it seems like a reasonable
number for now.
Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/bitmap.c | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 27 | ||||
-rw-r--r-- | drivers/md/md.h | 7 |
3 files changed, 32 insertions, 4 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 623292a5473e..5d64da990804 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -1470,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) | |||
1470 | wait_event(bitmap->mddev->recovery_wait, | 1470 | wait_event(bitmap->mddev->recovery_wait, |
1471 | atomic_read(&bitmap->mddev->recovery_active) == 0); | 1471 | atomic_read(&bitmap->mddev->recovery_active) == 0); |
1472 | 1472 | ||
1473 | bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; | ||
1474 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); | ||
1473 | sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); | 1475 | sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); |
1474 | s = 0; | 1476 | s = 0; |
1475 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { | 1477 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { |
diff --git a/drivers/md/md.c b/drivers/md/md.c index aad0ac54bf90..8ea208847a6d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -1324,10 +1324,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1324 | } | 1324 | } |
1325 | 1325 | ||
1326 | if (rdev->raid_disk >= 0 && | 1326 | if (rdev->raid_disk >= 0 && |
1327 | !test_bit(In_sync, &rdev->flags) && | 1327 | !test_bit(In_sync, &rdev->flags)) { |
1328 | rdev->recovery_offset > 0) { | 1328 | if (mddev->curr_resync_completed > rdev->recovery_offset) |
1329 | sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); | 1329 | rdev->recovery_offset = mddev->curr_resync_completed; |
1330 | sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); | 1330 | if (rdev->recovery_offset > 0) { |
1331 | sb->feature_map |= | ||
1332 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); | ||
1333 | sb->recovery_offset = | ||
1334 | cpu_to_le64(rdev->recovery_offset); | ||
1335 | } | ||
1331 | } | 1336 | } |
1332 | 1337 | ||
1333 | if (mddev->reshape_position != MaxSector) { | 1338 | if (mddev->reshape_position != MaxSector) { |
@@ -6072,6 +6077,18 @@ void md_do_sync(mddev_t *mddev) | |||
6072 | } | 6077 | } |
6073 | if (kthread_should_stop()) | 6078 | if (kthread_should_stop()) |
6074 | goto interrupted; | 6079 | goto interrupted; |
6080 | |||
6081 | if (mddev->curr_resync > mddev->curr_resync_completed && | ||
6082 | (mddev->curr_resync - mddev->curr_resync_completed) | ||
6083 | > (max_sectors >> 4)) { | ||
6084 | /* time to update curr_resync_completed */ | ||
6085 | blk_unplug(mddev->queue); | ||
6086 | wait_event(mddev->recovery_wait, | ||
6087 | atomic_read(&mddev->recovery_active) == 0); | ||
6088 | mddev->curr_resync_completed = | ||
6089 | mddev->curr_resync; | ||
6090 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | ||
6091 | } | ||
6075 | sectors = mddev->pers->sync_request(mddev, j, &skipped, | 6092 | sectors = mddev->pers->sync_request(mddev, j, &skipped, |
6076 | currspeed < speed_min(mddev)); | 6093 | currspeed < speed_min(mddev)); |
6077 | if (sectors == 0) { | 6094 | if (sectors == 0) { |
@@ -6205,6 +6222,8 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
6205 | mdk_rdev_t *rdev; | 6222 | mdk_rdev_t *rdev; |
6206 | int spares = 0; | 6223 | int spares = 0; |
6207 | 6224 | ||
6225 | mddev->curr_resync_completed = 0; | ||
6226 | |||
6208 | list_for_each_entry(rdev, &mddev->disks, same_set) | 6227 | list_for_each_entry(rdev, &mddev->disks, same_set) |
6209 | if (rdev->raid_disk >= 0 && | 6228 | if (rdev->raid_disk >= 0 && |
6210 | !test_bit(Blocked, &rdev->flags) && | 6229 | !test_bit(Blocked, &rdev->flags) && |
diff --git a/drivers/md/md.h b/drivers/md/md.h index e78b3c1d55fd..bede26c9d4a9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -159,6 +159,13 @@ struct mddev_s | |||
159 | struct mdk_thread_s *thread; /* management thread */ | 159 | struct mdk_thread_s *thread; /* management thread */ |
160 | struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ | 160 | struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ |
161 | sector_t curr_resync; /* last block scheduled */ | 161 | sector_t curr_resync; /* last block scheduled */ |
162 | /* As resync requests can complete out of order, we cannot easily track | ||
163 | * how much resync has been completed. So we occasionally pause until | ||
164 | * everything completes, then set curr_resync_completed to curr_resync. | ||
165 | * As such it may be well behind the real resync mark, but it is a value | ||
166 | * we are certain of. | ||
167 | */ | ||
168 | sector_t curr_resync_completed; | ||
162 | unsigned long resync_mark; /* a recent timestamp */ | 169 | unsigned long resync_mark; /* a recent timestamp */ |
163 | sector_t resync_mark_cnt;/* blocks written at resync_mark */ | 170 | sector_t resync_mark_cnt;/* blocks written at resync_mark */ |
164 | sector_t curr_mark_cnt; /* blocks scheduled now */ | 171 | sector_t curr_mark_cnt; /* blocks scheduled now */ |