diff options
author | Song Liu <songliubraving@fb.com> | 2017-01-23 20:12:57 -0500 |
---|---|---|
committer | Shaohua Li <shli@fb.com> | 2017-01-24 14:20:15 -0500 |
commit | a85dd7b8df52e35d8ee3794c65cac5c39128fd80 (patch) | |
tree | 3d69f1c7c7ce9c9a85c806763c12c774cd95643f | |
parent | ba02684daf7fb4a827580f909b7c7db61c05ae7d (diff) |
md/r5cache: flush data only stripes in r5l_recovery_log()
For safer operation, all arrays start in write-through mode, which has been
better tested and is more mature. And actually the write-through/write-mode
isn't persistent after array restarted, so we always start array in
write-through mode. However, if recovery found data-only stripes before the
shutdown (from previous write-back mode), it is not safe to start the array in
write-through mode, as write-through mode can not handle stripes with data in
write-back cache. To solve this problem, we flush all data-only stripes in
r5l_recovery_log(). When r5l_recovery_log() returns, the array starts with
empty cache in write-through mode.
This logic is implemented in r5c_recovery_flush_data_only_stripes():
1. enable write back cache
2. flush all stripes
3. wake up conf->mddev->thread
4. wait for all stripes get flushed (reuse wait_for_quiescent)
5. disable write back cache
The wait in 4 will be waked up in release_inactive_stripe_list()
when conf->active_stripes reaches 0.
It is safe to wake up mddev->thread here because all the resource
required for the thread has been initialized.
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
-rw-r--r-- | drivers/md/md.c | 5 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 56 |
2 files changed, 45 insertions, 16 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 82821ee0d57f..01175dac0db6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -5291,6 +5291,11 @@ int md_run(struct mddev *mddev) | |||
5291 | if (start_readonly && mddev->ro == 0) | 5291 | if (start_readonly && mddev->ro == 0) |
5292 | mddev->ro = 2; /* read-only, but switch on first write */ | 5292 | mddev->ro = 2; /* read-only, but switch on first write */ |
5293 | 5293 | ||
5294 | /* | ||
5295 | * NOTE: some pers->run(), for example r5l_recovery_log(), wakes | ||
5296 | * up mddev->thread. It is important to initialize critical | ||
5297 | * resources for mddev->thread BEFORE calling pers->run(). | ||
5298 | */ | ||
5294 | err = pers->run(mddev); | 5299 | err = pers->run(mddev); |
5295 | if (err) | 5300 | if (err) |
5296 | pr_warn("md: pers->run() failed ...\n"); | 5301 | pr_warn("md: pers->run() failed ...\n"); |
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 95dcaa022e1f..3d7dda85494c 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c | |||
@@ -2060,7 +2060,7 @@ static int | |||
2060 | r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, | 2060 | r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, |
2061 | struct r5l_recovery_ctx *ctx) | 2061 | struct r5l_recovery_ctx *ctx) |
2062 | { | 2062 | { |
2063 | struct stripe_head *sh, *next; | 2063 | struct stripe_head *sh; |
2064 | struct mddev *mddev = log->rdev->mddev; | 2064 | struct mddev *mddev = log->rdev->mddev; |
2065 | struct page *page; | 2065 | struct page *page; |
2066 | sector_t next_checkpoint = MaxSector; | 2066 | sector_t next_checkpoint = MaxSector; |
@@ -2074,7 +2074,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, | |||
2074 | 2074 | ||
2075 | WARN_ON(list_empty(&ctx->cached_list)); | 2075 | WARN_ON(list_empty(&ctx->cached_list)); |
2076 | 2076 | ||
2077 | list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { | 2077 | list_for_each_entry(sh, &ctx->cached_list, lru) { |
2078 | struct r5l_meta_block *mb; | 2078 | struct r5l_meta_block *mb; |
2079 | int i; | 2079 | int i; |
2080 | int offset; | 2080 | int offset; |
@@ -2124,14 +2124,39 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, | |||
2124 | ctx->pos = write_pos; | 2124 | ctx->pos = write_pos; |
2125 | ctx->seq += 1; | 2125 | ctx->seq += 1; |
2126 | next_checkpoint = sh->log_start; | 2126 | next_checkpoint = sh->log_start; |
2127 | list_del_init(&sh->lru); | ||
2128 | raid5_release_stripe(sh); | ||
2129 | } | 2127 | } |
2130 | log->next_checkpoint = next_checkpoint; | 2128 | log->next_checkpoint = next_checkpoint; |
2131 | __free_page(page); | 2129 | __free_page(page); |
2132 | return 0; | 2130 | return 0; |
2133 | } | 2131 | } |
2134 | 2132 | ||
2133 | static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, | ||
2134 | struct r5l_recovery_ctx *ctx) | ||
2135 | { | ||
2136 | struct mddev *mddev = log->rdev->mddev; | ||
2137 | struct r5conf *conf = mddev->private; | ||
2138 | struct stripe_head *sh, *next; | ||
2139 | |||
2140 | if (ctx->data_only_stripes == 0) | ||
2141 | return; | ||
2142 | |||
2143 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; | ||
2144 | |||
2145 | list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { | ||
2146 | r5c_make_stripe_write_out(sh); | ||
2147 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2148 | list_del_init(&sh->lru); | ||
2149 | raid5_release_stripe(sh); | ||
2150 | } | ||
2151 | |||
2152 | md_wakeup_thread(conf->mddev->thread); | ||
2153 | /* reuse conf->wait_for_quiescent in recovery */ | ||
2154 | wait_event(conf->wait_for_quiescent, | ||
2155 | atomic_read(&conf->active_stripes) == 0); | ||
2156 | |||
2157 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; | ||
2158 | } | ||
2159 | |||
2135 | static int r5l_recovery_log(struct r5l_log *log) | 2160 | static int r5l_recovery_log(struct r5l_log *log) |
2136 | { | 2161 | { |
2137 | struct mddev *mddev = log->rdev->mddev; | 2162 | struct mddev *mddev = log->rdev->mddev; |
@@ -2158,32 +2183,31 @@ static int r5l_recovery_log(struct r5l_log *log) | |||
2158 | pos = ctx.pos; | 2183 | pos = ctx.pos; |
2159 | ctx.seq += 10000; | 2184 | ctx.seq += 10000; |
2160 | 2185 | ||
2161 | if (ctx.data_only_stripes == 0) { | ||
2162 | log->next_checkpoint = ctx.pos; | ||
2163 | r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); | ||
2164 | ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); | ||
2165 | } | ||
2166 | 2186 | ||
2167 | if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) | 2187 | if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) |
2168 | pr_debug("md/raid:%s: starting from clean shutdown\n", | 2188 | pr_debug("md/raid:%s: starting from clean shutdown\n", |
2169 | mdname(mddev)); | 2189 | mdname(mddev)); |
2170 | else { | 2190 | else |
2171 | pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", | 2191 | pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", |
2172 | mdname(mddev), ctx.data_only_stripes, | 2192 | mdname(mddev), ctx.data_only_stripes, |
2173 | ctx.data_parity_stripes); | 2193 | ctx.data_parity_stripes); |
2174 | 2194 | ||
2175 | if (ctx.data_only_stripes > 0) | 2195 | if (ctx.data_only_stripes == 0) { |
2176 | if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { | 2196 | log->next_checkpoint = ctx.pos; |
2177 | pr_err("md/raid:%s: failed to rewrite stripes to journal\n", | 2197 | r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); |
2178 | mdname(mddev)); | 2198 | ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); |
2179 | return -EIO; | 2199 | } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { |
2180 | } | 2200 | pr_err("md/raid:%s: failed to rewrite stripes to journal\n", |
2201 | mdname(mddev)); | ||
2202 | return -EIO; | ||
2181 | } | 2203 | } |
2182 | 2204 | ||
2183 | log->log_start = ctx.pos; | 2205 | log->log_start = ctx.pos; |
2184 | log->seq = ctx.seq; | 2206 | log->seq = ctx.seq; |
2185 | log->last_checkpoint = pos; | 2207 | log->last_checkpoint = pos; |
2186 | r5l_write_super(log, pos); | 2208 | r5l_write_super(log, pos); |
2209 | |||
2210 | r5c_recovery_flush_data_only_stripes(log, &ctx); | ||
2187 | return 0; | 2211 | return 0; |
2188 | } | 2212 | } |
2189 | 2213 | ||