aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd/checkpoint.c
diff options
context:
space:
mode:
authorHidehiro Kawai <hidehiro.kawai.ez@hitachi.com>2008-10-22 17:15:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-23 11:55:01 -0400
commit4afe978530702c934dfdb11f54073136818b2119 (patch)
tree5f7fb9539b46c0b390157f55c84017e14b7f605c /fs/jbd/checkpoint.c
parent66f50ee3cee4c9d98eea0add6f439e6e5e0ca4a5 (diff)
jbd: fix error handling for checkpoint io
When a checkpointing IO fails, current JBD code doesn't check the error and continue journaling. This means latest metadata can be lost from both the journal and filesystem. This patch leaves the failed metadata blocks in the journal space and aborts journaling in the case of log_do_checkpoint(). To achieve this, we need to do: 1. don't remove the failed buffer from the checkpoint list where in the case of __try_to_free_cp_buf() because it may be released or overwritten by a later transaction 2. log_do_checkpoint() is the last chance, remove the failed buffer from the checkpoint list and abort the journal 3. when checkpointing fails, don't update the journal super block to prevent the journaled contents from being cleaned. For safety, don't update j_tail and j_tail_sequence either 4. when checkpointing fails, notify this error to the ext3 layer so that ext3 don't clear the needs_recovery flag, otherwise the journaled contents are ignored and cleaned in the recovery phase 5. if the recovery fails, keep the needs_recovery flag 6. prevent cleanup_journal_tail() from being called between __journal_drop_transaction() and journal_abort() (a race issue between journal_flush() and __log_wait_for_space() Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com> Acked-by: Jan Kara <jack@suse.cz> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/jbd/checkpoint.c')
-rw-r--r--fs/jbd/checkpoint.c49
1 files changed, 37 insertions, 12 deletions
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index a5432bbbfb88..e29293501d42 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
93 int ret = 0; 93 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh); 94 struct buffer_head *bh = jh2bh(jh);
95 95
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
97 !buffer_dirty(bh) && buffer_uptodate(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list"); 98 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __journal_remove_checkpoint(jh) + 1; 99 ret = __journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh); 100 jbd_unlock_bh_state(bh);
@@ -160,21 +161,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
160 * buffers. Note that we take the buffers in the opposite ordering 161 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO. 162 * from the one in which they were submitted for IO.
162 * 163 *
164 * Return 0 on success, and return <0 if some buffers have failed
165 * to be written out.
166 *
163 * Called with j_list_lock held. 167 * Called with j_list_lock held.
164 */ 168 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 169static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{ 170{
167 struct journal_head *jh; 171 struct journal_head *jh;
168 struct buffer_head *bh; 172 struct buffer_head *bh;
169 tid_t this_tid; 173 tid_t this_tid;
170 int released = 0; 174 int released = 0;
175 int ret = 0;
171 176
172 this_tid = transaction->t_tid; 177 this_tid = transaction->t_tid;
173restart: 178restart:
174 /* Did somebody clean up the transaction in the meanwhile? */ 179 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction || 180 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid) 181 transaction->t_tid != this_tid)
177 return; 182 return ret;
178 while (!released && transaction->t_checkpoint_io_list) { 183 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list; 184 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh); 185 bh = jh2bh(jh);
@@ -194,6 +199,9 @@ restart:
194 spin_lock(&journal->j_list_lock); 199 spin_lock(&journal->j_list_lock);
195 goto restart; 200 goto restart;
196 } 201 }
202 if (unlikely(!buffer_uptodate(bh)))
203 ret = -EIO;
204
197 /* 205 /*
198 * Now in whatever state the buffer currently is, we know that 206 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list 207 * it has been written out and so we can drop it from the list
@@ -203,6 +211,8 @@ restart:
203 journal_remove_journal_head(bh); 211 journal_remove_journal_head(bh);
204 __brelse(bh); 212 __brelse(bh);
205 } 213 }
214
215 return ret;
206} 216}
207 217
208#define NR_BATCH 64 218#define NR_BATCH 64
@@ -226,7 +236,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
226 * Try to flush one buffer from the checkpoint list to disk. 236 * Try to flush one buffer from the checkpoint list to disk.
227 * 237 *
228 * Return 1 if something happened which requires us to abort the current 238 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list. 239 * scan of the checkpoint list. Return <0 if the buffer has failed to
240 * be written out.
230 * 241 *
231 * Called with j_list_lock held and drops it if 1 is returned 242 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 243 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -256,6 +267,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
256 log_wait_commit(journal, tid); 267 log_wait_commit(journal, tid);
257 ret = 1; 268 ret = 1;
258 } else if (!buffer_dirty(bh)) { 269 } else if (!buffer_dirty(bh)) {
270 ret = 1;
271 if (unlikely(!buffer_uptodate(bh)))
272 ret = -EIO;
259 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 273 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
260 BUFFER_TRACE(bh, "remove from checkpoint"); 274 BUFFER_TRACE(bh, "remove from checkpoint");
261 __journal_remove_checkpoint(jh); 275 __journal_remove_checkpoint(jh);
@@ -263,7 +277,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
263 jbd_unlock_bh_state(bh); 277 jbd_unlock_bh_state(bh);
264 journal_remove_journal_head(bh); 278 journal_remove_journal_head(bh);
265 __brelse(bh); 279 __brelse(bh);
266 ret = 1;
267 } else { 280 } else {
268 /* 281 /*
269 * Important: we are about to write the buffer, and 282 * Important: we are about to write the buffer, and
@@ -295,6 +308,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
295 * to disk. We submit larger chunks of data at once. 308 * to disk. We submit larger chunks of data at once.
296 * 309 *
297 * The journal should be locked before calling this function. 310 * The journal should be locked before calling this function.
311 * Called with j_checkpoint_mutex held.
298 */ 312 */
299int log_do_checkpoint(journal_t *journal) 313int log_do_checkpoint(journal_t *journal)
300{ 314{
@@ -318,6 +332,7 @@ int log_do_checkpoint(journal_t *journal)
318 * OK, we need to start writing disk blocks. Take one transaction 332 * OK, we need to start writing disk blocks. Take one transaction
319 * and write it. 333 * and write it.
320 */ 334 */
335 result = 0;
321 spin_lock(&journal->j_list_lock); 336 spin_lock(&journal->j_list_lock);
322 if (!journal->j_checkpoint_transactions) 337 if (!journal->j_checkpoint_transactions)
323 goto out; 338 goto out;
@@ -334,7 +349,7 @@ restart:
334 int batch_count = 0; 349 int batch_count = 0;
335 struct buffer_head *bhs[NR_BATCH]; 350 struct buffer_head *bhs[NR_BATCH];
336 struct journal_head *jh; 351 struct journal_head *jh;
337 int retry = 0; 352 int retry = 0, err;
338 353
339 while (!retry && transaction->t_checkpoint_list) { 354 while (!retry && transaction->t_checkpoint_list) {
340 struct buffer_head *bh; 355 struct buffer_head *bh;
@@ -347,6 +362,8 @@ restart:
347 break; 362 break;
348 } 363 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count); 364 retry = __process_buffer(journal, jh, bhs,&batch_count);
365 if (retry < 0 && !result)
366 result = retry;
350 if (!retry && (need_resched() || 367 if (!retry && (need_resched() ||
351 spin_needbreak(&journal->j_list_lock))) { 368 spin_needbreak(&journal->j_list_lock))) {
352 spin_unlock(&journal->j_list_lock); 369 spin_unlock(&journal->j_list_lock);
@@ -371,14 +388,18 @@ restart:
371 * Now we have cleaned up the first transaction's checkpoint 388 * Now we have cleaned up the first transaction's checkpoint
372 * list. Let's clean up the second one 389 * list. Let's clean up the second one
373 */ 390 */
374 __wait_cp_io(journal, transaction); 391 err = __wait_cp_io(journal, transaction);
392 if (!result)
393 result = err;
375 } 394 }
376out: 395out:
377 spin_unlock(&journal->j_list_lock); 396 spin_unlock(&journal->j_list_lock);
378 result = cleanup_journal_tail(journal);
379 if (result < 0) 397 if (result < 0)
380 return result; 398 journal_abort(journal, result);
381 return 0; 399 else
400 result = cleanup_journal_tail(journal);
401
402 return (result < 0) ? result : 0;
382} 403}
383 404
384/* 405/*
@@ -394,8 +415,9 @@ out:
394 * This is the only part of the journaling code which really needs to be 415 * This is the only part of the journaling code which really needs to be
395 * aware of transaction aborts. Checkpointing involves writing to the 416 * aware of transaction aborts. Checkpointing involves writing to the
396 * main filesystem area rather than to the journal, so it can proceed 417 * main filesystem area rather than to the journal, so it can proceed
397 * even in abort state, but we must not update the journal superblock if 418 * even in abort state, but we must not update the super block if
398 * we have an abort error outstanding. 419 * checkpointing may have failed. Otherwise, we would lose some metadata
420 * buffers which should be written-back to the filesystem.
399 */ 421 */
400 422
401int cleanup_journal_tail(journal_t *journal) 423int cleanup_journal_tail(journal_t *journal)
@@ -404,6 +426,9 @@ int cleanup_journal_tail(journal_t *journal)
404 tid_t first_tid; 426 tid_t first_tid;
405 unsigned long blocknr, freed; 427 unsigned long blocknr, freed;
406 428
429 if (is_journal_aborted(journal))
430 return 1;
431
407 /* OK, work out the oldest transaction remaining in the log, and 432 /* OK, work out the oldest transaction remaining in the log, and
408 * the log block it starts at. 433 * the log block it starts at.
409 * 434 *