aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd2/checkpoint.c
diff options
context:
space:
mode:
authorHidehiro Kawai <hidehiro.kawai.ez@hitachi.com>2008-10-10 20:29:13 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-10-10 20:29:13 -0400
commit44519faf22ad6ce924ad0352d3dc200d9e0b66e8 (patch)
tree332dd28cf16439fc4c78ad198e04c12ff7c16e66 /fs/jbd2/checkpoint.c
parent77e841de8abac4755cc83ca224fdf71418d65380 (diff)
jbd2: fix error handling for checkpoint io
When a checkpointing IO fails, current JBD2 code doesn't check the error and continue journaling. This means latest metadata can be lost from both the journal and filesystem. This patch leaves the failed metadata blocks in the journal space and aborts journaling in the case of jbd2_log_do_checkpoint(). To achieve this, we need to do: 1. don't remove the failed buffer from the checkpoint list where in the case of __try_to_free_cp_buf() because it may be released or overwritten by a later transaction 2. jbd2_log_do_checkpoint() is the last chance, remove the failed buffer from the checkpoint list and abort the journal 3. when checkpointing fails, don't update the journal super block to prevent the journaled contents from being cleaned. For safety, don't update j_tail and j_tail_sequence either 4. when checkpointing fails, notify this error to the ext4 layer so that ext4 don't clear the needs_recovery flag, otherwise the journaled contents are ignored and cleaned in the recovery phase 5. if the recovery fails, keep the needs_recovery flag 6. prevent jbd2_cleanup_journal_tail() from being called between __jbd2_journal_drop_transaction() and jbd2_journal_abort() (a possible race issue between jbd2_log_do_checkpoint()s called by jbd2_journal_flush() and __jbd2_log_wait_for_space()) Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/jbd2/checkpoint.c')
-rw-r--r--fs/jbd2/checkpoint.c49
1 files changed, 37 insertions, 12 deletions
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42895d369458..9203c3332f17 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -94,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
94 int ret = 0; 94 int ret = 0;
95 struct buffer_head *bh = jh2bh(jh); 95 struct buffer_head *bh = jh2bh(jh);
96 96
97 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 97 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
98 JBUFFER_TRACE(jh, "remove from checkpoint list"); 99 JBUFFER_TRACE(jh, "remove from checkpoint list");
99 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 100 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
100 jbd_unlock_bh_state(bh); 101 jbd_unlock_bh_state(bh);
@@ -176,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
176 * buffers. Note that we take the buffers in the opposite ordering 177 * buffers. Note that we take the buffers in the opposite ordering
177 * from the one in which they were submitted for IO. 178 * from the one in which they were submitted for IO.
178 * 179 *
180 * Return 0 on success, and return <0 if some buffers have failed
181 * to be written out.
182 *
179 * Called with j_list_lock held. 183 * Called with j_list_lock held.
180 */ 184 */
181static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 185static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
182{ 186{
183 struct journal_head *jh; 187 struct journal_head *jh;
184 struct buffer_head *bh; 188 struct buffer_head *bh;
185 tid_t this_tid; 189 tid_t this_tid;
186 int released = 0; 190 int released = 0;
191 int ret = 0;
187 192
188 this_tid = transaction->t_tid; 193 this_tid = transaction->t_tid;
189restart: 194restart:
190 /* Did somebody clean up the transaction in the meanwhile? */ 195 /* Did somebody clean up the transaction in the meanwhile? */
191 if (journal->j_checkpoint_transactions != transaction || 196 if (journal->j_checkpoint_transactions != transaction ||
192 transaction->t_tid != this_tid) 197 transaction->t_tid != this_tid)
193 return; 198 return ret;
194 while (!released && transaction->t_checkpoint_io_list) { 199 while (!released && transaction->t_checkpoint_io_list) {
195 jh = transaction->t_checkpoint_io_list; 200 jh = transaction->t_checkpoint_io_list;
196 bh = jh2bh(jh); 201 bh = jh2bh(jh);
@@ -210,6 +215,9 @@ restart:
210 spin_lock(&journal->j_list_lock); 215 spin_lock(&journal->j_list_lock);
211 goto restart; 216 goto restart;
212 } 217 }
218 if (unlikely(buffer_write_io_error(bh)))
219 ret = -EIO;
220
213 /* 221 /*
214 * Now in whatever state the buffer currently is, we know that 222 * Now in whatever state the buffer currently is, we know that
215 * it has been written out and so we can drop it from the list 223 * it has been written out and so we can drop it from the list
@@ -219,6 +227,8 @@ restart:
219 jbd2_journal_remove_journal_head(bh); 227 jbd2_journal_remove_journal_head(bh);
220 __brelse(bh); 228 __brelse(bh);
221 } 229 }
230
231 return ret;
222} 232}
223 233
224#define NR_BATCH 64 234#define NR_BATCH 64
@@ -242,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
242 * Try to flush one buffer from the checkpoint list to disk. 252 * Try to flush one buffer from the checkpoint list to disk.
243 * 253 *
244 * Return 1 if something happened which requires us to abort the current 254 * Return 1 if something happened which requires us to abort the current
245 * scan of the checkpoint list. 255 * scan of the checkpoint list. Return <0 if the buffer has failed to
256 * be written out.
246 * 257 *
247 * Called with j_list_lock held and drops it if 1 is returned 258 * Called with j_list_lock held and drops it if 1 is returned
248 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 259 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -274,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
274 jbd2_log_wait_commit(journal, tid); 285 jbd2_log_wait_commit(journal, tid);
275 ret = 1; 286 ret = 1;
276 } else if (!buffer_dirty(bh)) { 287 } else if (!buffer_dirty(bh)) {
288 ret = 1;
289 if (unlikely(buffer_write_io_error(bh)))
290 ret = -EIO;
277 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 291 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
278 BUFFER_TRACE(bh, "remove from checkpoint"); 292 BUFFER_TRACE(bh, "remove from checkpoint");
279 __jbd2_journal_remove_checkpoint(jh); 293 __jbd2_journal_remove_checkpoint(jh);
@@ -281,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
281 jbd_unlock_bh_state(bh); 295 jbd_unlock_bh_state(bh);
282 jbd2_journal_remove_journal_head(bh); 296 jbd2_journal_remove_journal_head(bh);
283 __brelse(bh); 297 __brelse(bh);
284 ret = 1;
285 } else { 298 } else {
286 /* 299 /*
287 * Important: we are about to write the buffer, and 300 * Important: we are about to write the buffer, and
@@ -314,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
314 * to disk. We submit larger chunks of data at once. 327 * to disk. We submit larger chunks of data at once.
315 * 328 *
316 * The journal should be locked before calling this function. 329 * The journal should be locked before calling this function.
330 * Called with j_checkpoint_mutex held.
317 */ 331 */
318int jbd2_log_do_checkpoint(journal_t *journal) 332int jbd2_log_do_checkpoint(journal_t *journal)
319{ 333{
@@ -339,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
339 * OK, we need to start writing disk blocks. Take one transaction 353 * OK, we need to start writing disk blocks. Take one transaction
340 * and write it. 354 * and write it.
341 */ 355 */
356 result = 0;
342 spin_lock(&journal->j_list_lock); 357 spin_lock(&journal->j_list_lock);
343 if (!journal->j_checkpoint_transactions) 358 if (!journal->j_checkpoint_transactions)
344 goto out; 359 goto out;
@@ -357,7 +372,7 @@ restart:
357 int batch_count = 0; 372 int batch_count = 0;
358 struct buffer_head *bhs[NR_BATCH]; 373 struct buffer_head *bhs[NR_BATCH];
359 struct journal_head *jh; 374 struct journal_head *jh;
360 int retry = 0; 375 int retry = 0, err;
361 376
362 while (!retry && transaction->t_checkpoint_list) { 377 while (!retry && transaction->t_checkpoint_list) {
363 struct buffer_head *bh; 378 struct buffer_head *bh;
@@ -371,6 +386,8 @@ restart:
371 } 386 }
372 retry = __process_buffer(journal, jh, bhs, &batch_count, 387 retry = __process_buffer(journal, jh, bhs, &batch_count,
373 transaction); 388 transaction);
389 if (retry < 0 && !result)
390 result = retry;
374 if (!retry && (need_resched() || 391 if (!retry && (need_resched() ||
375 spin_needbreak(&journal->j_list_lock))) { 392 spin_needbreak(&journal->j_list_lock))) {
376 spin_unlock(&journal->j_list_lock); 393 spin_unlock(&journal->j_list_lock);
@@ -395,14 +412,18 @@ restart:
395 * Now we have cleaned up the first transaction's checkpoint 412 * Now we have cleaned up the first transaction's checkpoint
396 * list. Let's clean up the second one 413 * list. Let's clean up the second one
397 */ 414 */
398 __wait_cp_io(journal, transaction); 415 err = __wait_cp_io(journal, transaction);
416 if (!result)
417 result = err;
399 } 418 }
400out: 419out:
401 spin_unlock(&journal->j_list_lock); 420 spin_unlock(&journal->j_list_lock);
402 result = jbd2_cleanup_journal_tail(journal);
403 if (result < 0) 421 if (result < 0)
404 return result; 422 jbd2_journal_abort(journal, result);
405 return 0; 423 else
424 result = jbd2_cleanup_journal_tail(journal);
425
426 return (result < 0) ? result : 0;
406} 427}
407 428
408/* 429/*
@@ -418,8 +439,9 @@ out:
418 * This is the only part of the journaling code which really needs to be 439 * This is the only part of the journaling code which really needs to be
419 * aware of transaction aborts. Checkpointing involves writing to the 440 * aware of transaction aborts. Checkpointing involves writing to the
420 * main filesystem area rather than to the journal, so it can proceed 441 * main filesystem area rather than to the journal, so it can proceed
421 * even in abort state, but we must not update the journal superblock if 442 * even in abort state, but we must not update the super block if
422 * we have an abort error outstanding. 443 * checkpointing may have failed. Otherwise, we would lose some metadata
444 * buffers which should be written-back to the filesystem.
423 */ 445 */
424 446
425int jbd2_cleanup_journal_tail(journal_t *journal) 447int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -428,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
428 tid_t first_tid; 450 tid_t first_tid;
429 unsigned long blocknr, freed; 451 unsigned long blocknr, freed;
430 452
453 if (is_journal_aborted(journal))
454 return 1;
455
431 /* OK, work out the oldest transaction remaining in the log, and 456 /* OK, work out the oldest transaction remaining in the log, and
432 * the log block it starts at. 457 * the log block it starts at.
433 * 458 *