aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHidehiro Kawai <hidehiro.kawai.ez@hitachi.com>2008-10-10 20:29:13 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-10-10 20:29:13 -0400
commit44519faf22ad6ce924ad0352d3dc200d9e0b66e8 (patch)
tree332dd28cf16439fc4c78ad198e04c12ff7c16e66
parent77e841de8abac4755cc83ca224fdf71418d65380 (diff)
jbd2: fix error handling for checkpoint io
When a checkpointing IO fails, current JBD2 code doesn't check the error and continue journaling. This means latest metadata can be lost from both the journal and filesystem. This patch leaves the failed metadata blocks in the journal space and aborts journaling in the case of jbd2_log_do_checkpoint(). To achieve this, we need to do: 1. don't remove the failed buffer from the checkpoint list where in the case of __try_to_free_cp_buf() because it may be released or overwritten by a later transaction 2. jbd2_log_do_checkpoint() is the last chance, remove the failed buffer from the checkpoint list and abort the journal 3. when checkpointing fails, don't update the journal super block to prevent the journaled contents from being cleaned. For safety, don't update j_tail and j_tail_sequence either 4. when checkpointing fails, notify this error to the ext4 layer so that ext4 don't clear the needs_recovery flag, otherwise the journaled contents are ignored and cleaned in the recovery phase 5. if the recovery fails, keep the needs_recovery flag 6. prevent jbd2_cleanup_journal_tail() from being called between __jbd2_journal_drop_transaction() and jbd2_journal_abort() (a possible race issue between jbd2_log_do_checkpoint()s called by jbd2_journal_flush() and __jbd2_log_wait_for_space()) Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r--fs/jbd2/checkpoint.c49
-rw-r--r--fs/jbd2/journal.c28
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--include/linux/jbd2.h2
4 files changed, 65 insertions, 21 deletions
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42895d369458..9203c3332f17 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -94,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
94 int ret = 0; 94 int ret = 0;
95 struct buffer_head *bh = jh2bh(jh); 95 struct buffer_head *bh = jh2bh(jh);
96 96
97 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 97 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
98 JBUFFER_TRACE(jh, "remove from checkpoint list"); 99 JBUFFER_TRACE(jh, "remove from checkpoint list");
99 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 100 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
100 jbd_unlock_bh_state(bh); 101 jbd_unlock_bh_state(bh);
@@ -176,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
176 * buffers. Note that we take the buffers in the opposite ordering 177 * buffers. Note that we take the buffers in the opposite ordering
177 * from the one in which they were submitted for IO. 178 * from the one in which they were submitted for IO.
178 * 179 *
180 * Return 0 on success, and return <0 if some buffers have failed
181 * to be written out.
182 *
179 * Called with j_list_lock held. 183 * Called with j_list_lock held.
180 */ 184 */
181static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 185static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
182{ 186{
183 struct journal_head *jh; 187 struct journal_head *jh;
184 struct buffer_head *bh; 188 struct buffer_head *bh;
185 tid_t this_tid; 189 tid_t this_tid;
186 int released = 0; 190 int released = 0;
191 int ret = 0;
187 192
188 this_tid = transaction->t_tid; 193 this_tid = transaction->t_tid;
189restart: 194restart:
190 /* Did somebody clean up the transaction in the meanwhile? */ 195 /* Did somebody clean up the transaction in the meanwhile? */
191 if (journal->j_checkpoint_transactions != transaction || 196 if (journal->j_checkpoint_transactions != transaction ||
192 transaction->t_tid != this_tid) 197 transaction->t_tid != this_tid)
193 return; 198 return ret;
194 while (!released && transaction->t_checkpoint_io_list) { 199 while (!released && transaction->t_checkpoint_io_list) {
195 jh = transaction->t_checkpoint_io_list; 200 jh = transaction->t_checkpoint_io_list;
196 bh = jh2bh(jh); 201 bh = jh2bh(jh);
@@ -210,6 +215,9 @@ restart:
210 spin_lock(&journal->j_list_lock); 215 spin_lock(&journal->j_list_lock);
211 goto restart; 216 goto restart;
212 } 217 }
218 if (unlikely(buffer_write_io_error(bh)))
219 ret = -EIO;
220
213 /* 221 /*
214 * Now in whatever state the buffer currently is, we know that 222 * Now in whatever state the buffer currently is, we know that
215 * it has been written out and so we can drop it from the list 223 * it has been written out and so we can drop it from the list
@@ -219,6 +227,8 @@ restart:
219 jbd2_journal_remove_journal_head(bh); 227 jbd2_journal_remove_journal_head(bh);
220 __brelse(bh); 228 __brelse(bh);
221 } 229 }
230
231 return ret;
222} 232}
223 233
224#define NR_BATCH 64 234#define NR_BATCH 64
@@ -242,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
242 * Try to flush one buffer from the checkpoint list to disk. 252 * Try to flush one buffer from the checkpoint list to disk.
243 * 253 *
244 * Return 1 if something happened which requires us to abort the current 254 * Return 1 if something happened which requires us to abort the current
245 * scan of the checkpoint list. 255 * scan of the checkpoint list. Return <0 if the buffer has failed to
256 * be written out.
246 * 257 *
247 * Called with j_list_lock held and drops it if 1 is returned 258 * Called with j_list_lock held and drops it if 1 is returned
248 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 259 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -274,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
274 jbd2_log_wait_commit(journal, tid); 285 jbd2_log_wait_commit(journal, tid);
275 ret = 1; 286 ret = 1;
276 } else if (!buffer_dirty(bh)) { 287 } else if (!buffer_dirty(bh)) {
288 ret = 1;
289 if (unlikely(buffer_write_io_error(bh)))
290 ret = -EIO;
277 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 291 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
278 BUFFER_TRACE(bh, "remove from checkpoint"); 292 BUFFER_TRACE(bh, "remove from checkpoint");
279 __jbd2_journal_remove_checkpoint(jh); 293 __jbd2_journal_remove_checkpoint(jh);
@@ -281,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
281 jbd_unlock_bh_state(bh); 295 jbd_unlock_bh_state(bh);
282 jbd2_journal_remove_journal_head(bh); 296 jbd2_journal_remove_journal_head(bh);
283 __brelse(bh); 297 __brelse(bh);
284 ret = 1;
285 } else { 298 } else {
286 /* 299 /*
287 * Important: we are about to write the buffer, and 300 * Important: we are about to write the buffer, and
@@ -314,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
314 * to disk. We submit larger chunks of data at once. 327 * to disk. We submit larger chunks of data at once.
315 * 328 *
316 * The journal should be locked before calling this function. 329 * The journal should be locked before calling this function.
330 * Called with j_checkpoint_mutex held.
317 */ 331 */
318int jbd2_log_do_checkpoint(journal_t *journal) 332int jbd2_log_do_checkpoint(journal_t *journal)
319{ 333{
@@ -339,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
339 * OK, we need to start writing disk blocks. Take one transaction 353 * OK, we need to start writing disk blocks. Take one transaction
340 * and write it. 354 * and write it.
341 */ 355 */
356 result = 0;
342 spin_lock(&journal->j_list_lock); 357 spin_lock(&journal->j_list_lock);
343 if (!journal->j_checkpoint_transactions) 358 if (!journal->j_checkpoint_transactions)
344 goto out; 359 goto out;
@@ -357,7 +372,7 @@ restart:
357 int batch_count = 0; 372 int batch_count = 0;
358 struct buffer_head *bhs[NR_BATCH]; 373 struct buffer_head *bhs[NR_BATCH];
359 struct journal_head *jh; 374 struct journal_head *jh;
360 int retry = 0; 375 int retry = 0, err;
361 376
362 while (!retry && transaction->t_checkpoint_list) { 377 while (!retry && transaction->t_checkpoint_list) {
363 struct buffer_head *bh; 378 struct buffer_head *bh;
@@ -371,6 +386,8 @@ restart:
371 } 386 }
372 retry = __process_buffer(journal, jh, bhs, &batch_count, 387 retry = __process_buffer(journal, jh, bhs, &batch_count,
373 transaction); 388 transaction);
389 if (retry < 0 && !result)
390 result = retry;
374 if (!retry && (need_resched() || 391 if (!retry && (need_resched() ||
375 spin_needbreak(&journal->j_list_lock))) { 392 spin_needbreak(&journal->j_list_lock))) {
376 spin_unlock(&journal->j_list_lock); 393 spin_unlock(&journal->j_list_lock);
@@ -395,14 +412,18 @@ restart:
395 * Now we have cleaned up the first transaction's checkpoint 412 * Now we have cleaned up the first transaction's checkpoint
396 * list. Let's clean up the second one 413 * list. Let's clean up the second one
397 */ 414 */
398 __wait_cp_io(journal, transaction); 415 err = __wait_cp_io(journal, transaction);
416 if (!result)
417 result = err;
399 } 418 }
400out: 419out:
401 spin_unlock(&journal->j_list_lock); 420 spin_unlock(&journal->j_list_lock);
402 result = jbd2_cleanup_journal_tail(journal);
403 if (result < 0) 421 if (result < 0)
404 return result; 422 jbd2_journal_abort(journal, result);
405 return 0; 423 else
424 result = jbd2_cleanup_journal_tail(journal);
425
426 return (result < 0) ? result : 0;
406} 427}
407 428
408/* 429/*
@@ -418,8 +439,9 @@ out:
418 * This is the only part of the journaling code which really needs to be 439 * This is the only part of the journaling code which really needs to be
419 * aware of transaction aborts. Checkpointing involves writing to the 440 * aware of transaction aborts. Checkpointing involves writing to the
420 * main filesystem area rather than to the journal, so it can proceed 441 * main filesystem area rather than to the journal, so it can proceed
421 * even in abort state, but we must not update the journal superblock if 442 * even in abort state, but we must not update the super block if
422 * we have an abort error outstanding. 443 * checkpointing may have failed. Otherwise, we would lose some metadata
444 * buffers which should be written-back to the filesystem.
423 */ 445 */
424 446
425int jbd2_cleanup_journal_tail(journal_t *journal) 447int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -428,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
428 tid_t first_tid; 450 tid_t first_tid;
429 unsigned long blocknr, freed; 451 unsigned long blocknr, freed;
430 452
453 if (is_journal_aborted(journal))
454 return 1;
455
431 /* OK, work out the oldest transaction remaining in the log, and 456 /* OK, work out the oldest transaction remaining in the log, and
432 * the log block it starts at. 457 * the log block it starts at.
433 * 458 *
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 01c3901c3a07..783de118de92 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1451,9 +1451,12 @@ recovery_error:
1451 * 1451 *
1452 * Release a journal_t structure once it is no longer in use by the 1452 * Release a journal_t structure once it is no longer in use by the
1453 * journaled object. 1453 * journaled object.
1454 * Return <0 if we couldn't clean up the journal.
1454 */ 1455 */
1455void jbd2_journal_destroy(journal_t *journal) 1456int jbd2_journal_destroy(journal_t *journal)
1456{ 1457{
1458 int err = 0;
1459
1457 /* Wait for the commit thread to wake up and die. */ 1460 /* Wait for the commit thread to wake up and die. */
1458 journal_kill_thread(journal); 1461 journal_kill_thread(journal);
1459 1462
@@ -1476,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal)
1476 J_ASSERT(journal->j_checkpoint_transactions == NULL); 1479 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1477 spin_unlock(&journal->j_list_lock); 1480 spin_unlock(&journal->j_list_lock);
1478 1481
1479 /* We can now mark the journal as empty. */
1480 journal->j_tail = 0;
1481 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1482 if (journal->j_sb_buffer) { 1482 if (journal->j_sb_buffer) {
1483 jbd2_journal_update_superblock(journal, 1); 1483 if (!is_journal_aborted(journal)) {
1484 /* We can now mark the journal as empty. */
1485 journal->j_tail = 0;
1486 journal->j_tail_sequence =
1487 ++journal->j_transaction_sequence;
1488 jbd2_journal_update_superblock(journal, 1);
1489 } else {
1490 err = -EIO;
1491 }
1484 brelse(journal->j_sb_buffer); 1492 brelse(journal->j_sb_buffer);
1485 } 1493 }
1486 1494
@@ -1492,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal)
1492 jbd2_journal_destroy_revoke(journal); 1500 jbd2_journal_destroy_revoke(journal);
1493 kfree(journal->j_wbuf); 1501 kfree(journal->j_wbuf);
1494 kfree(journal); 1502 kfree(journal);
1503
1504 return err;
1495} 1505}
1496 1506
1497 1507
@@ -1717,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal)
1717 spin_lock(&journal->j_list_lock); 1727 spin_lock(&journal->j_list_lock);
1718 while (!err && journal->j_checkpoint_transactions != NULL) { 1728 while (!err && journal->j_checkpoint_transactions != NULL) {
1719 spin_unlock(&journal->j_list_lock); 1729 spin_unlock(&journal->j_list_lock);
1730 mutex_lock(&journal->j_checkpoint_mutex);
1720 err = jbd2_log_do_checkpoint(journal); 1731 err = jbd2_log_do_checkpoint(journal);
1732 mutex_unlock(&journal->j_checkpoint_mutex);
1721 spin_lock(&journal->j_list_lock); 1733 spin_lock(&journal->j_list_lock);
1722 } 1734 }
1723 spin_unlock(&journal->j_list_lock); 1735 spin_unlock(&journal->j_list_lock);
1736
1737 if (is_journal_aborted(journal))
1738 return -EIO;
1739
1724 jbd2_cleanup_journal_tail(journal); 1740 jbd2_cleanup_journal_tail(journal);
1725 1741
1726 /* Finally, mark the journal as really needing no recovery. 1742 /* Finally, mark the journal as really needing no recovery.
@@ -1742,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal)
1742 J_ASSERT(journal->j_head == journal->j_tail); 1758 J_ASSERT(journal->j_head == journal->j_tail);
1743 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1759 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1744 spin_unlock(&journal->j_state_lock); 1760 spin_unlock(&journal->j_state_lock);
1745 return err; 1761 return 0;
1746} 1762}
1747 1763
1748/** 1764/**
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 058f50f65b76..73063285b13f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -225,7 +225,7 @@ do { \
225 */ 225 */
226int jbd2_journal_recover(journal_t *journal) 226int jbd2_journal_recover(journal_t *journal)
227{ 227{
228 int err; 228 int err, err2;
229 journal_superblock_t * sb; 229 journal_superblock_t * sb;
230 230
231 struct recovery_info info; 231 struct recovery_info info;
@@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal)
263 journal->j_transaction_sequence = ++info.end_transaction; 263 journal->j_transaction_sequence = ++info.end_transaction;
264 264
265 jbd2_journal_clear_revoke(journal); 265 jbd2_journal_clear_revoke(journal);
266 sync_blockdev(journal->j_fs_dev); 266 err2 = sync_blockdev(journal->j_fs_dev);
267 if (!err)
268 err = err2;
269
267 return err; 270 return err;
268} 271}
269 272
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 66c3499478b5..c9e7d781db31 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1060,7 +1060,7 @@ extern void jbd2_journal_clear_features
1060 (journal_t *, unsigned long, unsigned long, unsigned long); 1060 (journal_t *, unsigned long, unsigned long, unsigned long);
1061extern int jbd2_journal_create (journal_t *); 1061extern int jbd2_journal_create (journal_t *);
1062extern int jbd2_journal_load (journal_t *journal); 1062extern int jbd2_journal_load (journal_t *journal);
1063extern void jbd2_journal_destroy (journal_t *); 1063extern int jbd2_journal_destroy (journal_t *);
1064extern int jbd2_journal_recover (journal_t *journal); 1064extern int jbd2_journal_recover (journal_t *journal);
1065extern int jbd2_journal_wipe (journal_t *, int); 1065extern int jbd2_journal_wipe (journal_t *, int);
1066extern int jbd2_journal_skip_recovery (journal_t *); 1066extern int jbd2_journal_skip_recovery (journal_t *);