aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHidehiro Kawai <hidehiro.kawai.ez@hitachi.com>2008-10-22 17:15:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-23 11:55:01 -0400
commit4afe978530702c934dfdb11f54073136818b2119 (patch)
tree5f7fb9539b46c0b390157f55c84017e14b7f605c
parent66f50ee3cee4c9d98eea0add6f439e6e5e0ca4a5 (diff)
jbd: fix error handling for checkpoint io
When a checkpointing IO fails, current JBD code doesn't check the error and continue journaling. This means latest metadata can be lost from both the journal and filesystem. This patch leaves the failed metadata blocks in the journal space and aborts journaling in the case of log_do_checkpoint(). To achieve this, we need to do: 1. don't remove the failed buffer from the checkpoint list where in the case of __try_to_free_cp_buf() because it may be released or overwritten by a later transaction 2. log_do_checkpoint() is the last chance, remove the failed buffer from the checkpoint list and abort the journal 3. when checkpointing fails, don't update the journal super block to prevent the journaled contents from being cleaned. For safety, don't update j_tail and j_tail_sequence either 4. when checkpointing fails, notify this error to the ext3 layer so that ext3 don't clear the needs_recovery flag, otherwise the journaled contents are ignored and cleaned in the recovery phase 5. if the recovery fails, keep the needs_recovery flag 6. prevent cleanup_journal_tail() from being called between __journal_drop_transaction() and journal_abort() (a race issue between journal_flush() and __log_wait_for_space() Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com> Acked-by: Jan Kara <jack@suse.cz> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/jbd/checkpoint.c49
-rw-r--r--fs/jbd/journal.c28
-rw-r--r--fs/jbd/recovery.c7
-rw-r--r--include/linux/jbd.h2
4 files changed, 65 insertions, 21 deletions
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index a5432bbbfb88..e29293501d42 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
93 int ret = 0; 93 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh); 94 struct buffer_head *bh = jh2bh(jh);
95 95
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
97 !buffer_dirty(bh) && buffer_uptodate(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list"); 98 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __journal_remove_checkpoint(jh) + 1; 99 ret = __journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh); 100 jbd_unlock_bh_state(bh);
@@ -160,21 +161,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
160 * buffers. Note that we take the buffers in the opposite ordering 161 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO. 162 * from the one in which they were submitted for IO.
162 * 163 *
164 * Return 0 on success, and return <0 if some buffers have failed
165 * to be written out.
166 *
163 * Called with j_list_lock held. 167 * Called with j_list_lock held.
164 */ 168 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 169static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{ 170{
167 struct journal_head *jh; 171 struct journal_head *jh;
168 struct buffer_head *bh; 172 struct buffer_head *bh;
169 tid_t this_tid; 173 tid_t this_tid;
170 int released = 0; 174 int released = 0;
175 int ret = 0;
171 176
172 this_tid = transaction->t_tid; 177 this_tid = transaction->t_tid;
173restart: 178restart:
174 /* Did somebody clean up the transaction in the meanwhile? */ 179 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction || 180 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid) 181 transaction->t_tid != this_tid)
177 return; 182 return ret;
178 while (!released && transaction->t_checkpoint_io_list) { 183 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list; 184 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh); 185 bh = jh2bh(jh);
@@ -194,6 +199,9 @@ restart:
194 spin_lock(&journal->j_list_lock); 199 spin_lock(&journal->j_list_lock);
195 goto restart; 200 goto restart;
196 } 201 }
202 if (unlikely(!buffer_uptodate(bh)))
203 ret = -EIO;
204
197 /* 205 /*
198 * Now in whatever state the buffer currently is, we know that 206 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list 207 * it has been written out and so we can drop it from the list
@@ -203,6 +211,8 @@ restart:
203 journal_remove_journal_head(bh); 211 journal_remove_journal_head(bh);
204 __brelse(bh); 212 __brelse(bh);
205 } 213 }
214
215 return ret;
206} 216}
207 217
208#define NR_BATCH 64 218#define NR_BATCH 64
@@ -226,7 +236,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
226 * Try to flush one buffer from the checkpoint list to disk. 236 * Try to flush one buffer from the checkpoint list to disk.
227 * 237 *
228 * Return 1 if something happened which requires us to abort the current 238 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list. 239 * scan of the checkpoint list. Return <0 if the buffer has failed to
240 * be written out.
230 * 241 *
231 * Called with j_list_lock held and drops it if 1 is returned 242 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 243 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -256,6 +267,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
256 log_wait_commit(journal, tid); 267 log_wait_commit(journal, tid);
257 ret = 1; 268 ret = 1;
258 } else if (!buffer_dirty(bh)) { 269 } else if (!buffer_dirty(bh)) {
270 ret = 1;
271 if (unlikely(!buffer_uptodate(bh)))
272 ret = -EIO;
259 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 273 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
260 BUFFER_TRACE(bh, "remove from checkpoint"); 274 BUFFER_TRACE(bh, "remove from checkpoint");
261 __journal_remove_checkpoint(jh); 275 __journal_remove_checkpoint(jh);
@@ -263,7 +277,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
263 jbd_unlock_bh_state(bh); 277 jbd_unlock_bh_state(bh);
264 journal_remove_journal_head(bh); 278 journal_remove_journal_head(bh);
265 __brelse(bh); 279 __brelse(bh);
266 ret = 1;
267 } else { 280 } else {
268 /* 281 /*
269 * Important: we are about to write the buffer, and 282 * Important: we are about to write the buffer, and
@@ -295,6 +308,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
295 * to disk. We submit larger chunks of data at once. 308 * to disk. We submit larger chunks of data at once.
296 * 309 *
297 * The journal should be locked before calling this function. 310 * The journal should be locked before calling this function.
311 * Called with j_checkpoint_mutex held.
298 */ 312 */
299int log_do_checkpoint(journal_t *journal) 313int log_do_checkpoint(journal_t *journal)
300{ 314{
@@ -318,6 +332,7 @@ int log_do_checkpoint(journal_t *journal)
318 * OK, we need to start writing disk blocks. Take one transaction 332 * OK, we need to start writing disk blocks. Take one transaction
319 * and write it. 333 * and write it.
320 */ 334 */
335 result = 0;
321 spin_lock(&journal->j_list_lock); 336 spin_lock(&journal->j_list_lock);
322 if (!journal->j_checkpoint_transactions) 337 if (!journal->j_checkpoint_transactions)
323 goto out; 338 goto out;
@@ -334,7 +349,7 @@ restart:
334 int batch_count = 0; 349 int batch_count = 0;
335 struct buffer_head *bhs[NR_BATCH]; 350 struct buffer_head *bhs[NR_BATCH];
336 struct journal_head *jh; 351 struct journal_head *jh;
337 int retry = 0; 352 int retry = 0, err;
338 353
339 while (!retry && transaction->t_checkpoint_list) { 354 while (!retry && transaction->t_checkpoint_list) {
340 struct buffer_head *bh; 355 struct buffer_head *bh;
@@ -347,6 +362,8 @@ restart:
347 break; 362 break;
348 } 363 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count); 364 retry = __process_buffer(journal, jh, bhs,&batch_count);
365 if (retry < 0 && !result)
366 result = retry;
350 if (!retry && (need_resched() || 367 if (!retry && (need_resched() ||
351 spin_needbreak(&journal->j_list_lock))) { 368 spin_needbreak(&journal->j_list_lock))) {
352 spin_unlock(&journal->j_list_lock); 369 spin_unlock(&journal->j_list_lock);
@@ -371,14 +388,18 @@ restart:
371 * Now we have cleaned up the first transaction's checkpoint 388 * Now we have cleaned up the first transaction's checkpoint
372 * list. Let's clean up the second one 389 * list. Let's clean up the second one
373 */ 390 */
374 __wait_cp_io(journal, transaction); 391 err = __wait_cp_io(journal, transaction);
392 if (!result)
393 result = err;
375 } 394 }
376out: 395out:
377 spin_unlock(&journal->j_list_lock); 396 spin_unlock(&journal->j_list_lock);
378 result = cleanup_journal_tail(journal);
379 if (result < 0) 397 if (result < 0)
380 return result; 398 journal_abort(journal, result);
381 return 0; 399 else
400 result = cleanup_journal_tail(journal);
401
402 return (result < 0) ? result : 0;
382} 403}
383 404
384/* 405/*
@@ -394,8 +415,9 @@ out:
394 * This is the only part of the journaling code which really needs to be 415 * This is the only part of the journaling code which really needs to be
395 * aware of transaction aborts. Checkpointing involves writing to the 416 * aware of transaction aborts. Checkpointing involves writing to the
396 * main filesystem area rather than to the journal, so it can proceed 417 * main filesystem area rather than to the journal, so it can proceed
397 * even in abort state, but we must not update the journal superblock if 418 * even in abort state, but we must not update the super block if
398 * we have an abort error outstanding. 419 * checkpointing may have failed. Otherwise, we would lose some metadata
420 * buffers which should be written-back to the filesystem.
399 */ 421 */
400 422
401int cleanup_journal_tail(journal_t *journal) 423int cleanup_journal_tail(journal_t *journal)
@@ -404,6 +426,9 @@ int cleanup_journal_tail(journal_t *journal)
404 tid_t first_tid; 426 tid_t first_tid;
405 unsigned long blocknr, freed; 427 unsigned long blocknr, freed;
406 428
429 if (is_journal_aborted(journal))
430 return 1;
431
407 /* OK, work out the oldest transaction remaining in the log, and 432 /* OK, work out the oldest transaction remaining in the log, and
408 * the log block it starts at. 433 * the log block it starts at.
409 * 434 *
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index aa7143a8349b..9e4fa52d7dc8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1121,9 +1121,12 @@ recovery_error:
1121 * 1121 *
1122 * Release a journal_t structure once it is no longer in use by the 1122 * Release a journal_t structure once it is no longer in use by the
1123 * journaled object. 1123 * journaled object.
1124 * Return <0 if we couldn't clean up the journal.
1124 */ 1125 */
1125void journal_destroy(journal_t *journal) 1126int journal_destroy(journal_t *journal)
1126{ 1127{
1128 int err = 0;
1129
1127 /* Wait for the commit thread to wake up and die. */ 1130 /* Wait for the commit thread to wake up and die. */
1128 journal_kill_thread(journal); 1131 journal_kill_thread(journal);
1129 1132
@@ -1146,11 +1149,16 @@ void journal_destroy(journal_t *journal)
1146 J_ASSERT(journal->j_checkpoint_transactions == NULL); 1149 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1147 spin_unlock(&journal->j_list_lock); 1150 spin_unlock(&journal->j_list_lock);
1148 1151
1149 /* We can now mark the journal as empty. */
1150 journal->j_tail = 0;
1151 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1152 if (journal->j_sb_buffer) { 1152 if (journal->j_sb_buffer) {
1153 journal_update_superblock(journal, 1); 1153 if (!is_journal_aborted(journal)) {
1154 /* We can now mark the journal as empty. */
1155 journal->j_tail = 0;
1156 journal->j_tail_sequence =
1157 ++journal->j_transaction_sequence;
1158 journal_update_superblock(journal, 1);
1159 } else {
1160 err = -EIO;
1161 }
1154 brelse(journal->j_sb_buffer); 1162 brelse(journal->j_sb_buffer);
1155 } 1163 }
1156 1164
@@ -1160,6 +1168,8 @@ void journal_destroy(journal_t *journal)
1160 journal_destroy_revoke(journal); 1168 journal_destroy_revoke(journal);
1161 kfree(journal->j_wbuf); 1169 kfree(journal->j_wbuf);
1162 kfree(journal); 1170 kfree(journal);
1171
1172 return err;
1163} 1173}
1164 1174
1165 1175
@@ -1359,10 +1369,16 @@ int journal_flush(journal_t *journal)
1359 spin_lock(&journal->j_list_lock); 1369 spin_lock(&journal->j_list_lock);
1360 while (!err && journal->j_checkpoint_transactions != NULL) { 1370 while (!err && journal->j_checkpoint_transactions != NULL) {
1361 spin_unlock(&journal->j_list_lock); 1371 spin_unlock(&journal->j_list_lock);
1372 mutex_lock(&journal->j_checkpoint_mutex);
1362 err = log_do_checkpoint(journal); 1373 err = log_do_checkpoint(journal);
1374 mutex_unlock(&journal->j_checkpoint_mutex);
1363 spin_lock(&journal->j_list_lock); 1375 spin_lock(&journal->j_list_lock);
1364 } 1376 }
1365 spin_unlock(&journal->j_list_lock); 1377 spin_unlock(&journal->j_list_lock);
1378
1379 if (is_journal_aborted(journal))
1380 return -EIO;
1381
1366 cleanup_journal_tail(journal); 1382 cleanup_journal_tail(journal);
1367 1383
1368 /* Finally, mark the journal as really needing no recovery. 1384 /* Finally, mark the journal as really needing no recovery.
@@ -1384,7 +1400,7 @@ int journal_flush(journal_t *journal)
1384 J_ASSERT(journal->j_head == journal->j_tail); 1400 J_ASSERT(journal->j_head == journal->j_tail);
1385 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1401 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1386 spin_unlock(&journal->j_state_lock); 1402 spin_unlock(&journal->j_state_lock);
1387 return err; 1403 return 0;
1388} 1404}
1389 1405
1390/** 1406/**
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 43bc5e5ed064..db5e982c5ddf 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -223,7 +223,7 @@ do { \
223 */ 223 */
224int journal_recover(journal_t *journal) 224int journal_recover(journal_t *journal)
225{ 225{
226 int err; 226 int err, err2;
227 journal_superblock_t * sb; 227 journal_superblock_t * sb;
228 228
229 struct recovery_info info; 229 struct recovery_info info;
@@ -261,7 +261,10 @@ int journal_recover(journal_t *journal)
261 journal->j_transaction_sequence = ++info.end_transaction; 261 journal->j_transaction_sequence = ++info.end_transaction;
262 262
263 journal_clear_revoke(journal); 263 journal_clear_revoke(journal);
264 sync_blockdev(journal->j_fs_dev); 264 err2 = sync_blockdev(journal->j_fs_dev);
265 if (!err)
266 err = err2;
267
265 return err; 268 return err;
266} 269}
267 270
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 35d4f6342fac..346e2b80be7d 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -911,7 +911,7 @@ extern int journal_set_features
911 (journal_t *, unsigned long, unsigned long, unsigned long); 911 (journal_t *, unsigned long, unsigned long, unsigned long);
912extern int journal_create (journal_t *); 912extern int journal_create (journal_t *);
913extern int journal_load (journal_t *journal); 913extern int journal_load (journal_t *journal);
914extern void journal_destroy (journal_t *); 914extern int journal_destroy (journal_t *);
915extern int journal_recover (journal_t *journal); 915extern int journal_recover (journal_t *journal);
916extern int journal_wipe (journal_t *, int); 916extern int journal_wipe (journal_t *, int);
917extern int journal_skip_recovery (journal_t *); 917extern int journal_skip_recovery (journal_t *);