aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
commit7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
treee730a4565e0318140d2fbd2f0415d18a339d7336 /fs/jbd
parent41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'fs/jbd')
-rw-r--r--fs/jbd/Kconfig30
-rw-r--r--fs/jbd/checkpoint.c68
-rw-r--r--fs/jbd/commit.c78
-rw-r--r--fs/jbd/journal.c36
-rw-r--r--fs/jbd/recovery.c7
-rw-r--r--fs/jbd/revoke.c163
-rw-r--r--fs/jbd/transaction.c77
7 files changed, 318 insertions, 141 deletions
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
new file mode 100644
index 000000000000..4e28beeed157
--- /dev/null
+++ b/fs/jbd/Kconfig
@@ -0,0 +1,30 @@
1config JBD
2 tristate
3 help
4 This is a generic journalling layer for block devices. It is
5 currently used by the ext3 file system, but it could also be
6 used to add journal support to other file systems or block
7 devices such as RAID or LVM.
8
9 If you are using the ext3 file system, you need to say Y here.
10 If you are not using ext3 then you will probably want to say N.
11
12 To compile this device as a module, choose M here: the module will be
13 called jbd. If you are compiling ext3 into the kernel, you
14 cannot compile this code as a module.
15
16config JBD_DEBUG
17 bool "JBD (ext3) debugging support"
18 depends on JBD && DEBUG_FS
19 help
20 If you are using the ext3 journaled file system (or potentially any
21 other file system/device using JBD), this option allows you to
22 enable debugging output while the system is running, in order to
23 help track down any problems you are having. By default the
24 debugging output will be turned off.
25
26 If you select Y here, then you will be able to turn on debugging
27 with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
28 number between 1 and 5, the higher the number, the more debugging
29 output is generated. To turn debugging off again, do
30 "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index a5432bbbfb88..1bd8d4acc6f2 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
93 int ret = 0; 93 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh); 94 struct buffer_head *bh = jh2bh(jh);
95 95
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
97 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list"); 98 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __journal_remove_checkpoint(jh) + 1; 99 ret = __journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh); 100 jbd_unlock_bh_state(bh);
@@ -126,14 +127,29 @@ void __log_wait_for_space(journal_t *journal)
126 127
127 /* 128 /*
128 * Test again, another process may have checkpointed while we 129 * Test again, another process may have checkpointed while we
129 * were waiting for the checkpoint lock 130 * were waiting for the checkpoint lock. If there are no
131 * outstanding transactions there is nothing to checkpoint and
132 * we can't make progress. Abort the journal in this case.
130 */ 133 */
131 spin_lock(&journal->j_state_lock); 134 spin_lock(&journal->j_state_lock);
135 spin_lock(&journal->j_list_lock);
132 nblocks = jbd_space_needed(journal); 136 nblocks = jbd_space_needed(journal);
133 if (__log_space_left(journal) < nblocks) { 137 if (__log_space_left(journal) < nblocks) {
138 int chkpt = journal->j_checkpoint_transactions != NULL;
139
140 spin_unlock(&journal->j_list_lock);
134 spin_unlock(&journal->j_state_lock); 141 spin_unlock(&journal->j_state_lock);
135 log_do_checkpoint(journal); 142 if (chkpt) {
143 log_do_checkpoint(journal);
144 } else {
145 printk(KERN_ERR "%s: no transactions\n",
146 __func__);
147 journal_abort(journal, 0);
148 }
149
136 spin_lock(&journal->j_state_lock); 150 spin_lock(&journal->j_state_lock);
151 } else {
152 spin_unlock(&journal->j_list_lock);
137 } 153 }
138 mutex_unlock(&journal->j_checkpoint_mutex); 154 mutex_unlock(&journal->j_checkpoint_mutex);
139 } 155 }
@@ -160,21 +176,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
160 * buffers. Note that we take the buffers in the opposite ordering 176 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO. 177 * from the one in which they were submitted for IO.
162 * 178 *
179 * Return 0 on success, and return <0 if some buffers have failed
180 * to be written out.
181 *
163 * Called with j_list_lock held. 182 * Called with j_list_lock held.
164 */ 183 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 184static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{ 185{
167 struct journal_head *jh; 186 struct journal_head *jh;
168 struct buffer_head *bh; 187 struct buffer_head *bh;
169 tid_t this_tid; 188 tid_t this_tid;
170 int released = 0; 189 int released = 0;
190 int ret = 0;
171 191
172 this_tid = transaction->t_tid; 192 this_tid = transaction->t_tid;
173restart: 193restart:
174 /* Did somebody clean up the transaction in the meanwhile? */ 194 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction || 195 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid) 196 transaction->t_tid != this_tid)
177 return; 197 return ret;
178 while (!released && transaction->t_checkpoint_io_list) { 198 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list; 199 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh); 200 bh = jh2bh(jh);
@@ -194,6 +214,9 @@ restart:
194 spin_lock(&journal->j_list_lock); 214 spin_lock(&journal->j_list_lock);
195 goto restart; 215 goto restart;
196 } 216 }
217 if (unlikely(buffer_write_io_error(bh)))
218 ret = -EIO;
219
197 /* 220 /*
198 * Now in whatever state the buffer currently is, we know that 221 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list 222 * it has been written out and so we can drop it from the list
@@ -203,6 +226,8 @@ restart:
203 journal_remove_journal_head(bh); 226 journal_remove_journal_head(bh);
204 __brelse(bh); 227 __brelse(bh);
205 } 228 }
229
230 return ret;
206} 231}
207 232
208#define NR_BATCH 64 233#define NR_BATCH 64
@@ -226,7 +251,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
226 * Try to flush one buffer from the checkpoint list to disk. 251 * Try to flush one buffer from the checkpoint list to disk.
227 * 252 *
228 * Return 1 if something happened which requires us to abort the current 253 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list. 254 * scan of the checkpoint list. Return <0 if the buffer has failed to
255 * be written out.
230 * 256 *
231 * Called with j_list_lock held and drops it if 1 is returned 257 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 258 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -256,6 +282,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
256 log_wait_commit(journal, tid); 282 log_wait_commit(journal, tid);
257 ret = 1; 283 ret = 1;
258 } else if (!buffer_dirty(bh)) { 284 } else if (!buffer_dirty(bh)) {
285 ret = 1;
286 if (unlikely(buffer_write_io_error(bh)))
287 ret = -EIO;
259 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 288 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
260 BUFFER_TRACE(bh, "remove from checkpoint"); 289 BUFFER_TRACE(bh, "remove from checkpoint");
261 __journal_remove_checkpoint(jh); 290 __journal_remove_checkpoint(jh);
@@ -263,7 +292,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
263 jbd_unlock_bh_state(bh); 292 jbd_unlock_bh_state(bh);
264 journal_remove_journal_head(bh); 293 journal_remove_journal_head(bh);
265 __brelse(bh); 294 __brelse(bh);
266 ret = 1;
267 } else { 295 } else {
268 /* 296 /*
269 * Important: we are about to write the buffer, and 297 * Important: we are about to write the buffer, and
@@ -295,6 +323,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
295 * to disk. We submit larger chunks of data at once. 323 * to disk. We submit larger chunks of data at once.
296 * 324 *
297 * The journal should be locked before calling this function. 325 * The journal should be locked before calling this function.
326 * Called with j_checkpoint_mutex held.
298 */ 327 */
299int log_do_checkpoint(journal_t *journal) 328int log_do_checkpoint(journal_t *journal)
300{ 329{
@@ -318,6 +347,7 @@ int log_do_checkpoint(journal_t *journal)
318 * OK, we need to start writing disk blocks. Take one transaction 347 * OK, we need to start writing disk blocks. Take one transaction
319 * and write it. 348 * and write it.
320 */ 349 */
350 result = 0;
321 spin_lock(&journal->j_list_lock); 351 spin_lock(&journal->j_list_lock);
322 if (!journal->j_checkpoint_transactions) 352 if (!journal->j_checkpoint_transactions)
323 goto out; 353 goto out;
@@ -334,7 +364,7 @@ restart:
334 int batch_count = 0; 364 int batch_count = 0;
335 struct buffer_head *bhs[NR_BATCH]; 365 struct buffer_head *bhs[NR_BATCH];
336 struct journal_head *jh; 366 struct journal_head *jh;
337 int retry = 0; 367 int retry = 0, err;
338 368
339 while (!retry && transaction->t_checkpoint_list) { 369 while (!retry && transaction->t_checkpoint_list) {
340 struct buffer_head *bh; 370 struct buffer_head *bh;
@@ -347,6 +377,8 @@ restart:
347 break; 377 break;
348 } 378 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count); 379 retry = __process_buffer(journal, jh, bhs,&batch_count);
380 if (retry < 0 && !result)
381 result = retry;
350 if (!retry && (need_resched() || 382 if (!retry && (need_resched() ||
351 spin_needbreak(&journal->j_list_lock))) { 383 spin_needbreak(&journal->j_list_lock))) {
352 spin_unlock(&journal->j_list_lock); 384 spin_unlock(&journal->j_list_lock);
@@ -371,14 +403,18 @@ restart:
371 * Now we have cleaned up the first transaction's checkpoint 403 * Now we have cleaned up the first transaction's checkpoint
372 * list. Let's clean up the second one 404 * list. Let's clean up the second one
373 */ 405 */
374 __wait_cp_io(journal, transaction); 406 err = __wait_cp_io(journal, transaction);
407 if (!result)
408 result = err;
375 } 409 }
376out: 410out:
377 spin_unlock(&journal->j_list_lock); 411 spin_unlock(&journal->j_list_lock);
378 result = cleanup_journal_tail(journal);
379 if (result < 0) 412 if (result < 0)
380 return result; 413 journal_abort(journal, result);
381 return 0; 414 else
415 result = cleanup_journal_tail(journal);
416
417 return (result < 0) ? result : 0;
382} 418}
383 419
384/* 420/*
@@ -394,8 +430,9 @@ out:
394 * This is the only part of the journaling code which really needs to be 430 * This is the only part of the journaling code which really needs to be
395 * aware of transaction aborts. Checkpointing involves writing to the 431 * aware of transaction aborts. Checkpointing involves writing to the
396 * main filesystem area rather than to the journal, so it can proceed 432 * main filesystem area rather than to the journal, so it can proceed
397 * even in abort state, but we must not update the journal superblock if 433 * even in abort state, but we must not update the super block if
398 * we have an abort error outstanding. 434 * checkpointing may have failed. Otherwise, we would lose some metadata
435 * buffers which should be written-back to the filesystem.
399 */ 436 */
400 437
401int cleanup_journal_tail(journal_t *journal) 438int cleanup_journal_tail(journal_t *journal)
@@ -404,6 +441,9 @@ int cleanup_journal_tail(journal_t *journal)
404 tid_t first_tid; 441 tid_t first_tid;
405 unsigned long blocknr, freed; 442 unsigned long blocknr, freed;
406 443
444 if (is_journal_aborted(journal))
445 return 1;
446
407 /* OK, work out the oldest transaction remaining in the log, and 447 /* OK, work out the oldest transaction remaining in the log, and
408 * the log block it starts at. 448 * the log block it starts at.
409 * 449 *
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 5a8ca61498ca..25719d902c51 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 36
37/* 37/*
38 * When an ext3-ordered file is truncated, it is possible that many pages are 38 * When an ext3-ordered file is truncated, it is possible that many pages are
39 * not sucessfully freed, because they are attached to a committing transaction. 39 * not successfully freed, because they are attached to a committing transaction.
40 * After the transaction commits, these pages are left on the LRU, with no 40 * After the transaction commits, these pages are left on the LRU, with no
41 * ->mapping, and with attached buffers. These pages are trivially reclaimable 41 * ->mapping, and with attached buffers. These pages are trivially reclaimable
42 * by the VM, but their apparent absence upsets the VM accounting, and it makes 42 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
45 * So here, we have a buffer which has just come off the forget list. Look to 45 * So here, we have a buffer which has just come off the forget list. Look to
46 * see if we can strip all buffers from the backing page. 46 * see if we can strip all buffers from the backing page.
47 * 47 *
48 * Called under lock_journal(), and possibly under journal_datalist_lock. The 48 * Called under journal->j_list_lock. The caller provided us with a ref
49 * caller provided us with a ref against the buffer, and we drop that here. 49 * against the buffer, and we drop that here.
50 */ 50 */
51static void release_buffer_page(struct buffer_head *bh) 51static void release_buffer_page(struct buffer_head *bh)
52{ 52{
@@ -63,7 +63,7 @@ static void release_buffer_page(struct buffer_head *bh)
63 goto nope; 63 goto nope;
64 64
65 /* OK, it's a truncated page */ 65 /* OK, it's a truncated page */
66 if (TestSetPageLocked(page)) 66 if (!trylock_page(page))
67 goto nope; 67 goto nope;
68 68
69 page_cache_get(page); 69 page_cache_get(page);
@@ -78,6 +78,19 @@ nope:
78} 78}
79 79
80/* 80/*
81 * Decrement reference counter for data buffer. If it has been marked
82 * 'BH_Freed', release it and the page to which it belongs if possible.
83 */
84static void release_data_buffer(struct buffer_head *bh)
85{
86 if (buffer_freed(bh)) {
87 clear_buffer_freed(bh);
88 release_buffer_page(bh);
89 } else
90 put_bh(bh);
91}
92
93/*
81 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is 94 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
82 * held. For ranking reasons we must trylock. If we lose, schedule away and 95 * held. For ranking reasons we must trylock. If we lose, schedule away and
83 * return 0. j_list_lock is dropped in this case. 96 * return 0. j_list_lock is dropped in this case.
@@ -172,7 +185,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
172/* 185/*
173 * Submit all the data buffers to disk 186 * Submit all the data buffers to disk
174 */ 187 */
175static void journal_submit_data_buffers(journal_t *journal, 188static int journal_submit_data_buffers(journal_t *journal,
176 transaction_t *commit_transaction) 189 transaction_t *commit_transaction)
177{ 190{
178 struct journal_head *jh; 191 struct journal_head *jh;
@@ -180,6 +193,7 @@ static void journal_submit_data_buffers(journal_t *journal,
180 int locked; 193 int locked;
181 int bufs = 0; 194 int bufs = 0;
182 struct buffer_head **wbuf = journal->j_wbuf; 195 struct buffer_head **wbuf = journal->j_wbuf;
196 int err = 0;
183 197
184 /* 198 /*
185 * Whenever we unlock the journal and sleep, things can get added 199 * Whenever we unlock the journal and sleep, things can get added
@@ -207,7 +221,7 @@ write_out_data:
207 * blocking lock_buffer(). 221 * blocking lock_buffer().
208 */ 222 */
209 if (buffer_dirty(bh)) { 223 if (buffer_dirty(bh)) {
210 if (test_set_buffer_locked(bh)) { 224 if (!trylock_buffer(bh)) {
211 BUFFER_TRACE(bh, "needs blocking lock"); 225 BUFFER_TRACE(bh, "needs blocking lock");
212 spin_unlock(&journal->j_list_lock); 226 spin_unlock(&journal->j_list_lock);
213 /* Write out all data to prevent deadlocks */ 227 /* Write out all data to prevent deadlocks */
@@ -231,7 +245,7 @@ write_out_data:
231 if (locked) 245 if (locked)
232 unlock_buffer(bh); 246 unlock_buffer(bh);
233 BUFFER_TRACE(bh, "already cleaned up"); 247 BUFFER_TRACE(bh, "already cleaned up");
234 put_bh(bh); 248 release_data_buffer(bh);
235 continue; 249 continue;
236 } 250 }
237 if (locked && test_clear_buffer_dirty(bh)) { 251 if (locked && test_clear_buffer_dirty(bh)) {
@@ -253,15 +267,17 @@ write_out_data:
253 put_bh(bh); 267 put_bh(bh);
254 } else { 268 } else {
255 BUFFER_TRACE(bh, "writeout complete: unfile"); 269 BUFFER_TRACE(bh, "writeout complete: unfile");
270 if (unlikely(!buffer_uptodate(bh)))
271 err = -EIO;
256 __journal_unfile_buffer(jh); 272 __journal_unfile_buffer(jh);
257 jbd_unlock_bh_state(bh); 273 jbd_unlock_bh_state(bh);
258 if (locked) 274 if (locked)
259 unlock_buffer(bh); 275 unlock_buffer(bh);
260 journal_remove_journal_head(bh); 276 journal_remove_journal_head(bh);
261 /* Once for our safety reference, once for 277 /* One for our safety reference, other for
262 * journal_remove_journal_head() */ 278 * journal_remove_journal_head() */
263 put_bh(bh); 279 put_bh(bh);
264 put_bh(bh); 280 release_data_buffer(bh);
265 } 281 }
266 282
267 if (need_resched() || spin_needbreak(&journal->j_list_lock)) { 283 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
@@ -271,6 +287,8 @@ write_out_data:
271 } 287 }
272 spin_unlock(&journal->j_list_lock); 288 spin_unlock(&journal->j_list_lock);
273 journal_do_submit_data(wbuf, bufs); 289 journal_do_submit_data(wbuf, bufs);
290
291 return err;
274} 292}
275 293
276/* 294/*
@@ -410,8 +428,7 @@ void journal_commit_transaction(journal_t *journal)
410 * Now start flushing things to disk, in the order they appear 428 * Now start flushing things to disk, in the order they appear
411 * on the transaction lists. Data blocks go first. 429 * on the transaction lists. Data blocks go first.
412 */ 430 */
413 err = 0; 431 err = journal_submit_data_buffers(journal, commit_transaction);
414 journal_submit_data_buffers(journal, commit_transaction);
415 432
416 /* 433 /*
417 * Wait for all previously submitted IO to complete. 434 * Wait for all previously submitted IO to complete.
@@ -426,10 +443,21 @@ void journal_commit_transaction(journal_t *journal)
426 if (buffer_locked(bh)) { 443 if (buffer_locked(bh)) {
427 spin_unlock(&journal->j_list_lock); 444 spin_unlock(&journal->j_list_lock);
428 wait_on_buffer(bh); 445 wait_on_buffer(bh);
429 if (unlikely(!buffer_uptodate(bh)))
430 err = -EIO;
431 spin_lock(&journal->j_list_lock); 446 spin_lock(&journal->j_list_lock);
432 } 447 }
448 if (unlikely(!buffer_uptodate(bh))) {
449 if (!trylock_page(bh->b_page)) {
450 spin_unlock(&journal->j_list_lock);
451 lock_page(bh->b_page);
452 spin_lock(&journal->j_list_lock);
453 }
454 if (bh->b_page->mapping)
455 set_bit(AS_EIO, &bh->b_page->mapping->flags);
456
457 unlock_page(bh->b_page);
458 SetPageError(bh->b_page);
459 err = -EIO;
460 }
433 if (!inverted_lock(journal, bh)) { 461 if (!inverted_lock(journal, bh)) {
434 put_bh(bh); 462 put_bh(bh);
435 spin_lock(&journal->j_list_lock); 463 spin_lock(&journal->j_list_lock);
@@ -443,17 +471,23 @@ void journal_commit_transaction(journal_t *journal)
443 } else { 471 } else {
444 jbd_unlock_bh_state(bh); 472 jbd_unlock_bh_state(bh);
445 } 473 }
446 put_bh(bh); 474 release_data_buffer(bh);
447 cond_resched_lock(&journal->j_list_lock); 475 cond_resched_lock(&journal->j_list_lock);
448 } 476 }
449 spin_unlock(&journal->j_list_lock); 477 spin_unlock(&journal->j_list_lock);
450 478
451 if (err) 479 if (err) {
452 journal_abort(journal, err); 480 char b[BDEVNAME_SIZE];
453 481
454 journal_write_revoke_records(journal, commit_transaction); 482 printk(KERN_WARNING
483 "JBD: Detected IO errors while flushing file data "
484 "on %s\n", bdevname(journal->j_fs_dev, b));
485 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
486 journal_abort(journal, err);
487 err = 0;
488 }
455 489
456 jbd_debug(3, "JBD: commit phase 2\n"); 490 journal_write_revoke_records(journal, commit_transaction);
457 491
458 /* 492 /*
459 * If we found any dirty or locked buffers, then we should have 493 * If we found any dirty or locked buffers, then we should have
@@ -486,9 +520,10 @@ void journal_commit_transaction(journal_t *journal)
486 jh = commit_transaction->t_buffers; 520 jh = commit_transaction->t_buffers;
487 521
488 /* If we're in abort mode, we just un-journal the buffer and 522 /* If we're in abort mode, we just un-journal the buffer and
489 release it for background writing. */ 523 release it. */
490 524
491 if (is_journal_aborted(journal)) { 525 if (is_journal_aborted(journal)) {
526 clear_buffer_jbddirty(jh2bh(jh));
492 JBUFFER_TRACE(jh, "journal is aborting: refile"); 527 JBUFFER_TRACE(jh, "journal is aborting: refile");
493 journal_refile_buffer(journal, jh); 528 journal_refile_buffer(journal, jh);
494 /* If that was the last one, we need to clean up 529 /* If that was the last one, we need to clean up
@@ -730,6 +765,9 @@ wait_for_iobuf:
730 /* AKPM: bforget here */ 765 /* AKPM: bforget here */
731 } 766 }
732 767
768 if (err)
769 journal_abort(journal, err);
770
733 jbd_debug(3, "JBD: commit phase 6\n"); 771 jbd_debug(3, "JBD: commit phase 6\n");
734 772
735 if (journal_write_commit_record(journal, commit_transaction)) 773 if (journal_write_commit_record(journal, commit_transaction))
@@ -820,6 +858,8 @@ restart_loop:
820 if (buffer_jbddirty(bh)) { 858 if (buffer_jbddirty(bh)) {
821 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 859 JBUFFER_TRACE(jh, "add to new checkpointing trans");
822 __journal_insert_checkpoint(jh, commit_transaction); 860 __journal_insert_checkpoint(jh, commit_transaction);
861 if (is_journal_aborted(journal))
862 clear_buffer_jbddirty(bh);
823 JBUFFER_TRACE(jh, "refile for checkpoint writeback"); 863 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
824 __journal_refile_buffer(jh); 864 __journal_refile_buffer(jh);
825 jbd_unlock_bh_state(bh); 865 jbd_unlock_bh_state(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b99c3b3654c4..9e4fa52d7dc8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -68,7 +68,6 @@ EXPORT_SYMBOL(journal_set_features);
68EXPORT_SYMBOL(journal_create); 68EXPORT_SYMBOL(journal_create);
69EXPORT_SYMBOL(journal_load); 69EXPORT_SYMBOL(journal_load);
70EXPORT_SYMBOL(journal_destroy); 70EXPORT_SYMBOL(journal_destroy);
71EXPORT_SYMBOL(journal_update_superblock);
72EXPORT_SYMBOL(journal_abort); 71EXPORT_SYMBOL(journal_abort);
73EXPORT_SYMBOL(journal_errno); 72EXPORT_SYMBOL(journal_errno);
74EXPORT_SYMBOL(journal_ack_err); 73EXPORT_SYMBOL(journal_ack_err);
@@ -1122,9 +1121,12 @@ recovery_error:
1122 * 1121 *
1123 * Release a journal_t structure once it is no longer in use by the 1122 * Release a journal_t structure once it is no longer in use by the
1124 * journaled object. 1123 * journaled object.
1124 * Return <0 if we couldn't clean up the journal.
1125 */ 1125 */
1126void journal_destroy(journal_t *journal) 1126int journal_destroy(journal_t *journal)
1127{ 1127{
1128 int err = 0;
1129
1128 /* Wait for the commit thread to wake up and die. */ 1130 /* Wait for the commit thread to wake up and die. */
1129 journal_kill_thread(journal); 1131 journal_kill_thread(journal);
1130 1132
@@ -1147,11 +1149,16 @@ void journal_destroy(journal_t *journal)
1147 J_ASSERT(journal->j_checkpoint_transactions == NULL); 1149 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1148 spin_unlock(&journal->j_list_lock); 1150 spin_unlock(&journal->j_list_lock);
1149 1151
1150 /* We can now mark the journal as empty. */
1151 journal->j_tail = 0;
1152 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1153 if (journal->j_sb_buffer) { 1152 if (journal->j_sb_buffer) {
1154 journal_update_superblock(journal, 1); 1153 if (!is_journal_aborted(journal)) {
1154 /* We can now mark the journal as empty. */
1155 journal->j_tail = 0;
1156 journal->j_tail_sequence =
1157 ++journal->j_transaction_sequence;
1158 journal_update_superblock(journal, 1);
1159 } else {
1160 err = -EIO;
1161 }
1155 brelse(journal->j_sb_buffer); 1162 brelse(journal->j_sb_buffer);
1156 } 1163 }
1157 1164
@@ -1161,6 +1168,8 @@ void journal_destroy(journal_t *journal)
1161 journal_destroy_revoke(journal); 1168 journal_destroy_revoke(journal);
1162 kfree(journal->j_wbuf); 1169 kfree(journal->j_wbuf);
1163 kfree(journal); 1170 kfree(journal);
1171
1172 return err;
1164} 1173}
1165 1174
1166 1175
@@ -1360,10 +1369,16 @@ int journal_flush(journal_t *journal)
1360 spin_lock(&journal->j_list_lock); 1369 spin_lock(&journal->j_list_lock);
1361 while (!err && journal->j_checkpoint_transactions != NULL) { 1370 while (!err && journal->j_checkpoint_transactions != NULL) {
1362 spin_unlock(&journal->j_list_lock); 1371 spin_unlock(&journal->j_list_lock);
1372 mutex_lock(&journal->j_checkpoint_mutex);
1363 err = log_do_checkpoint(journal); 1373 err = log_do_checkpoint(journal);
1374 mutex_unlock(&journal->j_checkpoint_mutex);
1364 spin_lock(&journal->j_list_lock); 1375 spin_lock(&journal->j_list_lock);
1365 } 1376 }
1366 spin_unlock(&journal->j_list_lock); 1377 spin_unlock(&journal->j_list_lock);
1378
1379 if (is_journal_aborted(journal))
1380 return -EIO;
1381
1367 cleanup_journal_tail(journal); 1382 cleanup_journal_tail(journal);
1368 1383
1369 /* Finally, mark the journal as really needing no recovery. 1384 /* Finally, mark the journal as really needing no recovery.
@@ -1385,7 +1400,7 @@ int journal_flush(journal_t *journal)
1385 J_ASSERT(journal->j_head == journal->j_tail); 1400 J_ASSERT(journal->j_head == journal->j_tail);
1386 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1401 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1387 spin_unlock(&journal->j_state_lock); 1402 spin_unlock(&journal->j_state_lock);
1388 return err; 1403 return 0;
1389} 1404}
1390 1405
1391/** 1406/**
@@ -1636,9 +1651,10 @@ static int journal_init_journal_head_cache(void)
1636 1651
1637static void journal_destroy_journal_head_cache(void) 1652static void journal_destroy_journal_head_cache(void)
1638{ 1653{
1639 J_ASSERT(journal_head_cache != NULL); 1654 if (journal_head_cache) {
1640 kmem_cache_destroy(journal_head_cache); 1655 kmem_cache_destroy(journal_head_cache);
1641 journal_head_cache = NULL; 1656 journal_head_cache = NULL;
1657 }
1642} 1658}
1643 1659
1644/* 1660/*
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 43bc5e5ed064..db5e982c5ddf 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -223,7 +223,7 @@ do { \
223 */ 223 */
224int journal_recover(journal_t *journal) 224int journal_recover(journal_t *journal)
225{ 225{
226 int err; 226 int err, err2;
227 journal_superblock_t * sb; 227 journal_superblock_t * sb;
228 228
229 struct recovery_info info; 229 struct recovery_info info;
@@ -261,7 +261,10 @@ int journal_recover(journal_t *journal)
261 journal->j_transaction_sequence = ++info.end_transaction; 261 journal->j_transaction_sequence = ++info.end_transaction;
262 262
263 journal_clear_revoke(journal); 263 journal_clear_revoke(journal);
264 sync_blockdev(journal->j_fs_dev); 264 err2 = sync_blockdev(journal->j_fs_dev);
265 if (!err)
266 err = err2;
267
265 return err; 268 return err;
266} 269}
267 270
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 1bb43e987f4b..c7bd649bbbdc 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -166,138 +166,123 @@ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
166 return NULL; 166 return NULL;
167} 167}
168 168
169void journal_destroy_revoke_caches(void)
170{
171 if (revoke_record_cache) {
172 kmem_cache_destroy(revoke_record_cache);
173 revoke_record_cache = NULL;
174 }
175 if (revoke_table_cache) {
176 kmem_cache_destroy(revoke_table_cache);
177 revoke_table_cache = NULL;
178 }
179}
180
169int __init journal_init_revoke_caches(void) 181int __init journal_init_revoke_caches(void)
170{ 182{
183 J_ASSERT(!revoke_record_cache);
184 J_ASSERT(!revoke_table_cache);
185
171 revoke_record_cache = kmem_cache_create("revoke_record", 186 revoke_record_cache = kmem_cache_create("revoke_record",
172 sizeof(struct jbd_revoke_record_s), 187 sizeof(struct jbd_revoke_record_s),
173 0, 188 0,
174 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, 189 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
175 NULL); 190 NULL);
176 if (!revoke_record_cache) 191 if (!revoke_record_cache)
177 return -ENOMEM; 192 goto record_cache_failure;
178 193
179 revoke_table_cache = kmem_cache_create("revoke_table", 194 revoke_table_cache = kmem_cache_create("revoke_table",
180 sizeof(struct jbd_revoke_table_s), 195 sizeof(struct jbd_revoke_table_s),
181 0, SLAB_TEMPORARY, NULL); 196 0, SLAB_TEMPORARY, NULL);
182 if (!revoke_table_cache) { 197 if (!revoke_table_cache)
183 kmem_cache_destroy(revoke_record_cache); 198 goto table_cache_failure;
184 revoke_record_cache = NULL; 199
185 return -ENOMEM;
186 }
187 return 0; 200 return 0;
188}
189 201
190void journal_destroy_revoke_caches(void) 202table_cache_failure:
191{ 203 journal_destroy_revoke_caches();
192 kmem_cache_destroy(revoke_record_cache); 204record_cache_failure:
193 revoke_record_cache = NULL; 205 return -ENOMEM;
194 kmem_cache_destroy(revoke_table_cache);
195 revoke_table_cache = NULL;
196} 206}
197 207
198/* Initialise the revoke table for a given journal to a given size. */ 208static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
199
200int journal_init_revoke(journal_t *journal, int hash_size)
201{ 209{
202 int shift, tmp; 210 int shift = 0;
211 int tmp = hash_size;
212 struct jbd_revoke_table_s *table;
203 213
204 J_ASSERT (journal->j_revoke_table[0] == NULL); 214 table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
215 if (!table)
216 goto out;
205 217
206 shift = 0;
207 tmp = hash_size;
208 while((tmp >>= 1UL) != 0UL) 218 while((tmp >>= 1UL) != 0UL)
209 shift++; 219 shift++;
210 220
211 journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); 221 table->hash_size = hash_size;
212 if (!journal->j_revoke_table[0]) 222 table->hash_shift = shift;
213 return -ENOMEM; 223 table->hash_table =
214 journal->j_revoke = journal->j_revoke_table[0];
215
216 /* Check that the hash_size is a power of two */
217 J_ASSERT(is_power_of_2(hash_size));
218
219 journal->j_revoke->hash_size = hash_size;
220
221 journal->j_revoke->hash_shift = shift;
222
223 journal->j_revoke->hash_table =
224 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); 224 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
225 if (!journal->j_revoke->hash_table) { 225 if (!table->hash_table) {
226 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); 226 kmem_cache_free(revoke_table_cache, table);
227 journal->j_revoke = NULL; 227 table = NULL;
228 return -ENOMEM; 228 goto out;
229 } 229 }
230 230
231 for (tmp = 0; tmp < hash_size; tmp++) 231 for (tmp = 0; tmp < hash_size; tmp++)
232 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); 232 INIT_LIST_HEAD(&table->hash_table[tmp]);
233 233
234 journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); 234out:
235 if (!journal->j_revoke_table[1]) { 235 return table;
236 kfree(journal->j_revoke_table[0]->hash_table); 236}
237 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); 237
238 return -ENOMEM; 238static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
239{
240 int i;
241 struct list_head *hash_list;
242
243 for (i = 0; i < table->hash_size; i++) {
244 hash_list = &table->hash_table[i];
245 J_ASSERT(list_empty(hash_list));
239 } 246 }
240 247
241 journal->j_revoke = journal->j_revoke_table[1]; 248 kfree(table->hash_table);
249 kmem_cache_free(revoke_table_cache, table);
250}
242 251
243 /* Check that the hash_size is a power of two */ 252/* Initialise the revoke table for a given journal to a given size. */
253int journal_init_revoke(journal_t *journal, int hash_size)
254{
255 J_ASSERT(journal->j_revoke_table[0] == NULL);
244 J_ASSERT(is_power_of_2(hash_size)); 256 J_ASSERT(is_power_of_2(hash_size));
245 257
246 journal->j_revoke->hash_size = hash_size; 258 journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
259 if (!journal->j_revoke_table[0])
260 goto fail0;
247 261
248 journal->j_revoke->hash_shift = shift; 262 journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
263 if (!journal->j_revoke_table[1])
264 goto fail1;
249 265
250 journal->j_revoke->hash_table = 266 journal->j_revoke = journal->j_revoke_table[1];
251 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
252 if (!journal->j_revoke->hash_table) {
253 kfree(journal->j_revoke_table[0]->hash_table);
254 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
255 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
256 journal->j_revoke = NULL;
257 return -ENOMEM;
258 }
259
260 for (tmp = 0; tmp < hash_size; tmp++)
261 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
262 267
263 spin_lock_init(&journal->j_revoke_lock); 268 spin_lock_init(&journal->j_revoke_lock);
264 269
265 return 0; 270 return 0;
266}
267 271
268/* Destoy a journal's revoke table. The table must already be empty! */ 272fail1:
273 journal_destroy_revoke_table(journal->j_revoke_table[0]);
274fail0:
275 return -ENOMEM;
276}
269 277
278/* Destroy a journal's revoke table. The table must already be empty! */
270void journal_destroy_revoke(journal_t *journal) 279void journal_destroy_revoke(journal_t *journal)
271{ 280{
272 struct jbd_revoke_table_s *table;
273 struct list_head *hash_list;
274 int i;
275
276 table = journal->j_revoke_table[0];
277 if (!table)
278 return;
279
280 for (i=0; i<table->hash_size; i++) {
281 hash_list = &table->hash_table[i];
282 J_ASSERT (list_empty(hash_list));
283 }
284
285 kfree(table->hash_table);
286 kmem_cache_free(revoke_table_cache, table);
287 journal->j_revoke = NULL;
288
289 table = journal->j_revoke_table[1];
290 if (!table)
291 return;
292
293 for (i=0; i<table->hash_size; i++) {
294 hash_list = &table->hash_table[i];
295 J_ASSERT (list_empty(hash_list));
296 }
297
298 kfree(table->hash_table);
299 kmem_cache_free(revoke_table_cache, table);
300 journal->j_revoke = NULL; 281 journal->j_revoke = NULL;
282 if (journal->j_revoke_table[0])
283 journal_destroy_revoke_table(journal->j_revoke_table[0]);
284 if (journal->j_revoke_table[1])
285 journal_destroy_revoke_table(journal->j_revoke_table[1]);
301} 286}
302 287
303 288
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 67ff2024c23c..d15cd6e7251e 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -291,7 +291,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
291 goto out; 291 goto out;
292 } 292 }
293 293
294 lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); 294 lock_map_acquire(&handle->h_lockdep_map);
295 295
296out: 296out:
297 return handle; 297 return handle;
@@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
954 journal_t *journal = handle->h_transaction->t_journal; 954 journal_t *journal = handle->h_transaction->t_journal;
955 int need_brelse = 0; 955 int need_brelse = 0;
956 struct journal_head *jh; 956 struct journal_head *jh;
957 int ret = 0;
957 958
958 if (is_handle_aborted(handle)) 959 if (is_handle_aborted(handle))
959 return 0; 960 return ret;
960 961
961 jh = journal_add_journal_head(bh); 962 jh = journal_add_journal_head(bh);
962 JBUFFER_TRACE(jh, "entry"); 963 JBUFFER_TRACE(jh, "entry");
@@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1067 time if it is redirtied */ 1068 time if it is redirtied */
1068 } 1069 }
1069 1070
1070 /* journal_clean_data_list() may have got there first */ 1071 /*
1072 * We cannot remove the buffer with io error from the
1073 * committing transaction, because otherwise it would
1074 * miss the error and the commit would not abort.
1075 */
1076 if (unlikely(!buffer_uptodate(bh))) {
1077 ret = -EIO;
1078 goto no_journal;
1079 }
1080
1071 if (jh->b_transaction != NULL) { 1081 if (jh->b_transaction != NULL) {
1072 JBUFFER_TRACE(jh, "unfile from commit"); 1082 JBUFFER_TRACE(jh, "unfile from commit");
1073 __journal_temp_unlink_buffer(jh); 1083 __journal_temp_unlink_buffer(jh);
@@ -1108,7 +1118,7 @@ no_journal:
1108 } 1118 }
1109 JBUFFER_TRACE(jh, "exit"); 1119 JBUFFER_TRACE(jh, "exit");
1110 journal_put_journal_head(jh); 1120 journal_put_journal_head(jh);
1111 return 0; 1121 return ret;
1112} 1122}
1113 1123
1114/** 1124/**
@@ -1448,7 +1458,7 @@ int journal_stop(handle_t *handle)
1448 spin_unlock(&journal->j_state_lock); 1458 spin_unlock(&journal->j_state_lock);
1449 } 1459 }
1450 1460
1451 lock_release(&handle->h_lockdep_map, 1, _THIS_IP_); 1461 lock_map_release(&handle->h_lockdep_map);
1452 1462
1453 jbd_free_handle(handle); 1463 jbd_free_handle(handle);
1454 return err; 1464 return err;
@@ -1648,12 +1658,42 @@ out:
1648 return; 1658 return;
1649} 1659}
1650 1660
1661/*
1662 * journal_try_to_free_buffers() could race with journal_commit_transaction()
1663 * The latter might still hold the a count on buffers when inspecting
1664 * them on t_syncdata_list or t_locked_list.
1665 *
1666 * journal_try_to_free_buffers() will call this function to
1667 * wait for the current transaction to finish syncing data buffers, before
1668 * tryinf to free that buffer.
1669 *
1670 * Called with journal->j_state_lock held.
1671 */
1672static void journal_wait_for_transaction_sync_data(journal_t *journal)
1673{
1674 transaction_t *transaction = NULL;
1675 tid_t tid;
1676
1677 spin_lock(&journal->j_state_lock);
1678 transaction = journal->j_committing_transaction;
1679
1680 if (!transaction) {
1681 spin_unlock(&journal->j_state_lock);
1682 return;
1683 }
1684
1685 tid = transaction->t_tid;
1686 spin_unlock(&journal->j_state_lock);
1687 log_wait_commit(journal, tid);
1688}
1651 1689
1652/** 1690/**
1653 * int journal_try_to_free_buffers() - try to free page buffers. 1691 * int journal_try_to_free_buffers() - try to free page buffers.
1654 * @journal: journal for operation 1692 * @journal: journal for operation
1655 * @page: to try and free 1693 * @page: to try and free
1656 * @unused_gfp_mask: unused 1694 * @gfp_mask: we use the mask to detect how hard should we try to release
1695 * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
1696 * release the buffers.
1657 * 1697 *
1658 * 1698 *
1659 * For all the buffers on this page, 1699 * For all the buffers on this page,
@@ -1682,9 +1722,11 @@ out:
1682 * journal_try_to_free_buffer() is changing its state. But that 1722 * journal_try_to_free_buffer() is changing its state. But that
1683 * cannot happen because we never reallocate freed data as metadata 1723 * cannot happen because we never reallocate freed data as metadata
1684 * while the data is part of a transaction. Yes? 1724 * while the data is part of a transaction. Yes?
1725 *
1726 * Return 0 on failure, 1 on success
1685 */ 1727 */
1686int journal_try_to_free_buffers(journal_t *journal, 1728int journal_try_to_free_buffers(journal_t *journal,
1687 struct page *page, gfp_t unused_gfp_mask) 1729 struct page *page, gfp_t gfp_mask)
1688{ 1730{
1689 struct buffer_head *head; 1731 struct buffer_head *head;
1690 struct buffer_head *bh; 1732 struct buffer_head *bh;
@@ -1713,7 +1755,28 @@ int journal_try_to_free_buffers(journal_t *journal,
1713 if (buffer_jbd(bh)) 1755 if (buffer_jbd(bh))
1714 goto busy; 1756 goto busy;
1715 } while ((bh = bh->b_this_page) != head); 1757 } while ((bh = bh->b_this_page) != head);
1758
1716 ret = try_to_free_buffers(page); 1759 ret = try_to_free_buffers(page);
1760
1761 /*
1762 * There are a number of places where journal_try_to_free_buffers()
1763 * could race with journal_commit_transaction(), the later still
1764 * holds the reference to the buffers to free while processing them.
1765 * try_to_free_buffers() failed to free those buffers. Some of the
1766 * caller of releasepage() request page buffers to be dropped, otherwise
1767 * treat the fail-to-free as errors (such as generic_file_direct_IO())
1768 *
1769 * So, if the caller of try_to_release_page() wants the synchronous
1770 * behaviour(i.e make sure buffers are dropped upon return),
1771 * let's wait for the current transaction to finish flush of
1772 * dirty data buffers, then try to free those buffers again,
1773 * with the journal locked.
1774 */
1775 if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
1776 journal_wait_for_transaction_sync_data(journal);
1777 ret = try_to_free_buffers(page);
1778 }
1779
1717busy: 1780busy:
1718 return ret; 1781 return ret;
1719} 1782}