aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2015-06-18 10:52:29 -0400
committerJan Kara <jack@suse.com>2015-07-23 14:59:40 -0400
commitc290ea01abb7907fde602f3ba55905ef10a37477 (patch)
tree67b3f47105259178034ef42d096bb5accd9407a3 /fs/jbd
parent82ff50b222d8ac645cdeba974c612c9eef01c3dd (diff)
fs: Remove ext3 filesystem driver
The functionality of ext3 is fully supported by ext4 driver. Major distributions (SUSE, RedHat) already use ext4 driver to handle ext3 filesystems for quite some time. There is some ugliness in mm resulting from jbd cleaning buffers in a dirty page without cleaning page dirty bit and also support for buffer bouncing in the block layer when stable pages are required is there only because of jbd. So let's remove the ext3 driver. This saves us some 28k lines of duplicated code. Acked-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Jan Kara <jack@suse.cz>
Diffstat (limited to 'fs/jbd')
-rw-r--r--fs/jbd/Kconfig30
-rw-r--r--fs/jbd/Makefile7
-rw-r--r--fs/jbd/checkpoint.c782
-rw-r--r--fs/jbd/commit.c1021
-rw-r--r--fs/jbd/journal.c2145
-rw-r--r--fs/jbd/recovery.c594
-rw-r--r--fs/jbd/revoke.c733
-rw-r--r--fs/jbd/transaction.c2237
8 files changed, 0 insertions, 7549 deletions
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
deleted file mode 100644
index 4e28beeed157..000000000000
--- a/fs/jbd/Kconfig
+++ /dev/null
@@ -1,30 +0,0 @@
1config JBD
2 tristate
3 help
4 This is a generic journalling layer for block devices. It is
5 currently used by the ext3 file system, but it could also be
6 used to add journal support to other file systems or block
7 devices such as RAID or LVM.
8
9 If you are using the ext3 file system, you need to say Y here.
10 If you are not using ext3 then you will probably want to say N.
11
12 To compile this device as a module, choose M here: the module will be
13 called jbd. If you are compiling ext3 into the kernel, you
14 cannot compile this code as a module.
15
16config JBD_DEBUG
17 bool "JBD (ext3) debugging support"
18 depends on JBD && DEBUG_FS
19 help
20 If you are using the ext3 journaled file system (or potentially any
21 other file system/device using JBD), this option allows you to
22 enable debugging output while the system is running, in order to
23 help track down any problems you are having. By default the
24 debugging output will be turned off.
25
26 If you select Y here, then you will be able to turn on debugging
27 with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
28 number between 1 and 5, the higher the number, the more debugging
29 output is generated. To turn debugging off again, do
30 "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile
deleted file mode 100644
index 54aca4868a36..000000000000
--- a/fs/jbd/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
1#
2# Makefile for the linux journaling routines.
3#
4
5obj-$(CONFIG_JBD) += jbd.o
6
7jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
deleted file mode 100644
index 08c03044abdd..000000000000
--- a/fs/jbd/checkpoint.c
+++ /dev/null
@@ -1,782 +0,0 @@
1/*
2 * linux/fs/jbd/checkpoint.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Checkpoint routines for the generic filesystem journaling code.
13 * Part of the ext2fs journaling system.
14 *
15 * Checkpointing is the process of ensuring that a section of the log is
16 * committed fully to disk, so that that portion of the log can be
17 * reused.
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/blkdev.h>
26#include <trace/events/jbd.h>
27
28/*
29 * Unlink a buffer from a transaction checkpoint list.
30 *
31 * Called with j_list_lock held.
32 */
33static inline void __buffer_unlink_first(struct journal_head *jh)
34{
35 transaction_t *transaction = jh->b_cp_transaction;
36
37 jh->b_cpnext->b_cpprev = jh->b_cpprev;
38 jh->b_cpprev->b_cpnext = jh->b_cpnext;
39 if (transaction->t_checkpoint_list == jh) {
40 transaction->t_checkpoint_list = jh->b_cpnext;
41 if (transaction->t_checkpoint_list == jh)
42 transaction->t_checkpoint_list = NULL;
43 }
44}
45
46/*
47 * Unlink a buffer from a transaction checkpoint(io) list.
48 *
49 * Called with j_list_lock held.
50 */
51static inline void __buffer_unlink(struct journal_head *jh)
52{
53 transaction_t *transaction = jh->b_cp_transaction;
54
55 __buffer_unlink_first(jh);
56 if (transaction->t_checkpoint_io_list == jh) {
57 transaction->t_checkpoint_io_list = jh->b_cpnext;
58 if (transaction->t_checkpoint_io_list == jh)
59 transaction->t_checkpoint_io_list = NULL;
60 }
61}
62
63/*
64 * Move a buffer from the checkpoint list to the checkpoint io list
65 *
66 * Called with j_list_lock held
67 */
68static inline void __buffer_relink_io(struct journal_head *jh)
69{
70 transaction_t *transaction = jh->b_cp_transaction;
71
72 __buffer_unlink_first(jh);
73
74 if (!transaction->t_checkpoint_io_list) {
75 jh->b_cpnext = jh->b_cpprev = jh;
76 } else {
77 jh->b_cpnext = transaction->t_checkpoint_io_list;
78 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
79 jh->b_cpprev->b_cpnext = jh;
80 jh->b_cpnext->b_cpprev = jh;
81 }
82 transaction->t_checkpoint_io_list = jh;
83}
84
85/*
86 * Try to release a checkpointed buffer from its transaction.
87 * Returns 1 if we released it and 2 if we also released the
88 * whole transaction.
89 *
90 * Requires j_list_lock
91 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
92 */
93static int __try_to_free_cp_buf(struct journal_head *jh)
94{
95 int ret = 0;
96 struct buffer_head *bh = jh2bh(jh);
97
98 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
99 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
100 /*
101 * Get our reference so that bh cannot be freed before
102 * we unlock it
103 */
104 get_bh(bh);
105 JBUFFER_TRACE(jh, "remove from checkpoint list");
106 ret = __journal_remove_checkpoint(jh) + 1;
107 jbd_unlock_bh_state(bh);
108 BUFFER_TRACE(bh, "release");
109 __brelse(bh);
110 } else {
111 jbd_unlock_bh_state(bh);
112 }
113 return ret;
114}
115
116/*
117 * __log_wait_for_space: wait until there is space in the journal.
118 *
119 * Called under j-state_lock *only*. It will be unlocked if we have to wait
120 * for a checkpoint to free up some space in the log.
121 */
122void __log_wait_for_space(journal_t *journal)
123{
124 int nblocks, space_left;
125 assert_spin_locked(&journal->j_state_lock);
126
127 nblocks = jbd_space_needed(journal);
128 while (__log_space_left(journal) < nblocks) {
129 if (journal->j_flags & JFS_ABORT)
130 return;
131 spin_unlock(&journal->j_state_lock);
132 mutex_lock(&journal->j_checkpoint_mutex);
133
134 /*
135 * Test again, another process may have checkpointed while we
136 * were waiting for the checkpoint lock. If there are no
137 * transactions ready to be checkpointed, try to recover
138 * journal space by calling cleanup_journal_tail(), and if
139 * that doesn't work, by waiting for the currently committing
140 * transaction to complete. If there is absolutely no way
141 * to make progress, this is either a BUG or corrupted
142 * filesystem, so abort the journal and leave a stack
143 * trace for forensic evidence.
144 */
145 spin_lock(&journal->j_state_lock);
146 spin_lock(&journal->j_list_lock);
147 nblocks = jbd_space_needed(journal);
148 space_left = __log_space_left(journal);
149 if (space_left < nblocks) {
150 int chkpt = journal->j_checkpoint_transactions != NULL;
151 tid_t tid = 0;
152
153 if (journal->j_committing_transaction)
154 tid = journal->j_committing_transaction->t_tid;
155 spin_unlock(&journal->j_list_lock);
156 spin_unlock(&journal->j_state_lock);
157 if (chkpt) {
158 log_do_checkpoint(journal);
159 } else if (cleanup_journal_tail(journal) == 0) {
160 /* We were able to recover space; yay! */
161 ;
162 } else if (tid) {
163 log_wait_commit(journal, tid);
164 } else {
165 printk(KERN_ERR "%s: needed %d blocks and "
166 "only had %d space available\n",
167 __func__, nblocks, space_left);
168 printk(KERN_ERR "%s: no way to get more "
169 "journal space\n", __func__);
170 WARN_ON(1);
171 journal_abort(journal, 0);
172 }
173 spin_lock(&journal->j_state_lock);
174 } else {
175 spin_unlock(&journal->j_list_lock);
176 }
177 mutex_unlock(&journal->j_checkpoint_mutex);
178 }
179}
180
181/*
182 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
183 * The caller must restart a list walk. Wait for someone else to run
184 * jbd_unlock_bh_state().
185 */
186static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
187 __releases(journal->j_list_lock)
188{
189 get_bh(bh);
190 spin_unlock(&journal->j_list_lock);
191 jbd_lock_bh_state(bh);
192 jbd_unlock_bh_state(bh);
193 put_bh(bh);
194}
195
196/*
197 * Clean up transaction's list of buffers submitted for io.
198 * We wait for any pending IO to complete and remove any clean
199 * buffers. Note that we take the buffers in the opposite ordering
200 * from the one in which they were submitted for IO.
201 *
202 * Return 0 on success, and return <0 if some buffers have failed
203 * to be written out.
204 *
205 * Called with j_list_lock held.
206 */
207static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
208{
209 struct journal_head *jh;
210 struct buffer_head *bh;
211 tid_t this_tid;
212 int released = 0;
213 int ret = 0;
214
215 this_tid = transaction->t_tid;
216restart:
217 /* Did somebody clean up the transaction in the meanwhile? */
218 if (journal->j_checkpoint_transactions != transaction ||
219 transaction->t_tid != this_tid)
220 return ret;
221 while (!released && transaction->t_checkpoint_io_list) {
222 jh = transaction->t_checkpoint_io_list;
223 bh = jh2bh(jh);
224 if (!jbd_trylock_bh_state(bh)) {
225 jbd_sync_bh(journal, bh);
226 spin_lock(&journal->j_list_lock);
227 goto restart;
228 }
229 get_bh(bh);
230 if (buffer_locked(bh)) {
231 spin_unlock(&journal->j_list_lock);
232 jbd_unlock_bh_state(bh);
233 wait_on_buffer(bh);
234 /* the journal_head may have gone by now */
235 BUFFER_TRACE(bh, "brelse");
236 __brelse(bh);
237 spin_lock(&journal->j_list_lock);
238 goto restart;
239 }
240 if (unlikely(buffer_write_io_error(bh)))
241 ret = -EIO;
242
243 /*
244 * Now in whatever state the buffer currently is, we know that
245 * it has been written out and so we can drop it from the list
246 */
247 released = __journal_remove_checkpoint(jh);
248 jbd_unlock_bh_state(bh);
249 __brelse(bh);
250 }
251
252 return ret;
253}
254
255#define NR_BATCH 64
256
257static void
258__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
259{
260 int i;
261 struct blk_plug plug;
262
263 blk_start_plug(&plug);
264 for (i = 0; i < *batch_count; i++)
265 write_dirty_buffer(bhs[i], WRITE_SYNC);
266 blk_finish_plug(&plug);
267
268 for (i = 0; i < *batch_count; i++) {
269 struct buffer_head *bh = bhs[i];
270 clear_buffer_jwrite(bh);
271 BUFFER_TRACE(bh, "brelse");
272 __brelse(bh);
273 }
274 *batch_count = 0;
275}
276
277/*
278 * Try to flush one buffer from the checkpoint list to disk.
279 *
280 * Return 1 if something happened which requires us to abort the current
281 * scan of the checkpoint list. Return <0 if the buffer has failed to
282 * be written out.
283 *
284 * Called with j_list_lock held and drops it if 1 is returned
285 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
286 */
287static int __process_buffer(journal_t *journal, struct journal_head *jh,
288 struct buffer_head **bhs, int *batch_count)
289{
290 struct buffer_head *bh = jh2bh(jh);
291 int ret = 0;
292
293 if (buffer_locked(bh)) {
294 get_bh(bh);
295 spin_unlock(&journal->j_list_lock);
296 jbd_unlock_bh_state(bh);
297 wait_on_buffer(bh);
298 /* the journal_head may have gone by now */
299 BUFFER_TRACE(bh, "brelse");
300 __brelse(bh);
301 ret = 1;
302 } else if (jh->b_transaction != NULL) {
303 transaction_t *t = jh->b_transaction;
304 tid_t tid = t->t_tid;
305
306 spin_unlock(&journal->j_list_lock);
307 jbd_unlock_bh_state(bh);
308 log_start_commit(journal, tid);
309 log_wait_commit(journal, tid);
310 ret = 1;
311 } else if (!buffer_dirty(bh)) {
312 ret = 1;
313 if (unlikely(buffer_write_io_error(bh)))
314 ret = -EIO;
315 get_bh(bh);
316 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
317 BUFFER_TRACE(bh, "remove from checkpoint");
318 __journal_remove_checkpoint(jh);
319 spin_unlock(&journal->j_list_lock);
320 jbd_unlock_bh_state(bh);
321 __brelse(bh);
322 } else {
323 /*
324 * Important: we are about to write the buffer, and
325 * possibly block, while still holding the journal lock.
326 * We cannot afford to let the transaction logic start
327 * messing around with this buffer before we write it to
328 * disk, as that would break recoverability.
329 */
330 BUFFER_TRACE(bh, "queue");
331 get_bh(bh);
332 J_ASSERT_BH(bh, !buffer_jwrite(bh));
333 set_buffer_jwrite(bh);
334 bhs[*batch_count] = bh;
335 __buffer_relink_io(jh);
336 jbd_unlock_bh_state(bh);
337 (*batch_count)++;
338 if (*batch_count == NR_BATCH) {
339 spin_unlock(&journal->j_list_lock);
340 __flush_batch(journal, bhs, batch_count);
341 ret = 1;
342 }
343 }
344 return ret;
345}
346
347/*
348 * Perform an actual checkpoint. We take the first transaction on the
349 * list of transactions to be checkpointed and send all its buffers
350 * to disk. We submit larger chunks of data at once.
351 *
352 * The journal should be locked before calling this function.
353 * Called with j_checkpoint_mutex held.
354 */
355int log_do_checkpoint(journal_t *journal)
356{
357 transaction_t *transaction;
358 tid_t this_tid;
359 int result;
360
361 jbd_debug(1, "Start checkpoint\n");
362
363 /*
364 * First thing: if there are any transactions in the log which
365 * don't need checkpointing, just eliminate them from the
366 * journal straight away.
367 */
368 result = cleanup_journal_tail(journal);
369 trace_jbd_checkpoint(journal, result);
370 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
371 if (result <= 0)
372 return result;
373
374 /*
375 * OK, we need to start writing disk blocks. Take one transaction
376 * and write it.
377 */
378 result = 0;
379 spin_lock(&journal->j_list_lock);
380 if (!journal->j_checkpoint_transactions)
381 goto out;
382 transaction = journal->j_checkpoint_transactions;
383 this_tid = transaction->t_tid;
384restart:
385 /*
386 * If someone cleaned up this transaction while we slept, we're
387 * done (maybe it's a new transaction, but it fell at the same
388 * address).
389 */
390 if (journal->j_checkpoint_transactions == transaction &&
391 transaction->t_tid == this_tid) {
392 int batch_count = 0;
393 struct buffer_head *bhs[NR_BATCH];
394 struct journal_head *jh;
395 int retry = 0, err;
396
397 while (!retry && transaction->t_checkpoint_list) {
398 struct buffer_head *bh;
399
400 jh = transaction->t_checkpoint_list;
401 bh = jh2bh(jh);
402 if (!jbd_trylock_bh_state(bh)) {
403 jbd_sync_bh(journal, bh);
404 retry = 1;
405 break;
406 }
407 retry = __process_buffer(journal, jh, bhs,&batch_count);
408 if (retry < 0 && !result)
409 result = retry;
410 if (!retry && (need_resched() ||
411 spin_needbreak(&journal->j_list_lock))) {
412 spin_unlock(&journal->j_list_lock);
413 retry = 1;
414 break;
415 }
416 }
417
418 if (batch_count) {
419 if (!retry) {
420 spin_unlock(&journal->j_list_lock);
421 retry = 1;
422 }
423 __flush_batch(journal, bhs, &batch_count);
424 }
425
426 if (retry) {
427 spin_lock(&journal->j_list_lock);
428 goto restart;
429 }
430 /*
431 * Now we have cleaned up the first transaction's checkpoint
432 * list. Let's clean up the second one
433 */
434 err = __wait_cp_io(journal, transaction);
435 if (!result)
436 result = err;
437 }
438out:
439 spin_unlock(&journal->j_list_lock);
440 if (result < 0)
441 journal_abort(journal, result);
442 else
443 result = cleanup_journal_tail(journal);
444
445 return (result < 0) ? result : 0;
446}
447
448/*
449 * Check the list of checkpoint transactions for the journal to see if
450 * we have already got rid of any since the last update of the log tail
451 * in the journal superblock. If so, we can instantly roll the
452 * superblock forward to remove those transactions from the log.
453 *
454 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
455 *
456 * This is the only part of the journaling code which really needs to be
457 * aware of transaction aborts. Checkpointing involves writing to the
458 * main filesystem area rather than to the journal, so it can proceed
459 * even in abort state, but we must not update the super block if
460 * checkpointing may have failed. Otherwise, we would lose some metadata
461 * buffers which should be written-back to the filesystem.
462 */
463
464int cleanup_journal_tail(journal_t *journal)
465{
466 transaction_t * transaction;
467 tid_t first_tid;
468 unsigned int blocknr, freed;
469
470 if (is_journal_aborted(journal))
471 return 1;
472
473 /*
474 * OK, work out the oldest transaction remaining in the log, and
475 * the log block it starts at.
476 *
477 * If the log is now empty, we need to work out which is the
478 * next transaction ID we will write, and where it will
479 * start.
480 */
481 spin_lock(&journal->j_state_lock);
482 spin_lock(&journal->j_list_lock);
483 transaction = journal->j_checkpoint_transactions;
484 if (transaction) {
485 first_tid = transaction->t_tid;
486 blocknr = transaction->t_log_start;
487 } else if ((transaction = journal->j_committing_transaction) != NULL) {
488 first_tid = transaction->t_tid;
489 blocknr = transaction->t_log_start;
490 } else if ((transaction = journal->j_running_transaction) != NULL) {
491 first_tid = transaction->t_tid;
492 blocknr = journal->j_head;
493 } else {
494 first_tid = journal->j_transaction_sequence;
495 blocknr = journal->j_head;
496 }
497 spin_unlock(&journal->j_list_lock);
498 J_ASSERT(blocknr != 0);
499
500 /* If the oldest pinned transaction is at the tail of the log
501 already then there's not much we can do right now. */
502 if (journal->j_tail_sequence == first_tid) {
503 spin_unlock(&journal->j_state_lock);
504 return 1;
505 }
506 spin_unlock(&journal->j_state_lock);
507
508 /*
509 * We need to make sure that any blocks that were recently written out
510 * --- perhaps by log_do_checkpoint() --- are flushed out before we
511 * drop the transactions from the journal. Similarly we need to be sure
512 * superblock makes it to disk before next transaction starts reusing
513 * freed space (otherwise we could replay some blocks of the new
514 * transaction thinking they belong to the old one). So we use
515 * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially
516 * with an appropriately sized journal, but we need this to guarantee
517 * correctness. Fortunately cleanup_journal_tail() doesn't get called
518 * all that often.
519 */
520 journal_update_sb_log_tail(journal, first_tid, blocknr,
521 WRITE_FLUSH_FUA);
522
523 spin_lock(&journal->j_state_lock);
524 /* OK, update the superblock to recover the freed space.
525 * Physical blocks come first: have we wrapped beyond the end of
526 * the log? */
527 freed = blocknr - journal->j_tail;
528 if (blocknr < journal->j_tail)
529 freed = freed + journal->j_last - journal->j_first;
530
531 trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
532 jbd_debug(1,
533 "Cleaning journal tail from %d to %d (offset %u), "
534 "freeing %u\n",
535 journal->j_tail_sequence, first_tid, blocknr, freed);
536
537 journal->j_free += freed;
538 journal->j_tail_sequence = first_tid;
539 journal->j_tail = blocknr;
540 spin_unlock(&journal->j_state_lock);
541 return 0;
542}
543
544
545/* Checkpoint list management */
546
547/*
548 * journal_clean_one_cp_list
549 *
550 * Find all the written-back checkpoint buffers in the given list and release
551 * them.
552 *
553 * Called with j_list_lock held.
554 * Returns number of buffers reaped (for debug)
555 */
556
557static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
558{
559 struct journal_head *last_jh;
560 struct journal_head *next_jh = jh;
561 int ret, freed = 0;
562
563 *released = 0;
564 if (!jh)
565 return 0;
566
567 last_jh = jh->b_cpprev;
568 do {
569 jh = next_jh;
570 next_jh = jh->b_cpnext;
571 /* Use trylock because of the ranking */
572 if (jbd_trylock_bh_state(jh2bh(jh))) {
573 ret = __try_to_free_cp_buf(jh);
574 if (ret) {
575 freed++;
576 if (ret == 2) {
577 *released = 1;
578 return freed;
579 }
580 }
581 }
582 /*
583 * This function only frees up some memory
584 * if possible so we dont have an obligation
585 * to finish processing. Bail out if preemption
586 * requested:
587 */
588 if (need_resched())
589 return freed;
590 } while (jh != last_jh);
591
592 return freed;
593}
594
595/*
596 * journal_clean_checkpoint_list
597 *
598 * Find all the written-back checkpoint buffers in the journal and release them.
599 *
600 * Called with the journal locked.
601 * Called with j_list_lock held.
602 * Returns number of buffers reaped (for debug)
603 */
604
605int __journal_clean_checkpoint_list(journal_t *journal)
606{
607 transaction_t *transaction, *last_transaction, *next_transaction;
608 int ret = 0;
609 int released;
610
611 transaction = journal->j_checkpoint_transactions;
612 if (!transaction)
613 goto out;
614
615 last_transaction = transaction->t_cpprev;
616 next_transaction = transaction;
617 do {
618 transaction = next_transaction;
619 next_transaction = transaction->t_cpnext;
620 ret += journal_clean_one_cp_list(transaction->
621 t_checkpoint_list, &released);
622 /*
623 * This function only frees up some memory if possible so we
624 * dont have an obligation to finish processing. Bail out if
625 * preemption requested:
626 */
627 if (need_resched())
628 goto out;
629 if (released)
630 continue;
631 /*
632 * It is essential that we are as careful as in the case of
633 * t_checkpoint_list with removing the buffer from the list as
634 * we can possibly see not yet submitted buffers on io_list
635 */
636 ret += journal_clean_one_cp_list(transaction->
637 t_checkpoint_io_list, &released);
638 if (need_resched())
639 goto out;
640 } while (transaction != last_transaction);
641out:
642 return ret;
643}
644
645/*
646 * journal_remove_checkpoint: called after a buffer has been committed
647 * to disk (either by being write-back flushed to disk, or being
648 * committed to the log).
649 *
650 * We cannot safely clean a transaction out of the log until all of the
651 * buffer updates committed in that transaction have safely been stored
652 * elsewhere on disk. To achieve this, all of the buffers in a
653 * transaction need to be maintained on the transaction's checkpoint
654 * lists until they have been rewritten, at which point this function is
655 * called to remove the buffer from the existing transaction's
656 * checkpoint lists.
657 *
658 * The function returns 1 if it frees the transaction, 0 otherwise.
659 * The function can free jh and bh.
660 *
661 * This function is called with j_list_lock held.
662 * This function is called with jbd_lock_bh_state(jh2bh(jh))
663 */
664
665int __journal_remove_checkpoint(struct journal_head *jh)
666{
667 transaction_t *transaction;
668 journal_t *journal;
669 int ret = 0;
670
671 JBUFFER_TRACE(jh, "entry");
672
673 if ((transaction = jh->b_cp_transaction) == NULL) {
674 JBUFFER_TRACE(jh, "not on transaction");
675 goto out;
676 }
677 journal = transaction->t_journal;
678
679 JBUFFER_TRACE(jh, "removing from transaction");
680 __buffer_unlink(jh);
681 jh->b_cp_transaction = NULL;
682 journal_put_journal_head(jh);
683
684 if (transaction->t_checkpoint_list != NULL ||
685 transaction->t_checkpoint_io_list != NULL)
686 goto out;
687
688 /*
689 * There is one special case to worry about: if we have just pulled the
690 * buffer off a running or committing transaction's checkpoing list,
691 * then even if the checkpoint list is empty, the transaction obviously
692 * cannot be dropped!
693 *
694 * The locking here around t_state is a bit sleazy.
695 * See the comment at the end of journal_commit_transaction().
696 */
697 if (transaction->t_state != T_FINISHED)
698 goto out;
699
700 /* OK, that was the last buffer for the transaction: we can now
701 safely remove this transaction from the log */
702
703 __journal_drop_transaction(journal, transaction);
704
705 /* Just in case anybody was waiting for more transactions to be
706 checkpointed... */
707 wake_up(&journal->j_wait_logspace);
708 ret = 1;
709out:
710 return ret;
711}
712
713/*
714 * journal_insert_checkpoint: put a committed buffer onto a checkpoint
715 * list so that we know when it is safe to clean the transaction out of
716 * the log.
717 *
718 * Called with the journal locked.
719 * Called with j_list_lock held.
720 */
721void __journal_insert_checkpoint(struct journal_head *jh,
722 transaction_t *transaction)
723{
724 JBUFFER_TRACE(jh, "entry");
725 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
726 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
727
728 /* Get reference for checkpointing transaction */
729 journal_grab_journal_head(jh2bh(jh));
730 jh->b_cp_transaction = transaction;
731
732 if (!transaction->t_checkpoint_list) {
733 jh->b_cpnext = jh->b_cpprev = jh;
734 } else {
735 jh->b_cpnext = transaction->t_checkpoint_list;
736 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
737 jh->b_cpprev->b_cpnext = jh;
738 jh->b_cpnext->b_cpprev = jh;
739 }
740 transaction->t_checkpoint_list = jh;
741}
742
743/*
744 * We've finished with this transaction structure: adios...
745 *
746 * The transaction must have no links except for the checkpoint by this
747 * point.
748 *
749 * Called with the journal locked.
750 * Called with j_list_lock held.
751 */
752
753void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
754{
755 assert_spin_locked(&journal->j_list_lock);
756 if (transaction->t_cpnext) {
757 transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
758 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
759 if (journal->j_checkpoint_transactions == transaction)
760 journal->j_checkpoint_transactions =
761 transaction->t_cpnext;
762 if (journal->j_checkpoint_transactions == transaction)
763 journal->j_checkpoint_transactions = NULL;
764 }
765
766 J_ASSERT(transaction->t_state == T_FINISHED);
767 J_ASSERT(transaction->t_buffers == NULL);
768 J_ASSERT(transaction->t_sync_datalist == NULL);
769 J_ASSERT(transaction->t_forget == NULL);
770 J_ASSERT(transaction->t_iobuf_list == NULL);
771 J_ASSERT(transaction->t_shadow_list == NULL);
772 J_ASSERT(transaction->t_log_list == NULL);
773 J_ASSERT(transaction->t_checkpoint_list == NULL);
774 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
775 J_ASSERT(transaction->t_updates == 0);
776 J_ASSERT(journal->j_committing_transaction != transaction);
777 J_ASSERT(journal->j_running_transaction != transaction);
778
779 trace_jbd_drop_transaction(journal, transaction);
780 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
781 kfree(transaction);
782}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
deleted file mode 100644
index bb217dcb41af..000000000000
--- a/fs/jbd/commit.c
+++ /dev/null
@@ -1,1021 +0,0 @@
1/*
2 * linux/fs/jbd/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd.h>
19#include <linux/errno.h>
20#include <linux/mm.h>
21#include <linux/pagemap.h>
22#include <linux/bio.h>
23#include <linux/blkdev.h>
24#include <trace/events/jbd.h>
25
26/*
27 * Default IO end handler for temporary BJ_IO buffer_heads.
28 */
29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30{
31 BUFFER_TRACE(bh, "");
32 if (uptodate)
33 set_buffer_uptodate(bh);
34 else
35 clear_buffer_uptodate(bh);
36 unlock_buffer(bh);
37}
38
39/*
40 * When an ext3-ordered file is truncated, it is possible that many pages are
41 * not successfully freed, because they are attached to a committing transaction.
42 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
45 * the numbers in /proc/meminfo look odd.
46 *
47 * So here, we have a buffer which has just come off the forget list. Look to
48 * see if we can strip all buffers from the backing page.
49 *
50 * Called under journal->j_list_lock. The caller provided us with a ref
51 * against the buffer, and we drop that here.
52 */
53static void release_buffer_page(struct buffer_head *bh)
54{
55 struct page *page;
56
57 if (buffer_dirty(bh))
58 goto nope;
59 if (atomic_read(&bh->b_count) != 1)
60 goto nope;
61 page = bh->b_page;
62 if (!page)
63 goto nope;
64 if (page->mapping)
65 goto nope;
66
67 /* OK, it's a truncated page */
68 if (!trylock_page(page))
69 goto nope;
70
71 page_cache_get(page);
72 __brelse(bh);
73 try_to_free_buffers(page);
74 unlock_page(page);
75 page_cache_release(page);
76 return;
77
78nope:
79 __brelse(bh);
80}
81
82/*
83 * Decrement reference counter for data buffer. If it has been marked
84 * 'BH_Freed', release it and the page to which it belongs if possible.
85 */
86static void release_data_buffer(struct buffer_head *bh)
87{
88 if (buffer_freed(bh)) {
89 WARN_ON_ONCE(buffer_dirty(bh));
90 clear_buffer_freed(bh);
91 clear_buffer_mapped(bh);
92 clear_buffer_new(bh);
93 clear_buffer_req(bh);
94 bh->b_bdev = NULL;
95 release_buffer_page(bh);
96 } else
97 put_bh(bh);
98}
99
100/*
101 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
102 * held. For ranking reasons we must trylock. If we lose, schedule away and
103 * return 0. j_list_lock is dropped in this case.
104 */
105static int inverted_lock(journal_t *journal, struct buffer_head *bh)
106{
107 if (!jbd_trylock_bh_state(bh)) {
108 spin_unlock(&journal->j_list_lock);
109 schedule();
110 return 0;
111 }
112 return 1;
113}
114
115/* Done it all: now write the commit record. We should have
116 * cleaned up our previous buffers by now, so if we are in abort
117 * mode we can now just skip the rest of the journal write
118 * entirely.
119 *
120 * Returns 1 if the journal needs to be aborted or 0 on success
121 */
122static int journal_write_commit_record(journal_t *journal,
123 transaction_t *commit_transaction)
124{
125 struct journal_head *descriptor;
126 struct buffer_head *bh;
127 journal_header_t *header;
128 int ret;
129
130 if (is_journal_aborted(journal))
131 return 0;
132
133 descriptor = journal_get_descriptor_buffer(journal);
134 if (!descriptor)
135 return 1;
136
137 bh = jh2bh(descriptor);
138
139 header = (journal_header_t *)(bh->b_data);
140 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
141 header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
142 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
143
144 JBUFFER_TRACE(descriptor, "write commit block");
145 set_buffer_dirty(bh);
146
147 if (journal->j_flags & JFS_BARRIER)
148 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
149 else
150 ret = sync_dirty_buffer(bh);
151
152 put_bh(bh); /* One for getblk() */
153 journal_put_journal_head(descriptor);
154
155 return (ret == -EIO);
156}
157
158static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
159 int write_op)
160{
161 int i;
162
163 for (i = 0; i < bufs; i++) {
164 wbuf[i]->b_end_io = end_buffer_write_sync;
165 /*
166 * Here we write back pagecache data that may be mmaped. Since
167 * we cannot afford to clean the page and set PageWriteback
168 * here due to lock ordering (page lock ranks above transaction
169 * start), the data can change while IO is in flight. Tell the
170 * block layer it should bounce the bio pages if stable data
171 * during write is required.
172 *
173 * We use up our safety reference in submit_bh().
174 */
175 _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
176 }
177}
178
179/*
180 * Submit all the data buffers to disk
181 */
182static int journal_submit_data_buffers(journal_t *journal,
183 transaction_t *commit_transaction,
184 int write_op)
185{
186 struct journal_head *jh;
187 struct buffer_head *bh;
188 int locked;
189 int bufs = 0;
190 struct buffer_head **wbuf = journal->j_wbuf;
191 int err = 0;
192
193 /*
194 * Whenever we unlock the journal and sleep, things can get added
195 * onto ->t_sync_datalist, so we have to keep looping back to
196 * write_out_data until we *know* that the list is empty.
197 *
198 * Cleanup any flushed data buffers from the data list. Even in
199 * abort mode, we want to flush this out as soon as possible.
200 */
201write_out_data:
202 cond_resched();
203 spin_lock(&journal->j_list_lock);
204
205 while (commit_transaction->t_sync_datalist) {
206 jh = commit_transaction->t_sync_datalist;
207 bh = jh2bh(jh);
208 locked = 0;
209
210 /* Get reference just to make sure buffer does not disappear
211 * when we are forced to drop various locks */
212 get_bh(bh);
213 /* If the buffer is dirty, we need to submit IO and hence
214 * we need the buffer lock. We try to lock the buffer without
215 * blocking. If we fail, we need to drop j_list_lock and do
216 * blocking lock_buffer().
217 */
218 if (buffer_dirty(bh)) {
219 if (!trylock_buffer(bh)) {
220 BUFFER_TRACE(bh, "needs blocking lock");
221 spin_unlock(&journal->j_list_lock);
222 trace_jbd_do_submit_data(journal,
223 commit_transaction);
224 /* Write out all data to prevent deadlocks */
225 journal_do_submit_data(wbuf, bufs, write_op);
226 bufs = 0;
227 lock_buffer(bh);
228 spin_lock(&journal->j_list_lock);
229 }
230 locked = 1;
231 }
232 /* We have to get bh_state lock. Again out of order, sigh. */
233 if (!inverted_lock(journal, bh)) {
234 jbd_lock_bh_state(bh);
235 spin_lock(&journal->j_list_lock);
236 }
237 /* Someone already cleaned up the buffer? */
238 if (!buffer_jbd(bh) || bh2jh(bh) != jh
239 || jh->b_transaction != commit_transaction
240 || jh->b_jlist != BJ_SyncData) {
241 jbd_unlock_bh_state(bh);
242 if (locked)
243 unlock_buffer(bh);
244 BUFFER_TRACE(bh, "already cleaned up");
245 release_data_buffer(bh);
246 continue;
247 }
248 if (locked && test_clear_buffer_dirty(bh)) {
249 BUFFER_TRACE(bh, "needs writeout, adding to array");
250 wbuf[bufs++] = bh;
251 __journal_file_buffer(jh, commit_transaction,
252 BJ_Locked);
253 jbd_unlock_bh_state(bh);
254 if (bufs == journal->j_wbufsize) {
255 spin_unlock(&journal->j_list_lock);
256 trace_jbd_do_submit_data(journal,
257 commit_transaction);
258 journal_do_submit_data(wbuf, bufs, write_op);
259 bufs = 0;
260 goto write_out_data;
261 }
262 } else if (!locked && buffer_locked(bh)) {
263 __journal_file_buffer(jh, commit_transaction,
264 BJ_Locked);
265 jbd_unlock_bh_state(bh);
266 put_bh(bh);
267 } else {
268 BUFFER_TRACE(bh, "writeout complete: unfile");
269 if (unlikely(!buffer_uptodate(bh)))
270 err = -EIO;
271 __journal_unfile_buffer(jh);
272 jbd_unlock_bh_state(bh);
273 if (locked)
274 unlock_buffer(bh);
275 release_data_buffer(bh);
276 }
277
278 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
279 spin_unlock(&journal->j_list_lock);
280 goto write_out_data;
281 }
282 }
283 spin_unlock(&journal->j_list_lock);
284 trace_jbd_do_submit_data(journal, commit_transaction);
285 journal_do_submit_data(wbuf, bufs, write_op);
286
287 return err;
288}
289
290/*
291 * journal_commit_transaction
292 *
293 * The primary function for committing a transaction to the log. This
294 * function is called by the journal thread to begin a complete commit.
295 */
296void journal_commit_transaction(journal_t *journal)
297{
298 transaction_t *commit_transaction;
299 struct journal_head *jh, *new_jh, *descriptor;
300 struct buffer_head **wbuf = journal->j_wbuf;
301 int bufs;
302 int flags;
303 int err;
304 unsigned int blocknr;
305 ktime_t start_time;
306 u64 commit_time;
307 char *tagp = NULL;
308 journal_header_t *header;
309 journal_block_tag_t *tag = NULL;
310 int space_left = 0;
311 int first_tag = 0;
312 int tag_flag;
313 int i;
314 struct blk_plug plug;
315 int write_op = WRITE;
316
317 /*
318 * First job: lock down the current transaction and wait for
319 * all outstanding updates to complete.
320 */
321
322 /* Do we need to erase the effects of a prior journal_flush? */
323 if (journal->j_flags & JFS_FLUSHED) {
324 jbd_debug(3, "super block updated\n");
325 mutex_lock(&journal->j_checkpoint_mutex);
326 /*
327 * We hold j_checkpoint_mutex so tail cannot change under us.
328 * We don't need any special data guarantees for writing sb
329 * since journal is empty and it is ok for write to be
330 * flushed only with transaction commit.
331 */
332 journal_update_sb_log_tail(journal, journal->j_tail_sequence,
333 journal->j_tail, WRITE_SYNC);
334 mutex_unlock(&journal->j_checkpoint_mutex);
335 } else {
336 jbd_debug(3, "superblock not updated\n");
337 }
338
339 J_ASSERT(journal->j_running_transaction != NULL);
340 J_ASSERT(journal->j_committing_transaction == NULL);
341
342 commit_transaction = journal->j_running_transaction;
343
344 trace_jbd_start_commit(journal, commit_transaction);
345 jbd_debug(1, "JBD: starting commit of transaction %d\n",
346 commit_transaction->t_tid);
347
348 spin_lock(&journal->j_state_lock);
349 J_ASSERT(commit_transaction->t_state == T_RUNNING);
350 commit_transaction->t_state = T_LOCKED;
351
352 trace_jbd_commit_locking(journal, commit_transaction);
353 spin_lock(&commit_transaction->t_handle_lock);
354 while (commit_transaction->t_updates) {
355 DEFINE_WAIT(wait);
356
357 prepare_to_wait(&journal->j_wait_updates, &wait,
358 TASK_UNINTERRUPTIBLE);
359 if (commit_transaction->t_updates) {
360 spin_unlock(&commit_transaction->t_handle_lock);
361 spin_unlock(&journal->j_state_lock);
362 schedule();
363 spin_lock(&journal->j_state_lock);
364 spin_lock(&commit_transaction->t_handle_lock);
365 }
366 finish_wait(&journal->j_wait_updates, &wait);
367 }
368 spin_unlock(&commit_transaction->t_handle_lock);
369
370 J_ASSERT (commit_transaction->t_outstanding_credits <=
371 journal->j_max_transaction_buffers);
372
373 /*
374 * First thing we are allowed to do is to discard any remaining
375 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
376 * that there are no such buffers: if a large filesystem
377 * operation like a truncate needs to split itself over multiple
378 * transactions, then it may try to do a journal_restart() while
379 * there are still BJ_Reserved buffers outstanding. These must
380 * be released cleanly from the current transaction.
381 *
382 * In this case, the filesystem must still reserve write access
383 * again before modifying the buffer in the new transaction, but
384 * we do not require it to remember exactly which old buffers it
385 * has reserved. This is consistent with the existing behaviour
386 * that multiple journal_get_write_access() calls to the same
387 * buffer are perfectly permissible.
388 */
389 while (commit_transaction->t_reserved_list) {
390 jh = commit_transaction->t_reserved_list;
391 JBUFFER_TRACE(jh, "reserved, unused: refile");
392 /*
393 * A journal_get_undo_access()+journal_release_buffer() may
394 * leave undo-committed data.
395 */
396 if (jh->b_committed_data) {
397 struct buffer_head *bh = jh2bh(jh);
398
399 jbd_lock_bh_state(bh);
400 jbd_free(jh->b_committed_data, bh->b_size);
401 jh->b_committed_data = NULL;
402 jbd_unlock_bh_state(bh);
403 }
404 journal_refile_buffer(journal, jh);
405 }
406
407 /*
408 * Now try to drop any written-back buffers from the journal's
409 * checkpoint lists. We do this *before* commit because it potentially
410 * frees some memory
411 */
412 spin_lock(&journal->j_list_lock);
413 __journal_clean_checkpoint_list(journal);
414 spin_unlock(&journal->j_list_lock);
415
416 jbd_debug (3, "JBD: commit phase 1\n");
417
418 /*
419 * Clear revoked flag to reflect there is no revoked buffers
420 * in the next transaction which is going to be started.
421 */
422 journal_clear_buffer_revoked_flags(journal);
423
424 /*
425 * Switch to a new revoke table.
426 */
427 journal_switch_revoke_table(journal);
428
429 trace_jbd_commit_flushing(journal, commit_transaction);
430 commit_transaction->t_state = T_FLUSH;
431 journal->j_committing_transaction = commit_transaction;
432 journal->j_running_transaction = NULL;
433 start_time = ktime_get();
434 commit_transaction->t_log_start = journal->j_head;
435 wake_up(&journal->j_wait_transaction_locked);
436 spin_unlock(&journal->j_state_lock);
437
438 jbd_debug (3, "JBD: commit phase 2\n");
439
440 if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
441 write_op = WRITE_SYNC;
442
443 /*
444 * Now start flushing things to disk, in the order they appear
445 * on the transaction lists. Data blocks go first.
446 */
447 blk_start_plug(&plug);
448 err = journal_submit_data_buffers(journal, commit_transaction,
449 write_op);
450 blk_finish_plug(&plug);
451
452 /*
453 * Wait for all previously submitted IO to complete.
454 */
455 spin_lock(&journal->j_list_lock);
456 while (commit_transaction->t_locked_list) {
457 struct buffer_head *bh;
458
459 jh = commit_transaction->t_locked_list->b_tprev;
460 bh = jh2bh(jh);
461 get_bh(bh);
462 if (buffer_locked(bh)) {
463 spin_unlock(&journal->j_list_lock);
464 wait_on_buffer(bh);
465 spin_lock(&journal->j_list_lock);
466 }
467 if (unlikely(!buffer_uptodate(bh))) {
468 if (!trylock_page(bh->b_page)) {
469 spin_unlock(&journal->j_list_lock);
470 lock_page(bh->b_page);
471 spin_lock(&journal->j_list_lock);
472 }
473 if (bh->b_page->mapping)
474 set_bit(AS_EIO, &bh->b_page->mapping->flags);
475
476 unlock_page(bh->b_page);
477 SetPageError(bh->b_page);
478 err = -EIO;
479 }
480 if (!inverted_lock(journal, bh)) {
481 put_bh(bh);
482 spin_lock(&journal->j_list_lock);
483 continue;
484 }
485 if (buffer_jbd(bh) && bh2jh(bh) == jh &&
486 jh->b_transaction == commit_transaction &&
487 jh->b_jlist == BJ_Locked)
488 __journal_unfile_buffer(jh);
489 jbd_unlock_bh_state(bh);
490 release_data_buffer(bh);
491 cond_resched_lock(&journal->j_list_lock);
492 }
493 spin_unlock(&journal->j_list_lock);
494
495 if (err) {
496 char b[BDEVNAME_SIZE];
497
498 printk(KERN_WARNING
499 "JBD: Detected IO errors while flushing file data "
500 "on %s\n", bdevname(journal->j_fs_dev, b));
501 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
502 journal_abort(journal, err);
503 err = 0;
504 }
505
506 blk_start_plug(&plug);
507
508 journal_write_revoke_records(journal, commit_transaction, write_op);
509
510 /*
511 * If we found any dirty or locked buffers, then we should have
512 * looped back up to the write_out_data label. If there weren't
513 * any then journal_clean_data_list should have wiped the list
514 * clean by now, so check that it is in fact empty.
515 */
516 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
517
518 jbd_debug (3, "JBD: commit phase 3\n");
519
520 /*
521 * Way to go: we have now written out all of the data for a
522 * transaction! Now comes the tricky part: we need to write out
523 * metadata. Loop over the transaction's entire buffer list:
524 */
525 spin_lock(&journal->j_state_lock);
526 commit_transaction->t_state = T_COMMIT;
527 spin_unlock(&journal->j_state_lock);
528
529 trace_jbd_commit_logging(journal, commit_transaction);
530 J_ASSERT(commit_transaction->t_nr_buffers <=
531 commit_transaction->t_outstanding_credits);
532
533 descriptor = NULL;
534 bufs = 0;
535 while (commit_transaction->t_buffers) {
536
537 /* Find the next buffer to be journaled... */
538
539 jh = commit_transaction->t_buffers;
540
541 /* If we're in abort mode, we just un-journal the buffer and
542 release it. */
543
544 if (is_journal_aborted(journal)) {
545 clear_buffer_jbddirty(jh2bh(jh));
546 JBUFFER_TRACE(jh, "journal is aborting: refile");
547 journal_refile_buffer(journal, jh);
548 /* If that was the last one, we need to clean up
549 * any descriptor buffers which may have been
550 * already allocated, even if we are now
551 * aborting. */
552 if (!commit_transaction->t_buffers)
553 goto start_journal_io;
554 continue;
555 }
556
557 /* Make sure we have a descriptor block in which to
558 record the metadata buffer. */
559
560 if (!descriptor) {
561 struct buffer_head *bh;
562
563 J_ASSERT (bufs == 0);
564
565 jbd_debug(4, "JBD: get descriptor\n");
566
567 descriptor = journal_get_descriptor_buffer(journal);
568 if (!descriptor) {
569 journal_abort(journal, -EIO);
570 continue;
571 }
572
573 bh = jh2bh(descriptor);
574 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
575 (unsigned long long)bh->b_blocknr, bh->b_data);
576 header = (journal_header_t *)&bh->b_data[0];
577 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
578 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
579 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
580
581 tagp = &bh->b_data[sizeof(journal_header_t)];
582 space_left = bh->b_size - sizeof(journal_header_t);
583 first_tag = 1;
584 set_buffer_jwrite(bh);
585 set_buffer_dirty(bh);
586 wbuf[bufs++] = bh;
587
588 /* Record it so that we can wait for IO
589 completion later */
590 BUFFER_TRACE(bh, "ph3: file as descriptor");
591 journal_file_buffer(descriptor, commit_transaction,
592 BJ_LogCtl);
593 }
594
595 /* Where is the buffer to be written? */
596
597 err = journal_next_log_block(journal, &blocknr);
598 /* If the block mapping failed, just abandon the buffer
599 and repeat this loop: we'll fall into the
600 refile-on-abort condition above. */
601 if (err) {
602 journal_abort(journal, err);
603 continue;
604 }
605
606 /*
607 * start_this_handle() uses t_outstanding_credits to determine
608 * the free space in the log, but this counter is changed
609 * by journal_next_log_block() also.
610 */
611 commit_transaction->t_outstanding_credits--;
612
613 /* Bump b_count to prevent truncate from stumbling over
614 the shadowed buffer! @@@ This can go if we ever get
615 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
616 get_bh(jh2bh(jh));
617
618 /* Make a temporary IO buffer with which to write it out
619 (this will requeue both the metadata buffer and the
620 temporary IO buffer). new_bh goes on BJ_IO*/
621
622 set_buffer_jwrite(jh2bh(jh));
623 /*
624 * akpm: journal_write_metadata_buffer() sets
625 * new_bh->b_transaction to commit_transaction.
626 * We need to clean this up before we release new_bh
627 * (which is of type BJ_IO)
628 */
629 JBUFFER_TRACE(jh, "ph3: write metadata");
630 flags = journal_write_metadata_buffer(commit_transaction,
631 jh, &new_jh, blocknr);
632 set_buffer_jwrite(jh2bh(new_jh));
633 wbuf[bufs++] = jh2bh(new_jh);
634
635 /* Record the new block's tag in the current descriptor
636 buffer */
637
638 tag_flag = 0;
639 if (flags & 1)
640 tag_flag |= JFS_FLAG_ESCAPE;
641 if (!first_tag)
642 tag_flag |= JFS_FLAG_SAME_UUID;
643
644 tag = (journal_block_tag_t *) tagp;
645 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
646 tag->t_flags = cpu_to_be32(tag_flag);
647 tagp += sizeof(journal_block_tag_t);
648 space_left -= sizeof(journal_block_tag_t);
649
650 if (first_tag) {
651 memcpy (tagp, journal->j_uuid, 16);
652 tagp += 16;
653 space_left -= 16;
654 first_tag = 0;
655 }
656
657 /* If there's no more to do, or if the descriptor is full,
658 let the IO rip! */
659
660 if (bufs == journal->j_wbufsize ||
661 commit_transaction->t_buffers == NULL ||
662 space_left < sizeof(journal_block_tag_t) + 16) {
663
664 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
665
666 /* Write an end-of-descriptor marker before
667 submitting the IOs. "tag" still points to
668 the last tag we set up. */
669
670 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
671
672start_journal_io:
673 for (i = 0; i < bufs; i++) {
674 struct buffer_head *bh = wbuf[i];
675 lock_buffer(bh);
676 clear_buffer_dirty(bh);
677 set_buffer_uptodate(bh);
678 bh->b_end_io = journal_end_buffer_io_sync;
679 /*
680 * In data=journal mode, here we can end up
681 * writing pagecache data that might be
682 * mmapped. Since we can't afford to clean the
683 * page and set PageWriteback (see the comment
684 * near the other use of _submit_bh()), the
685 * data can change while the write is in
686 * flight. Tell the block layer to bounce the
687 * bio pages if stable pages are required.
688 */
689 _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
690 }
691 cond_resched();
692
693 /* Force a new descriptor to be generated next
694 time round the loop. */
695 descriptor = NULL;
696 bufs = 0;
697 }
698 }
699
700 blk_finish_plug(&plug);
701
702 /* Lo and behold: we have just managed to send a transaction to
703 the log. Before we can commit it, wait for the IO so far to
704 complete. Control buffers being written are on the
705 transaction's t_log_list queue, and metadata buffers are on
706 the t_iobuf_list queue.
707
708 Wait for the buffers in reverse order. That way we are
709 less likely to be woken up until all IOs have completed, and
710 so we incur less scheduling load.
711 */
712
713 jbd_debug(3, "JBD: commit phase 4\n");
714
715 /*
716 * akpm: these are BJ_IO, and j_list_lock is not needed.
717 * See __journal_try_to_free_buffer.
718 */
719wait_for_iobuf:
720 while (commit_transaction->t_iobuf_list != NULL) {
721 struct buffer_head *bh;
722
723 jh = commit_transaction->t_iobuf_list->b_tprev;
724 bh = jh2bh(jh);
725 if (buffer_locked(bh)) {
726 wait_on_buffer(bh);
727 goto wait_for_iobuf;
728 }
729 if (cond_resched())
730 goto wait_for_iobuf;
731
732 if (unlikely(!buffer_uptodate(bh)))
733 err = -EIO;
734
735 clear_buffer_jwrite(bh);
736
737 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
738 journal_unfile_buffer(journal, jh);
739
740 /*
741 * ->t_iobuf_list should contain only dummy buffer_heads
742 * which were created by journal_write_metadata_buffer().
743 */
744 BUFFER_TRACE(bh, "dumping temporary bh");
745 journal_put_journal_head(jh);
746 __brelse(bh);
747 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
748 free_buffer_head(bh);
749
750 /* We also have to unlock and free the corresponding
751 shadowed buffer */
752 jh = commit_transaction->t_shadow_list->b_tprev;
753 bh = jh2bh(jh);
754 clear_buffer_jwrite(bh);
755 J_ASSERT_BH(bh, buffer_jbddirty(bh));
756
757 /* The metadata is now released for reuse, but we need
758 to remember it against this transaction so that when
759 we finally commit, we can do any checkpointing
760 required. */
761 JBUFFER_TRACE(jh, "file as BJ_Forget");
762 journal_file_buffer(jh, commit_transaction, BJ_Forget);
763 /*
764 * Wake up any transactions which were waiting for this
765 * IO to complete. The barrier must be here so that changes
766 * by journal_file_buffer() take effect before wake_up_bit()
767 * does the waitqueue check.
768 */
769 smp_mb();
770 wake_up_bit(&bh->b_state, BH_Unshadow);
771 JBUFFER_TRACE(jh, "brelse shadowed buffer");
772 __brelse(bh);
773 }
774
775 J_ASSERT (commit_transaction->t_shadow_list == NULL);
776
777 jbd_debug(3, "JBD: commit phase 5\n");
778
779 /* Here we wait for the revoke record and descriptor record buffers */
780 wait_for_ctlbuf:
781 while (commit_transaction->t_log_list != NULL) {
782 struct buffer_head *bh;
783
784 jh = commit_transaction->t_log_list->b_tprev;
785 bh = jh2bh(jh);
786 if (buffer_locked(bh)) {
787 wait_on_buffer(bh);
788 goto wait_for_ctlbuf;
789 }
790 if (cond_resched())
791 goto wait_for_ctlbuf;
792
793 if (unlikely(!buffer_uptodate(bh)))
794 err = -EIO;
795
796 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
797 clear_buffer_jwrite(bh);
798 journal_unfile_buffer(journal, jh);
799 journal_put_journal_head(jh);
800 __brelse(bh); /* One for getblk */
801 /* AKPM: bforget here */
802 }
803
804 if (err)
805 journal_abort(journal, err);
806
807 jbd_debug(3, "JBD: commit phase 6\n");
808
809 /* All metadata is written, now write commit record and do cleanup */
810 spin_lock(&journal->j_state_lock);
811 J_ASSERT(commit_transaction->t_state == T_COMMIT);
812 commit_transaction->t_state = T_COMMIT_RECORD;
813 spin_unlock(&journal->j_state_lock);
814
815 if (journal_write_commit_record(journal, commit_transaction))
816 err = -EIO;
817
818 if (err)
819 journal_abort(journal, err);
820
821 /* End of a transaction! Finally, we can do checkpoint
822 processing: any buffers committed as a result of this
823 transaction can be removed from any checkpoint list it was on
824 before. */
825
826 jbd_debug(3, "JBD: commit phase 7\n");
827
828 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
829 J_ASSERT(commit_transaction->t_buffers == NULL);
830 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
831 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
832 J_ASSERT(commit_transaction->t_shadow_list == NULL);
833 J_ASSERT(commit_transaction->t_log_list == NULL);
834
835restart_loop:
836 /*
837 * As there are other places (journal_unmap_buffer()) adding buffers
838 * to this list we have to be careful and hold the j_list_lock.
839 */
840 spin_lock(&journal->j_list_lock);
841 while (commit_transaction->t_forget) {
842 transaction_t *cp_transaction;
843 struct buffer_head *bh;
844 int try_to_free = 0;
845
846 jh = commit_transaction->t_forget;
847 spin_unlock(&journal->j_list_lock);
848 bh = jh2bh(jh);
849 /*
850 * Get a reference so that bh cannot be freed before we are
851 * done with it.
852 */
853 get_bh(bh);
854 jbd_lock_bh_state(bh);
855 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
856 jh->b_transaction == journal->j_running_transaction);
857
858 /*
859 * If there is undo-protected committed data against
860 * this buffer, then we can remove it now. If it is a
861 * buffer needing such protection, the old frozen_data
862 * field now points to a committed version of the
863 * buffer, so rotate that field to the new committed
864 * data.
865 *
866 * Otherwise, we can just throw away the frozen data now.
867 */
868 if (jh->b_committed_data) {
869 jbd_free(jh->b_committed_data, bh->b_size);
870 jh->b_committed_data = NULL;
871 if (jh->b_frozen_data) {
872 jh->b_committed_data = jh->b_frozen_data;
873 jh->b_frozen_data = NULL;
874 }
875 } else if (jh->b_frozen_data) {
876 jbd_free(jh->b_frozen_data, bh->b_size);
877 jh->b_frozen_data = NULL;
878 }
879
880 spin_lock(&journal->j_list_lock);
881 cp_transaction = jh->b_cp_transaction;
882 if (cp_transaction) {
883 JBUFFER_TRACE(jh, "remove from old cp transaction");
884 __journal_remove_checkpoint(jh);
885 }
886
887 /* Only re-checkpoint the buffer_head if it is marked
888 * dirty. If the buffer was added to the BJ_Forget list
889 * by journal_forget, it may no longer be dirty and
890 * there's no point in keeping a checkpoint record for
891 * it. */
892
893 /*
894 * A buffer which has been freed while still being journaled by
895 * a previous transaction.
896 */
897 if (buffer_freed(bh)) {
898 /*
899 * If the running transaction is the one containing
900 * "add to orphan" operation (b_next_transaction !=
901 * NULL), we have to wait for that transaction to
902 * commit before we can really get rid of the buffer.
903 * So just clear b_modified to not confuse transaction
904 * credit accounting and refile the buffer to
905 * BJ_Forget of the running transaction. If the just
906 * committed transaction contains "add to orphan"
907 * operation, we can completely invalidate the buffer
908 * now. We are rather throughout in that since the
909 * buffer may be still accessible when blocksize <
910 * pagesize and it is attached to the last partial
911 * page.
912 */
913 jh->b_modified = 0;
914 if (!jh->b_next_transaction) {
915 clear_buffer_freed(bh);
916 clear_buffer_jbddirty(bh);
917 clear_buffer_mapped(bh);
918 clear_buffer_new(bh);
919 clear_buffer_req(bh);
920 bh->b_bdev = NULL;
921 }
922 }
923
924 if (buffer_jbddirty(bh)) {
925 JBUFFER_TRACE(jh, "add to new checkpointing trans");
926 __journal_insert_checkpoint(jh, commit_transaction);
927 if (is_journal_aborted(journal))
928 clear_buffer_jbddirty(bh);
929 } else {
930 J_ASSERT_BH(bh, !buffer_dirty(bh));
931 /*
932 * The buffer on BJ_Forget list and not jbddirty means
933 * it has been freed by this transaction and hence it
934 * could not have been reallocated until this
935 * transaction has committed. *BUT* it could be
936 * reallocated once we have written all the data to
937 * disk and before we process the buffer on BJ_Forget
938 * list.
939 */
940 if (!jh->b_next_transaction)
941 try_to_free = 1;
942 }
943 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
944 __journal_refile_buffer(jh);
945 jbd_unlock_bh_state(bh);
946 if (try_to_free)
947 release_buffer_page(bh);
948 else
949 __brelse(bh);
950 cond_resched_lock(&journal->j_list_lock);
951 }
952 spin_unlock(&journal->j_list_lock);
953 /*
954 * This is a bit sleazy. We use j_list_lock to protect transition
955 * of a transaction into T_FINISHED state and calling
956 * __journal_drop_transaction(). Otherwise we could race with
957 * other checkpointing code processing the transaction...
958 */
959 spin_lock(&journal->j_state_lock);
960 spin_lock(&journal->j_list_lock);
961 /*
962 * Now recheck if some buffers did not get attached to the transaction
963 * while the lock was dropped...
964 */
965 if (commit_transaction->t_forget) {
966 spin_unlock(&journal->j_list_lock);
967 spin_unlock(&journal->j_state_lock);
968 goto restart_loop;
969 }
970
971 /* Done with this transaction! */
972
973 jbd_debug(3, "JBD: commit phase 8\n");
974
975 J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
976
977 commit_transaction->t_state = T_FINISHED;
978 J_ASSERT(commit_transaction == journal->j_committing_transaction);
979 journal->j_commit_sequence = commit_transaction->t_tid;
980 journal->j_committing_transaction = NULL;
981 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
982
983 /*
984 * weight the commit time higher than the average time so we don't
985 * react too strongly to vast changes in commit time
986 */
987 if (likely(journal->j_average_commit_time))
988 journal->j_average_commit_time = (commit_time*3 +
989 journal->j_average_commit_time) / 4;
990 else
991 journal->j_average_commit_time = commit_time;
992
993 spin_unlock(&journal->j_state_lock);
994
995 if (commit_transaction->t_checkpoint_list == NULL &&
996 commit_transaction->t_checkpoint_io_list == NULL) {
997 __journal_drop_transaction(journal, commit_transaction);
998 } else {
999 if (journal->j_checkpoint_transactions == NULL) {
1000 journal->j_checkpoint_transactions = commit_transaction;
1001 commit_transaction->t_cpnext = commit_transaction;
1002 commit_transaction->t_cpprev = commit_transaction;
1003 } else {
1004 commit_transaction->t_cpnext =
1005 journal->j_checkpoint_transactions;
1006 commit_transaction->t_cpprev =
1007 commit_transaction->t_cpnext->t_cpprev;
1008 commit_transaction->t_cpnext->t_cpprev =
1009 commit_transaction;
1010 commit_transaction->t_cpprev->t_cpnext =
1011 commit_transaction;
1012 }
1013 }
1014 spin_unlock(&journal->j_list_lock);
1015
1016 trace_jbd_end_commit(journal, commit_transaction);
1017 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1018 journal->j_commit_sequence, journal->j_tail_sequence);
1019
1020 wake_up(&journal->j_wait_done_commit);
1021}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
deleted file mode 100644
index c46a79adb6ad..000000000000
--- a/fs/jbd/journal.c
+++ /dev/null
@@ -1,2145 +0,0 @@
1/*
2 * linux/fs/jbd/journal.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem journal-writing code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages journals: areas of disk reserved for logging
16 * transactional updates. This includes the kernel journaling thread
17 * which is responsible for scheduling updates to the log.
18 *
19 * We do not actually manage the physical storage of the journal in this
20 * file: that is left to a per-journal policy function, which allows us
21 * to store the journal within a filesystem-specified area for ext2
22 * journaling (ext2 can use a reserved inode for storing the log).
23 */
24
25#include <linux/module.h>
26#include <linux/time.h>
27#include <linux/fs.h>
28#include <linux/jbd.h>
29#include <linux/errno.h>
30#include <linux/slab.h>
31#include <linux/init.h>
32#include <linux/mm.h>
33#include <linux/freezer.h>
34#include <linux/pagemap.h>
35#include <linux/kthread.h>
36#include <linux/poison.h>
37#include <linux/proc_fs.h>
38#include <linux/debugfs.h>
39#include <linux/ratelimit.h>
40
41#define CREATE_TRACE_POINTS
42#include <trace/events/jbd.h>
43
44#include <asm/uaccess.h>
45#include <asm/page.h>
46
47EXPORT_SYMBOL(journal_start);
48EXPORT_SYMBOL(journal_restart);
49EXPORT_SYMBOL(journal_extend);
50EXPORT_SYMBOL(journal_stop);
51EXPORT_SYMBOL(journal_lock_updates);
52EXPORT_SYMBOL(journal_unlock_updates);
53EXPORT_SYMBOL(journal_get_write_access);
54EXPORT_SYMBOL(journal_get_create_access);
55EXPORT_SYMBOL(journal_get_undo_access);
56EXPORT_SYMBOL(journal_dirty_data);
57EXPORT_SYMBOL(journal_dirty_metadata);
58EXPORT_SYMBOL(journal_release_buffer);
59EXPORT_SYMBOL(journal_forget);
60#if 0
61EXPORT_SYMBOL(journal_sync_buffer);
62#endif
63EXPORT_SYMBOL(journal_flush);
64EXPORT_SYMBOL(journal_revoke);
65
66EXPORT_SYMBOL(journal_init_dev);
67EXPORT_SYMBOL(journal_init_inode);
68EXPORT_SYMBOL(journal_update_format);
69EXPORT_SYMBOL(journal_check_used_features);
70EXPORT_SYMBOL(journal_check_available_features);
71EXPORT_SYMBOL(journal_set_features);
72EXPORT_SYMBOL(journal_create);
73EXPORT_SYMBOL(journal_load);
74EXPORT_SYMBOL(journal_destroy);
75EXPORT_SYMBOL(journal_abort);
76EXPORT_SYMBOL(journal_errno);
77EXPORT_SYMBOL(journal_ack_err);
78EXPORT_SYMBOL(journal_clear_err);
79EXPORT_SYMBOL(log_wait_commit);
80EXPORT_SYMBOL(log_start_commit);
81EXPORT_SYMBOL(journal_start_commit);
82EXPORT_SYMBOL(journal_force_commit_nested);
83EXPORT_SYMBOL(journal_wipe);
84EXPORT_SYMBOL(journal_blocks_per_page);
85EXPORT_SYMBOL(journal_invalidatepage);
86EXPORT_SYMBOL(journal_try_to_free_buffers);
87EXPORT_SYMBOL(journal_force_commit);
88
89static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
90static void __journal_abort_soft (journal_t *journal, int errno);
91static const char *journal_dev_name(journal_t *journal, char *buffer);
92
93#ifdef CONFIG_JBD_DEBUG
94void __jbd_debug(int level, const char *file, const char *func,
95 unsigned int line, const char *fmt, ...)
96{
97 struct va_format vaf;
98 va_list args;
99
100 if (level > journal_enable_debug)
101 return;
102 va_start(args, fmt);
103 vaf.fmt = fmt;
104 vaf.va = &args;
105 printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
106 va_end(args);
107}
108EXPORT_SYMBOL(__jbd_debug);
109#endif
110
111/*
112 * Helper function used to manage commit timeouts
113 */
114
115static void commit_timeout(unsigned long __data)
116{
117 struct task_struct * p = (struct task_struct *) __data;
118
119 wake_up_process(p);
120}
121
122/*
123 * kjournald: The main thread function used to manage a logging device
124 * journal.
125 *
126 * This kernel thread is responsible for two things:
127 *
128 * 1) COMMIT: Every so often we need to commit the current state of the
129 * filesystem to disk. The journal thread is responsible for writing
130 * all of the metadata buffers to disk.
131 *
132 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
133 * of the data in that part of the log has been rewritten elsewhere on
134 * the disk. Flushing these old buffers to reclaim space in the log is
135 * known as checkpointing, and this thread is responsible for that job.
136 */
137
138static int kjournald(void *arg)
139{
140 journal_t *journal = arg;
141 transaction_t *transaction;
142
143 /*
144 * Set up an interval timer which can be used to trigger a commit wakeup
145 * after the commit interval expires
146 */
147 setup_timer(&journal->j_commit_timer, commit_timeout,
148 (unsigned long)current);
149
150 set_freezable();
151
152 /* Record that the journal thread is running */
153 journal->j_task = current;
154 wake_up(&journal->j_wait_done_commit);
155
156 printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
157 journal->j_commit_interval / HZ);
158
159 /*
160 * And now, wait forever for commit wakeup events.
161 */
162 spin_lock(&journal->j_state_lock);
163
164loop:
165 if (journal->j_flags & JFS_UNMOUNT)
166 goto end_loop;
167
168 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
169 journal->j_commit_sequence, journal->j_commit_request);
170
171 if (journal->j_commit_sequence != journal->j_commit_request) {
172 jbd_debug(1, "OK, requests differ\n");
173 spin_unlock(&journal->j_state_lock);
174 del_timer_sync(&journal->j_commit_timer);
175 journal_commit_transaction(journal);
176 spin_lock(&journal->j_state_lock);
177 goto loop;
178 }
179
180 wake_up(&journal->j_wait_done_commit);
181 if (freezing(current)) {
182 /*
183 * The simpler the better. Flushing journal isn't a
184 * good idea, because that depends on threads that may
185 * be already stopped.
186 */
187 jbd_debug(1, "Now suspending kjournald\n");
188 spin_unlock(&journal->j_state_lock);
189 try_to_freeze();
190 spin_lock(&journal->j_state_lock);
191 } else {
192 /*
193 * We assume on resume that commits are already there,
194 * so we don't sleep
195 */
196 DEFINE_WAIT(wait);
197 int should_sleep = 1;
198
199 prepare_to_wait(&journal->j_wait_commit, &wait,
200 TASK_INTERRUPTIBLE);
201 if (journal->j_commit_sequence != journal->j_commit_request)
202 should_sleep = 0;
203 transaction = journal->j_running_transaction;
204 if (transaction && time_after_eq(jiffies,
205 transaction->t_expires))
206 should_sleep = 0;
207 if (journal->j_flags & JFS_UNMOUNT)
208 should_sleep = 0;
209 if (should_sleep) {
210 spin_unlock(&journal->j_state_lock);
211 schedule();
212 spin_lock(&journal->j_state_lock);
213 }
214 finish_wait(&journal->j_wait_commit, &wait);
215 }
216
217 jbd_debug(1, "kjournald wakes\n");
218
219 /*
220 * Were we woken up by a commit wakeup event?
221 */
222 transaction = journal->j_running_transaction;
223 if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
224 journal->j_commit_request = transaction->t_tid;
225 jbd_debug(1, "woke because of timeout\n");
226 }
227 goto loop;
228
229end_loop:
230 spin_unlock(&journal->j_state_lock);
231 del_timer_sync(&journal->j_commit_timer);
232 journal->j_task = NULL;
233 wake_up(&journal->j_wait_done_commit);
234 jbd_debug(1, "Journal thread exiting.\n");
235 return 0;
236}
237
238static int journal_start_thread(journal_t *journal)
239{
240 struct task_struct *t;
241
242 t = kthread_run(kjournald, journal, "kjournald");
243 if (IS_ERR(t))
244 return PTR_ERR(t);
245
246 wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
247 return 0;
248}
249
250static void journal_kill_thread(journal_t *journal)
251{
252 spin_lock(&journal->j_state_lock);
253 journal->j_flags |= JFS_UNMOUNT;
254
255 while (journal->j_task) {
256 wake_up(&journal->j_wait_commit);
257 spin_unlock(&journal->j_state_lock);
258 wait_event(journal->j_wait_done_commit,
259 journal->j_task == NULL);
260 spin_lock(&journal->j_state_lock);
261 }
262 spin_unlock(&journal->j_state_lock);
263}
264
265/*
266 * journal_write_metadata_buffer: write a metadata buffer to the journal.
267 *
268 * Writes a metadata buffer to a given disk block. The actual IO is not
269 * performed but a new buffer_head is constructed which labels the data
270 * to be written with the correct destination disk block.
271 *
272 * Any magic-number escaping which needs to be done will cause a
273 * copy-out here. If the buffer happens to start with the
274 * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
275 * magic number is only written to the log for descripter blocks. In
276 * this case, we copy the data and replace the first word with 0, and we
277 * return a result code which indicates that this buffer needs to be
278 * marked as an escaped buffer in the corresponding log descriptor
279 * block. The missing word can then be restored when the block is read
280 * during recovery.
281 *
282 * If the source buffer has already been modified by a new transaction
283 * since we took the last commit snapshot, we use the frozen copy of
284 * that data for IO. If we end up using the existing buffer_head's data
285 * for the write, then we *have* to lock the buffer to prevent anyone
286 * else from using and possibly modifying it while the IO is in
287 * progress.
288 *
289 * The function returns a pointer to the buffer_heads to be used for IO.
290 *
291 * We assume that the journal has already been locked in this function.
292 *
293 * Return value:
294 * <0: Error
295 * >=0: Finished OK
296 *
297 * On success:
298 * Bit 0 set == escape performed on the data
299 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
300 */
301
302int journal_write_metadata_buffer(transaction_t *transaction,
303 struct journal_head *jh_in,
304 struct journal_head **jh_out,
305 unsigned int blocknr)
306{
307 int need_copy_out = 0;
308 int done_copy_out = 0;
309 int do_escape = 0;
310 char *mapped_data;
311 struct buffer_head *new_bh;
312 struct journal_head *new_jh;
313 struct page *new_page;
314 unsigned int new_offset;
315 struct buffer_head *bh_in = jh2bh(jh_in);
316 journal_t *journal = transaction->t_journal;
317
318 /*
319 * The buffer really shouldn't be locked: only the current committing
320 * transaction is allowed to write it, so nobody else is allowed
321 * to do any IO.
322 *
323 * akpm: except if we're journalling data, and write() output is
324 * also part of a shared mapping, and another thread has
325 * decided to launch a writepage() against this buffer.
326 */
327 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
328
329 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
330 /* keep subsequent assertions sane */
331 atomic_set(&new_bh->b_count, 1);
332 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
333
334 /*
335 * If a new transaction has already done a buffer copy-out, then
336 * we use that version of the data for the commit.
337 */
338 jbd_lock_bh_state(bh_in);
339repeat:
340 if (jh_in->b_frozen_data) {
341 done_copy_out = 1;
342 new_page = virt_to_page(jh_in->b_frozen_data);
343 new_offset = offset_in_page(jh_in->b_frozen_data);
344 } else {
345 new_page = jh2bh(jh_in)->b_page;
346 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
347 }
348
349 mapped_data = kmap_atomic(new_page);
350 /*
351 * Check for escaping
352 */
353 if (*((__be32 *)(mapped_data + new_offset)) ==
354 cpu_to_be32(JFS_MAGIC_NUMBER)) {
355 need_copy_out = 1;
356 do_escape = 1;
357 }
358 kunmap_atomic(mapped_data);
359
360 /*
361 * Do we need to do a data copy?
362 */
363 if (need_copy_out && !done_copy_out) {
364 char *tmp;
365
366 jbd_unlock_bh_state(bh_in);
367 tmp = jbd_alloc(bh_in->b_size, GFP_NOFS);
368 jbd_lock_bh_state(bh_in);
369 if (jh_in->b_frozen_data) {
370 jbd_free(tmp, bh_in->b_size);
371 goto repeat;
372 }
373
374 jh_in->b_frozen_data = tmp;
375 mapped_data = kmap_atomic(new_page);
376 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
377 kunmap_atomic(mapped_data);
378
379 new_page = virt_to_page(tmp);
380 new_offset = offset_in_page(tmp);
381 done_copy_out = 1;
382 }
383
384 /*
385 * Did we need to do an escaping? Now we've done all the
386 * copying, we can finally do so.
387 */
388 if (do_escape) {
389 mapped_data = kmap_atomic(new_page);
390 *((unsigned int *)(mapped_data + new_offset)) = 0;
391 kunmap_atomic(mapped_data);
392 }
393
394 set_bh_page(new_bh, new_page, new_offset);
395 new_jh->b_transaction = NULL;
396 new_bh->b_size = jh2bh(jh_in)->b_size;
397 new_bh->b_bdev = transaction->t_journal->j_dev;
398 new_bh->b_blocknr = blocknr;
399 set_buffer_mapped(new_bh);
400 set_buffer_dirty(new_bh);
401
402 *jh_out = new_jh;
403
404 /*
405 * The to-be-written buffer needs to get moved to the io queue,
406 * and the original buffer whose contents we are shadowing or
407 * copying is moved to the transaction's shadow queue.
408 */
409 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
410 spin_lock(&journal->j_list_lock);
411 __journal_file_buffer(jh_in, transaction, BJ_Shadow);
412 spin_unlock(&journal->j_list_lock);
413 jbd_unlock_bh_state(bh_in);
414
415 JBUFFER_TRACE(new_jh, "file as BJ_IO");
416 journal_file_buffer(new_jh, transaction, BJ_IO);
417
418 return do_escape | (done_copy_out << 1);
419}
420
421/*
422 * Allocation code for the journal file. Manage the space left in the
423 * journal, so that we can begin checkpointing when appropriate.
424 */
425
426/*
427 * __log_space_left: Return the number of free blocks left in the journal.
428 *
429 * Called with the journal already locked.
430 *
431 * Called under j_state_lock
432 */
433
434int __log_space_left(journal_t *journal)
435{
436 int left = journal->j_free;
437
438 assert_spin_locked(&journal->j_state_lock);
439
440 /*
441 * Be pessimistic here about the number of those free blocks which
442 * might be required for log descriptor control blocks.
443 */
444
445#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
446
447 left -= MIN_LOG_RESERVED_BLOCKS;
448
449 if (left <= 0)
450 return 0;
451 left -= (left >> 3);
452 return left;
453}
454
455/*
456 * Called under j_state_lock. Returns true if a transaction commit was started.
457 */
458int __log_start_commit(journal_t *journal, tid_t target)
459{
460 /*
461 * The only transaction we can possibly wait upon is the
462 * currently running transaction (if it exists). Otherwise,
463 * the target tid must be an old one.
464 */
465 if (journal->j_commit_request != target &&
466 journal->j_running_transaction &&
467 journal->j_running_transaction->t_tid == target) {
468 /*
469 * We want a new commit: OK, mark the request and wakeup the
470 * commit thread. We do _not_ do the commit ourselves.
471 */
472
473 journal->j_commit_request = target;
474 jbd_debug(1, "JBD: requesting commit %d/%d\n",
475 journal->j_commit_request,
476 journal->j_commit_sequence);
477 wake_up(&journal->j_wait_commit);
478 return 1;
479 } else if (!tid_geq(journal->j_commit_request, target))
480 /* This should never happen, but if it does, preserve
481 the evidence before kjournald goes into a loop and
482 increments j_commit_sequence beyond all recognition. */
483 WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
484 journal->j_commit_request, journal->j_commit_sequence,
485 target, journal->j_running_transaction ?
486 journal->j_running_transaction->t_tid : 0);
487 return 0;
488}
489
490int log_start_commit(journal_t *journal, tid_t tid)
491{
492 int ret;
493
494 spin_lock(&journal->j_state_lock);
495 ret = __log_start_commit(journal, tid);
496 spin_unlock(&journal->j_state_lock);
497 return ret;
498}
499
500/*
501 * Force and wait upon a commit if the calling process is not within
502 * transaction. This is used for forcing out undo-protected data which contains
503 * bitmaps, when the fs is running out of space.
504 *
505 * We can only force the running transaction if we don't have an active handle;
506 * otherwise, we will deadlock.
507 *
508 * Returns true if a transaction was started.
509 */
510int journal_force_commit_nested(journal_t *journal)
511{
512 transaction_t *transaction = NULL;
513 tid_t tid;
514
515 spin_lock(&journal->j_state_lock);
516 if (journal->j_running_transaction && !current->journal_info) {
517 transaction = journal->j_running_transaction;
518 __log_start_commit(journal, transaction->t_tid);
519 } else if (journal->j_committing_transaction)
520 transaction = journal->j_committing_transaction;
521
522 if (!transaction) {
523 spin_unlock(&journal->j_state_lock);
524 return 0; /* Nothing to retry */
525 }
526
527 tid = transaction->t_tid;
528 spin_unlock(&journal->j_state_lock);
529 log_wait_commit(journal, tid);
530 return 1;
531}
532
533/*
534 * Start a commit of the current running transaction (if any). Returns true
535 * if a transaction is going to be committed (or is currently already
536 * committing), and fills its tid in at *ptid
537 */
538int journal_start_commit(journal_t *journal, tid_t *ptid)
539{
540 int ret = 0;
541
542 spin_lock(&journal->j_state_lock);
543 if (journal->j_running_transaction) {
544 tid_t tid = journal->j_running_transaction->t_tid;
545
546 __log_start_commit(journal, tid);
547 /* There's a running transaction and we've just made sure
548 * it's commit has been scheduled. */
549 if (ptid)
550 *ptid = tid;
551 ret = 1;
552 } else if (journal->j_committing_transaction) {
553 /*
554 * If commit has been started, then we have to wait for
555 * completion of that transaction.
556 */
557 if (ptid)
558 *ptid = journal->j_committing_transaction->t_tid;
559 ret = 1;
560 }
561 spin_unlock(&journal->j_state_lock);
562 return ret;
563}
564
565/*
566 * Wait for a specified commit to complete.
567 * The caller may not hold the journal lock.
568 */
569int log_wait_commit(journal_t *journal, tid_t tid)
570{
571 int err = 0;
572
573#ifdef CONFIG_JBD_DEBUG
574 spin_lock(&journal->j_state_lock);
575 if (!tid_geq(journal->j_commit_request, tid)) {
576 printk(KERN_ERR
577 "%s: error: j_commit_request=%d, tid=%d\n",
578 __func__, journal->j_commit_request, tid);
579 }
580 spin_unlock(&journal->j_state_lock);
581#endif
582 spin_lock(&journal->j_state_lock);
583 /*
584 * Not running or committing trans? Must be already committed. This
585 * saves us from waiting for a *long* time when tid overflows.
586 */
587 if (!((journal->j_running_transaction &&
588 journal->j_running_transaction->t_tid == tid) ||
589 (journal->j_committing_transaction &&
590 journal->j_committing_transaction->t_tid == tid)))
591 goto out_unlock;
592
593 if (!tid_geq(journal->j_commit_waited, tid))
594 journal->j_commit_waited = tid;
595 while (tid_gt(tid, journal->j_commit_sequence)) {
596 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
597 tid, journal->j_commit_sequence);
598 wake_up(&journal->j_wait_commit);
599 spin_unlock(&journal->j_state_lock);
600 wait_event(journal->j_wait_done_commit,
601 !tid_gt(tid, journal->j_commit_sequence));
602 spin_lock(&journal->j_state_lock);
603 }
604out_unlock:
605 spin_unlock(&journal->j_state_lock);
606
607 if (unlikely(is_journal_aborted(journal)))
608 err = -EIO;
609 return err;
610}
611
612/*
613 * Return 1 if a given transaction has not yet sent barrier request
614 * connected with a transaction commit. If 0 is returned, transaction
615 * may or may not have sent the barrier. Used to avoid sending barrier
616 * twice in common cases.
617 */
618int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
619{
620 int ret = 0;
621 transaction_t *commit_trans;
622
623 if (!(journal->j_flags & JFS_BARRIER))
624 return 0;
625 spin_lock(&journal->j_state_lock);
626 /* Transaction already committed? */
627 if (tid_geq(journal->j_commit_sequence, tid))
628 goto out;
629 /*
630 * Transaction is being committed and we already proceeded to
631 * writing commit record?
632 */
633 commit_trans = journal->j_committing_transaction;
634 if (commit_trans && commit_trans->t_tid == tid &&
635 commit_trans->t_state >= T_COMMIT_RECORD)
636 goto out;
637 ret = 1;
638out:
639 spin_unlock(&journal->j_state_lock);
640 return ret;
641}
642EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
643
644/*
645 * Log buffer allocation routines:
646 */
647
648int journal_next_log_block(journal_t *journal, unsigned int *retp)
649{
650 unsigned int blocknr;
651
652 spin_lock(&journal->j_state_lock);
653 J_ASSERT(journal->j_free > 1);
654
655 blocknr = journal->j_head;
656 journal->j_head++;
657 journal->j_free--;
658 if (journal->j_head == journal->j_last)
659 journal->j_head = journal->j_first;
660 spin_unlock(&journal->j_state_lock);
661 return journal_bmap(journal, blocknr, retp);
662}
663
664/*
665 * Conversion of logical to physical block numbers for the journal
666 *
667 * On external journals the journal blocks are identity-mapped, so
668 * this is a no-op. If needed, we can use j_blk_offset - everything is
669 * ready.
670 */
671int journal_bmap(journal_t *journal, unsigned int blocknr,
672 unsigned int *retp)
673{
674 int err = 0;
675 unsigned int ret;
676
677 if (journal->j_inode) {
678 ret = bmap(journal->j_inode, blocknr);
679 if (ret)
680 *retp = ret;
681 else {
682 char b[BDEVNAME_SIZE];
683
684 printk(KERN_ALERT "%s: journal block not found "
685 "at offset %u on %s\n",
686 __func__,
687 blocknr,
688 bdevname(journal->j_dev, b));
689 err = -EIO;
690 __journal_abort_soft(journal, err);
691 }
692 } else {
693 *retp = blocknr; /* +journal->j_blk_offset */
694 }
695 return err;
696}
697
698/*
699 * We play buffer_head aliasing tricks to write data/metadata blocks to
700 * the journal without copying their contents, but for journal
701 * descriptor blocks we do need to generate bona fide buffers.
702 *
703 * After the caller of journal_get_descriptor_buffer() has finished modifying
704 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
705 * But we don't bother doing that, so there will be coherency problems with
706 * mmaps of blockdevs which hold live JBD-controlled filesystems.
707 */
708struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
709{
710 struct buffer_head *bh;
711 unsigned int blocknr;
712 int err;
713
714 err = journal_next_log_block(journal, &blocknr);
715
716 if (err)
717 return NULL;
718
719 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
720 if (!bh)
721 return NULL;
722 lock_buffer(bh);
723 memset(bh->b_data, 0, journal->j_blocksize);
724 set_buffer_uptodate(bh);
725 unlock_buffer(bh);
726 BUFFER_TRACE(bh, "return this buffer");
727 return journal_add_journal_head(bh);
728}
729
730/*
731 * Management for journal control blocks: functions to create and
732 * destroy journal_t structures, and to initialise and read existing
733 * journal blocks from disk. */
734
735/* First: create and setup a journal_t object in memory. We initialise
736 * very few fields yet: that has to wait until we have created the
737 * journal structures from from scratch, or loaded them from disk. */
738
739static journal_t * journal_init_common (void)
740{
741 journal_t *journal;
742 int err;
743
744 journal = kzalloc(sizeof(*journal), GFP_KERNEL);
745 if (!journal)
746 goto fail;
747
748 init_waitqueue_head(&journal->j_wait_transaction_locked);
749 init_waitqueue_head(&journal->j_wait_logspace);
750 init_waitqueue_head(&journal->j_wait_done_commit);
751 init_waitqueue_head(&journal->j_wait_checkpoint);
752 init_waitqueue_head(&journal->j_wait_commit);
753 init_waitqueue_head(&journal->j_wait_updates);
754 mutex_init(&journal->j_checkpoint_mutex);
755 spin_lock_init(&journal->j_revoke_lock);
756 spin_lock_init(&journal->j_list_lock);
757 spin_lock_init(&journal->j_state_lock);
758
759 journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
760
761 /* The journal is marked for error until we succeed with recovery! */
762 journal->j_flags = JFS_ABORT;
763
764 /* Set up a default-sized revoke table for the new mount. */
765 err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
766 if (err) {
767 kfree(journal);
768 goto fail;
769 }
770 return journal;
771fail:
772 return NULL;
773}
774
775/* journal_init_dev and journal_init_inode:
776 *
777 * Create a journal structure assigned some fixed set of disk blocks to
778 * the journal. We don't actually touch those disk blocks yet, but we
779 * need to set up all of the mapping information to tell the journaling
780 * system where the journal blocks are.
781 *
782 */
783
784/**
785 * journal_t * journal_init_dev() - creates and initialises a journal structure
786 * @bdev: Block device on which to create the journal
787 * @fs_dev: Device which hold journalled filesystem for this journal.
788 * @start: Block nr Start of journal.
789 * @len: Length of the journal in blocks.
790 * @blocksize: blocksize of journalling device
791 *
792 * Returns: a newly created journal_t *
793 *
794 * journal_init_dev creates a journal which maps a fixed contiguous
795 * range of blocks on an arbitrary block device.
796 *
797 */
798journal_t * journal_init_dev(struct block_device *bdev,
799 struct block_device *fs_dev,
800 int start, int len, int blocksize)
801{
802 journal_t *journal = journal_init_common();
803 struct buffer_head *bh;
804 int n;
805
806 if (!journal)
807 return NULL;
808
809 /* journal descriptor can store up to n blocks -bzzz */
810 journal->j_blocksize = blocksize;
811 n = journal->j_blocksize / sizeof(journal_block_tag_t);
812 journal->j_wbufsize = n;
813 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
814 if (!journal->j_wbuf) {
815 printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
816 __func__);
817 goto out_err;
818 }
819 journal->j_dev = bdev;
820 journal->j_fs_dev = fs_dev;
821 journal->j_blk_offset = start;
822 journal->j_maxlen = len;
823
824 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
825 if (!bh) {
826 printk(KERN_ERR
827 "%s: Cannot get buffer for journal superblock\n",
828 __func__);
829 goto out_err;
830 }
831 journal->j_sb_buffer = bh;
832 journal->j_superblock = (journal_superblock_t *)bh->b_data;
833
834 return journal;
835out_err:
836 kfree(journal->j_wbuf);
837 kfree(journal);
838 return NULL;
839}
840
841/**
842 * journal_t * journal_init_inode () - creates a journal which maps to a inode.
843 * @inode: An inode to create the journal in
844 *
845 * journal_init_inode creates a journal which maps an on-disk inode as
846 * the journal. The inode must exist already, must support bmap() and
847 * must have all data blocks preallocated.
848 */
849journal_t * journal_init_inode (struct inode *inode)
850{
851 struct buffer_head *bh;
852 journal_t *journal = journal_init_common();
853 int err;
854 int n;
855 unsigned int blocknr;
856
857 if (!journal)
858 return NULL;
859
860 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
861 journal->j_inode = inode;
862 jbd_debug(1,
863 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
864 journal, inode->i_sb->s_id, inode->i_ino,
865 (long long) inode->i_size,
866 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
867
868 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
869 journal->j_blocksize = inode->i_sb->s_blocksize;
870
871 /* journal descriptor can store up to n blocks -bzzz */
872 n = journal->j_blocksize / sizeof(journal_block_tag_t);
873 journal->j_wbufsize = n;
874 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
875 if (!journal->j_wbuf) {
876 printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
877 __func__);
878 goto out_err;
879 }
880
881 err = journal_bmap(journal, 0, &blocknr);
882 /* If that failed, give up */
883 if (err) {
884 printk(KERN_ERR "%s: Cannot locate journal superblock\n",
885 __func__);
886 goto out_err;
887 }
888
889 bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
890 if (!bh) {
891 printk(KERN_ERR
892 "%s: Cannot get buffer for journal superblock\n",
893 __func__);
894 goto out_err;
895 }
896 journal->j_sb_buffer = bh;
897 journal->j_superblock = (journal_superblock_t *)bh->b_data;
898
899 return journal;
900out_err:
901 kfree(journal->j_wbuf);
902 kfree(journal);
903 return NULL;
904}
905
906/*
907 * If the journal init or create aborts, we need to mark the journal
908 * superblock as being NULL to prevent the journal destroy from writing
909 * back a bogus superblock.
910 */
911static void journal_fail_superblock (journal_t *journal)
912{
913 struct buffer_head *bh = journal->j_sb_buffer;
914 brelse(bh);
915 journal->j_sb_buffer = NULL;
916}
917
918/*
919 * Given a journal_t structure, initialise the various fields for
920 * startup of a new journaling session. We use this both when creating
921 * a journal, and after recovering an old journal to reset it for
922 * subsequent use.
923 */
924
925static int journal_reset(journal_t *journal)
926{
927 journal_superblock_t *sb = journal->j_superblock;
928 unsigned int first, last;
929
930 first = be32_to_cpu(sb->s_first);
931 last = be32_to_cpu(sb->s_maxlen);
932 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
933 printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
934 first, last);
935 journal_fail_superblock(journal);
936 return -EINVAL;
937 }
938
939 journal->j_first = first;
940 journal->j_last = last;
941
942 journal->j_head = first;
943 journal->j_tail = first;
944 journal->j_free = last - first;
945
946 journal->j_tail_sequence = journal->j_transaction_sequence;
947 journal->j_commit_sequence = journal->j_transaction_sequence - 1;
948 journal->j_commit_request = journal->j_commit_sequence;
949
950 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
951
952 /*
953 * As a special case, if the on-disk copy is already marked as needing
954 * no recovery (s_start == 0), then we can safely defer the superblock
955 * update until the next commit by setting JFS_FLUSHED. This avoids
956 * attempting a write to a potential-readonly device.
957 */
958 if (sb->s_start == 0) {
959 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
960 "(start %u, seq %d, errno %d)\n",
961 journal->j_tail, journal->j_tail_sequence,
962 journal->j_errno);
963 journal->j_flags |= JFS_FLUSHED;
964 } else {
965 /* Lock here to make assertions happy... */
966 mutex_lock(&journal->j_checkpoint_mutex);
967 /*
968 * Update log tail information. We use WRITE_FUA since new
969 * transaction will start reusing journal space and so we
970 * must make sure information about current log tail is on
971 * disk before that.
972 */
973 journal_update_sb_log_tail(journal,
974 journal->j_tail_sequence,
975 journal->j_tail,
976 WRITE_FUA);
977 mutex_unlock(&journal->j_checkpoint_mutex);
978 }
979 return journal_start_thread(journal);
980}
981
982/**
983 * int journal_create() - Initialise the new journal file
984 * @journal: Journal to create. This structure must have been initialised
985 *
986 * Given a journal_t structure which tells us which disk blocks we can
987 * use, create a new journal superblock and initialise all of the
988 * journal fields from scratch.
989 **/
990int journal_create(journal_t *journal)
991{
992 unsigned int blocknr;
993 struct buffer_head *bh;
994 journal_superblock_t *sb;
995 int i, err;
996
997 if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
998 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
999 journal->j_maxlen);
1000 journal_fail_superblock(journal);
1001 return -EINVAL;
1002 }
1003
1004 if (journal->j_inode == NULL) {
1005 /*
1006 * We don't know what block to start at!
1007 */
1008 printk(KERN_EMERG
1009 "%s: creation of journal on external device!\n",
1010 __func__);
1011 BUG();
1012 }
1013
1014 /* Zero out the entire journal on disk. We cannot afford to
1015 have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
1016 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
1017 for (i = 0; i < journal->j_maxlen; i++) {
1018 err = journal_bmap(journal, i, &blocknr);
1019 if (err)
1020 return err;
1021 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1022 if (unlikely(!bh))
1023 return -ENOMEM;
1024 lock_buffer(bh);
1025 memset (bh->b_data, 0, journal->j_blocksize);
1026 BUFFER_TRACE(bh, "marking dirty");
1027 mark_buffer_dirty(bh);
1028 BUFFER_TRACE(bh, "marking uptodate");
1029 set_buffer_uptodate(bh);
1030 unlock_buffer(bh);
1031 __brelse(bh);
1032 }
1033
1034 sync_blockdev(journal->j_dev);
1035 jbd_debug(1, "JBD: journal cleared.\n");
1036
1037 /* OK, fill in the initial static fields in the new superblock */
1038 sb = journal->j_superblock;
1039
1040 sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
1041 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
1042
1043 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
1044 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
1045 sb->s_first = cpu_to_be32(1);
1046
1047 journal->j_transaction_sequence = 1;
1048
1049 journal->j_flags &= ~JFS_ABORT;
1050 journal->j_format_version = 2;
1051
1052 return journal_reset(journal);
1053}
1054
1055static void journal_write_superblock(journal_t *journal, int write_op)
1056{
1057 struct buffer_head *bh = journal->j_sb_buffer;
1058 int ret;
1059
1060 trace_journal_write_superblock(journal, write_op);
1061 if (!(journal->j_flags & JFS_BARRIER))
1062 write_op &= ~(REQ_FUA | REQ_FLUSH);
1063 lock_buffer(bh);
1064 if (buffer_write_io_error(bh)) {
1065 char b[BDEVNAME_SIZE];
1066 /*
1067 * Oh, dear. A previous attempt to write the journal
1068 * superblock failed. This could happen because the
1069 * USB device was yanked out. Or it could happen to
1070 * be a transient write error and maybe the block will
1071 * be remapped. Nothing we can do but to retry the
1072 * write and hope for the best.
1073 */
1074 printk(KERN_ERR "JBD: previous I/O error detected "
1075 "for journal superblock update for %s.\n",
1076 journal_dev_name(journal, b));
1077 clear_buffer_write_io_error(bh);
1078 set_buffer_uptodate(bh);
1079 }
1080
1081 get_bh(bh);
1082 bh->b_end_io = end_buffer_write_sync;
1083 ret = submit_bh(write_op, bh);
1084 wait_on_buffer(bh);
1085 if (buffer_write_io_error(bh)) {
1086 clear_buffer_write_io_error(bh);
1087 set_buffer_uptodate(bh);
1088 ret = -EIO;
1089 }
1090 if (ret) {
1091 char b[BDEVNAME_SIZE];
1092 printk(KERN_ERR "JBD: Error %d detected "
1093 "when updating journal superblock for %s.\n",
1094 ret, journal_dev_name(journal, b));
1095 }
1096}
1097
1098/**
1099 * journal_update_sb_log_tail() - Update log tail in journal sb on disk.
1100 * @journal: The journal to update.
1101 * @tail_tid: TID of the new transaction at the tail of the log
1102 * @tail_block: The first block of the transaction at the tail of the log
1103 * @write_op: With which operation should we write the journal sb
1104 *
1105 * Update a journal's superblock information about log tail and write it to
1106 * disk, waiting for the IO to complete.
1107 */
1108void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1109 unsigned int tail_block, int write_op)
1110{
1111 journal_superblock_t *sb = journal->j_superblock;
1112
1113 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1114 jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n",
1115 tail_block, tail_tid);
1116
1117 sb->s_sequence = cpu_to_be32(tail_tid);
1118 sb->s_start = cpu_to_be32(tail_block);
1119
1120 journal_write_superblock(journal, write_op);
1121
1122 /* Log is no longer empty */
1123 spin_lock(&journal->j_state_lock);
1124 WARN_ON(!sb->s_sequence);
1125 journal->j_flags &= ~JFS_FLUSHED;
1126 spin_unlock(&journal->j_state_lock);
1127}
1128
1129/**
1130 * mark_journal_empty() - Mark on disk journal as empty.
1131 * @journal: The journal to update.
1132 *
1133 * Update a journal's dynamic superblock fields to show that journal is empty.
1134 * Write updated superblock to disk waiting for IO to complete.
1135 */
1136static void mark_journal_empty(journal_t *journal)
1137{
1138 journal_superblock_t *sb = journal->j_superblock;
1139
1140 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1141 spin_lock(&journal->j_state_lock);
1142 /* Is it already empty? */
1143 if (sb->s_start == 0) {
1144 spin_unlock(&journal->j_state_lock);
1145 return;
1146 }
1147 jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
1148 journal->j_tail_sequence);
1149
1150 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1151 sb->s_start = cpu_to_be32(0);
1152 spin_unlock(&journal->j_state_lock);
1153
1154 journal_write_superblock(journal, WRITE_FUA);
1155
1156 spin_lock(&journal->j_state_lock);
1157 /* Log is empty */
1158 journal->j_flags |= JFS_FLUSHED;
1159 spin_unlock(&journal->j_state_lock);
1160}
1161
1162/**
1163 * journal_update_sb_errno() - Update error in the journal.
1164 * @journal: The journal to update.
1165 *
1166 * Update a journal's errno. Write updated superblock to disk waiting for IO
1167 * to complete.
1168 */
1169static void journal_update_sb_errno(journal_t *journal)
1170{
1171 journal_superblock_t *sb = journal->j_superblock;
1172
1173 spin_lock(&journal->j_state_lock);
1174 jbd_debug(1, "JBD: updating superblock error (errno %d)\n",
1175 journal->j_errno);
1176 sb->s_errno = cpu_to_be32(journal->j_errno);
1177 spin_unlock(&journal->j_state_lock);
1178
1179 journal_write_superblock(journal, WRITE_SYNC);
1180}
1181
1182/*
1183 * Read the superblock for a given journal, performing initial
1184 * validation of the format.
1185 */
1186
1187static int journal_get_superblock(journal_t *journal)
1188{
1189 struct buffer_head *bh;
1190 journal_superblock_t *sb;
1191 int err = -EIO;
1192
1193 bh = journal->j_sb_buffer;
1194
1195 J_ASSERT(bh != NULL);
1196 if (!buffer_uptodate(bh)) {
1197 ll_rw_block(READ, 1, &bh);
1198 wait_on_buffer(bh);
1199 if (!buffer_uptodate(bh)) {
1200 printk (KERN_ERR
1201 "JBD: IO error reading journal superblock\n");
1202 goto out;
1203 }
1204 }
1205
1206 sb = journal->j_superblock;
1207
1208 err = -EINVAL;
1209
1210 if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) ||
1211 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1212 printk(KERN_WARNING "JBD: no valid journal superblock found\n");
1213 goto out;
1214 }
1215
1216 switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1217 case JFS_SUPERBLOCK_V1:
1218 journal->j_format_version = 1;
1219 break;
1220 case JFS_SUPERBLOCK_V2:
1221 journal->j_format_version = 2;
1222 break;
1223 default:
1224 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
1225 goto out;
1226 }
1227
1228 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1229 journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1230 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1231 printk (KERN_WARNING "JBD: journal file too short\n");
1232 goto out;
1233 }
1234
1235 if (be32_to_cpu(sb->s_first) == 0 ||
1236 be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1237 printk(KERN_WARNING
1238 "JBD: Invalid start block of journal: %u\n",
1239 be32_to_cpu(sb->s_first));
1240 goto out;
1241 }
1242
1243 return 0;
1244
1245out:
1246 journal_fail_superblock(journal);
1247 return err;
1248}
1249
1250/*
1251 * Load the on-disk journal superblock and read the key fields into the
1252 * journal_t.
1253 */
1254
1255static int load_superblock(journal_t *journal)
1256{
1257 int err;
1258 journal_superblock_t *sb;
1259
1260 err = journal_get_superblock(journal);
1261 if (err)
1262 return err;
1263
1264 sb = journal->j_superblock;
1265
1266 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1267 journal->j_tail = be32_to_cpu(sb->s_start);
1268 journal->j_first = be32_to_cpu(sb->s_first);
1269 journal->j_last = be32_to_cpu(sb->s_maxlen);
1270 journal->j_errno = be32_to_cpu(sb->s_errno);
1271
1272 return 0;
1273}
1274
1275
1276/**
1277 * int journal_load() - Read journal from disk.
1278 * @journal: Journal to act on.
1279 *
1280 * Given a journal_t structure which tells us which disk blocks contain
1281 * a journal, read the journal from disk to initialise the in-memory
1282 * structures.
1283 */
1284int journal_load(journal_t *journal)
1285{
1286 int err;
1287 journal_superblock_t *sb;
1288
1289 err = load_superblock(journal);
1290 if (err)
1291 return err;
1292
1293 sb = journal->j_superblock;
1294 /* If this is a V2 superblock, then we have to check the
1295 * features flags on it. */
1296
1297 if (journal->j_format_version >= 2) {
1298 if ((sb->s_feature_ro_compat &
1299 ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
1300 (sb->s_feature_incompat &
1301 ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
1302 printk (KERN_WARNING
1303 "JBD: Unrecognised features on journal\n");
1304 return -EINVAL;
1305 }
1306 }
1307
1308 /* Let the recovery code check whether it needs to recover any
1309 * data from the journal. */
1310 if (journal_recover(journal))
1311 goto recovery_error;
1312
1313 /* OK, we've finished with the dynamic journal bits:
1314 * reinitialise the dynamic contents of the superblock in memory
1315 * and reset them on disk. */
1316 if (journal_reset(journal))
1317 goto recovery_error;
1318
1319 journal->j_flags &= ~JFS_ABORT;
1320 journal->j_flags |= JFS_LOADED;
1321 return 0;
1322
1323recovery_error:
1324 printk (KERN_WARNING "JBD: recovery failed\n");
1325 return -EIO;
1326}
1327
1328/**
1329 * void journal_destroy() - Release a journal_t structure.
1330 * @journal: Journal to act on.
1331 *
1332 * Release a journal_t structure once it is no longer in use by the
1333 * journaled object.
1334 * Return <0 if we couldn't clean up the journal.
1335 */
1336int journal_destroy(journal_t *journal)
1337{
1338 int err = 0;
1339
1340
1341 /* Wait for the commit thread to wake up and die. */
1342 journal_kill_thread(journal);
1343
1344 /* Force a final log commit */
1345 if (journal->j_running_transaction)
1346 journal_commit_transaction(journal);
1347
1348 /* Force any old transactions to disk */
1349
1350 /* We cannot race with anybody but must keep assertions happy */
1351 mutex_lock(&journal->j_checkpoint_mutex);
1352 /* Totally anal locking here... */
1353 spin_lock(&journal->j_list_lock);
1354 while (journal->j_checkpoint_transactions != NULL) {
1355 spin_unlock(&journal->j_list_lock);
1356 log_do_checkpoint(journal);
1357 spin_lock(&journal->j_list_lock);
1358 }
1359
1360 J_ASSERT(journal->j_running_transaction == NULL);
1361 J_ASSERT(journal->j_committing_transaction == NULL);
1362 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1363 spin_unlock(&journal->j_list_lock);
1364
1365 if (journal->j_sb_buffer) {
1366 if (!is_journal_aborted(journal)) {
1367 journal->j_tail_sequence =
1368 ++journal->j_transaction_sequence;
1369 mark_journal_empty(journal);
1370 } else
1371 err = -EIO;
1372 brelse(journal->j_sb_buffer);
1373 }
1374 mutex_unlock(&journal->j_checkpoint_mutex);
1375
1376 iput(journal->j_inode);
1377 if (journal->j_revoke)
1378 journal_destroy_revoke(journal);
1379 kfree(journal->j_wbuf);
1380 kfree(journal);
1381
1382 return err;
1383}
1384
1385
1386/**
1387 *int journal_check_used_features () - Check if features specified are used.
1388 * @journal: Journal to check.
1389 * @compat: bitmask of compatible features
1390 * @ro: bitmask of features that force read-only mount
1391 * @incompat: bitmask of incompatible features
1392 *
1393 * Check whether the journal uses all of a given set of
1394 * features. Return true (non-zero) if it does.
1395 **/
1396
1397int journal_check_used_features (journal_t *journal, unsigned long compat,
1398 unsigned long ro, unsigned long incompat)
1399{
1400 journal_superblock_t *sb;
1401
1402 if (!compat && !ro && !incompat)
1403 return 1;
1404 if (journal->j_format_version == 1)
1405 return 0;
1406
1407 sb = journal->j_superblock;
1408
1409 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1410 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1411 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1412 return 1;
1413
1414 return 0;
1415}
1416
1417/**
1418 * int journal_check_available_features() - Check feature set in journalling layer
1419 * @journal: Journal to check.
1420 * @compat: bitmask of compatible features
1421 * @ro: bitmask of features that force read-only mount
1422 * @incompat: bitmask of incompatible features
1423 *
1424 * Check whether the journaling code supports the use of
1425 * all of a given set of features on this journal. Return true
1426 * (non-zero) if it can. */
1427
1428int journal_check_available_features (journal_t *journal, unsigned long compat,
1429 unsigned long ro, unsigned long incompat)
1430{
1431 if (!compat && !ro && !incompat)
1432 return 1;
1433
1434 /* We can support any known requested features iff the
1435 * superblock is in version 2. Otherwise we fail to support any
1436 * extended sb features. */
1437
1438 if (journal->j_format_version != 2)
1439 return 0;
1440
1441 if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
1442 (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
1443 (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
1444 return 1;
1445
1446 return 0;
1447}
1448
1449/**
1450 * int journal_set_features () - Mark a given journal feature in the superblock
1451 * @journal: Journal to act on.
1452 * @compat: bitmask of compatible features
1453 * @ro: bitmask of features that force read-only mount
1454 * @incompat: bitmask of incompatible features
1455 *
1456 * Mark a given journal feature as present on the
1457 * superblock. Returns true if the requested features could be set.
1458 *
1459 */
1460
1461int journal_set_features (journal_t *journal, unsigned long compat,
1462 unsigned long ro, unsigned long incompat)
1463{
1464 journal_superblock_t *sb;
1465
1466 if (journal_check_used_features(journal, compat, ro, incompat))
1467 return 1;
1468
1469 if (!journal_check_available_features(journal, compat, ro, incompat))
1470 return 0;
1471
1472 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1473 compat, ro, incompat);
1474
1475 sb = journal->j_superblock;
1476
1477 sb->s_feature_compat |= cpu_to_be32(compat);
1478 sb->s_feature_ro_compat |= cpu_to_be32(ro);
1479 sb->s_feature_incompat |= cpu_to_be32(incompat);
1480
1481 return 1;
1482}
1483
1484
1485/**
1486 * int journal_update_format () - Update on-disk journal structure.
1487 * @journal: Journal to act on.
1488 *
1489 * Given an initialised but unloaded journal struct, poke about in the
1490 * on-disk structure to update it to the most recent supported version.
1491 */
1492int journal_update_format (journal_t *journal)
1493{
1494 journal_superblock_t *sb;
1495 int err;
1496
1497 err = journal_get_superblock(journal);
1498 if (err)
1499 return err;
1500
1501 sb = journal->j_superblock;
1502
1503 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1504 case JFS_SUPERBLOCK_V2:
1505 return 0;
1506 case JFS_SUPERBLOCK_V1:
1507 return journal_convert_superblock_v1(journal, sb);
1508 default:
1509 break;
1510 }
1511 return -EINVAL;
1512}
1513
1514static int journal_convert_superblock_v1(journal_t *journal,
1515 journal_superblock_t *sb)
1516{
1517 int offset, blocksize;
1518 struct buffer_head *bh;
1519
1520 printk(KERN_WARNING
1521 "JBD: Converting superblock from version 1 to 2.\n");
1522
1523 /* Pre-initialise new fields to zero */
1524 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1525 blocksize = be32_to_cpu(sb->s_blocksize);
1526 memset(&sb->s_feature_compat, 0, blocksize-offset);
1527
1528 sb->s_nr_users = cpu_to_be32(1);
1529 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
1530 journal->j_format_version = 2;
1531
1532 bh = journal->j_sb_buffer;
1533 BUFFER_TRACE(bh, "marking dirty");
1534 mark_buffer_dirty(bh);
1535 sync_dirty_buffer(bh);
1536 return 0;
1537}
1538
1539
1540/**
1541 * int journal_flush () - Flush journal
1542 * @journal: Journal to act on.
1543 *
1544 * Flush all data for a given journal to disk and empty the journal.
1545 * Filesystems can use this when remounting readonly to ensure that
1546 * recovery does not need to happen on remount.
1547 */
1548
1549int journal_flush(journal_t *journal)
1550{
1551 int err = 0;
1552 transaction_t *transaction = NULL;
1553
1554 spin_lock(&journal->j_state_lock);
1555
1556 /* Force everything buffered to the log... */
1557 if (journal->j_running_transaction) {
1558 transaction = journal->j_running_transaction;
1559 __log_start_commit(journal, transaction->t_tid);
1560 } else if (journal->j_committing_transaction)
1561 transaction = journal->j_committing_transaction;
1562
1563 /* Wait for the log commit to complete... */
1564 if (transaction) {
1565 tid_t tid = transaction->t_tid;
1566
1567 spin_unlock(&journal->j_state_lock);
1568 log_wait_commit(journal, tid);
1569 } else {
1570 spin_unlock(&journal->j_state_lock);
1571 }
1572
1573 /* ...and flush everything in the log out to disk. */
1574 spin_lock(&journal->j_list_lock);
1575 while (!err && journal->j_checkpoint_transactions != NULL) {
1576 spin_unlock(&journal->j_list_lock);
1577 mutex_lock(&journal->j_checkpoint_mutex);
1578 err = log_do_checkpoint(journal);
1579 mutex_unlock(&journal->j_checkpoint_mutex);
1580 spin_lock(&journal->j_list_lock);
1581 }
1582 spin_unlock(&journal->j_list_lock);
1583
1584 if (is_journal_aborted(journal))
1585 return -EIO;
1586
1587 mutex_lock(&journal->j_checkpoint_mutex);
1588 cleanup_journal_tail(journal);
1589
1590 /* Finally, mark the journal as really needing no recovery.
1591 * This sets s_start==0 in the underlying superblock, which is
1592 * the magic code for a fully-recovered superblock. Any future
1593 * commits of data to the journal will restore the current
1594 * s_start value. */
1595 mark_journal_empty(journal);
1596 mutex_unlock(&journal->j_checkpoint_mutex);
1597 spin_lock(&journal->j_state_lock);
1598 J_ASSERT(!journal->j_running_transaction);
1599 J_ASSERT(!journal->j_committing_transaction);
1600 J_ASSERT(!journal->j_checkpoint_transactions);
1601 J_ASSERT(journal->j_head == journal->j_tail);
1602 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1603 spin_unlock(&journal->j_state_lock);
1604 return 0;
1605}
1606
1607/**
1608 * int journal_wipe() - Wipe journal contents
1609 * @journal: Journal to act on.
1610 * @write: flag (see below)
1611 *
1612 * Wipe out all of the contents of a journal, safely. This will produce
1613 * a warning if the journal contains any valid recovery information.
1614 * Must be called between journal_init_*() and journal_load().
1615 *
1616 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
1617 * we merely suppress recovery.
1618 */
1619
1620int journal_wipe(journal_t *journal, int write)
1621{
1622 int err = 0;
1623
1624 J_ASSERT (!(journal->j_flags & JFS_LOADED));
1625
1626 err = load_superblock(journal);
1627 if (err)
1628 return err;
1629
1630 if (!journal->j_tail)
1631 goto no_recovery;
1632
1633 printk (KERN_WARNING "JBD: %s recovery information on journal\n",
1634 write ? "Clearing" : "Ignoring");
1635
1636 err = journal_skip_recovery(journal);
1637 if (write) {
1638 /* Lock to make assertions happy... */
1639 mutex_lock(&journal->j_checkpoint_mutex);
1640 mark_journal_empty(journal);
1641 mutex_unlock(&journal->j_checkpoint_mutex);
1642 }
1643
1644 no_recovery:
1645 return err;
1646}
1647
1648/*
1649 * journal_dev_name: format a character string to describe on what
1650 * device this journal is present.
1651 */
1652
1653static const char *journal_dev_name(journal_t *journal, char *buffer)
1654{
1655 struct block_device *bdev;
1656
1657 if (journal->j_inode)
1658 bdev = journal->j_inode->i_sb->s_bdev;
1659 else
1660 bdev = journal->j_dev;
1661
1662 return bdevname(bdev, buffer);
1663}
1664
1665/*
1666 * Journal abort has very specific semantics, which we describe
1667 * for journal abort.
1668 *
1669 * Two internal function, which provide abort to te jbd layer
1670 * itself are here.
1671 */
1672
1673/*
1674 * Quick version for internal journal use (doesn't lock the journal).
1675 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
1676 * and don't attempt to make any other journal updates.
1677 */
1678static void __journal_abort_hard(journal_t *journal)
1679{
1680 transaction_t *transaction;
1681 char b[BDEVNAME_SIZE];
1682
1683 if (journal->j_flags & JFS_ABORT)
1684 return;
1685
1686 printk(KERN_ERR "Aborting journal on device %s.\n",
1687 journal_dev_name(journal, b));
1688
1689 spin_lock(&journal->j_state_lock);
1690 journal->j_flags |= JFS_ABORT;
1691 transaction = journal->j_running_transaction;
1692 if (transaction)
1693 __log_start_commit(journal, transaction->t_tid);
1694 spin_unlock(&journal->j_state_lock);
1695}
1696
1697/* Soft abort: record the abort error status in the journal superblock,
1698 * but don't do any other IO. */
1699static void __journal_abort_soft (journal_t *journal, int errno)
1700{
1701 if (journal->j_flags & JFS_ABORT)
1702 return;
1703
1704 if (!journal->j_errno)
1705 journal->j_errno = errno;
1706
1707 __journal_abort_hard(journal);
1708
1709 if (errno)
1710 journal_update_sb_errno(journal);
1711}
1712
1713/**
1714 * void journal_abort () - Shutdown the journal immediately.
1715 * @journal: the journal to shutdown.
1716 * @errno: an error number to record in the journal indicating
1717 * the reason for the shutdown.
1718 *
1719 * Perform a complete, immediate shutdown of the ENTIRE
1720 * journal (not of a single transaction). This operation cannot be
1721 * undone without closing and reopening the journal.
1722 *
1723 * The journal_abort function is intended to support higher level error
1724 * recovery mechanisms such as the ext2/ext3 remount-readonly error
1725 * mode.
1726 *
1727 * Journal abort has very specific semantics. Any existing dirty,
1728 * unjournaled buffers in the main filesystem will still be written to
1729 * disk by bdflush, but the journaling mechanism will be suspended
1730 * immediately and no further transaction commits will be honoured.
1731 *
1732 * Any dirty, journaled buffers will be written back to disk without
1733 * hitting the journal. Atomicity cannot be guaranteed on an aborted
1734 * filesystem, but we _do_ attempt to leave as much data as possible
1735 * behind for fsck to use for cleanup.
1736 *
1737 * Any attempt to get a new transaction handle on a journal which is in
1738 * ABORT state will just result in an -EROFS error return. A
1739 * journal_stop on an existing handle will return -EIO if we have
1740 * entered abort state during the update.
1741 *
1742 * Recursive transactions are not disturbed by journal abort until the
1743 * final journal_stop, which will receive the -EIO error.
1744 *
1745 * Finally, the journal_abort call allows the caller to supply an errno
1746 * which will be recorded (if possible) in the journal superblock. This
1747 * allows a client to record failure conditions in the middle of a
1748 * transaction without having to complete the transaction to record the
1749 * failure to disk. ext3_error, for example, now uses this
1750 * functionality.
1751 *
1752 * Errors which originate from within the journaling layer will NOT
1753 * supply an errno; a null errno implies that absolutely no further
1754 * writes are done to the journal (unless there are any already in
1755 * progress).
1756 *
1757 */
1758
1759void journal_abort(journal_t *journal, int errno)
1760{
1761 __journal_abort_soft(journal, errno);
1762}
1763
1764/**
1765 * int journal_errno () - returns the journal's error state.
1766 * @journal: journal to examine.
1767 *
1768 * This is the errno numbet set with journal_abort(), the last
1769 * time the journal was mounted - if the journal was stopped
1770 * without calling abort this will be 0.
1771 *
1772 * If the journal has been aborted on this mount time -EROFS will
1773 * be returned.
1774 */
1775int journal_errno(journal_t *journal)
1776{
1777 int err;
1778
1779 spin_lock(&journal->j_state_lock);
1780 if (journal->j_flags & JFS_ABORT)
1781 err = -EROFS;
1782 else
1783 err = journal->j_errno;
1784 spin_unlock(&journal->j_state_lock);
1785 return err;
1786}
1787
1788/**
1789 * int journal_clear_err () - clears the journal's error state
1790 * @journal: journal to act on.
1791 *
1792 * An error must be cleared or Acked to take a FS out of readonly
1793 * mode.
1794 */
1795int journal_clear_err(journal_t *journal)
1796{
1797 int err = 0;
1798
1799 spin_lock(&journal->j_state_lock);
1800 if (journal->j_flags & JFS_ABORT)
1801 err = -EROFS;
1802 else
1803 journal->j_errno = 0;
1804 spin_unlock(&journal->j_state_lock);
1805 return err;
1806}
1807
1808/**
1809 * void journal_ack_err() - Ack journal err.
1810 * @journal: journal to act on.
1811 *
1812 * An error must be cleared or Acked to take a FS out of readonly
1813 * mode.
1814 */
1815void journal_ack_err(journal_t *journal)
1816{
1817 spin_lock(&journal->j_state_lock);
1818 if (journal->j_errno)
1819 journal->j_flags |= JFS_ACK_ERR;
1820 spin_unlock(&journal->j_state_lock);
1821}
1822
1823int journal_blocks_per_page(struct inode *inode)
1824{
1825 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1826}
1827
1828/*
1829 * Journal_head storage management
1830 */
1831static struct kmem_cache *journal_head_cache;
1832#ifdef CONFIG_JBD_DEBUG
1833static atomic_t nr_journal_heads = ATOMIC_INIT(0);
1834#endif
1835
1836static int journal_init_journal_head_cache(void)
1837{
1838 int retval;
1839
1840 J_ASSERT(journal_head_cache == NULL);
1841 journal_head_cache = kmem_cache_create("journal_head",
1842 sizeof(struct journal_head),
1843 0, /* offset */
1844 SLAB_TEMPORARY, /* flags */
1845 NULL); /* ctor */
1846 retval = 0;
1847 if (!journal_head_cache) {
1848 retval = -ENOMEM;
1849 printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
1850 }
1851 return retval;
1852}
1853
1854static void journal_destroy_journal_head_cache(void)
1855{
1856 if (journal_head_cache) {
1857 kmem_cache_destroy(journal_head_cache);
1858 journal_head_cache = NULL;
1859 }
1860}
1861
1862/*
1863 * journal_head splicing and dicing
1864 */
1865static struct journal_head *journal_alloc_journal_head(void)
1866{
1867 struct journal_head *ret;
1868
1869#ifdef CONFIG_JBD_DEBUG
1870 atomic_inc(&nr_journal_heads);
1871#endif
1872 ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
1873 if (ret == NULL) {
1874 jbd_debug(1, "out of memory for journal_head\n");
1875 printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1876 __func__);
1877
1878 while (ret == NULL) {
1879 yield();
1880 ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
1881 }
1882 }
1883 return ret;
1884}
1885
1886static void journal_free_journal_head(struct journal_head *jh)
1887{
1888#ifdef CONFIG_JBD_DEBUG
1889 atomic_dec(&nr_journal_heads);
1890 memset(jh, JBD_POISON_FREE, sizeof(*jh));
1891#endif
1892 kmem_cache_free(journal_head_cache, jh);
1893}
1894
1895/*
1896 * A journal_head is attached to a buffer_head whenever JBD has an
1897 * interest in the buffer.
1898 *
1899 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
1900 * is set. This bit is tested in core kernel code where we need to take
1901 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
1902 * there.
1903 *
1904 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
1905 *
1906 * When a buffer has its BH_JBD bit set it is immune from being released by
1907 * core kernel code, mainly via ->b_count.
1908 *
1909 * A journal_head is detached from its buffer_head when the journal_head's
1910 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
1911 * transaction (b_cp_transaction) hold their references to b_jcount.
1912 *
1913 * Various places in the kernel want to attach a journal_head to a buffer_head
1914 * _before_ attaching the journal_head to a transaction. To protect the
1915 * journal_head in this situation, journal_add_journal_head elevates the
1916 * journal_head's b_jcount refcount by one. The caller must call
1917 * journal_put_journal_head() to undo this.
1918 *
1919 * So the typical usage would be:
1920 *
1921 * (Attach a journal_head if needed. Increments b_jcount)
1922 * struct journal_head *jh = journal_add_journal_head(bh);
1923 * ...
1924 * (Get another reference for transaction)
1925 * journal_grab_journal_head(bh);
1926 * jh->b_transaction = xxx;
1927 * (Put original reference)
1928 * journal_put_journal_head(jh);
1929 */
1930
1931/*
1932 * Give a buffer_head a journal_head.
1933 *
1934 * May sleep.
1935 */
1936struct journal_head *journal_add_journal_head(struct buffer_head *bh)
1937{
1938 struct journal_head *jh;
1939 struct journal_head *new_jh = NULL;
1940
1941repeat:
1942 if (!buffer_jbd(bh))
1943 new_jh = journal_alloc_journal_head();
1944
1945 jbd_lock_bh_journal_head(bh);
1946 if (buffer_jbd(bh)) {
1947 jh = bh2jh(bh);
1948 } else {
1949 J_ASSERT_BH(bh,
1950 (atomic_read(&bh->b_count) > 0) ||
1951 (bh->b_page && bh->b_page->mapping));
1952
1953 if (!new_jh) {
1954 jbd_unlock_bh_journal_head(bh);
1955 goto repeat;
1956 }
1957
1958 jh = new_jh;
1959 new_jh = NULL; /* We consumed it */
1960 set_buffer_jbd(bh);
1961 bh->b_private = jh;
1962 jh->b_bh = bh;
1963 get_bh(bh);
1964 BUFFER_TRACE(bh, "added journal_head");
1965 }
1966 jh->b_jcount++;
1967 jbd_unlock_bh_journal_head(bh);
1968 if (new_jh)
1969 journal_free_journal_head(new_jh);
1970 return bh->b_private;
1971}
1972
1973/*
1974 * Grab a ref against this buffer_head's journal_head. If it ended up not
1975 * having a journal_head, return NULL
1976 */
1977struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
1978{
1979 struct journal_head *jh = NULL;
1980
1981 jbd_lock_bh_journal_head(bh);
1982 if (buffer_jbd(bh)) {
1983 jh = bh2jh(bh);
1984 jh->b_jcount++;
1985 }
1986 jbd_unlock_bh_journal_head(bh);
1987 return jh;
1988}
1989
1990static void __journal_remove_journal_head(struct buffer_head *bh)
1991{
1992 struct journal_head *jh = bh2jh(bh);
1993
1994 J_ASSERT_JH(jh, jh->b_jcount >= 0);
1995 J_ASSERT_JH(jh, jh->b_transaction == NULL);
1996 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1997 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
1998 J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
1999 J_ASSERT_BH(bh, buffer_jbd(bh));
2000 J_ASSERT_BH(bh, jh2bh(jh) == bh);
2001 BUFFER_TRACE(bh, "remove journal_head");
2002 if (jh->b_frozen_data) {
2003 printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
2004 jbd_free(jh->b_frozen_data, bh->b_size);
2005 }
2006 if (jh->b_committed_data) {
2007 printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
2008 jbd_free(jh->b_committed_data, bh->b_size);
2009 }
2010 bh->b_private = NULL;
2011 jh->b_bh = NULL; /* debug, really */
2012 clear_buffer_jbd(bh);
2013 journal_free_journal_head(jh);
2014}
2015
2016/*
2017 * Drop a reference on the passed journal_head. If it fell to zero then
2018 * release the journal_head from the buffer_head.
2019 */
2020void journal_put_journal_head(struct journal_head *jh)
2021{
2022 struct buffer_head *bh = jh2bh(jh);
2023
2024 jbd_lock_bh_journal_head(bh);
2025 J_ASSERT_JH(jh, jh->b_jcount > 0);
2026 --jh->b_jcount;
2027 if (!jh->b_jcount) {
2028 __journal_remove_journal_head(bh);
2029 jbd_unlock_bh_journal_head(bh);
2030 __brelse(bh);
2031 } else
2032 jbd_unlock_bh_journal_head(bh);
2033}
2034
2035/*
2036 * debugfs tunables
2037 */
2038#ifdef CONFIG_JBD_DEBUG
2039
2040u8 journal_enable_debug __read_mostly;
2041EXPORT_SYMBOL(journal_enable_debug);
2042
2043static struct dentry *jbd_debugfs_dir;
2044static struct dentry *jbd_debug;
2045
2046static void __init jbd_create_debugfs_entry(void)
2047{
2048 jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
2049 if (jbd_debugfs_dir)
2050 jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
2051 jbd_debugfs_dir,
2052 &journal_enable_debug);
2053}
2054
2055static void __exit jbd_remove_debugfs_entry(void)
2056{
2057 debugfs_remove(jbd_debug);
2058 debugfs_remove(jbd_debugfs_dir);
2059}
2060
2061#else
2062
2063static inline void jbd_create_debugfs_entry(void)
2064{
2065}
2066
2067static inline void jbd_remove_debugfs_entry(void)
2068{
2069}
2070
2071#endif
2072
2073struct kmem_cache *jbd_handle_cache;
2074
2075static int __init journal_init_handle_cache(void)
2076{
2077 jbd_handle_cache = kmem_cache_create("journal_handle",
2078 sizeof(handle_t),
2079 0, /* offset */
2080 SLAB_TEMPORARY, /* flags */
2081 NULL); /* ctor */
2082 if (jbd_handle_cache == NULL) {
2083 printk(KERN_EMERG "JBD: failed to create handle cache\n");
2084 return -ENOMEM;
2085 }
2086 return 0;
2087}
2088
2089static void journal_destroy_handle_cache(void)
2090{
2091 if (jbd_handle_cache)
2092 kmem_cache_destroy(jbd_handle_cache);
2093}
2094
2095/*
2096 * Module startup and shutdown
2097 */
2098
2099static int __init journal_init_caches(void)
2100{
2101 int ret;
2102
2103 ret = journal_init_revoke_caches();
2104 if (ret == 0)
2105 ret = journal_init_journal_head_cache();
2106 if (ret == 0)
2107 ret = journal_init_handle_cache();
2108 return ret;
2109}
2110
2111static void journal_destroy_caches(void)
2112{
2113 journal_destroy_revoke_caches();
2114 journal_destroy_journal_head_cache();
2115 journal_destroy_handle_cache();
2116}
2117
2118static int __init journal_init(void)
2119{
2120 int ret;
2121
2122 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2123
2124 ret = journal_init_caches();
2125 if (ret != 0)
2126 journal_destroy_caches();
2127 jbd_create_debugfs_entry();
2128 return ret;
2129}
2130
2131static void __exit journal_exit(void)
2132{
2133#ifdef CONFIG_JBD_DEBUG
2134 int n = atomic_read(&nr_journal_heads);
2135 if (n)
2136 printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
2137#endif
2138 jbd_remove_debugfs_entry();
2139 journal_destroy_caches();
2140}
2141
2142MODULE_LICENSE("GPL");
2143module_init(journal_init);
2144module_exit(journal_exit);
2145
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
deleted file mode 100644
index a748fe21465a..000000000000
--- a/fs/jbd/recovery.c
+++ /dev/null
@@ -1,594 +0,0 @@
1/*
2 * linux/fs/jbd/recovery.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal recovery routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#ifndef __KERNEL__
17#include "jfs_user.h"
18#else
19#include <linux/time.h>
20#include <linux/fs.h>
21#include <linux/jbd.h>
22#include <linux/errno.h>
23#include <linux/blkdev.h>
24#endif
25
26/*
27 * Maintain information about the progress of the recovery job, so that
28 * the different passes can carry information between them.
29 */
30struct recovery_info
31{
32 tid_t start_transaction;
33 tid_t end_transaction;
34
35 int nr_replays;
36 int nr_revokes;
37 int nr_revoke_hits;
38};
39
40enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
41static int do_one_pass(journal_t *journal,
42 struct recovery_info *info, enum passtype pass);
43static int scan_revoke_records(journal_t *, struct buffer_head *,
44 tid_t, struct recovery_info *);
45
46#ifdef __KERNEL__
47
48/* Release readahead buffers after use */
49static void journal_brelse_array(struct buffer_head *b[], int n)
50{
51 while (--n >= 0)
52 brelse (b[n]);
53}
54
55
56/*
57 * When reading from the journal, we are going through the block device
58 * layer directly and so there is no readahead being done for us. We
59 * need to implement any readahead ourselves if we want it to happen at
60 * all. Recovery is basically one long sequential read, so make sure we
61 * do the IO in reasonably large chunks.
62 *
63 * This is not so critical that we need to be enormously clever about
64 * the readahead size, though. 128K is a purely arbitrary, good-enough
65 * fixed value.
66 */
67
68#define MAXBUF 8
69static int do_readahead(journal_t *journal, unsigned int start)
70{
71 int err;
72 unsigned int max, nbufs, next;
73 unsigned int blocknr;
74 struct buffer_head *bh;
75
76 struct buffer_head * bufs[MAXBUF];
77
78 /* Do up to 128K of readahead */
79 max = start + (128 * 1024 / journal->j_blocksize);
80 if (max > journal->j_maxlen)
81 max = journal->j_maxlen;
82
83 /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
84 * a time to the block device IO layer. */
85
86 nbufs = 0;
87
88 for (next = start; next < max; next++) {
89 err = journal_bmap(journal, next, &blocknr);
90
91 if (err) {
92 printk (KERN_ERR "JBD: bad block at offset %u\n",
93 next);
94 goto failed;
95 }
96
97 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
98 if (!bh) {
99 err = -ENOMEM;
100 goto failed;
101 }
102
103 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
104 bufs[nbufs++] = bh;
105 if (nbufs == MAXBUF) {
106 ll_rw_block(READ, nbufs, bufs);
107 journal_brelse_array(bufs, nbufs);
108 nbufs = 0;
109 }
110 } else
111 brelse(bh);
112 }
113
114 if (nbufs)
115 ll_rw_block(READ, nbufs, bufs);
116 err = 0;
117
118failed:
119 if (nbufs)
120 journal_brelse_array(bufs, nbufs);
121 return err;
122}
123
124#endif /* __KERNEL__ */
125
126
127/*
128 * Read a block from the journal
129 */
130
131static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset)
133{
134 int err;
135 unsigned int blocknr;
136 struct buffer_head *bh;
137
138 *bhp = NULL;
139
140 if (offset >= journal->j_maxlen) {
141 printk(KERN_ERR "JBD: corrupted journal superblock\n");
142 return -EIO;
143 }
144
145 err = journal_bmap(journal, offset, &blocknr);
146
147 if (err) {
148 printk (KERN_ERR "JBD: bad block at offset %u\n",
149 offset);
150 return err;
151 }
152
153 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
154 if (!bh)
155 return -ENOMEM;
156
157 if (!buffer_uptodate(bh)) {
158 /* If this is a brand new buffer, start readahead.
159 Otherwise, we assume we are already reading it. */
160 if (!buffer_req(bh))
161 do_readahead(journal, offset);
162 wait_on_buffer(bh);
163 }
164
165 if (!buffer_uptodate(bh)) {
166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
167 offset);
168 brelse(bh);
169 return -EIO;
170 }
171
172 *bhp = bh;
173 return 0;
174}
175
176
177/*
178 * Count the number of in-use tags in a journal descriptor block.
179 */
180
181static int count_tags(struct buffer_head *bh, int size)
182{
183 char * tagp;
184 journal_block_tag_t * tag;
185 int nr = 0;
186
187 tagp = &bh->b_data[sizeof(journal_header_t)];
188
189 while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
190 tag = (journal_block_tag_t *) tagp;
191
192 nr++;
193 tagp += sizeof(journal_block_tag_t);
194 if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
195 tagp += 16;
196
197 if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
198 break;
199 }
200
201 return nr;
202}
203
204
205/* Make sure we wrap around the log correctly! */
206#define wrap(journal, var) \
207do { \
208 if (var >= (journal)->j_last) \
209 var -= ((journal)->j_last - (journal)->j_first); \
210} while (0)
211
212/**
213 * journal_recover - recovers a on-disk journal
214 * @journal: the journal to recover
215 *
216 * The primary function for recovering the log contents when mounting a
217 * journaled device.
218 *
219 * Recovery is done in three passes. In the first pass, we look for the
220 * end of the log. In the second, we assemble the list of revoke
221 * blocks. In the third and final pass, we replay any un-revoked blocks
222 * in the log.
223 */
224int journal_recover(journal_t *journal)
225{
226 int err, err2;
227 journal_superblock_t * sb;
228
229 struct recovery_info info;
230
231 memset(&info, 0, sizeof(info));
232 sb = journal->j_superblock;
233
234 /*
235 * The journal superblock's s_start field (the current log head)
236 * is always zero if, and only if, the journal was cleanly
237 * unmounted.
238 */
239
240 if (!sb->s_start) {
241 jbd_debug(1, "No recovery required, last transaction %d\n",
242 be32_to_cpu(sb->s_sequence));
243 journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
244 return 0;
245 }
246
247 err = do_one_pass(journal, &info, PASS_SCAN);
248 if (!err)
249 err = do_one_pass(journal, &info, PASS_REVOKE);
250 if (!err)
251 err = do_one_pass(journal, &info, PASS_REPLAY);
252
253 jbd_debug(1, "JBD: recovery, exit status %d, "
254 "recovered transactions %u to %u\n",
255 err, info.start_transaction, info.end_transaction);
256 jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
257 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
258
259 /* Restart the log at the next transaction ID, thus invalidating
260 * any existing commit records in the log. */
261 journal->j_transaction_sequence = ++info.end_transaction;
262
263 journal_clear_revoke(journal);
264 err2 = sync_blockdev(journal->j_fs_dev);
265 if (!err)
266 err = err2;
267 /* Flush disk caches to get replayed data on the permanent storage */
268 if (journal->j_flags & JFS_BARRIER) {
269 err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
270 if (!err)
271 err = err2;
272 }
273
274 return err;
275}
276
277/**
278 * journal_skip_recovery - Start journal and wipe exiting records
279 * @journal: journal to startup
280 *
281 * Locate any valid recovery information from the journal and set up the
282 * journal structures in memory to ignore it (presumably because the
283 * caller has evidence that it is out of date).
284 * This function does'nt appear to be exorted..
285 *
286 * We perform one pass over the journal to allow us to tell the user how
287 * much recovery information is being erased, and to let us initialise
288 * the journal transaction sequence numbers to the next unused ID.
289 */
290int journal_skip_recovery(journal_t *journal)
291{
292 int err;
293 struct recovery_info info;
294
295 memset (&info, 0, sizeof(info));
296
297 err = do_one_pass(journal, &info, PASS_SCAN);
298
299 if (err) {
300 printk(KERN_ERR "JBD: error %d scanning journal\n", err);
301 ++journal->j_transaction_sequence;
302 } else {
303#ifdef CONFIG_JBD_DEBUG
304 int dropped = info.end_transaction -
305 be32_to_cpu(journal->j_superblock->s_sequence);
306 jbd_debug(1,
307 "JBD: ignoring %d transaction%s from the journal.\n",
308 dropped, (dropped == 1) ? "" : "s");
309#endif
310 journal->j_transaction_sequence = ++info.end_transaction;
311 }
312
313 journal->j_tail = 0;
314 return err;
315}
316
317static int do_one_pass(journal_t *journal,
318 struct recovery_info *info, enum passtype pass)
319{
320 unsigned int first_commit_ID, next_commit_ID;
321 unsigned int next_log_block;
322 int err, success = 0;
323 journal_superblock_t * sb;
324 journal_header_t * tmp;
325 struct buffer_head * bh;
326 unsigned int sequence;
327 int blocktype;
328
329 /*
330 * First thing is to establish what we expect to find in the log
331 * (in terms of transaction IDs), and where (in terms of log
332 * block offsets): query the superblock.
333 */
334
335 sb = journal->j_superblock;
336 next_commit_ID = be32_to_cpu(sb->s_sequence);
337 next_log_block = be32_to_cpu(sb->s_start);
338
339 first_commit_ID = next_commit_ID;
340 if (pass == PASS_SCAN)
341 info->start_transaction = first_commit_ID;
342
343 jbd_debug(1, "Starting recovery pass %d\n", pass);
344
345 /*
346 * Now we walk through the log, transaction by transaction,
347 * making sure that each transaction has a commit block in the
348 * expected place. Each complete transaction gets replayed back
349 * into the main filesystem.
350 */
351
352 while (1) {
353 int flags;
354 char * tagp;
355 journal_block_tag_t * tag;
356 struct buffer_head * obh;
357 struct buffer_head * nbh;
358
359 cond_resched();
360
361 /* If we already know where to stop the log traversal,
362 * check right now that we haven't gone past the end of
363 * the log. */
364
365 if (pass != PASS_SCAN)
366 if (tid_geq(next_commit_ID, info->end_transaction))
367 break;
368
369 jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
370 next_commit_ID, next_log_block, journal->j_last);
371
372 /* Skip over each chunk of the transaction looking
373 * either the next descriptor block or the final commit
374 * record. */
375
376 jbd_debug(3, "JBD: checking block %u\n", next_log_block);
377 err = jread(&bh, journal, next_log_block);
378 if (err)
379 goto failed;
380
381 next_log_block++;
382 wrap(journal, next_log_block);
383
384 /* What kind of buffer is it?
385 *
386 * If it is a descriptor block, check that it has the
387 * expected sequence number. Otherwise, we're all done
388 * here. */
389
390 tmp = (journal_header_t *)bh->b_data;
391
392 if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
393 brelse(bh);
394 break;
395 }
396
397 blocktype = be32_to_cpu(tmp->h_blocktype);
398 sequence = be32_to_cpu(tmp->h_sequence);
399 jbd_debug(3, "Found magic %d, sequence %d\n",
400 blocktype, sequence);
401
402 if (sequence != next_commit_ID) {
403 brelse(bh);
404 break;
405 }
406
407 /* OK, we have a valid descriptor block which matches
408 * all of the sequence number checks. What are we going
409 * to do with it? That depends on the pass... */
410
411 switch(blocktype) {
412 case JFS_DESCRIPTOR_BLOCK:
413 /* If it is a valid descriptor block, replay it
414 * in pass REPLAY; otherwise, just skip over the
415 * blocks it describes. */
416 if (pass != PASS_REPLAY) {
417 next_log_block +=
418 count_tags(bh, journal->j_blocksize);
419 wrap(journal, next_log_block);
420 brelse(bh);
421 continue;
422 }
423
424 /* A descriptor block: we can now write all of
425 * the data blocks. Yay, useful work is finally
426 * getting done here! */
427
428 tagp = &bh->b_data[sizeof(journal_header_t)];
429 while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
430 <= journal->j_blocksize) {
431 unsigned int io_block;
432
433 tag = (journal_block_tag_t *) tagp;
434 flags = be32_to_cpu(tag->t_flags);
435
436 io_block = next_log_block++;
437 wrap(journal, next_log_block);
438 err = jread(&obh, journal, io_block);
439 if (err) {
440 /* Recover what we can, but
441 * report failure at the end. */
442 success = err;
443 printk (KERN_ERR
444 "JBD: IO error %d recovering "
445 "block %u in log\n",
446 err, io_block);
447 } else {
448 unsigned int blocknr;
449
450 J_ASSERT(obh != NULL);
451 blocknr = be32_to_cpu(tag->t_blocknr);
452
453 /* If the block has been
454 * revoked, then we're all done
455 * here. */
456 if (journal_test_revoke
457 (journal, blocknr,
458 next_commit_ID)) {
459 brelse(obh);
460 ++info->nr_revoke_hits;
461 goto skip_write;
462 }
463
464 /* Find a buffer for the new
465 * data being restored */
466 nbh = __getblk(journal->j_fs_dev,
467 blocknr,
468 journal->j_blocksize);
469 if (nbh == NULL) {
470 printk(KERN_ERR
471 "JBD: Out of memory "
472 "during recovery.\n");
473 err = -ENOMEM;
474 brelse(bh);
475 brelse(obh);
476 goto failed;
477 }
478
479 lock_buffer(nbh);
480 memcpy(nbh->b_data, obh->b_data,
481 journal->j_blocksize);
482 if (flags & JFS_FLAG_ESCAPE) {
483 *((__be32 *)nbh->b_data) =
484 cpu_to_be32(JFS_MAGIC_NUMBER);
485 }
486
487 BUFFER_TRACE(nbh, "marking dirty");
488 set_buffer_uptodate(nbh);
489 mark_buffer_dirty(nbh);
490 BUFFER_TRACE(nbh, "marking uptodate");
491 ++info->nr_replays;
492 /* ll_rw_block(WRITE, 1, &nbh); */
493 unlock_buffer(nbh);
494 brelse(obh);
495 brelse(nbh);
496 }
497
498 skip_write:
499 tagp += sizeof(journal_block_tag_t);
500 if (!(flags & JFS_FLAG_SAME_UUID))
501 tagp += 16;
502
503 if (flags & JFS_FLAG_LAST_TAG)
504 break;
505 }
506
507 brelse(bh);
508 continue;
509
510 case JFS_COMMIT_BLOCK:
511 /* Found an expected commit block: not much to
512 * do other than move on to the next sequence
513 * number. */
514 brelse(bh);
515 next_commit_ID++;
516 continue;
517
518 case JFS_REVOKE_BLOCK:
519 /* If we aren't in the REVOKE pass, then we can
520 * just skip over this block. */
521 if (pass != PASS_REVOKE) {
522 brelse(bh);
523 continue;
524 }
525
526 err = scan_revoke_records(journal, bh,
527 next_commit_ID, info);
528 brelse(bh);
529 if (err)
530 goto failed;
531 continue;
532
533 default:
534 jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
535 blocktype);
536 brelse(bh);
537 goto done;
538 }
539 }
540
541 done:
542 /*
543 * We broke out of the log scan loop: either we came to the
544 * known end of the log or we found an unexpected block in the
545 * log. If the latter happened, then we know that the "current"
546 * transaction marks the end of the valid log.
547 */
548
549 if (pass == PASS_SCAN)
550 info->end_transaction = next_commit_ID;
551 else {
552 /* It's really bad news if different passes end up at
553 * different places (but possible due to IO errors). */
554 if (info->end_transaction != next_commit_ID) {
555 printk (KERN_ERR "JBD: recovery pass %d ended at "
556 "transaction %u, expected %u\n",
557 pass, next_commit_ID, info->end_transaction);
558 if (!success)
559 success = -EIO;
560 }
561 }
562
563 return success;
564
565 failed:
566 return err;
567}
568
569
570/* Scan a revoke record, marking all blocks mentioned as revoked. */
571
572static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
573 tid_t sequence, struct recovery_info *info)
574{
575 journal_revoke_header_t *header;
576 int offset, max;
577
578 header = (journal_revoke_header_t *) bh->b_data;
579 offset = sizeof(journal_revoke_header_t);
580 max = be32_to_cpu(header->r_count);
581
582 while (offset < max) {
583 unsigned int blocknr;
584 int err;
585
586 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
587 offset += 4;
588 err = journal_set_revoke(journal, blocknr, sequence);
589 if (err)
590 return err;
591 ++info->nr_revokes;
592 }
593 return 0;
594}
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
deleted file mode 100644
index dcead636c33b..000000000000
--- a/fs/jbd/revoke.c
+++ /dev/null
@@ -1,733 +0,0 @@
1/*
2 * linux/fs/jbd/revoke.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
5 *
6 * Copyright 2000 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal revoke routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 *
15 * Revoke is the mechanism used to prevent old log records for deleted
16 * metadata from being replayed on top of newer data using the same
17 * blocks. The revoke mechanism is used in two separate places:
18 *
19 * + Commit: during commit we write the entire list of the current
20 * transaction's revoked blocks to the journal
21 *
22 * + Recovery: during recovery we record the transaction ID of all
23 * revoked blocks. If there are multiple revoke records in the log
24 * for a single block, only the last one counts, and if there is a log
25 * entry for a block beyond the last revoke, then that log entry still
26 * gets replayed.
27 *
28 * We can get interactions between revokes and new log data within a
29 * single transaction:
30 *
31 * Block is revoked and then journaled:
32 * The desired end result is the journaling of the new block, so we
33 * cancel the revoke before the transaction commits.
34 *
35 * Block is journaled and then revoked:
36 * The revoke must take precedence over the write of the block, so we
37 * need either to cancel the journal entry or to write the revoke
38 * later in the log than the log block. In this case, we choose the
39 * latter: journaling a block cancels any revoke record for that block
40 * in the current transaction, so any revoke for that block in the
41 * transaction must have happened after the block was journaled and so
42 * the revoke must take precedence.
43 *
44 * Block is revoked and then written as data:
45 * The data write is allowed to succeed, but the revoke is _not_
46 * cancelled. We still need to prevent old log records from
47 * overwriting the new data. We don't even need to clear the revoke
48 * bit here.
49 *
50 * We cache revoke status of a buffer in the current transaction in b_states
51 * bits. As the name says, revokevalid flag indicates that the cached revoke
52 * status of a buffer is valid and we can rely on the cached status.
53 *
54 * Revoke information on buffers is a tri-state value:
55 *
56 * RevokeValid clear: no cached revoke status, need to look it up
57 * RevokeValid set, Revoked clear:
58 * buffer has not been revoked, and cancel_revoke
59 * need do nothing.
60 * RevokeValid set, Revoked set:
61 * buffer has been revoked.
62 *
63 * Locking rules:
64 * We keep two hash tables of revoke records. One hashtable belongs to the
65 * running transaction (is pointed to by journal->j_revoke), the other one
66 * belongs to the committing transaction. Accesses to the second hash table
67 * happen only from the kjournald and no other thread touches this table. Also
68 * journal_switch_revoke_table() which switches which hashtable belongs to the
69 * running and which to the committing transaction is called only from
70 * kjournald. Therefore we need no locks when accessing the hashtable belonging
71 * to the committing transaction.
72 *
73 * All users operating on the hash table belonging to the running transaction
74 * have a handle to the transaction. Therefore they are safe from kjournald
75 * switching hash tables under them. For operations on the lists of entries in
76 * the hash table j_revoke_lock is used.
77 *
78 * Finally, also replay code uses the hash tables but at this moment no one else
79 * can touch them (filesystem isn't mounted yet) and hence no locking is
80 * needed.
81 */
82
83#ifndef __KERNEL__
84#include "jfs_user.h"
85#else
86#include <linux/time.h>
87#include <linux/fs.h>
88#include <linux/jbd.h>
89#include <linux/errno.h>
90#include <linux/slab.h>
91#include <linux/list.h>
92#include <linux/init.h>
93#include <linux/bio.h>
94#endif
95#include <linux/log2.h>
96#include <linux/hash.h>
97
98static struct kmem_cache *revoke_record_cache;
99static struct kmem_cache *revoke_table_cache;
100
101/* Each revoke record represents one single revoked block. During
102 journal replay, this involves recording the transaction ID of the
103 last transaction to revoke this block. */
104
105struct jbd_revoke_record_s
106{
107 struct list_head hash;
108 tid_t sequence; /* Used for recovery only */
109 unsigned int blocknr;
110};
111
112
113/* The revoke table is just a simple hash table of revoke records. */
114struct jbd_revoke_table_s
115{
116 /* It is conceivable that we might want a larger hash table
117 * for recovery. Must be a power of two. */
118 int hash_size;
119 int hash_shift;
120 struct list_head *hash_table;
121};
122
123
124#ifdef __KERNEL__
125static void write_one_revoke_record(journal_t *, transaction_t *,
126 struct journal_head **, int *,
127 struct jbd_revoke_record_s *, int);
128static void flush_descriptor(journal_t *, struct journal_head *, int, int);
129#endif
130
131/* Utility functions to maintain the revoke table */
132
133static inline int hash(journal_t *journal, unsigned int block)
134{
135 struct jbd_revoke_table_s *table = journal->j_revoke;
136
137 return hash_32(block, table->hash_shift);
138}
139
140static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
141 tid_t seq)
142{
143 struct list_head *hash_list;
144 struct jbd_revoke_record_s *record;
145
146repeat:
147 record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
148 if (!record)
149 goto oom;
150
151 record->sequence = seq;
152 record->blocknr = blocknr;
153 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
154 spin_lock(&journal->j_revoke_lock);
155 list_add(&record->hash, hash_list);
156 spin_unlock(&journal->j_revoke_lock);
157 return 0;
158
159oom:
160 if (!journal_oom_retry)
161 return -ENOMEM;
162 jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
163 yield();
164 goto repeat;
165}
166
167/* Find a revoke record in the journal's hash table. */
168
169static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
170 unsigned int blocknr)
171{
172 struct list_head *hash_list;
173 struct jbd_revoke_record_s *record;
174
175 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
176
177 spin_lock(&journal->j_revoke_lock);
178 record = (struct jbd_revoke_record_s *) hash_list->next;
179 while (&(record->hash) != hash_list) {
180 if (record->blocknr == blocknr) {
181 spin_unlock(&journal->j_revoke_lock);
182 return record;
183 }
184 record = (struct jbd_revoke_record_s *) record->hash.next;
185 }
186 spin_unlock(&journal->j_revoke_lock);
187 return NULL;
188}
189
190void journal_destroy_revoke_caches(void)
191{
192 if (revoke_record_cache) {
193 kmem_cache_destroy(revoke_record_cache);
194 revoke_record_cache = NULL;
195 }
196 if (revoke_table_cache) {
197 kmem_cache_destroy(revoke_table_cache);
198 revoke_table_cache = NULL;
199 }
200}
201
202int __init journal_init_revoke_caches(void)
203{
204 J_ASSERT(!revoke_record_cache);
205 J_ASSERT(!revoke_table_cache);
206
207 revoke_record_cache = kmem_cache_create("revoke_record",
208 sizeof(struct jbd_revoke_record_s),
209 0,
210 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
211 NULL);
212 if (!revoke_record_cache)
213 goto record_cache_failure;
214
215 revoke_table_cache = kmem_cache_create("revoke_table",
216 sizeof(struct jbd_revoke_table_s),
217 0, SLAB_TEMPORARY, NULL);
218 if (!revoke_table_cache)
219 goto table_cache_failure;
220
221 return 0;
222
223table_cache_failure:
224 journal_destroy_revoke_caches();
225record_cache_failure:
226 return -ENOMEM;
227}
228
229static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
230{
231 int i;
232 struct jbd_revoke_table_s *table;
233
234 table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
235 if (!table)
236 goto out;
237
238 table->hash_size = hash_size;
239 table->hash_shift = ilog2(hash_size);
240 table->hash_table =
241 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
242 if (!table->hash_table) {
243 kmem_cache_free(revoke_table_cache, table);
244 table = NULL;
245 goto out;
246 }
247
248 for (i = 0; i < hash_size; i++)
249 INIT_LIST_HEAD(&table->hash_table[i]);
250
251out:
252 return table;
253}
254
255static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
256{
257 int i;
258 struct list_head *hash_list;
259
260 for (i = 0; i < table->hash_size; i++) {
261 hash_list = &table->hash_table[i];
262 J_ASSERT(list_empty(hash_list));
263 }
264
265 kfree(table->hash_table);
266 kmem_cache_free(revoke_table_cache, table);
267}
268
269/* Initialise the revoke table for a given journal to a given size. */
270int journal_init_revoke(journal_t *journal, int hash_size)
271{
272 J_ASSERT(journal->j_revoke_table[0] == NULL);
273 J_ASSERT(is_power_of_2(hash_size));
274
275 journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
276 if (!journal->j_revoke_table[0])
277 goto fail0;
278
279 journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
280 if (!journal->j_revoke_table[1])
281 goto fail1;
282
283 journal->j_revoke = journal->j_revoke_table[1];
284
285 spin_lock_init(&journal->j_revoke_lock);
286
287 return 0;
288
289fail1:
290 journal_destroy_revoke_table(journal->j_revoke_table[0]);
291fail0:
292 return -ENOMEM;
293}
294
295/* Destroy a journal's revoke table. The table must already be empty! */
296void journal_destroy_revoke(journal_t *journal)
297{
298 journal->j_revoke = NULL;
299 if (journal->j_revoke_table[0])
300 journal_destroy_revoke_table(journal->j_revoke_table[0]);
301 if (journal->j_revoke_table[1])
302 journal_destroy_revoke_table(journal->j_revoke_table[1]);
303}
304
305
306#ifdef __KERNEL__
307
308/*
309 * journal_revoke: revoke a given buffer_head from the journal. This
310 * prevents the block from being replayed during recovery if we take a
311 * crash after this current transaction commits. Any subsequent
312 * metadata writes of the buffer in this transaction cancel the
313 * revoke.
314 *
315 * Note that this call may block --- it is up to the caller to make
316 * sure that there are no further calls to journal_write_metadata
317 * before the revoke is complete. In ext3, this implies calling the
318 * revoke before clearing the block bitmap when we are deleting
319 * metadata.
320 *
321 * Revoke performs a journal_forget on any buffer_head passed in as a
322 * parameter, but does _not_ forget the buffer_head if the bh was only
323 * found implicitly.
324 *
325 * bh_in may not be a journalled buffer - it may have come off
326 * the hash tables without an attached journal_head.
327 *
328 * If bh_in is non-zero, journal_revoke() will decrement its b_count
329 * by one.
330 */
331
332int journal_revoke(handle_t *handle, unsigned int blocknr,
333 struct buffer_head *bh_in)
334{
335 struct buffer_head *bh = NULL;
336 journal_t *journal;
337 struct block_device *bdev;
338 int err;
339
340 might_sleep();
341 if (bh_in)
342 BUFFER_TRACE(bh_in, "enter");
343
344 journal = handle->h_transaction->t_journal;
345 if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
346 J_ASSERT (!"Cannot set revoke feature!");
347 return -EINVAL;
348 }
349
350 bdev = journal->j_fs_dev;
351 bh = bh_in;
352
353 if (!bh) {
354 bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
355 if (bh)
356 BUFFER_TRACE(bh, "found on hash");
357 }
358#ifdef JBD_EXPENSIVE_CHECKING
359 else {
360 struct buffer_head *bh2;
361
362 /* If there is a different buffer_head lying around in
363 * memory anywhere... */
364 bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
365 if (bh2) {
366 /* ... and it has RevokeValid status... */
367 if (bh2 != bh && buffer_revokevalid(bh2))
368 /* ...then it better be revoked too,
369 * since it's illegal to create a revoke
370 * record against a buffer_head which is
371 * not marked revoked --- that would
372 * risk missing a subsequent revoke
373 * cancel. */
374 J_ASSERT_BH(bh2, buffer_revoked(bh2));
375 put_bh(bh2);
376 }
377 }
378#endif
379
380 /* We really ought not ever to revoke twice in a row without
381 first having the revoke cancelled: it's illegal to free a
382 block twice without allocating it in between! */
383 if (bh) {
384 if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
385 "inconsistent data on disk")) {
386 if (!bh_in)
387 brelse(bh);
388 return -EIO;
389 }
390 set_buffer_revoked(bh);
391 set_buffer_revokevalid(bh);
392 if (bh_in) {
393 BUFFER_TRACE(bh_in, "call journal_forget");
394 journal_forget(handle, bh_in);
395 } else {
396 BUFFER_TRACE(bh, "call brelse");
397 __brelse(bh);
398 }
399 }
400
401 jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
402 err = insert_revoke_hash(journal, blocknr,
403 handle->h_transaction->t_tid);
404 BUFFER_TRACE(bh_in, "exit");
405 return err;
406}
407
408/*
409 * Cancel an outstanding revoke. For use only internally by the
410 * journaling code (called from journal_get_write_access).
411 *
412 * We trust buffer_revoked() on the buffer if the buffer is already
413 * being journaled: if there is no revoke pending on the buffer, then we
414 * don't do anything here.
415 *
416 * This would break if it were possible for a buffer to be revoked and
417 * discarded, and then reallocated within the same transaction. In such
418 * a case we would have lost the revoked bit, but when we arrived here
419 * the second time we would still have a pending revoke to cancel. So,
420 * do not trust the Revoked bit on buffers unless RevokeValid is also
421 * set.
422 */
423int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
424{
425 struct jbd_revoke_record_s *record;
426 journal_t *journal = handle->h_transaction->t_journal;
427 int need_cancel;
428 int did_revoke = 0; /* akpm: debug */
429 struct buffer_head *bh = jh2bh(jh);
430
431 jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
432
433 /* Is the existing Revoke bit valid? If so, we trust it, and
434 * only perform the full cancel if the revoke bit is set. If
435 * not, we can't trust the revoke bit, and we need to do the
436 * full search for a revoke record. */
437 if (test_set_buffer_revokevalid(bh)) {
438 need_cancel = test_clear_buffer_revoked(bh);
439 } else {
440 need_cancel = 1;
441 clear_buffer_revoked(bh);
442 }
443
444 if (need_cancel) {
445 record = find_revoke_record(journal, bh->b_blocknr);
446 if (record) {
447 jbd_debug(4, "cancelled existing revoke on "
448 "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
449 spin_lock(&journal->j_revoke_lock);
450 list_del(&record->hash);
451 spin_unlock(&journal->j_revoke_lock);
452 kmem_cache_free(revoke_record_cache, record);
453 did_revoke = 1;
454 }
455 }
456
457#ifdef JBD_EXPENSIVE_CHECKING
458 /* There better not be one left behind by now! */
459 record = find_revoke_record(journal, bh->b_blocknr);
460 J_ASSERT_JH(jh, record == NULL);
461#endif
462
463 /* Finally, have we just cleared revoke on an unhashed
464 * buffer_head? If so, we'd better make sure we clear the
465 * revoked status on any hashed alias too, otherwise the revoke
466 * state machine will get very upset later on. */
467 if (need_cancel) {
468 struct buffer_head *bh2;
469 bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
470 if (bh2) {
471 if (bh2 != bh)
472 clear_buffer_revoked(bh2);
473 __brelse(bh2);
474 }
475 }
476 return did_revoke;
477}
478
479/*
480 * journal_clear_revoked_flags clears revoked flag of buffers in
481 * revoke table to reflect there is no revoked buffer in the next
482 * transaction which is going to be started.
483 */
484void journal_clear_buffer_revoked_flags(journal_t *journal)
485{
486 struct jbd_revoke_table_s *revoke = journal->j_revoke;
487 int i = 0;
488
489 for (i = 0; i < revoke->hash_size; i++) {
490 struct list_head *hash_list;
491 struct list_head *list_entry;
492 hash_list = &revoke->hash_table[i];
493
494 list_for_each(list_entry, hash_list) {
495 struct jbd_revoke_record_s *record;
496 struct buffer_head *bh;
497 record = (struct jbd_revoke_record_s *)list_entry;
498 bh = __find_get_block(journal->j_fs_dev,
499 record->blocknr,
500 journal->j_blocksize);
501 if (bh) {
502 clear_buffer_revoked(bh);
503 __brelse(bh);
504 }
505 }
506 }
507}
508
509/* journal_switch_revoke table select j_revoke for next transaction
510 * we do not want to suspend any processing until all revokes are
511 * written -bzzz
512 */
513void journal_switch_revoke_table(journal_t *journal)
514{
515 int i;
516
517 if (journal->j_revoke == journal->j_revoke_table[0])
518 journal->j_revoke = journal->j_revoke_table[1];
519 else
520 journal->j_revoke = journal->j_revoke_table[0];
521
522 for (i = 0; i < journal->j_revoke->hash_size; i++)
523 INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
524}
525
526/*
527 * Write revoke records to the journal for all entries in the current
528 * revoke hash, deleting the entries as we go.
529 */
530void journal_write_revoke_records(journal_t *journal,
531 transaction_t *transaction, int write_op)
532{
533 struct journal_head *descriptor;
534 struct jbd_revoke_record_s *record;
535 struct jbd_revoke_table_s *revoke;
536 struct list_head *hash_list;
537 int i, offset, count;
538
539 descriptor = NULL;
540 offset = 0;
541 count = 0;
542
543 /* select revoke table for committing transaction */
544 revoke = journal->j_revoke == journal->j_revoke_table[0] ?
545 journal->j_revoke_table[1] : journal->j_revoke_table[0];
546
547 for (i = 0; i < revoke->hash_size; i++) {
548 hash_list = &revoke->hash_table[i];
549
550 while (!list_empty(hash_list)) {
551 record = (struct jbd_revoke_record_s *)
552 hash_list->next;
553 write_one_revoke_record(journal, transaction,
554 &descriptor, &offset,
555 record, write_op);
556 count++;
557 list_del(&record->hash);
558 kmem_cache_free(revoke_record_cache, record);
559 }
560 }
561 if (descriptor)
562 flush_descriptor(journal, descriptor, offset, write_op);
563 jbd_debug(1, "Wrote %d revoke records\n", count);
564}
565
566/*
567 * Write out one revoke record. We need to create a new descriptor
568 * block if the old one is full or if we have not already created one.
569 */
570
571static void write_one_revoke_record(journal_t *journal,
572 transaction_t *transaction,
573 struct journal_head **descriptorp,
574 int *offsetp,
575 struct jbd_revoke_record_s *record,
576 int write_op)
577{
578 struct journal_head *descriptor;
579 int offset;
580 journal_header_t *header;
581
582 /* If we are already aborting, this all becomes a noop. We
583 still need to go round the loop in
584 journal_write_revoke_records in order to free all of the
585 revoke records: only the IO to the journal is omitted. */
586 if (is_journal_aborted(journal))
587 return;
588
589 descriptor = *descriptorp;
590 offset = *offsetp;
591
592 /* Make sure we have a descriptor with space left for the record */
593 if (descriptor) {
594 if (offset == journal->j_blocksize) {
595 flush_descriptor(journal, descriptor, offset, write_op);
596 descriptor = NULL;
597 }
598 }
599
600 if (!descriptor) {
601 descriptor = journal_get_descriptor_buffer(journal);
602 if (!descriptor)
603 return;
604 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
605 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
606 header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
607 header->h_sequence = cpu_to_be32(transaction->t_tid);
608
609 /* Record it so that we can wait for IO completion later */
610 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
611 journal_file_buffer(descriptor, transaction, BJ_LogCtl);
612
613 offset = sizeof(journal_revoke_header_t);
614 *descriptorp = descriptor;
615 }
616
617 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
618 cpu_to_be32(record->blocknr);
619 offset += 4;
620 *offsetp = offset;
621}
622
623/*
624 * Flush a revoke descriptor out to the journal. If we are aborting,
625 * this is a noop; otherwise we are generating a buffer which needs to
626 * be waited for during commit, so it has to go onto the appropriate
627 * journal buffer list.
628 */
629
630static void flush_descriptor(journal_t *journal,
631 struct journal_head *descriptor,
632 int offset, int write_op)
633{
634 journal_revoke_header_t *header;
635 struct buffer_head *bh = jh2bh(descriptor);
636
637 if (is_journal_aborted(journal)) {
638 put_bh(bh);
639 return;
640 }
641
642 header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
643 header->r_count = cpu_to_be32(offset);
644 set_buffer_jwrite(bh);
645 BUFFER_TRACE(bh, "write");
646 set_buffer_dirty(bh);
647 write_dirty_buffer(bh, write_op);
648}
649#endif
650
651/*
652 * Revoke support for recovery.
653 *
654 * Recovery needs to be able to:
655 *
656 * record all revoke records, including the tid of the latest instance
657 * of each revoke in the journal
658 *
659 * check whether a given block in a given transaction should be replayed
660 * (ie. has not been revoked by a revoke record in that or a subsequent
661 * transaction)
662 *
663 * empty the revoke table after recovery.
664 */
665
666/*
667 * First, setting revoke records. We create a new revoke record for
668 * every block ever revoked in the log as we scan it for recovery, and
669 * we update the existing records if we find multiple revokes for a
670 * single block.
671 */
672
673int journal_set_revoke(journal_t *journal,
674 unsigned int blocknr,
675 tid_t sequence)
676{
677 struct jbd_revoke_record_s *record;
678
679 record = find_revoke_record(journal, blocknr);
680 if (record) {
681 /* If we have multiple occurrences, only record the
682 * latest sequence number in the hashed record */
683 if (tid_gt(sequence, record->sequence))
684 record->sequence = sequence;
685 return 0;
686 }
687 return insert_revoke_hash(journal, blocknr, sequence);
688}
689
690/*
691 * Test revoke records. For a given block referenced in the log, has
692 * that block been revoked? A revoke record with a given transaction
693 * sequence number revokes all blocks in that transaction and earlier
694 * ones, but later transactions still need replayed.
695 */
696
697int journal_test_revoke(journal_t *journal,
698 unsigned int blocknr,
699 tid_t sequence)
700{
701 struct jbd_revoke_record_s *record;
702
703 record = find_revoke_record(journal, blocknr);
704 if (!record)
705 return 0;
706 if (tid_gt(sequence, record->sequence))
707 return 0;
708 return 1;
709}
710
711/*
712 * Finally, once recovery is over, we need to clear the revoke table so
713 * that it can be reused by the running filesystem.
714 */
715
716void journal_clear_revoke(journal_t *journal)
717{
718 int i;
719 struct list_head *hash_list;
720 struct jbd_revoke_record_s *record;
721 struct jbd_revoke_table_s *revoke;
722
723 revoke = journal->j_revoke;
724
725 for (i = 0; i < revoke->hash_size; i++) {
726 hash_list = &revoke->hash_table[i];
727 while (!list_empty(hash_list)) {
728 record = (struct jbd_revoke_record_s*) hash_list->next;
729 list_del(&record->hash);
730 kmem_cache_free(revoke_record_cache, record);
731 }
732 }
733}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
deleted file mode 100644
index 1695ba8334a2..000000000000
--- a/fs/jbd/transaction.c
+++ /dev/null
@@ -1,2237 +0,0 @@
1/*
2 * linux/fs/jbd/transaction.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem transaction handling code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages transactions (compound commits managed by the
16 * journaling code) and handles (individual atomic operations by the
17 * filesystem).
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/timer.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
29
30static void __journal_temp_unlink_buffer(struct journal_head *jh);
31
32/*
33 * get_transaction: obtain a new transaction_t object.
34 *
35 * Simply allocate and initialise a new transaction. Create it in
36 * RUNNING state and add it to the current journal (which should not
37 * have an existing running transaction: we only make a new transaction
38 * once we have started to commit the old one).
39 *
40 * Preconditions:
41 * The journal MUST be locked. We don't perform atomic mallocs on the
42 * new transaction and we can't block without protecting against other
43 * processes trying to touch the journal while it is in transition.
44 *
45 * Called under j_state_lock
46 */
47
48static transaction_t *
49get_transaction(journal_t *journal, transaction_t *transaction)
50{
51 transaction->t_journal = journal;
52 transaction->t_state = T_RUNNING;
53 transaction->t_start_time = ktime_get();
54 transaction->t_tid = journal->j_transaction_sequence++;
55 transaction->t_expires = jiffies + journal->j_commit_interval;
56 spin_lock_init(&transaction->t_handle_lock);
57
58 /* Set up the commit timer for the new transaction. */
59 journal->j_commit_timer.expires =
60 round_jiffies_up(transaction->t_expires);
61 add_timer(&journal->j_commit_timer);
62
63 J_ASSERT(journal->j_running_transaction == NULL);
64 journal->j_running_transaction = transaction;
65
66 return transaction;
67}
68
69/*
70 * Handle management.
71 *
72 * A handle_t is an object which represents a single atomic update to a
73 * filesystem, and which tracks all of the modifications which form part
74 * of that one update.
75 */
76
77/*
78 * start_this_handle: Given a handle, deal with any locking or stalling
79 * needed to make sure that there is enough journal space for the handle
80 * to begin. Attach the handle to a transaction and set up the
81 * transaction's buffer credits.
82 */
83
84static int start_this_handle(journal_t *journal, handle_t *handle)
85{
86 transaction_t *transaction;
87 int needed;
88 int nblocks = handle->h_buffer_credits;
89 transaction_t *new_transaction = NULL;
90 int ret = 0;
91
92 if (nblocks > journal->j_max_transaction_buffers) {
93 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
94 current->comm, nblocks,
95 journal->j_max_transaction_buffers);
96 ret = -ENOSPC;
97 goto out;
98 }
99
100alloc_transaction:
101 if (!journal->j_running_transaction) {
102 new_transaction = kzalloc(sizeof(*new_transaction),
103 GFP_NOFS|__GFP_NOFAIL);
104 if (!new_transaction) {
105 ret = -ENOMEM;
106 goto out;
107 }
108 }
109
110 jbd_debug(3, "New handle %p going live.\n", handle);
111
112repeat:
113
114 /*
115 * We need to hold j_state_lock until t_updates has been incremented,
116 * for proper journal barrier handling
117 */
118 spin_lock(&journal->j_state_lock);
119repeat_locked:
120 if (is_journal_aborted(journal) ||
121 (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
122 spin_unlock(&journal->j_state_lock);
123 ret = -EROFS;
124 goto out;
125 }
126
127 /* Wait on the journal's transaction barrier if necessary */
128 if (journal->j_barrier_count) {
129 spin_unlock(&journal->j_state_lock);
130 wait_event(journal->j_wait_transaction_locked,
131 journal->j_barrier_count == 0);
132 goto repeat;
133 }
134
135 if (!journal->j_running_transaction) {
136 if (!new_transaction) {
137 spin_unlock(&journal->j_state_lock);
138 goto alloc_transaction;
139 }
140 get_transaction(journal, new_transaction);
141 new_transaction = NULL;
142 }
143
144 transaction = journal->j_running_transaction;
145
146 /*
147 * If the current transaction is locked down for commit, wait for the
148 * lock to be released.
149 */
150 if (transaction->t_state == T_LOCKED) {
151 DEFINE_WAIT(wait);
152
153 prepare_to_wait(&journal->j_wait_transaction_locked,
154 &wait, TASK_UNINTERRUPTIBLE);
155 spin_unlock(&journal->j_state_lock);
156 schedule();
157 finish_wait(&journal->j_wait_transaction_locked, &wait);
158 goto repeat;
159 }
160
161 /*
162 * If there is not enough space left in the log to write all potential
163 * buffers requested by this operation, we need to stall pending a log
164 * checkpoint to free some more log space.
165 */
166 spin_lock(&transaction->t_handle_lock);
167 needed = transaction->t_outstanding_credits + nblocks;
168
169 if (needed > journal->j_max_transaction_buffers) {
170 /*
171 * If the current transaction is already too large, then start
172 * to commit it: we can then go back and attach this handle to
173 * a new transaction.
174 */
175 DEFINE_WAIT(wait);
176
177 jbd_debug(2, "Handle %p starting new commit...\n", handle);
178 spin_unlock(&transaction->t_handle_lock);
179 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
180 TASK_UNINTERRUPTIBLE);
181 __log_start_commit(journal, transaction->t_tid);
182 spin_unlock(&journal->j_state_lock);
183 schedule();
184 finish_wait(&journal->j_wait_transaction_locked, &wait);
185 goto repeat;
186 }
187
188 /*
189 * The commit code assumes that it can get enough log space
190 * without forcing a checkpoint. This is *critical* for
191 * correctness: a checkpoint of a buffer which is also
192 * associated with a committing transaction creates a deadlock,
193 * so commit simply cannot force through checkpoints.
194 *
195 * We must therefore ensure the necessary space in the journal
196 * *before* starting to dirty potentially checkpointed buffers
197 * in the new transaction.
198 *
199 * The worst part is, any transaction currently committing can
200 * reduce the free space arbitrarily. Be careful to account for
201 * those buffers when checkpointing.
202 */
203
204 /*
205 * @@@ AKPM: This seems rather over-defensive. We're giving commit
206 * a _lot_ of headroom: 1/4 of the journal plus the size of
207 * the committing transaction. Really, we only need to give it
208 * committing_transaction->t_outstanding_credits plus "enough" for
209 * the log control blocks.
210 * Also, this test is inconsistent with the matching one in
211 * journal_extend().
212 */
213 if (__log_space_left(journal) < jbd_space_needed(journal)) {
214 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
215 spin_unlock(&transaction->t_handle_lock);
216 __log_wait_for_space(journal);
217 goto repeat_locked;
218 }
219
220 /* OK, account for the buffers that this operation expects to
221 * use and add the handle to the running transaction. */
222
223 handle->h_transaction = transaction;
224 transaction->t_outstanding_credits += nblocks;
225 transaction->t_updates++;
226 transaction->t_handle_count++;
227 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
228 handle, nblocks, transaction->t_outstanding_credits,
229 __log_space_left(journal));
230 spin_unlock(&transaction->t_handle_lock);
231 spin_unlock(&journal->j_state_lock);
232
233 lock_map_acquire(&handle->h_lockdep_map);
234out:
235 if (unlikely(new_transaction)) /* It's usually NULL */
236 kfree(new_transaction);
237 return ret;
238}
239
240static struct lock_class_key jbd_handle_key;
241
242/* Allocate a new handle. This should probably be in a slab... */
243static handle_t *new_handle(int nblocks)
244{
245 handle_t *handle = jbd_alloc_handle(GFP_NOFS);
246 if (!handle)
247 return NULL;
248 handle->h_buffer_credits = nblocks;
249 handle->h_ref = 1;
250
251 lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
252
253 return handle;
254}
255
256/**
257 * handle_t *journal_start() - Obtain a new handle.
258 * @journal: Journal to start transaction on.
259 * @nblocks: number of block buffer we might modify
260 *
261 * We make sure that the transaction can guarantee at least nblocks of
262 * modified buffers in the log. We block until the log can guarantee
263 * that much space.
264 *
265 * This function is visible to journal users (like ext3fs), so is not
266 * called with the journal already locked.
267 *
268 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
269 * on failure.
270 */
271handle_t *journal_start(journal_t *journal, int nblocks)
272{
273 handle_t *handle = journal_current_handle();
274 int err;
275
276 if (!journal)
277 return ERR_PTR(-EROFS);
278
279 if (handle) {
280 J_ASSERT(handle->h_transaction->t_journal == journal);
281 handle->h_ref++;
282 return handle;
283 }
284
285 handle = new_handle(nblocks);
286 if (!handle)
287 return ERR_PTR(-ENOMEM);
288
289 current->journal_info = handle;
290
291 err = start_this_handle(journal, handle);
292 if (err < 0) {
293 jbd_free_handle(handle);
294 current->journal_info = NULL;
295 handle = ERR_PTR(err);
296 }
297 return handle;
298}
299
300/**
301 * int journal_extend() - extend buffer credits.
302 * @handle: handle to 'extend'
303 * @nblocks: nr blocks to try to extend by.
304 *
305 * Some transactions, such as large extends and truncates, can be done
306 * atomically all at once or in several stages. The operation requests
307 * a credit for a number of buffer modications in advance, but can
308 * extend its credit if it needs more.
309 *
310 * journal_extend tries to give the running handle more buffer credits.
311 * It does not guarantee that allocation - this is a best-effort only.
312 * The calling process MUST be able to deal cleanly with a failure to
313 * extend here.
314 *
315 * Return 0 on success, non-zero on failure.
316 *
317 * return code < 0 implies an error
318 * return code > 0 implies normal transaction-full status.
319 */
320int journal_extend(handle_t *handle, int nblocks)
321{
322 transaction_t *transaction = handle->h_transaction;
323 journal_t *journal = transaction->t_journal;
324 int result;
325 int wanted;
326
327 result = -EIO;
328 if (is_handle_aborted(handle))
329 goto out;
330
331 result = 1;
332
333 spin_lock(&journal->j_state_lock);
334
335 /* Don't extend a locked-down transaction! */
336 if (handle->h_transaction->t_state != T_RUNNING) {
337 jbd_debug(3, "denied handle %p %d blocks: "
338 "transaction not running\n", handle, nblocks);
339 goto error_out;
340 }
341
342 spin_lock(&transaction->t_handle_lock);
343 wanted = transaction->t_outstanding_credits + nblocks;
344
345 if (wanted > journal->j_max_transaction_buffers) {
346 jbd_debug(3, "denied handle %p %d blocks: "
347 "transaction too large\n", handle, nblocks);
348 goto unlock;
349 }
350
351 if (wanted > __log_space_left(journal)) {
352 jbd_debug(3, "denied handle %p %d blocks: "
353 "insufficient log space\n", handle, nblocks);
354 goto unlock;
355 }
356
357 handle->h_buffer_credits += nblocks;
358 transaction->t_outstanding_credits += nblocks;
359 result = 0;
360
361 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
362unlock:
363 spin_unlock(&transaction->t_handle_lock);
364error_out:
365 spin_unlock(&journal->j_state_lock);
366out:
367 return result;
368}
369
370
371/**
372 * int journal_restart() - restart a handle.
373 * @handle: handle to restart
374 * @nblocks: nr credits requested
375 *
376 * Restart a handle for a multi-transaction filesystem
377 * operation.
378 *
379 * If the journal_extend() call above fails to grant new buffer credits
380 * to a running handle, a call to journal_restart will commit the
381 * handle's transaction so far and reattach the handle to a new
382 * transaction capabable of guaranteeing the requested number of
383 * credits.
384 */
385
386int journal_restart(handle_t *handle, int nblocks)
387{
388 transaction_t *transaction = handle->h_transaction;
389 journal_t *journal = transaction->t_journal;
390 int ret;
391
392 /* If we've had an abort of any type, don't even think about
393 * actually doing the restart! */
394 if (is_handle_aborted(handle))
395 return 0;
396
397 /*
398 * First unlink the handle from its current transaction, and start the
399 * commit on that.
400 */
401 J_ASSERT(transaction->t_updates > 0);
402 J_ASSERT(journal_current_handle() == handle);
403
404 spin_lock(&journal->j_state_lock);
405 spin_lock(&transaction->t_handle_lock);
406 transaction->t_outstanding_credits -= handle->h_buffer_credits;
407 transaction->t_updates--;
408
409 if (!transaction->t_updates)
410 wake_up(&journal->j_wait_updates);
411 spin_unlock(&transaction->t_handle_lock);
412
413 jbd_debug(2, "restarting handle %p\n", handle);
414 __log_start_commit(journal, transaction->t_tid);
415 spin_unlock(&journal->j_state_lock);
416
417 lock_map_release(&handle->h_lockdep_map);
418 handle->h_buffer_credits = nblocks;
419 ret = start_this_handle(journal, handle);
420 return ret;
421}
422
423
424/**
425 * void journal_lock_updates () - establish a transaction barrier.
426 * @journal: Journal to establish a barrier on.
427 *
428 * This locks out any further updates from being started, and blocks until all
429 * existing updates have completed, returning only once the journal is in a
430 * quiescent state with no updates running.
431 *
432 * We do not use simple mutex for synchronization as there are syscalls which
433 * want to return with filesystem locked and that trips up lockdep. Also
434 * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
435 * Since locking filesystem is rare operation, we use simple counter and
436 * waitqueue for locking.
437 */
438void journal_lock_updates(journal_t *journal)
439{
440 DEFINE_WAIT(wait);
441
442wait:
443 /* Wait for previous locked operation to finish */
444 wait_event(journal->j_wait_transaction_locked,
445 journal->j_barrier_count == 0);
446
447 spin_lock(&journal->j_state_lock);
448 /*
449 * Check reliably under the lock whether we are the ones winning the race
450 * and locking the journal
451 */
452 if (journal->j_barrier_count > 0) {
453 spin_unlock(&journal->j_state_lock);
454 goto wait;
455 }
456 ++journal->j_barrier_count;
457
458 /* Wait until there are no running updates */
459 while (1) {
460 transaction_t *transaction = journal->j_running_transaction;
461
462 if (!transaction)
463 break;
464
465 spin_lock(&transaction->t_handle_lock);
466 if (!transaction->t_updates) {
467 spin_unlock(&transaction->t_handle_lock);
468 break;
469 }
470 prepare_to_wait(&journal->j_wait_updates, &wait,
471 TASK_UNINTERRUPTIBLE);
472 spin_unlock(&transaction->t_handle_lock);
473 spin_unlock(&journal->j_state_lock);
474 schedule();
475 finish_wait(&journal->j_wait_updates, &wait);
476 spin_lock(&journal->j_state_lock);
477 }
478 spin_unlock(&journal->j_state_lock);
479}
480
481/**
482 * void journal_unlock_updates (journal_t* journal) - release barrier
483 * @journal: Journal to release the barrier on.
484 *
485 * Release a transaction barrier obtained with journal_lock_updates().
486 */
487void journal_unlock_updates (journal_t *journal)
488{
489 J_ASSERT(journal->j_barrier_count != 0);
490
491 spin_lock(&journal->j_state_lock);
492 --journal->j_barrier_count;
493 spin_unlock(&journal->j_state_lock);
494 wake_up(&journal->j_wait_transaction_locked);
495}
496
497static void warn_dirty_buffer(struct buffer_head *bh)
498{
499 char b[BDEVNAME_SIZE];
500
501 printk(KERN_WARNING
502 "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
503 "There's a risk of filesystem corruption in case of system "
504 "crash.\n",
505 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
506}
507
508/*
509 * If the buffer is already part of the current transaction, then there
510 * is nothing we need to do. If it is already part of a prior
511 * transaction which we are still committing to disk, then we need to
512 * make sure that we do not overwrite the old copy: we do copy-out to
513 * preserve the copy going to disk. We also account the buffer against
514 * the handle's metadata buffer credits (unless the buffer is already
515 * part of the transaction, that is).
516 *
517 */
518static int
519do_get_write_access(handle_t *handle, struct journal_head *jh,
520 int force_copy)
521{
522 struct buffer_head *bh;
523 transaction_t *transaction;
524 journal_t *journal;
525 int error;
526 char *frozen_buffer = NULL;
527 int need_copy = 0;
528
529 if (is_handle_aborted(handle))
530 return -EROFS;
531
532 transaction = handle->h_transaction;
533 journal = transaction->t_journal;
534
535 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
536
537 JBUFFER_TRACE(jh, "entry");
538repeat:
539 bh = jh2bh(jh);
540
541 /* @@@ Need to check for errors here at some point. */
542
543 lock_buffer(bh);
544 jbd_lock_bh_state(bh);
545
546 /* We now hold the buffer lock so it is safe to query the buffer
547 * state. Is the buffer dirty?
548 *
549 * If so, there are two possibilities. The buffer may be
550 * non-journaled, and undergoing a quite legitimate writeback.
551 * Otherwise, it is journaled, and we don't expect dirty buffers
552 * in that state (the buffers should be marked JBD_Dirty
553 * instead.) So either the IO is being done under our own
554 * control and this is a bug, or it's a third party IO such as
555 * dump(8) (which may leave the buffer scheduled for read ---
556 * ie. locked but not dirty) or tune2fs (which may actually have
557 * the buffer dirtied, ugh.) */
558
559 if (buffer_dirty(bh)) {
560 /*
561 * First question: is this buffer already part of the current
562 * transaction or the existing committing transaction?
563 */
564 if (jh->b_transaction) {
565 J_ASSERT_JH(jh,
566 jh->b_transaction == transaction ||
567 jh->b_transaction ==
568 journal->j_committing_transaction);
569 if (jh->b_next_transaction)
570 J_ASSERT_JH(jh, jh->b_next_transaction ==
571 transaction);
572 warn_dirty_buffer(bh);
573 }
574 /*
575 * In any case we need to clean the dirty flag and we must
576 * do it under the buffer lock to be sure we don't race
577 * with running write-out.
578 */
579 JBUFFER_TRACE(jh, "Journalling dirty buffer");
580 clear_buffer_dirty(bh);
581 set_buffer_jbddirty(bh);
582 }
583
584 unlock_buffer(bh);
585
586 error = -EROFS;
587 if (is_handle_aborted(handle)) {
588 jbd_unlock_bh_state(bh);
589 goto out;
590 }
591 error = 0;
592
593 /*
594 * The buffer is already part of this transaction if b_transaction or
595 * b_next_transaction points to it
596 */
597 if (jh->b_transaction == transaction ||
598 jh->b_next_transaction == transaction)
599 goto done;
600
601 /*
602 * this is the first time this transaction is touching this buffer,
603 * reset the modified flag
604 */
605 jh->b_modified = 0;
606
607 /*
608 * If there is already a copy-out version of this buffer, then we don't
609 * need to make another one
610 */
611 if (jh->b_frozen_data) {
612 JBUFFER_TRACE(jh, "has frozen data");
613 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
614 jh->b_next_transaction = transaction;
615 goto done;
616 }
617
618 /* Is there data here we need to preserve? */
619
620 if (jh->b_transaction && jh->b_transaction != transaction) {
621 JBUFFER_TRACE(jh, "owned by older transaction");
622 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
623 J_ASSERT_JH(jh, jh->b_transaction ==
624 journal->j_committing_transaction);
625
626 /* There is one case we have to be very careful about.
627 * If the committing transaction is currently writing
628 * this buffer out to disk and has NOT made a copy-out,
629 * then we cannot modify the buffer contents at all
630 * right now. The essence of copy-out is that it is the
631 * extra copy, not the primary copy, which gets
632 * journaled. If the primary copy is already going to
633 * disk then we cannot do copy-out here. */
634
635 if (jh->b_jlist == BJ_Shadow) {
636 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
637 wait_queue_head_t *wqh;
638
639 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
640
641 JBUFFER_TRACE(jh, "on shadow: sleep");
642 jbd_unlock_bh_state(bh);
643 /* commit wakes up all shadow buffers after IO */
644 for ( ; ; ) {
645 prepare_to_wait(wqh, &wait.wait,
646 TASK_UNINTERRUPTIBLE);
647 if (jh->b_jlist != BJ_Shadow)
648 break;
649 schedule();
650 }
651 finish_wait(wqh, &wait.wait);
652 goto repeat;
653 }
654
655 /* Only do the copy if the currently-owning transaction
656 * still needs it. If it is on the Forget list, the
657 * committing transaction is past that stage. The
658 * buffer had better remain locked during the kmalloc,
659 * but that should be true --- we hold the journal lock
660 * still and the buffer is already on the BUF_JOURNAL
661 * list so won't be flushed.
662 *
663 * Subtle point, though: if this is a get_undo_access,
664 * then we will be relying on the frozen_data to contain
665 * the new value of the committed_data record after the
666 * transaction, so we HAVE to force the frozen_data copy
667 * in that case. */
668
669 if (jh->b_jlist != BJ_Forget || force_copy) {
670 JBUFFER_TRACE(jh, "generate frozen data");
671 if (!frozen_buffer) {
672 JBUFFER_TRACE(jh, "allocate memory for buffer");
673 jbd_unlock_bh_state(bh);
674 frozen_buffer =
675 jbd_alloc(jh2bh(jh)->b_size,
676 GFP_NOFS);
677 if (!frozen_buffer) {
678 printk(KERN_ERR
679 "%s: OOM for frozen_buffer\n",
680 __func__);
681 JBUFFER_TRACE(jh, "oom!");
682 error = -ENOMEM;
683 jbd_lock_bh_state(bh);
684 goto done;
685 }
686 goto repeat;
687 }
688 jh->b_frozen_data = frozen_buffer;
689 frozen_buffer = NULL;
690 need_copy = 1;
691 }
692 jh->b_next_transaction = transaction;
693 }
694
695
696 /*
697 * Finally, if the buffer is not journaled right now, we need to make
698 * sure it doesn't get written to disk before the caller actually
699 * commits the new data
700 */
701 if (!jh->b_transaction) {
702 JBUFFER_TRACE(jh, "no transaction");
703 J_ASSERT_JH(jh, !jh->b_next_transaction);
704 JBUFFER_TRACE(jh, "file as BJ_Reserved");
705 spin_lock(&journal->j_list_lock);
706 __journal_file_buffer(jh, transaction, BJ_Reserved);
707 spin_unlock(&journal->j_list_lock);
708 }
709
710done:
711 if (need_copy) {
712 struct page *page;
713 int offset;
714 char *source;
715
716 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
717 "Possible IO failure.\n");
718 page = jh2bh(jh)->b_page;
719 offset = offset_in_page(jh2bh(jh)->b_data);
720 source = kmap_atomic(page);
721 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
722 kunmap_atomic(source);
723 }
724 jbd_unlock_bh_state(bh);
725
726 /*
727 * If we are about to journal a buffer, then any revoke pending on it is
728 * no longer valid
729 */
730 journal_cancel_revoke(handle, jh);
731
732out:
733 if (unlikely(frozen_buffer)) /* It's usually NULL */
734 jbd_free(frozen_buffer, bh->b_size);
735
736 JBUFFER_TRACE(jh, "exit");
737 return error;
738}
739
740/**
741 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
742 * @handle: transaction to add buffer modifications to
743 * @bh: bh to be used for metadata writes
744 *
745 * Returns an error code or 0 on success.
746 *
747 * In full data journalling mode the buffer may be of type BJ_AsyncData,
748 * because we're write()ing a buffer which is also part of a shared mapping.
749 */
750
751int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
752{
753 struct journal_head *jh = journal_add_journal_head(bh);
754 int rc;
755
756 /* We do not want to get caught playing with fields which the
757 * log thread also manipulates. Make sure that the buffer
758 * completes any outstanding IO before proceeding. */
759 rc = do_get_write_access(handle, jh, 0);
760 journal_put_journal_head(jh);
761 return rc;
762}
763
764
765/*
766 * When the user wants to journal a newly created buffer_head
767 * (ie. getblk() returned a new buffer and we are going to populate it
768 * manually rather than reading off disk), then we need to keep the
769 * buffer_head locked until it has been completely filled with new
770 * data. In this case, we should be able to make the assertion that
771 * the bh is not already part of an existing transaction.
772 *
773 * The buffer should already be locked by the caller by this point.
774 * There is no lock ranking violation: it was a newly created,
775 * unlocked buffer beforehand. */
776
777/**
778 * int journal_get_create_access () - notify intent to use newly created bh
779 * @handle: transaction to new buffer to
780 * @bh: new buffer.
781 *
782 * Call this if you create a new bh.
783 */
784int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
785{
786 transaction_t *transaction = handle->h_transaction;
787 journal_t *journal = transaction->t_journal;
788 struct journal_head *jh = journal_add_journal_head(bh);
789 int err;
790
791 jbd_debug(5, "journal_head %p\n", jh);
792 err = -EROFS;
793 if (is_handle_aborted(handle))
794 goto out;
795 err = 0;
796
797 JBUFFER_TRACE(jh, "entry");
798 /*
799 * The buffer may already belong to this transaction due to pre-zeroing
800 * in the filesystem's new_block code. It may also be on the previous,
801 * committing transaction's lists, but it HAS to be in Forget state in
802 * that case: the transaction must have deleted the buffer for it to be
803 * reused here.
804 */
805 jbd_lock_bh_state(bh);
806 spin_lock(&journal->j_list_lock);
807 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
808 jh->b_transaction == NULL ||
809 (jh->b_transaction == journal->j_committing_transaction &&
810 jh->b_jlist == BJ_Forget)));
811
812 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
813 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
814
815 if (jh->b_transaction == NULL) {
816 /*
817 * Previous journal_forget() could have left the buffer
818 * with jbddirty bit set because it was being committed. When
819 * the commit finished, we've filed the buffer for
820 * checkpointing and marked it dirty. Now we are reallocating
821 * the buffer so the transaction freeing it must have
822 * committed and so it's safe to clear the dirty bit.
823 */
824 clear_buffer_dirty(jh2bh(jh));
825
826 /* first access by this transaction */
827 jh->b_modified = 0;
828
829 JBUFFER_TRACE(jh, "file as BJ_Reserved");
830 __journal_file_buffer(jh, transaction, BJ_Reserved);
831 } else if (jh->b_transaction == journal->j_committing_transaction) {
832 /* first access by this transaction */
833 jh->b_modified = 0;
834
835 JBUFFER_TRACE(jh, "set next transaction");
836 jh->b_next_transaction = transaction;
837 }
838 spin_unlock(&journal->j_list_lock);
839 jbd_unlock_bh_state(bh);
840
841 /*
842 * akpm: I added this. ext3_alloc_branch can pick up new indirect
843 * blocks which contain freed but then revoked metadata. We need
844 * to cancel the revoke in case we end up freeing it yet again
845 * and the reallocating as data - this would cause a second revoke,
846 * which hits an assertion error.
847 */
848 JBUFFER_TRACE(jh, "cancelling revoke");
849 journal_cancel_revoke(handle, jh);
850out:
851 journal_put_journal_head(jh);
852 return err;
853}
854
855/**
856 * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
857 * @handle: transaction
858 * @bh: buffer to undo
859 *
860 * Sometimes there is a need to distinguish between metadata which has
861 * been committed to disk and that which has not. The ext3fs code uses
862 * this for freeing and allocating space, we have to make sure that we
863 * do not reuse freed space until the deallocation has been committed,
864 * since if we overwrote that space we would make the delete
865 * un-rewindable in case of a crash.
866 *
867 * To deal with that, journal_get_undo_access requests write access to a
868 * buffer for parts of non-rewindable operations such as delete
869 * operations on the bitmaps. The journaling code must keep a copy of
870 * the buffer's contents prior to the undo_access call until such time
871 * as we know that the buffer has definitely been committed to disk.
872 *
873 * We never need to know which transaction the committed data is part
874 * of, buffers touched here are guaranteed to be dirtied later and so
875 * will be committed to a new transaction in due course, at which point
876 * we can discard the old committed data pointer.
877 *
878 * Returns error number or 0 on success.
879 */
880int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
881{
882 int err;
883 struct journal_head *jh = journal_add_journal_head(bh);
884 char *committed_data = NULL;
885
886 JBUFFER_TRACE(jh, "entry");
887
888 /*
889 * Do this first --- it can drop the journal lock, so we want to
890 * make sure that obtaining the committed_data is done
891 * atomically wrt. completion of any outstanding commits.
892 */
893 err = do_get_write_access(handle, jh, 1);
894 if (err)
895 goto out;
896
897repeat:
898 if (!jh->b_committed_data) {
899 committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
900 if (!committed_data) {
901 printk(KERN_ERR "%s: No memory for committed data\n",
902 __func__);
903 err = -ENOMEM;
904 goto out;
905 }
906 }
907
908 jbd_lock_bh_state(bh);
909 if (!jh->b_committed_data) {
910 /* Copy out the current buffer contents into the
911 * preserved, committed copy. */
912 JBUFFER_TRACE(jh, "generate b_committed data");
913 if (!committed_data) {
914 jbd_unlock_bh_state(bh);
915 goto repeat;
916 }
917
918 jh->b_committed_data = committed_data;
919 committed_data = NULL;
920 memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
921 }
922 jbd_unlock_bh_state(bh);
923out:
924 journal_put_journal_head(jh);
925 if (unlikely(committed_data))
926 jbd_free(committed_data, bh->b_size);
927 return err;
928}
929
930/**
931 * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
932 * @handle: transaction
933 * @bh: bufferhead to mark
934 *
935 * Description:
936 * Mark a buffer as containing dirty data which needs to be flushed before
937 * we can commit the current transaction.
938 *
939 * The buffer is placed on the transaction's data list and is marked as
940 * belonging to the transaction.
941 *
942 * Returns error number or 0 on success.
943 *
944 * journal_dirty_data() can be called via page_launder->ext3_writepage
945 * by kswapd.
946 */
947int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
948{
949 journal_t *journal = handle->h_transaction->t_journal;
950 int need_brelse = 0;
951 struct journal_head *jh;
952 int ret = 0;
953
954 if (is_handle_aborted(handle))
955 return ret;
956
957 jh = journal_add_journal_head(bh);
958 JBUFFER_TRACE(jh, "entry");
959
960 /*
961 * The buffer could *already* be dirty. Writeout can start
962 * at any time.
963 */
964 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
965
966 /*
967 * What if the buffer is already part of a running transaction?
968 *
969 * There are two cases:
970 * 1) It is part of the current running transaction. Refile it,
971 * just in case we have allocated it as metadata, deallocated
972 * it, then reallocated it as data.
973 * 2) It is part of the previous, still-committing transaction.
974 * If all we want to do is to guarantee that the buffer will be
975 * written to disk before this new transaction commits, then
976 * being sure that the *previous* transaction has this same
977 * property is sufficient for us! Just leave it on its old
978 * transaction.
979 *
980 * In case (2), the buffer must not already exist as metadata
981 * --- that would violate write ordering (a transaction is free
982 * to write its data at any point, even before the previous
983 * committing transaction has committed). The caller must
984 * never, ever allow this to happen: there's nothing we can do
985 * about it in this layer.
986 */
987 jbd_lock_bh_state(bh);
988 spin_lock(&journal->j_list_lock);
989
990 /* Now that we have bh_state locked, are we really still mapped? */
991 if (!buffer_mapped(bh)) {
992 JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
993 goto no_journal;
994 }
995
996 if (jh->b_transaction) {
997 JBUFFER_TRACE(jh, "has transaction");
998 if (jh->b_transaction != handle->h_transaction) {
999 JBUFFER_TRACE(jh, "belongs to older transaction");
1000 J_ASSERT_JH(jh, jh->b_transaction ==
1001 journal->j_committing_transaction);
1002
1003 /* @@@ IS THIS TRUE ? */
1004 /*
1005 * Not any more. Scenario: someone does a write()
1006 * in data=journal mode. The buffer's transaction has
1007 * moved into commit. Then someone does another
1008 * write() to the file. We do the frozen data copyout
1009 * and set b_next_transaction to point to j_running_t.
1010 * And while we're in that state, someone does a
1011 * writepage() in an attempt to pageout the same area
1012 * of the file via a shared mapping. At present that
1013 * calls journal_dirty_data(), and we get right here.
1014 * It may be too late to journal the data. Simply
1015 * falling through to the next test will suffice: the
1016 * data will be dirty and wil be checkpointed. The
1017 * ordering comments in the next comment block still
1018 * apply.
1019 */
1020 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1021
1022 /*
1023 * If we're journalling data, and this buffer was
1024 * subject to a write(), it could be metadata, forget
1025 * or shadow against the committing transaction. Now,
1026 * someone has dirtied the same darn page via a mapping
1027 * and it is being writepage()'d.
1028 * We *could* just steal the page from commit, with some
1029 * fancy locking there. Instead, we just skip it -
1030 * don't tie the page's buffers to the new transaction
1031 * at all.
1032 * Implication: if we crash before the writepage() data
1033 * is written into the filesystem, recovery will replay
1034 * the write() data.
1035 */
1036 if (jh->b_jlist != BJ_None &&
1037 jh->b_jlist != BJ_SyncData &&
1038 jh->b_jlist != BJ_Locked) {
1039 JBUFFER_TRACE(jh, "Not stealing");
1040 goto no_journal;
1041 }
1042
1043 /*
1044 * This buffer may be undergoing writeout in commit. We
1045 * can't return from here and let the caller dirty it
1046 * again because that can cause the write-out loop in
1047 * commit to never terminate.
1048 */
1049 if (buffer_dirty(bh)) {
1050 get_bh(bh);
1051 spin_unlock(&journal->j_list_lock);
1052 jbd_unlock_bh_state(bh);
1053 need_brelse = 1;
1054 sync_dirty_buffer(bh);
1055 jbd_lock_bh_state(bh);
1056 spin_lock(&journal->j_list_lock);
1057 /* Since we dropped the lock... */
1058 if (!buffer_mapped(bh)) {
1059 JBUFFER_TRACE(jh, "buffer got unmapped");
1060 goto no_journal;
1061 }
1062 /* The buffer may become locked again at any
1063 time if it is redirtied */
1064 }
1065
1066 /*
1067 * We cannot remove the buffer with io error from the
1068 * committing transaction, because otherwise it would
1069 * miss the error and the commit would not abort.
1070 */
1071 if (unlikely(!buffer_uptodate(bh))) {
1072 ret = -EIO;
1073 goto no_journal;
1074 }
1075 /* We might have slept so buffer could be refiled now */
1076 if (jh->b_transaction != NULL &&
1077 jh->b_transaction != handle->h_transaction) {
1078 JBUFFER_TRACE(jh, "unfile from commit");
1079 __journal_temp_unlink_buffer(jh);
1080 /* It still points to the committing
1081 * transaction; move it to this one so
1082 * that the refile assert checks are
1083 * happy. */
1084 jh->b_transaction = handle->h_transaction;
1085 }
1086 /* The buffer will be refiled below */
1087
1088 }
1089 /*
1090 * Special case --- the buffer might actually have been
1091 * allocated and then immediately deallocated in the previous,
1092 * committing transaction, so might still be left on that
1093 * transaction's metadata lists.
1094 */
1095 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1096 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1097 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1098 JBUFFER_TRACE(jh, "file as data");
1099 __journal_file_buffer(jh, handle->h_transaction,
1100 BJ_SyncData);
1101 }
1102 } else {
1103 JBUFFER_TRACE(jh, "not on a transaction");
1104 __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1105 }
1106no_journal:
1107 spin_unlock(&journal->j_list_lock);
1108 jbd_unlock_bh_state(bh);
1109 if (need_brelse) {
1110 BUFFER_TRACE(bh, "brelse");
1111 __brelse(bh);
1112 }
1113 JBUFFER_TRACE(jh, "exit");
1114 journal_put_journal_head(jh);
1115 return ret;
1116}
1117
1118/**
1119 * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
1120 * @handle: transaction to add buffer to.
1121 * @bh: buffer to mark
1122 *
1123 * Mark dirty metadata which needs to be journaled as part of the current
1124 * transaction.
1125 *
1126 * The buffer is placed on the transaction's metadata list and is marked
1127 * as belonging to the transaction.
1128 *
1129 * Returns error number or 0 on success.
1130 *
1131 * Special care needs to be taken if the buffer already belongs to the
1132 * current committing transaction (in which case we should have frozen
1133 * data present for that commit). In that case, we don't relink the
1134 * buffer: that only gets done when the old transaction finally
1135 * completes its commit.
1136 */
1137int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1138{
1139 transaction_t *transaction = handle->h_transaction;
1140 journal_t *journal = transaction->t_journal;
1141 struct journal_head *jh = bh2jh(bh);
1142
1143 jbd_debug(5, "journal_head %p\n", jh);
1144 JBUFFER_TRACE(jh, "entry");
1145 if (is_handle_aborted(handle))
1146 goto out;
1147
1148 jbd_lock_bh_state(bh);
1149
1150 if (jh->b_modified == 0) {
1151 /*
1152 * This buffer's got modified and becoming part
1153 * of the transaction. This needs to be done
1154 * once a transaction -bzzz
1155 */
1156 jh->b_modified = 1;
1157 J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1158 handle->h_buffer_credits--;
1159 }
1160
1161 /*
1162 * fastpath, to avoid expensive locking. If this buffer is already
1163 * on the running transaction's metadata list there is nothing to do.
1164 * Nobody can take it off again because there is a handle open.
1165 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1166 * result in this test being false, so we go in and take the locks.
1167 */
1168 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1169 JBUFFER_TRACE(jh, "fastpath");
1170 J_ASSERT_JH(jh, jh->b_transaction ==
1171 journal->j_running_transaction);
1172 goto out_unlock_bh;
1173 }
1174
1175 set_buffer_jbddirty(bh);
1176
1177 /*
1178 * Metadata already on the current transaction list doesn't
1179 * need to be filed. Metadata on another transaction's list must
1180 * be committing, and will be refiled once the commit completes:
1181 * leave it alone for now.
1182 */
1183 if (jh->b_transaction != transaction) {
1184 JBUFFER_TRACE(jh, "already on other transaction");
1185 J_ASSERT_JH(jh, jh->b_transaction ==
1186 journal->j_committing_transaction);
1187 J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1188 /* And this case is illegal: we can't reuse another
1189 * transaction's data buffer, ever. */
1190 goto out_unlock_bh;
1191 }
1192
1193 /* That test should have eliminated the following case: */
1194 J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1195
1196 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1197 spin_lock(&journal->j_list_lock);
1198 __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1199 spin_unlock(&journal->j_list_lock);
1200out_unlock_bh:
1201 jbd_unlock_bh_state(bh);
1202out:
1203 JBUFFER_TRACE(jh, "exit");
1204 return 0;
1205}
1206
1207/*
1208 * journal_release_buffer: undo a get_write_access without any buffer
1209 * updates, if the update decided in the end that it didn't need access.
1210 *
1211 */
1212void
1213journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1214{
1215 BUFFER_TRACE(bh, "entry");
1216}
1217
1218/**
1219 * void journal_forget() - bforget() for potentially-journaled buffers.
1220 * @handle: transaction handle
1221 * @bh: bh to 'forget'
1222 *
1223 * We can only do the bforget if there are no commits pending against the
1224 * buffer. If the buffer is dirty in the current running transaction we
1225 * can safely unlink it.
1226 *
1227 * bh may not be a journalled buffer at all - it may be a non-JBD
1228 * buffer which came off the hashtable. Check for this.
1229 *
1230 * Decrements bh->b_count by one.
1231 *
1232 * Allow this call even if the handle has aborted --- it may be part of
1233 * the caller's cleanup after an abort.
1234 */
1235int journal_forget (handle_t *handle, struct buffer_head *bh)
1236{
1237 transaction_t *transaction = handle->h_transaction;
1238 journal_t *journal = transaction->t_journal;
1239 struct journal_head *jh;
1240 int drop_reserve = 0;
1241 int err = 0;
1242 int was_modified = 0;
1243
1244 BUFFER_TRACE(bh, "entry");
1245
1246 jbd_lock_bh_state(bh);
1247 spin_lock(&journal->j_list_lock);
1248
1249 if (!buffer_jbd(bh))
1250 goto not_jbd;
1251 jh = bh2jh(bh);
1252
1253 /* Critical error: attempting to delete a bitmap buffer, maybe?
1254 * Don't do any jbd operations, and return an error. */
1255 if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1256 "inconsistent data on disk")) {
1257 err = -EIO;
1258 goto not_jbd;
1259 }
1260
1261 /* keep track of whether or not this transaction modified us */
1262 was_modified = jh->b_modified;
1263
1264 /*
1265 * The buffer's going from the transaction, we must drop
1266 * all references -bzzz
1267 */
1268 jh->b_modified = 0;
1269
1270 if (jh->b_transaction == handle->h_transaction) {
1271 J_ASSERT_JH(jh, !jh->b_frozen_data);
1272
1273 /* If we are forgetting a buffer which is already part
1274 * of this transaction, then we can just drop it from
1275 * the transaction immediately. */
1276 clear_buffer_dirty(bh);
1277 clear_buffer_jbddirty(bh);
1278
1279 JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1280
1281 /*
1282 * we only want to drop a reference if this transaction
1283 * modified the buffer
1284 */
1285 if (was_modified)
1286 drop_reserve = 1;
1287
1288 /*
1289 * We are no longer going to journal this buffer.
1290 * However, the commit of this transaction is still
1291 * important to the buffer: the delete that we are now
1292 * processing might obsolete an old log entry, so by
1293 * committing, we can satisfy the buffer's checkpoint.
1294 *
1295 * So, if we have a checkpoint on the buffer, we should
1296 * now refile the buffer on our BJ_Forget list so that
1297 * we know to remove the checkpoint after we commit.
1298 */
1299
1300 if (jh->b_cp_transaction) {
1301 __journal_temp_unlink_buffer(jh);
1302 __journal_file_buffer(jh, transaction, BJ_Forget);
1303 } else {
1304 __journal_unfile_buffer(jh);
1305 if (!buffer_jbd(bh)) {
1306 spin_unlock(&journal->j_list_lock);
1307 jbd_unlock_bh_state(bh);
1308 __bforget(bh);
1309 goto drop;
1310 }
1311 }
1312 } else if (jh->b_transaction) {
1313 J_ASSERT_JH(jh, (jh->b_transaction ==
1314 journal->j_committing_transaction));
1315 /* However, if the buffer is still owned by a prior
1316 * (committing) transaction, we can't drop it yet... */
1317 JBUFFER_TRACE(jh, "belongs to older transaction");
1318 /* ... but we CAN drop it from the new transaction if we
1319 * have also modified it since the original commit. */
1320
1321 if (jh->b_next_transaction) {
1322 J_ASSERT(jh->b_next_transaction == transaction);
1323 jh->b_next_transaction = NULL;
1324
1325 /*
1326 * only drop a reference if this transaction modified
1327 * the buffer
1328 */
1329 if (was_modified)
1330 drop_reserve = 1;
1331 }
1332 }
1333
1334not_jbd:
1335 spin_unlock(&journal->j_list_lock);
1336 jbd_unlock_bh_state(bh);
1337 __brelse(bh);
1338drop:
1339 if (drop_reserve) {
1340 /* no need to reserve log space for this block -bzzz */
1341 handle->h_buffer_credits++;
1342 }
1343 return err;
1344}
1345
1346/**
1347 * int journal_stop() - complete a transaction
1348 * @handle: tranaction to complete.
1349 *
1350 * All done for a particular handle.
1351 *
1352 * There is not much action needed here. We just return any remaining
1353 * buffer credits to the transaction and remove the handle. The only
1354 * complication is that we need to start a commit operation if the
1355 * filesystem is marked for synchronous update.
1356 *
1357 * journal_stop itself will not usually return an error, but it may
1358 * do so in unusual circumstances. In particular, expect it to
1359 * return -EIO if a journal_abort has been executed since the
1360 * transaction began.
1361 */
1362int journal_stop(handle_t *handle)
1363{
1364 transaction_t *transaction = handle->h_transaction;
1365 journal_t *journal = transaction->t_journal;
1366 int err;
1367 pid_t pid;
1368
1369 J_ASSERT(journal_current_handle() == handle);
1370
1371 if (is_handle_aborted(handle))
1372 err = -EIO;
1373 else {
1374 J_ASSERT(transaction->t_updates > 0);
1375 err = 0;
1376 }
1377
1378 if (--handle->h_ref > 0) {
1379 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1380 handle->h_ref);
1381 return err;
1382 }
1383
1384 jbd_debug(4, "Handle %p going down\n", handle);
1385
1386 /*
1387 * Implement synchronous transaction batching. If the handle
1388 * was synchronous, don't force a commit immediately. Let's
1389 * yield and let another thread piggyback onto this transaction.
1390 * Keep doing that while new threads continue to arrive.
1391 * It doesn't cost much - we're about to run a commit and sleep
1392 * on IO anyway. Speeds up many-threaded, many-dir operations
1393 * by 30x or more...
1394 *
1395 * We try and optimize the sleep time against what the underlying disk
1396 * can do, instead of having a static sleep time. This is useful for
1397 * the case where our storage is so fast that it is more optimal to go
1398 * ahead and force a flush and wait for the transaction to be committed
1399 * than it is to wait for an arbitrary amount of time for new writers to
1400 * join the transaction. We achieve this by measuring how long it takes
1401 * to commit a transaction, and compare it with how long this
1402 * transaction has been running, and if run time < commit time then we
1403 * sleep for the delta and commit. This greatly helps super fast disks
1404 * that would see slowdowns as more threads started doing fsyncs.
1405 *
1406 * But don't do this if this process was the most recent one to
1407 * perform a synchronous write. We do this to detect the case where a
1408 * single process is doing a stream of sync writes. No point in waiting
1409 * for joiners in that case.
1410 */
1411 pid = current->pid;
1412 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1413 u64 commit_time, trans_time;
1414
1415 journal->j_last_sync_writer = pid;
1416
1417 spin_lock(&journal->j_state_lock);
1418 commit_time = journal->j_average_commit_time;
1419 spin_unlock(&journal->j_state_lock);
1420
1421 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1422 transaction->t_start_time));
1423
1424 commit_time = min_t(u64, commit_time,
1425 1000*jiffies_to_usecs(1));
1426
1427 if (trans_time < commit_time) {
1428 ktime_t expires = ktime_add_ns(ktime_get(),
1429 commit_time);
1430 set_current_state(TASK_UNINTERRUPTIBLE);
1431 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1432 }
1433 }
1434
1435 current->journal_info = NULL;
1436 spin_lock(&journal->j_state_lock);
1437 spin_lock(&transaction->t_handle_lock);
1438 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1439 transaction->t_updates--;
1440 if (!transaction->t_updates) {
1441 wake_up(&journal->j_wait_updates);
1442 if (journal->j_barrier_count)
1443 wake_up(&journal->j_wait_transaction_locked);
1444 }
1445
1446 /*
1447 * If the handle is marked SYNC, we need to set another commit
1448 * going! We also want to force a commit if the current
1449 * transaction is occupying too much of the log, or if the
1450 * transaction is too old now.
1451 */
1452 if (handle->h_sync ||
1453 transaction->t_outstanding_credits >
1454 journal->j_max_transaction_buffers ||
1455 time_after_eq(jiffies, transaction->t_expires)) {
1456 /* Do this even for aborted journals: an abort still
1457 * completes the commit thread, it just doesn't write
1458 * anything to disk. */
1459 tid_t tid = transaction->t_tid;
1460
1461 spin_unlock(&transaction->t_handle_lock);
1462 jbd_debug(2, "transaction too old, requesting commit for "
1463 "handle %p\n", handle);
1464 /* This is non-blocking */
1465 __log_start_commit(journal, transaction->t_tid);
1466 spin_unlock(&journal->j_state_lock);
1467
1468 /*
1469 * Special case: JFS_SYNC synchronous updates require us
1470 * to wait for the commit to complete.
1471 */
1472 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1473 err = log_wait_commit(journal, tid);
1474 } else {
1475 spin_unlock(&transaction->t_handle_lock);
1476 spin_unlock(&journal->j_state_lock);
1477 }
1478
1479 lock_map_release(&handle->h_lockdep_map);
1480
1481 jbd_free_handle(handle);
1482 return err;
1483}
1484
1485/**
1486 * int journal_force_commit() - force any uncommitted transactions
1487 * @journal: journal to force
1488 *
1489 * For synchronous operations: force any uncommitted transactions
1490 * to disk. May seem kludgy, but it reuses all the handle batching
1491 * code in a very simple manner.
1492 */
1493int journal_force_commit(journal_t *journal)
1494{
1495 handle_t *handle;
1496 int ret;
1497
1498 handle = journal_start(journal, 1);
1499 if (IS_ERR(handle)) {
1500 ret = PTR_ERR(handle);
1501 } else {
1502 handle->h_sync = 1;
1503 ret = journal_stop(handle);
1504 }
1505 return ret;
1506}
1507
1508/*
1509 *
1510 * List management code snippets: various functions for manipulating the
1511 * transaction buffer lists.
1512 *
1513 */
1514
1515/*
1516 * Append a buffer to a transaction list, given the transaction's list head
1517 * pointer.
1518 *
1519 * j_list_lock is held.
1520 *
1521 * jbd_lock_bh_state(jh2bh(jh)) is held.
1522 */
1523
1524static inline void
1525__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1526{
1527 if (!*list) {
1528 jh->b_tnext = jh->b_tprev = jh;
1529 *list = jh;
1530 } else {
1531 /* Insert at the tail of the list to preserve order */
1532 struct journal_head *first = *list, *last = first->b_tprev;
1533 jh->b_tprev = last;
1534 jh->b_tnext = first;
1535 last->b_tnext = first->b_tprev = jh;
1536 }
1537}
1538
1539/*
1540 * Remove a buffer from a transaction list, given the transaction's list
1541 * head pointer.
1542 *
1543 * Called with j_list_lock held, and the journal may not be locked.
1544 *
1545 * jbd_lock_bh_state(jh2bh(jh)) is held.
1546 */
1547
1548static inline void
1549__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1550{
1551 if (*list == jh) {
1552 *list = jh->b_tnext;
1553 if (*list == jh)
1554 *list = NULL;
1555 }
1556 jh->b_tprev->b_tnext = jh->b_tnext;
1557 jh->b_tnext->b_tprev = jh->b_tprev;
1558}
1559
1560/*
1561 * Remove a buffer from the appropriate transaction list.
1562 *
1563 * Note that this function can *change* the value of
1564 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
1565 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
1566 * is holding onto a copy of one of thee pointers, it could go bad.
1567 * Generally the caller needs to re-read the pointer from the transaction_t.
1568 *
1569 * Called under j_list_lock. The journal may not be locked.
1570 */
1571static void __journal_temp_unlink_buffer(struct journal_head *jh)
1572{
1573 struct journal_head **list = NULL;
1574 transaction_t *transaction;
1575 struct buffer_head *bh = jh2bh(jh);
1576
1577 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1578 transaction = jh->b_transaction;
1579 if (transaction)
1580 assert_spin_locked(&transaction->t_journal->j_list_lock);
1581
1582 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1583 if (jh->b_jlist != BJ_None)
1584 J_ASSERT_JH(jh, transaction != NULL);
1585
1586 switch (jh->b_jlist) {
1587 case BJ_None:
1588 return;
1589 case BJ_SyncData:
1590 list = &transaction->t_sync_datalist;
1591 break;
1592 case BJ_Metadata:
1593 transaction->t_nr_buffers--;
1594 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1595 list = &transaction->t_buffers;
1596 break;
1597 case BJ_Forget:
1598 list = &transaction->t_forget;
1599 break;
1600 case BJ_IO:
1601 list = &transaction->t_iobuf_list;
1602 break;
1603 case BJ_Shadow:
1604 list = &transaction->t_shadow_list;
1605 break;
1606 case BJ_LogCtl:
1607 list = &transaction->t_log_list;
1608 break;
1609 case BJ_Reserved:
1610 list = &transaction->t_reserved_list;
1611 break;
1612 case BJ_Locked:
1613 list = &transaction->t_locked_list;
1614 break;
1615 }
1616
1617 __blist_del_buffer(list, jh);
1618 jh->b_jlist = BJ_None;
1619 if (test_clear_buffer_jbddirty(bh))
1620 mark_buffer_dirty(bh); /* Expose it to the VM */
1621}
1622
1623/*
1624 * Remove buffer from all transactions.
1625 *
1626 * Called with bh_state lock and j_list_lock
1627 *
1628 * jh and bh may be already freed when this function returns.
1629 */
1630void __journal_unfile_buffer(struct journal_head *jh)
1631{
1632 __journal_temp_unlink_buffer(jh);
1633 jh->b_transaction = NULL;
1634 journal_put_journal_head(jh);
1635}
1636
1637void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1638{
1639 struct buffer_head *bh = jh2bh(jh);
1640
1641 /* Get reference so that buffer cannot be freed before we unlock it */
1642 get_bh(bh);
1643 jbd_lock_bh_state(bh);
1644 spin_lock(&journal->j_list_lock);
1645 __journal_unfile_buffer(jh);
1646 spin_unlock(&journal->j_list_lock);
1647 jbd_unlock_bh_state(bh);
1648 __brelse(bh);
1649}
1650
1651/*
1652 * Called from journal_try_to_free_buffers().
1653 *
1654 * Called under jbd_lock_bh_state(bh)
1655 */
1656static void
1657__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1658{
1659 struct journal_head *jh;
1660
1661 jh = bh2jh(bh);
1662
1663 if (buffer_locked(bh) || buffer_dirty(bh))
1664 goto out;
1665
1666 if (jh->b_next_transaction != NULL)
1667 goto out;
1668
1669 spin_lock(&journal->j_list_lock);
1670 if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
1671 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1672 /* A written-back ordered data buffer */
1673 JBUFFER_TRACE(jh, "release data");
1674 __journal_unfile_buffer(jh);
1675 }
1676 } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1677 /* written-back checkpointed metadata buffer */
1678 if (jh->b_jlist == BJ_None) {
1679 JBUFFER_TRACE(jh, "remove from checkpoint list");
1680 __journal_remove_checkpoint(jh);
1681 }
1682 }
1683 spin_unlock(&journal->j_list_lock);
1684out:
1685 return;
1686}
1687
1688/**
1689 * int journal_try_to_free_buffers() - try to free page buffers.
1690 * @journal: journal for operation
1691 * @page: to try and free
1692 * @gfp_mask: we use the mask to detect how hard should we try to release
1693 * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
1694 * release the buffers.
1695 *
1696 *
1697 * For all the buffers on this page,
1698 * if they are fully written out ordered data, move them onto BUF_CLEAN
1699 * so try_to_free_buffers() can reap them.
1700 *
1701 * This function returns non-zero if we wish try_to_free_buffers()
1702 * to be called. We do this if the page is releasable by try_to_free_buffers().
1703 * We also do it if the page has locked or dirty buffers and the caller wants
1704 * us to perform sync or async writeout.
1705 *
1706 * This complicates JBD locking somewhat. We aren't protected by the
1707 * BKL here. We wish to remove the buffer from its committing or
1708 * running transaction's ->t_datalist via __journal_unfile_buffer.
1709 *
1710 * This may *change* the value of transaction_t->t_datalist, so anyone
1711 * who looks at t_datalist needs to lock against this function.
1712 *
1713 * Even worse, someone may be doing a journal_dirty_data on this
1714 * buffer. So we need to lock against that. journal_dirty_data()
1715 * will come out of the lock with the buffer dirty, which makes it
1716 * ineligible for release here.
1717 *
1718 * Who else is affected by this? hmm... Really the only contender
1719 * is do_get_write_access() - it could be looking at the buffer while
1720 * journal_try_to_free_buffer() is changing its state. But that
1721 * cannot happen because we never reallocate freed data as metadata
1722 * while the data is part of a transaction. Yes?
1723 *
1724 * Return 0 on failure, 1 on success
1725 */
1726int journal_try_to_free_buffers(journal_t *journal,
1727 struct page *page, gfp_t gfp_mask)
1728{
1729 struct buffer_head *head;
1730 struct buffer_head *bh;
1731 int ret = 0;
1732
1733 J_ASSERT(PageLocked(page));
1734
1735 head = page_buffers(page);
1736 bh = head;
1737 do {
1738 struct journal_head *jh;
1739
1740 /*
1741 * We take our own ref against the journal_head here to avoid
1742 * having to add tons of locking around each instance of
1743 * journal_put_journal_head().
1744 */
1745 jh = journal_grab_journal_head(bh);
1746 if (!jh)
1747 continue;
1748
1749 jbd_lock_bh_state(bh);
1750 __journal_try_to_free_buffer(journal, bh);
1751 journal_put_journal_head(jh);
1752 jbd_unlock_bh_state(bh);
1753 if (buffer_jbd(bh))
1754 goto busy;
1755 } while ((bh = bh->b_this_page) != head);
1756
1757 ret = try_to_free_buffers(page);
1758
1759busy:
1760 return ret;
1761}
1762
1763/*
1764 * This buffer is no longer needed. If it is on an older transaction's
1765 * checkpoint list we need to record it on this transaction's forget list
1766 * to pin this buffer (and hence its checkpointing transaction) down until
1767 * this transaction commits. If the buffer isn't on a checkpoint list, we
1768 * release it.
1769 * Returns non-zero if JBD no longer has an interest in the buffer.
1770 *
1771 * Called under j_list_lock.
1772 *
1773 * Called under jbd_lock_bh_state(bh).
1774 */
1775static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1776{
1777 int may_free = 1;
1778 struct buffer_head *bh = jh2bh(jh);
1779
1780 if (jh->b_cp_transaction) {
1781 JBUFFER_TRACE(jh, "on running+cp transaction");
1782 __journal_temp_unlink_buffer(jh);
1783 /*
1784 * We don't want to write the buffer anymore, clear the
1785 * bit so that we don't confuse checks in
1786 * __journal_file_buffer
1787 */
1788 clear_buffer_dirty(bh);
1789 __journal_file_buffer(jh, transaction, BJ_Forget);
1790 may_free = 0;
1791 } else {
1792 JBUFFER_TRACE(jh, "on running transaction");
1793 __journal_unfile_buffer(jh);
1794 }
1795 return may_free;
1796}
1797
1798/*
1799 * journal_invalidatepage
1800 *
1801 * This code is tricky. It has a number of cases to deal with.
1802 *
1803 * There are two invariants which this code relies on:
1804 *
1805 * i_size must be updated on disk before we start calling invalidatepage on the
1806 * data.
1807 *
1808 * This is done in ext3 by defining an ext3_setattr method which
1809 * updates i_size before truncate gets going. By maintaining this
1810 * invariant, we can be sure that it is safe to throw away any buffers
1811 * attached to the current transaction: once the transaction commits,
1812 * we know that the data will not be needed.
1813 *
1814 * Note however that we can *not* throw away data belonging to the
1815 * previous, committing transaction!
1816 *
1817 * Any disk blocks which *are* part of the previous, committing
1818 * transaction (and which therefore cannot be discarded immediately) are
1819 * not going to be reused in the new running transaction
1820 *
1821 * The bitmap committed_data images guarantee this: any block which is
1822 * allocated in one transaction and removed in the next will be marked
1823 * as in-use in the committed_data bitmap, so cannot be reused until
1824 * the next transaction to delete the block commits. This means that
1825 * leaving committing buffers dirty is quite safe: the disk blocks
1826 * cannot be reallocated to a different file and so buffer aliasing is
1827 * not possible.
1828 *
1829 *
1830 * The above applies mainly to ordered data mode. In writeback mode we
1831 * don't make guarantees about the order in which data hits disk --- in
1832 * particular we don't guarantee that new dirty data is flushed before
1833 * transaction commit --- so it is always safe just to discard data
1834 * immediately in that mode. --sct
1835 */
1836
1837/*
1838 * The journal_unmap_buffer helper function returns zero if the buffer
1839 * concerned remains pinned as an anonymous buffer belonging to an older
1840 * transaction.
1841 *
1842 * We're outside-transaction here. Either or both of j_running_transaction
1843 * and j_committing_transaction may be NULL.
1844 */
1845static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
1846 int partial_page)
1847{
1848 transaction_t *transaction;
1849 struct journal_head *jh;
1850 int may_free = 1;
1851
1852 BUFFER_TRACE(bh, "entry");
1853
1854retry:
1855 /*
1856 * It is safe to proceed here without the j_list_lock because the
1857 * buffers cannot be stolen by try_to_free_buffers as long as we are
1858 * holding the page lock. --sct
1859 */
1860
1861 if (!buffer_jbd(bh))
1862 goto zap_buffer_unlocked;
1863
1864 spin_lock(&journal->j_state_lock);
1865 jbd_lock_bh_state(bh);
1866 spin_lock(&journal->j_list_lock);
1867
1868 jh = journal_grab_journal_head(bh);
1869 if (!jh)
1870 goto zap_buffer_no_jh;
1871
1872 /*
1873 * We cannot remove the buffer from checkpoint lists until the
1874 * transaction adding inode to orphan list (let's call it T)
1875 * is committed. Otherwise if the transaction changing the
1876 * buffer would be cleaned from the journal before T is
1877 * committed, a crash will cause that the correct contents of
1878 * the buffer will be lost. On the other hand we have to
1879 * clear the buffer dirty bit at latest at the moment when the
1880 * transaction marking the buffer as freed in the filesystem
1881 * structures is committed because from that moment on the
1882 * block can be reallocated and used by a different page.
1883 * Since the block hasn't been freed yet but the inode has
1884 * already been added to orphan list, it is safe for us to add
1885 * the buffer to BJ_Forget list of the newest transaction.
1886 *
1887 * Also we have to clear buffer_mapped flag of a truncated buffer
1888 * because the buffer_head may be attached to the page straddling
1889 * i_size (can happen only when blocksize < pagesize) and thus the
1890 * buffer_head can be reused when the file is extended again. So we end
1891 * up keeping around invalidated buffers attached to transactions'
1892 * BJ_Forget list just to stop checkpointing code from cleaning up
1893 * the transaction this buffer was modified in.
1894 */
1895 transaction = jh->b_transaction;
1896 if (transaction == NULL) {
1897 /* First case: not on any transaction. If it
1898 * has no checkpoint link, then we can zap it:
1899 * it's a writeback-mode buffer so we don't care
1900 * if it hits disk safely. */
1901 if (!jh->b_cp_transaction) {
1902 JBUFFER_TRACE(jh, "not on any transaction: zap");
1903 goto zap_buffer;
1904 }
1905
1906 if (!buffer_dirty(bh)) {
1907 /* bdflush has written it. We can drop it now */
1908 goto zap_buffer;
1909 }
1910
1911 /* OK, it must be in the journal but still not
1912 * written fully to disk: it's metadata or
1913 * journaled data... */
1914
1915 if (journal->j_running_transaction) {
1916 /* ... and once the current transaction has
1917 * committed, the buffer won't be needed any
1918 * longer. */
1919 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1920 may_free = __dispose_buffer(jh,
1921 journal->j_running_transaction);
1922 goto zap_buffer;
1923 } else {
1924 /* There is no currently-running transaction. So the
1925 * orphan record which we wrote for this file must have
1926 * passed into commit. We must attach this buffer to
1927 * the committing transaction, if it exists. */
1928 if (journal->j_committing_transaction) {
1929 JBUFFER_TRACE(jh, "give to committing trans");
1930 may_free = __dispose_buffer(jh,
1931 journal->j_committing_transaction);
1932 goto zap_buffer;
1933 } else {
1934 /* The orphan record's transaction has
1935 * committed. We can cleanse this buffer */
1936 clear_buffer_jbddirty(bh);
1937 goto zap_buffer;
1938 }
1939 }
1940 } else if (transaction == journal->j_committing_transaction) {
1941 JBUFFER_TRACE(jh, "on committing transaction");
1942 if (jh->b_jlist == BJ_Locked) {
1943 /*
1944 * The buffer is on the committing transaction's locked
1945 * list. We have the buffer locked, so I/O has
1946 * completed. So we can nail the buffer now.
1947 */
1948 may_free = __dispose_buffer(jh, transaction);
1949 goto zap_buffer;
1950 }
1951 /*
1952 * The buffer is committing, we simply cannot touch
1953 * it. If the page is straddling i_size we have to wait
1954 * for commit and try again.
1955 */
1956 if (partial_page) {
1957 tid_t tid = journal->j_committing_transaction->t_tid;
1958
1959 journal_put_journal_head(jh);
1960 spin_unlock(&journal->j_list_lock);
1961 jbd_unlock_bh_state(bh);
1962 spin_unlock(&journal->j_state_lock);
1963 unlock_buffer(bh);
1964 log_wait_commit(journal, tid);
1965 lock_buffer(bh);
1966 goto retry;
1967 }
1968 /*
1969 * OK, buffer won't be reachable after truncate. We just set
1970 * j_next_transaction to the running transaction (if there is
1971 * one) and mark buffer as freed so that commit code knows it
1972 * should clear dirty bits when it is done with the buffer.
1973 */
1974 set_buffer_freed(bh);
1975 if (journal->j_running_transaction && buffer_jbddirty(bh))
1976 jh->b_next_transaction = journal->j_running_transaction;
1977 journal_put_journal_head(jh);
1978 spin_unlock(&journal->j_list_lock);
1979 jbd_unlock_bh_state(bh);
1980 spin_unlock(&journal->j_state_lock);
1981 return 0;
1982 } else {
1983 /* Good, the buffer belongs to the running transaction.
1984 * We are writing our own transaction's data, not any
1985 * previous one's, so it is safe to throw it away
1986 * (remember that we expect the filesystem to have set
1987 * i_size already for this truncate so recovery will not
1988 * expose the disk blocks we are discarding here.) */
1989 J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1990 JBUFFER_TRACE(jh, "on running transaction");
1991 may_free = __dispose_buffer(jh, transaction);
1992 }
1993
1994zap_buffer:
1995 /*
1996 * This is tricky. Although the buffer is truncated, it may be reused
1997 * if blocksize < pagesize and it is attached to the page straddling
1998 * EOF. Since the buffer might have been added to BJ_Forget list of the
1999 * running transaction, journal_get_write_access() won't clear
2000 * b_modified and credit accounting gets confused. So clear b_modified
2001 * here. */
2002 jh->b_modified = 0;
2003 journal_put_journal_head(jh);
2004zap_buffer_no_jh:
2005 spin_unlock(&journal->j_list_lock);
2006 jbd_unlock_bh_state(bh);
2007 spin_unlock(&journal->j_state_lock);
2008zap_buffer_unlocked:
2009 clear_buffer_dirty(bh);
2010 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
2011 clear_buffer_mapped(bh);
2012 clear_buffer_req(bh);
2013 clear_buffer_new(bh);
2014 bh->b_bdev = NULL;
2015 return may_free;
2016}
2017
2018/**
2019 * void journal_invalidatepage() - invalidate a journal page
2020 * @journal: journal to use for flush
2021 * @page: page to flush
2022 * @offset: offset of the range to invalidate
2023 * @length: length of the range to invalidate
2024 *
2025 * Reap page buffers containing data in specified range in page.
2026 */
2027void journal_invalidatepage(journal_t *journal,
2028 struct page *page,
2029 unsigned int offset,
2030 unsigned int length)
2031{
2032 struct buffer_head *head, *bh, *next;
2033 unsigned int stop = offset + length;
2034 unsigned int curr_off = 0;
2035 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2036 int may_free = 1;
2037
2038 if (!PageLocked(page))
2039 BUG();
2040 if (!page_has_buffers(page))
2041 return;
2042
2043 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
2044
2045 /* We will potentially be playing with lists other than just the
2046 * data lists (especially for journaled data mode), so be
2047 * cautious in our locking. */
2048
2049 head = bh = page_buffers(page);
2050 do {
2051 unsigned int next_off = curr_off + bh->b_size;
2052 next = bh->b_this_page;
2053
2054 if (next_off > stop)
2055 return;
2056
2057 if (offset <= curr_off) {
2058 /* This block is wholly outside the truncation point */
2059 lock_buffer(bh);
2060 may_free &= journal_unmap_buffer(journal, bh,
2061 partial_page);
2062 unlock_buffer(bh);
2063 }
2064 curr_off = next_off;
2065 bh = next;
2066
2067 } while (bh != head);
2068
2069 if (!partial_page) {
2070 if (may_free && try_to_free_buffers(page))
2071 J_ASSERT(!page_has_buffers(page));
2072 }
2073}
2074
2075/*
2076 * File a buffer on the given transaction list.
2077 */
2078void __journal_file_buffer(struct journal_head *jh,
2079 transaction_t *transaction, int jlist)
2080{
2081 struct journal_head **list = NULL;
2082 int was_dirty = 0;
2083 struct buffer_head *bh = jh2bh(jh);
2084
2085 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2086 assert_spin_locked(&transaction->t_journal->j_list_lock);
2087
2088 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2089 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
2090 jh->b_transaction == NULL);
2091
2092 if (jh->b_transaction && jh->b_jlist == jlist)
2093 return;
2094
2095 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2096 jlist == BJ_Shadow || jlist == BJ_Forget) {
2097 /*
2098 * For metadata buffers, we track dirty bit in buffer_jbddirty
2099 * instead of buffer_dirty. We should not see a dirty bit set
2100 * here because we clear it in do_get_write_access but e.g.
2101 * tune2fs can modify the sb and set the dirty bit at any time
2102 * so we try to gracefully handle that.
2103 */
2104 if (buffer_dirty(bh))
2105 warn_dirty_buffer(bh);
2106 if (test_clear_buffer_dirty(bh) ||
2107 test_clear_buffer_jbddirty(bh))
2108 was_dirty = 1;
2109 }
2110
2111 if (jh->b_transaction)
2112 __journal_temp_unlink_buffer(jh);
2113 else
2114 journal_grab_journal_head(bh);
2115 jh->b_transaction = transaction;
2116
2117 switch (jlist) {
2118 case BJ_None:
2119 J_ASSERT_JH(jh, !jh->b_committed_data);
2120 J_ASSERT_JH(jh, !jh->b_frozen_data);
2121 return;
2122 case BJ_SyncData:
2123 list = &transaction->t_sync_datalist;
2124 break;
2125 case BJ_Metadata:
2126 transaction->t_nr_buffers++;
2127 list = &transaction->t_buffers;
2128 break;
2129 case BJ_Forget:
2130 list = &transaction->t_forget;
2131 break;
2132 case BJ_IO:
2133 list = &transaction->t_iobuf_list;
2134 break;
2135 case BJ_Shadow:
2136 list = &transaction->t_shadow_list;
2137 break;
2138 case BJ_LogCtl:
2139 list = &transaction->t_log_list;
2140 break;
2141 case BJ_Reserved:
2142 list = &transaction->t_reserved_list;
2143 break;
2144 case BJ_Locked:
2145 list = &transaction->t_locked_list;
2146 break;
2147 }
2148
2149 __blist_add_buffer(list, jh);
2150 jh->b_jlist = jlist;
2151
2152 if (was_dirty)
2153 set_buffer_jbddirty(bh);
2154}
2155
2156void journal_file_buffer(struct journal_head *jh,
2157 transaction_t *transaction, int jlist)
2158{
2159 jbd_lock_bh_state(jh2bh(jh));
2160 spin_lock(&transaction->t_journal->j_list_lock);
2161 __journal_file_buffer(jh, transaction, jlist);
2162 spin_unlock(&transaction->t_journal->j_list_lock);
2163 jbd_unlock_bh_state(jh2bh(jh));
2164}
2165
2166/*
2167 * Remove a buffer from its current buffer list in preparation for
2168 * dropping it from its current transaction entirely. If the buffer has
2169 * already started to be used by a subsequent transaction, refile the
2170 * buffer on that transaction's metadata list.
2171 *
2172 * Called under j_list_lock
2173 * Called under jbd_lock_bh_state(jh2bh(jh))
2174 *
2175 * jh and bh may be already free when this function returns
2176 */
2177void __journal_refile_buffer(struct journal_head *jh)
2178{
2179 int was_dirty, jlist;
2180 struct buffer_head *bh = jh2bh(jh);
2181
2182 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2183 if (jh->b_transaction)
2184 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2185
2186 /* If the buffer is now unused, just drop it. */
2187 if (jh->b_next_transaction == NULL) {
2188 __journal_unfile_buffer(jh);
2189 return;
2190 }
2191
2192 /*
2193 * It has been modified by a later transaction: add it to the new
2194 * transaction's metadata list.
2195 */
2196
2197 was_dirty = test_clear_buffer_jbddirty(bh);
2198 __journal_temp_unlink_buffer(jh);
2199 /*
2200 * We set b_transaction here because b_next_transaction will inherit
2201 * our jh reference and thus __journal_file_buffer() must not take a
2202 * new one.
2203 */
2204 jh->b_transaction = jh->b_next_transaction;
2205 jh->b_next_transaction = NULL;
2206 if (buffer_freed(bh))
2207 jlist = BJ_Forget;
2208 else if (jh->b_modified)
2209 jlist = BJ_Metadata;
2210 else
2211 jlist = BJ_Reserved;
2212 __journal_file_buffer(jh, jh->b_transaction, jlist);
2213 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2214
2215 if (was_dirty)
2216 set_buffer_jbddirty(bh);
2217}
2218
2219/*
2220 * __journal_refile_buffer() with necessary locking added. We take our bh
2221 * reference so that we can safely unlock bh.
2222 *
2223 * The jh and bh may be freed by this call.
2224 */
2225void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2226{
2227 struct buffer_head *bh = jh2bh(jh);
2228
2229 /* Get reference so that buffer cannot be freed before we unlock it */
2230 get_bh(bh);
2231 jbd_lock_bh_state(bh);
2232 spin_lock(&journal->j_list_lock);
2233 __journal_refile_buffer(jh);
2234 jbd_unlock_bh_state(bh);
2235 spin_unlock(&journal->j_list_lock);
2236 __brelse(bh);
2237}