aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jbd2')
-rw-r--r--fs/jbd2/Makefile7
-rw-r--r--fs/jbd2/checkpoint.c697
-rw-r--r--fs/jbd2/commit.c920
-rw-r--r--fs/jbd2/journal.c2084
-rw-r--r--fs/jbd2/recovery.c609
-rw-r--r--fs/jbd2/revoke.c712
-rw-r--r--fs/jbd2/transaction.c2081
7 files changed, 7110 insertions, 0 deletions
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
new file mode 100644
index 000000000000..802a3413872a
--- /dev/null
+++ b/fs/jbd2/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the linux journaling routines.
3#
4
5obj-$(CONFIG_JBD2) += jbd2.o
6
7jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
new file mode 100644
index 000000000000..68039fa9a566
--- /dev/null
+++ b/fs/jbd2/checkpoint.c
@@ -0,0 +1,697 @@
1/*
2 * linux/fs/checkpoint.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Checkpoint routines for the generic filesystem journaling code.
13 * Part of the ext2fs journaling system.
14 *
15 * Checkpointing is the process of ensuring that a section of the log is
16 * committed fully to disk, so that that portion of the log can be
17 * reused.
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd2.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25
26/*
27 * Unlink a buffer from a transaction checkpoint list.
28 *
29 * Called with j_list_lock held.
30 */
31static inline void __buffer_unlink_first(struct journal_head *jh)
32{
33 transaction_t *transaction = jh->b_cp_transaction;
34
35 jh->b_cpnext->b_cpprev = jh->b_cpprev;
36 jh->b_cpprev->b_cpnext = jh->b_cpnext;
37 if (transaction->t_checkpoint_list == jh) {
38 transaction->t_checkpoint_list = jh->b_cpnext;
39 if (transaction->t_checkpoint_list == jh)
40 transaction->t_checkpoint_list = NULL;
41 }
42}
43
44/*
45 * Unlink a buffer from a transaction checkpoint(io) list.
46 *
47 * Called with j_list_lock held.
48 */
49static inline void __buffer_unlink(struct journal_head *jh)
50{
51 transaction_t *transaction = jh->b_cp_transaction;
52
53 __buffer_unlink_first(jh);
54 if (transaction->t_checkpoint_io_list == jh) {
55 transaction->t_checkpoint_io_list = jh->b_cpnext;
56 if (transaction->t_checkpoint_io_list == jh)
57 transaction->t_checkpoint_io_list = NULL;
58 }
59}
60
61/*
62 * Move a buffer from the checkpoint list to the checkpoint io list
63 *
64 * Called with j_list_lock held
65 */
66static inline void __buffer_relink_io(struct journal_head *jh)
67{
68 transaction_t *transaction = jh->b_cp_transaction;
69
70 __buffer_unlink_first(jh);
71
72 if (!transaction->t_checkpoint_io_list) {
73 jh->b_cpnext = jh->b_cpprev = jh;
74 } else {
75 jh->b_cpnext = transaction->t_checkpoint_io_list;
76 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
77 jh->b_cpprev->b_cpnext = jh;
78 jh->b_cpnext->b_cpprev = jh;
79 }
80 transaction->t_checkpoint_io_list = jh;
81}
82
83/*
84 * Try to release a checkpointed buffer from its transaction.
85 * Returns 1 if we released it and 2 if we also released the
86 * whole transaction.
87 *
88 * Requires j_list_lock
89 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
90 */
91static int __try_to_free_cp_buf(struct journal_head *jh)
92{
93 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh);
95
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh);
100 jbd2_journal_remove_journal_head(bh);
101 BUFFER_TRACE(bh, "release");
102 __brelse(bh);
103 } else {
104 jbd_unlock_bh_state(bh);
105 }
106 return ret;
107}
108
109/*
110 * __jbd2_log_wait_for_space: wait until there is space in the journal.
111 *
112 * Called under j-state_lock *only*. It will be unlocked if we have to wait
113 * for a checkpoint to free up some space in the log.
114 */
115void __jbd2_log_wait_for_space(journal_t *journal)
116{
117 int nblocks;
118 assert_spin_locked(&journal->j_state_lock);
119
120 nblocks = jbd_space_needed(journal);
121 while (__jbd2_log_space_left(journal) < nblocks) {
122 if (journal->j_flags & JBD2_ABORT)
123 return;
124 spin_unlock(&journal->j_state_lock);
125 mutex_lock(&journal->j_checkpoint_mutex);
126
127 /*
128 * Test again, another process may have checkpointed while we
129 * were waiting for the checkpoint lock
130 */
131 spin_lock(&journal->j_state_lock);
132 nblocks = jbd_space_needed(journal);
133 if (__jbd2_log_space_left(journal) < nblocks) {
134 spin_unlock(&journal->j_state_lock);
135 jbd2_log_do_checkpoint(journal);
136 spin_lock(&journal->j_state_lock);
137 }
138 mutex_unlock(&journal->j_checkpoint_mutex);
139 }
140}
141
142/*
143 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
144 * The caller must restart a list walk. Wait for someone else to run
145 * jbd_unlock_bh_state().
146 */
147static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
148 __releases(journal->j_list_lock)
149{
150 get_bh(bh);
151 spin_unlock(&journal->j_list_lock);
152 jbd_lock_bh_state(bh);
153 jbd_unlock_bh_state(bh);
154 put_bh(bh);
155}
156
157/*
158 * Clean up transaction's list of buffers submitted for io.
159 * We wait for any pending IO to complete and remove any clean
160 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO.
162 *
163 * Called with j_list_lock held.
164 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{
167 struct journal_head *jh;
168 struct buffer_head *bh;
169 tid_t this_tid;
170 int released = 0;
171
172 this_tid = transaction->t_tid;
173restart:
174 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid)
177 return;
178 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh);
181 if (!jbd_trylock_bh_state(bh)) {
182 jbd_sync_bh(journal, bh);
183 spin_lock(&journal->j_list_lock);
184 goto restart;
185 }
186 if (buffer_locked(bh)) {
187 atomic_inc(&bh->b_count);
188 spin_unlock(&journal->j_list_lock);
189 jbd_unlock_bh_state(bh);
190 wait_on_buffer(bh);
191 /* the journal_head may have gone by now */
192 BUFFER_TRACE(bh, "brelse");
193 __brelse(bh);
194 spin_lock(&journal->j_list_lock);
195 goto restart;
196 }
197 /*
198 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list
200 */
201 released = __jbd2_journal_remove_checkpoint(jh);
202 jbd_unlock_bh_state(bh);
203 jbd2_journal_remove_journal_head(bh);
204 __brelse(bh);
205 }
206}
207
208#define NR_BATCH 64
209
210static void
211__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
212{
213 int i;
214
215 ll_rw_block(SWRITE, *batch_count, bhs);
216 for (i = 0; i < *batch_count; i++) {
217 struct buffer_head *bh = bhs[i];
218 clear_buffer_jwrite(bh);
219 BUFFER_TRACE(bh, "brelse");
220 __brelse(bh);
221 }
222 *batch_count = 0;
223}
224
225/*
226 * Try to flush one buffer from the checkpoint list to disk.
227 *
228 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list.
230 *
231 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
233 */
234static int __process_buffer(journal_t *journal, struct journal_head *jh,
235 struct buffer_head **bhs, int *batch_count)
236{
237 struct buffer_head *bh = jh2bh(jh);
238 int ret = 0;
239
240 if (buffer_locked(bh)) {
241 atomic_inc(&bh->b_count);
242 spin_unlock(&journal->j_list_lock);
243 jbd_unlock_bh_state(bh);
244 wait_on_buffer(bh);
245 /* the journal_head may have gone by now */
246 BUFFER_TRACE(bh, "brelse");
247 __brelse(bh);
248 ret = 1;
249 } else if (jh->b_transaction != NULL) {
250 transaction_t *t = jh->b_transaction;
251 tid_t tid = t->t_tid;
252
253 spin_unlock(&journal->j_list_lock);
254 jbd_unlock_bh_state(bh);
255 jbd2_log_start_commit(journal, tid);
256 jbd2_log_wait_commit(journal, tid);
257 ret = 1;
258 } else if (!buffer_dirty(bh)) {
259 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
260 BUFFER_TRACE(bh, "remove from checkpoint");
261 __jbd2_journal_remove_checkpoint(jh);
262 spin_unlock(&journal->j_list_lock);
263 jbd_unlock_bh_state(bh);
264 jbd2_journal_remove_journal_head(bh);
265 __brelse(bh);
266 ret = 1;
267 } else {
268 /*
269 * Important: we are about to write the buffer, and
270 * possibly block, while still holding the journal lock.
271 * We cannot afford to let the transaction logic start
272 * messing around with this buffer before we write it to
273 * disk, as that would break recoverability.
274 */
275 BUFFER_TRACE(bh, "queue");
276 get_bh(bh);
277 J_ASSERT_BH(bh, !buffer_jwrite(bh));
278 set_buffer_jwrite(bh);
279 bhs[*batch_count] = bh;
280 __buffer_relink_io(jh);
281 jbd_unlock_bh_state(bh);
282 (*batch_count)++;
283 if (*batch_count == NR_BATCH) {
284 spin_unlock(&journal->j_list_lock);
285 __flush_batch(journal, bhs, batch_count);
286 ret = 1;
287 }
288 }
289 return ret;
290}
291
292/*
293 * Perform an actual checkpoint. We take the first transaction on the
294 * list of transactions to be checkpointed and send all its buffers
295 * to disk. We submit larger chunks of data at once.
296 *
297 * The journal should be locked before calling this function.
298 */
299int jbd2_log_do_checkpoint(journal_t *journal)
300{
301 transaction_t *transaction;
302 tid_t this_tid;
303 int result;
304
305 jbd_debug(1, "Start checkpoint\n");
306
307 /*
308 * First thing: if there are any transactions in the log which
309 * don't need checkpointing, just eliminate them from the
310 * journal straight away.
311 */
312 result = jbd2_cleanup_journal_tail(journal);
313 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
314 if (result <= 0)
315 return result;
316
317 /*
318 * OK, we need to start writing disk blocks. Take one transaction
319 * and write it.
320 */
321 spin_lock(&journal->j_list_lock);
322 if (!journal->j_checkpoint_transactions)
323 goto out;
324 transaction = journal->j_checkpoint_transactions;
325 this_tid = transaction->t_tid;
326restart:
327 /*
328 * If someone cleaned up this transaction while we slept, we're
329 * done (maybe it's a new transaction, but it fell at the same
330 * address).
331 */
332 if (journal->j_checkpoint_transactions == transaction &&
333 transaction->t_tid == this_tid) {
334 int batch_count = 0;
335 struct buffer_head *bhs[NR_BATCH];
336 struct journal_head *jh;
337 int retry = 0;
338
339 while (!retry && transaction->t_checkpoint_list) {
340 struct buffer_head *bh;
341
342 jh = transaction->t_checkpoint_list;
343 bh = jh2bh(jh);
344 if (!jbd_trylock_bh_state(bh)) {
345 jbd_sync_bh(journal, bh);
346 retry = 1;
347 break;
348 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count);
350 if (!retry && lock_need_resched(&journal->j_list_lock)){
351 spin_unlock(&journal->j_list_lock);
352 retry = 1;
353 break;
354 }
355 }
356
357 if (batch_count) {
358 if (!retry) {
359 spin_unlock(&journal->j_list_lock);
360 retry = 1;
361 }
362 __flush_batch(journal, bhs, &batch_count);
363 }
364
365 if (retry) {
366 spin_lock(&journal->j_list_lock);
367 goto restart;
368 }
369 /*
370 * Now we have cleaned up the first transaction's checkpoint
371 * list. Let's clean up the second one
372 */
373 __wait_cp_io(journal, transaction);
374 }
375out:
376 spin_unlock(&journal->j_list_lock);
377 result = jbd2_cleanup_journal_tail(journal);
378 if (result < 0)
379 return result;
380 return 0;
381}
382
383/*
384 * Check the list of checkpoint transactions for the journal to see if
385 * we have already got rid of any since the last update of the log tail
386 * in the journal superblock. If so, we can instantly roll the
387 * superblock forward to remove those transactions from the log.
388 *
389 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
390 *
391 * Called with the journal lock held.
392 *
393 * This is the only part of the journaling code which really needs to be
394 * aware of transaction aborts. Checkpointing involves writing to the
395 * main filesystem area rather than to the journal, so it can proceed
396 * even in abort state, but we must not update the journal superblock if
397 * we have an abort error outstanding.
398 */
399
400int jbd2_cleanup_journal_tail(journal_t *journal)
401{
402 transaction_t * transaction;
403 tid_t first_tid;
404 unsigned long blocknr, freed;
405
406 /* OK, work out the oldest transaction remaining in the log, and
407 * the log block it starts at.
408 *
409 * If the log is now empty, we need to work out which is the
410 * next transaction ID we will write, and where it will
411 * start. */
412
413 spin_lock(&journal->j_state_lock);
414 spin_lock(&journal->j_list_lock);
415 transaction = journal->j_checkpoint_transactions;
416 if (transaction) {
417 first_tid = transaction->t_tid;
418 blocknr = transaction->t_log_start;
419 } else if ((transaction = journal->j_committing_transaction) != NULL) {
420 first_tid = transaction->t_tid;
421 blocknr = transaction->t_log_start;
422 } else if ((transaction = journal->j_running_transaction) != NULL) {
423 first_tid = transaction->t_tid;
424 blocknr = journal->j_head;
425 } else {
426 first_tid = journal->j_transaction_sequence;
427 blocknr = journal->j_head;
428 }
429 spin_unlock(&journal->j_list_lock);
430 J_ASSERT(blocknr != 0);
431
432 /* If the oldest pinned transaction is at the tail of the log
433 already then there's not much we can do right now. */
434 if (journal->j_tail_sequence == first_tid) {
435 spin_unlock(&journal->j_state_lock);
436 return 1;
437 }
438
439 /* OK, update the superblock to recover the freed space.
440 * Physical blocks come first: have we wrapped beyond the end of
441 * the log? */
442 freed = blocknr - journal->j_tail;
443 if (blocknr < journal->j_tail)
444 freed = freed + journal->j_last - journal->j_first;
445
446 jbd_debug(1,
447 "Cleaning journal tail from %d to %d (offset %lu), "
448 "freeing %lu\n",
449 journal->j_tail_sequence, first_tid, blocknr, freed);
450
451 journal->j_free += freed;
452 journal->j_tail_sequence = first_tid;
453 journal->j_tail = blocknr;
454 spin_unlock(&journal->j_state_lock);
455 if (!(journal->j_flags & JBD2_ABORT))
456 jbd2_journal_update_superblock(journal, 1);
457 return 0;
458}
459
460
461/* Checkpoint list management */
462
463/*
464 * journal_clean_one_cp_list
465 *
466 * Find all the written-back checkpoint buffers in the given list and release them.
467 *
468 * Called with the journal locked.
469 * Called with j_list_lock held.
470 * Returns number of bufers reaped (for debug)
471 */
472
473static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
474{
475 struct journal_head *last_jh;
476 struct journal_head *next_jh = jh;
477 int ret, freed = 0;
478
479 *released = 0;
480 if (!jh)
481 return 0;
482
483 last_jh = jh->b_cpprev;
484 do {
485 jh = next_jh;
486 next_jh = jh->b_cpnext;
487 /* Use trylock because of the ranking */
488 if (jbd_trylock_bh_state(jh2bh(jh))) {
489 ret = __try_to_free_cp_buf(jh);
490 if (ret) {
491 freed++;
492 if (ret == 2) {
493 *released = 1;
494 return freed;
495 }
496 }
497 }
498 /*
499 * This function only frees up some memory
500 * if possible so we dont have an obligation
501 * to finish processing. Bail out if preemption
502 * requested:
503 */
504 if (need_resched())
505 return freed;
506 } while (jh != last_jh);
507
508 return freed;
509}
510
511/*
512 * journal_clean_checkpoint_list
513 *
514 * Find all the written-back checkpoint buffers in the journal and release them.
515 *
516 * Called with the journal locked.
517 * Called with j_list_lock held.
518 * Returns number of buffers reaped (for debug)
519 */
520
521int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
522{
523 transaction_t *transaction, *last_transaction, *next_transaction;
524 int ret = 0;
525 int released;
526
527 transaction = journal->j_checkpoint_transactions;
528 if (!transaction)
529 goto out;
530
531 last_transaction = transaction->t_cpprev;
532 next_transaction = transaction;
533 do {
534 transaction = next_transaction;
535 next_transaction = transaction->t_cpnext;
536 ret += journal_clean_one_cp_list(transaction->
537 t_checkpoint_list, &released);
538 /*
539 * This function only frees up some memory if possible so we
540 * dont have an obligation to finish processing. Bail out if
541 * preemption requested:
542 */
543 if (need_resched())
544 goto out;
545 if (released)
546 continue;
547 /*
548 * It is essential that we are as careful as in the case of
549 * t_checkpoint_list with removing the buffer from the list as
550 * we can possibly see not yet submitted buffers on io_list
551 */
552 ret += journal_clean_one_cp_list(transaction->
553 t_checkpoint_io_list, &released);
554 if (need_resched())
555 goto out;
556 } while (transaction != last_transaction);
557out:
558 return ret;
559}
560
561/*
562 * journal_remove_checkpoint: called after a buffer has been committed
563 * to disk (either by being write-back flushed to disk, or being
564 * committed to the log).
565 *
566 * We cannot safely clean a transaction out of the log until all of the
567 * buffer updates committed in that transaction have safely been stored
568 * elsewhere on disk. To achieve this, all of the buffers in a
569 * transaction need to be maintained on the transaction's checkpoint
570 * lists until they have been rewritten, at which point this function is
571 * called to remove the buffer from the existing transaction's
572 * checkpoint lists.
573 *
574 * The function returns 1 if it frees the transaction, 0 otherwise.
575 *
576 * This function is called with the journal locked.
577 * This function is called with j_list_lock held.
578 * This function is called with jbd_lock_bh_state(jh2bh(jh))
579 */
580
581int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
582{
583 transaction_t *transaction;
584 journal_t *journal;
585 int ret = 0;
586
587 JBUFFER_TRACE(jh, "entry");
588
589 if ((transaction = jh->b_cp_transaction) == NULL) {
590 JBUFFER_TRACE(jh, "not on transaction");
591 goto out;
592 }
593 journal = transaction->t_journal;
594
595 __buffer_unlink(jh);
596 jh->b_cp_transaction = NULL;
597
598 if (transaction->t_checkpoint_list != NULL ||
599 transaction->t_checkpoint_io_list != NULL)
600 goto out;
601 JBUFFER_TRACE(jh, "transaction has no more buffers");
602
603 /*
604 * There is one special case to worry about: if we have just pulled the
605 * buffer off a committing transaction's forget list, then even if the
606 * checkpoint list is empty, the transaction obviously cannot be
607 * dropped!
608 *
609 * The locking here around j_committing_transaction is a bit sleazy.
610 * See the comment at the end of jbd2_journal_commit_transaction().
611 */
612 if (transaction == journal->j_committing_transaction) {
613 JBUFFER_TRACE(jh, "belongs to committing transaction");
614 goto out;
615 }
616
617 /* OK, that was the last buffer for the transaction: we can now
618 safely remove this transaction from the log */
619
620 __jbd2_journal_drop_transaction(journal, transaction);
621
622 /* Just in case anybody was waiting for more transactions to be
623 checkpointed... */
624 wake_up(&journal->j_wait_logspace);
625 ret = 1;
626out:
627 JBUFFER_TRACE(jh, "exit");
628 return ret;
629}
630
631/*
632 * journal_insert_checkpoint: put a committed buffer onto a checkpoint
633 * list so that we know when it is safe to clean the transaction out of
634 * the log.
635 *
636 * Called with the journal locked.
637 * Called with j_list_lock held.
638 */
639void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
640 transaction_t *transaction)
641{
642 JBUFFER_TRACE(jh, "entry");
643 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
644 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
645
646 jh->b_cp_transaction = transaction;
647
648 if (!transaction->t_checkpoint_list) {
649 jh->b_cpnext = jh->b_cpprev = jh;
650 } else {
651 jh->b_cpnext = transaction->t_checkpoint_list;
652 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
653 jh->b_cpprev->b_cpnext = jh;
654 jh->b_cpnext->b_cpprev = jh;
655 }
656 transaction->t_checkpoint_list = jh;
657}
658
659/*
660 * We've finished with this transaction structure: adios...
661 *
662 * The transaction must have no links except for the checkpoint by this
663 * point.
664 *
665 * Called with the journal locked.
666 * Called with j_list_lock held.
667 */
668
669void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
670{
671 assert_spin_locked(&journal->j_list_lock);
672 if (transaction->t_cpnext) {
673 transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
674 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
675 if (journal->j_checkpoint_transactions == transaction)
676 journal->j_checkpoint_transactions =
677 transaction->t_cpnext;
678 if (journal->j_checkpoint_transactions == transaction)
679 journal->j_checkpoint_transactions = NULL;
680 }
681
682 J_ASSERT(transaction->t_state == T_FINISHED);
683 J_ASSERT(transaction->t_buffers == NULL);
684 J_ASSERT(transaction->t_sync_datalist == NULL);
685 J_ASSERT(transaction->t_forget == NULL);
686 J_ASSERT(transaction->t_iobuf_list == NULL);
687 J_ASSERT(transaction->t_shadow_list == NULL);
688 J_ASSERT(transaction->t_log_list == NULL);
689 J_ASSERT(transaction->t_checkpoint_list == NULL);
690 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
691 J_ASSERT(transaction->t_updates == 0);
692 J_ASSERT(journal->j_committing_transaction != transaction);
693 J_ASSERT(journal->j_running_transaction != transaction);
694
695 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
696 kfree(transaction);
697}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
new file mode 100644
index 000000000000..70b2ae1ef281
--- /dev/null
+++ b/fs/jbd2/commit.c
@@ -0,0 +1,920 @@
1/*
2 * linux/fs/jbd2/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd2.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
23#include <linux/smp_lock.h>
24
25/*
26 * Default IO end handler for temporary BJ_IO buffer_heads.
27 */
28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29{
30 BUFFER_TRACE(bh, "");
31 if (uptodate)
32 set_buffer_uptodate(bh);
33 else
34 clear_buffer_uptodate(bh);
35 unlock_buffer(bh);
36}
37
38/*
39 * When an ext3-ordered file is truncated, it is possible that many pages are
40 * not sucessfully freed, because they are attached to a committing transaction.
41 * After the transaction commits, these pages are left on the LRU, with no
42 * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 * the numbers in /proc/meminfo look odd.
45 *
46 * So here, we have a buffer which has just come off the forget list. Look to
47 * see if we can strip all buffers from the backing page.
48 *
49 * Called under lock_journal(), and possibly under journal_datalist_lock. The
50 * caller provided us with a ref against the buffer, and we drop that here.
51 */
52static void release_buffer_page(struct buffer_head *bh)
53{
54 struct page *page;
55
56 if (buffer_dirty(bh))
57 goto nope;
58 if (atomic_read(&bh->b_count) != 1)
59 goto nope;
60 page = bh->b_page;
61 if (!page)
62 goto nope;
63 if (page->mapping)
64 goto nope;
65
66 /* OK, it's a truncated page */
67 if (TestSetPageLocked(page))
68 goto nope;
69
70 page_cache_get(page);
71 __brelse(bh);
72 try_to_free_buffers(page);
73 unlock_page(page);
74 page_cache_release(page);
75 return;
76
77nope:
78 __brelse(bh);
79}
80
81/*
82 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83 * held. For ranking reasons we must trylock. If we lose, schedule away and
84 * return 0. j_list_lock is dropped in this case.
85 */
86static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87{
88 if (!jbd_trylock_bh_state(bh)) {
89 spin_unlock(&journal->j_list_lock);
90 schedule();
91 return 0;
92 }
93 return 1;
94}
95
96/* Done it all: now write the commit record. We should have
97 * cleaned up our previous buffers by now, so if we are in abort
98 * mode we can now just skip the rest of the journal write
99 * entirely.
100 *
101 * Returns 1 if the journal needs to be aborted or 0 on success
102 */
103static int journal_write_commit_record(journal_t *journal,
104 transaction_t *commit_transaction)
105{
106 struct journal_head *descriptor;
107 struct buffer_head *bh;
108 int i, ret;
109 int barrier_done = 0;
110
111 if (is_journal_aborted(journal))
112 return 0;
113
114 descriptor = jbd2_journal_get_descriptor_buffer(journal);
115 if (!descriptor)
116 return 1;
117
118 bh = jh2bh(descriptor);
119
120 /* AKPM: buglet - add `i' to tmp! */
121 for (i = 0; i < bh->b_size; i += 512) {
122 journal_header_t *tmp = (journal_header_t*)bh->b_data;
123 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
124 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
125 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126 }
127
128 JBUFFER_TRACE(descriptor, "write commit block");
129 set_buffer_dirty(bh);
130 if (journal->j_flags & JBD2_BARRIER) {
131 set_buffer_ordered(bh);
132 barrier_done = 1;
133 }
134 ret = sync_dirty_buffer(bh);
135 /* is it possible for another commit to fail at roughly
136 * the same time as this one? If so, we don't want to
137 * trust the barrier flag in the super, but instead want
138 * to remember if we sent a barrier request
139 */
140 if (ret == -EOPNOTSUPP && barrier_done) {
141 char b[BDEVNAME_SIZE];
142
143 printk(KERN_WARNING
144 "JBD: barrier-based sync failed on %s - "
145 "disabling barriers\n",
146 bdevname(journal->j_dev, b));
147 spin_lock(&journal->j_state_lock);
148 journal->j_flags &= ~JBD2_BARRIER;
149 spin_unlock(&journal->j_state_lock);
150
151 /* And try again, without the barrier */
152 clear_buffer_ordered(bh);
153 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh);
156 }
157 put_bh(bh); /* One for getblk() */
158 jbd2_journal_put_journal_head(descriptor);
159
160 return (ret == -EIO);
161}
162
163static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164{
165 int i;
166
167 for (i = 0; i < bufs; i++) {
168 wbuf[i]->b_end_io = end_buffer_write_sync;
169 /* We use-up our safety reference in submit_bh() */
170 submit_bh(WRITE, wbuf[i]);
171 }
172}
173
174/*
175 * Submit all the data buffers to disk
176 */
177static void journal_submit_data_buffers(journal_t *journal,
178 transaction_t *commit_transaction)
179{
180 struct journal_head *jh;
181 struct buffer_head *bh;
182 int locked;
183 int bufs = 0;
184 struct buffer_head **wbuf = journal->j_wbuf;
185
186 /*
187 * Whenever we unlock the journal and sleep, things can get added
188 * onto ->t_sync_datalist, so we have to keep looping back to
189 * write_out_data until we *know* that the list is empty.
190 *
191 * Cleanup any flushed data buffers from the data list. Even in
192 * abort mode, we want to flush this out as soon as possible.
193 */
194write_out_data:
195 cond_resched();
196 spin_lock(&journal->j_list_lock);
197
198 while (commit_transaction->t_sync_datalist) {
199 jh = commit_transaction->t_sync_datalist;
200 bh = jh2bh(jh);
201 locked = 0;
202
203 /* Get reference just to make sure buffer does not disappear
204 * when we are forced to drop various locks */
205 get_bh(bh);
206 /* If the buffer is dirty, we need to submit IO and hence
207 * we need the buffer lock. We try to lock the buffer without
208 * blocking. If we fail, we need to drop j_list_lock and do
209 * blocking lock_buffer().
210 */
211 if (buffer_dirty(bh)) {
212 if (test_set_buffer_locked(bh)) {
213 BUFFER_TRACE(bh, "needs blocking lock");
214 spin_unlock(&journal->j_list_lock);
215 /* Write out all data to prevent deadlocks */
216 journal_do_submit_data(wbuf, bufs);
217 bufs = 0;
218 lock_buffer(bh);
219 spin_lock(&journal->j_list_lock);
220 }
221 locked = 1;
222 }
223 /* We have to get bh_state lock. Again out of order, sigh. */
224 if (!inverted_lock(journal, bh)) {
225 jbd_lock_bh_state(bh);
226 spin_lock(&journal->j_list_lock);
227 }
228 /* Someone already cleaned up the buffer? */
229 if (!buffer_jbd(bh)
230 || jh->b_transaction != commit_transaction
231 || jh->b_jlist != BJ_SyncData) {
232 jbd_unlock_bh_state(bh);
233 if (locked)
234 unlock_buffer(bh);
235 BUFFER_TRACE(bh, "already cleaned up");
236 put_bh(bh);
237 continue;
238 }
239 if (locked && test_clear_buffer_dirty(bh)) {
240 BUFFER_TRACE(bh, "needs writeout, adding to array");
241 wbuf[bufs++] = bh;
242 __jbd2_journal_file_buffer(jh, commit_transaction,
243 BJ_Locked);
244 jbd_unlock_bh_state(bh);
245 if (bufs == journal->j_wbufsize) {
246 spin_unlock(&journal->j_list_lock);
247 journal_do_submit_data(wbuf, bufs);
248 bufs = 0;
249 goto write_out_data;
250 }
251 }
252 else {
253 BUFFER_TRACE(bh, "writeout complete: unfile");
254 __jbd2_journal_unfile_buffer(jh);
255 jbd_unlock_bh_state(bh);
256 if (locked)
257 unlock_buffer(bh);
258 jbd2_journal_remove_journal_head(bh);
259 /* Once for our safety reference, once for
260 * jbd2_journal_remove_journal_head() */
261 put_bh(bh);
262 put_bh(bh);
263 }
264
265 if (lock_need_resched(&journal->j_list_lock)) {
266 spin_unlock(&journal->j_list_lock);
267 goto write_out_data;
268 }
269 }
270 spin_unlock(&journal->j_list_lock);
271 journal_do_submit_data(wbuf, bufs);
272}
273
274static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
275 unsigned long long block)
276{
277 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
278 if (tag_bytes > JBD_TAG_SIZE32)
279 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
280}
281
282/*
283 * jbd2_journal_commit_transaction
284 *
285 * The primary function for committing a transaction to the log. This
286 * function is called by the journal thread to begin a complete commit.
287 */
288void jbd2_journal_commit_transaction(journal_t *journal)
289{
290 transaction_t *commit_transaction;
291 struct journal_head *jh, *new_jh, *descriptor;
292 struct buffer_head **wbuf = journal->j_wbuf;
293 int bufs;
294 int flags;
295 int err;
296 unsigned long long blocknr;
297 char *tagp = NULL;
298 journal_header_t *header;
299 journal_block_tag_t *tag = NULL;
300 int space_left = 0;
301 int first_tag = 0;
302 int tag_flag;
303 int i;
304 int tag_bytes = journal_tag_bytes(journal);
305
306 /*
307 * First job: lock down the current transaction and wait for
308 * all outstanding updates to complete.
309 */
310
311#ifdef COMMIT_STATS
312 spin_lock(&journal->j_list_lock);
313 summarise_journal_usage(journal);
314 spin_unlock(&journal->j_list_lock);
315#endif
316
317 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
318 if (journal->j_flags & JBD2_FLUSHED) {
319 jbd_debug(3, "super block updated\n");
320 jbd2_journal_update_superblock(journal, 1);
321 } else {
322 jbd_debug(3, "superblock not updated\n");
323 }
324
325 J_ASSERT(journal->j_running_transaction != NULL);
326 J_ASSERT(journal->j_committing_transaction == NULL);
327
328 commit_transaction = journal->j_running_transaction;
329 J_ASSERT(commit_transaction->t_state == T_RUNNING);
330
331 jbd_debug(1, "JBD: starting commit of transaction %d\n",
332 commit_transaction->t_tid);
333
334 spin_lock(&journal->j_state_lock);
335 commit_transaction->t_state = T_LOCKED;
336
337 spin_lock(&commit_transaction->t_handle_lock);
338 while (commit_transaction->t_updates) {
339 DEFINE_WAIT(wait);
340
341 prepare_to_wait(&journal->j_wait_updates, &wait,
342 TASK_UNINTERRUPTIBLE);
343 if (commit_transaction->t_updates) {
344 spin_unlock(&commit_transaction->t_handle_lock);
345 spin_unlock(&journal->j_state_lock);
346 schedule();
347 spin_lock(&journal->j_state_lock);
348 spin_lock(&commit_transaction->t_handle_lock);
349 }
350 finish_wait(&journal->j_wait_updates, &wait);
351 }
352 spin_unlock(&commit_transaction->t_handle_lock);
353
354 J_ASSERT (commit_transaction->t_outstanding_credits <=
355 journal->j_max_transaction_buffers);
356
357 /*
358 * First thing we are allowed to do is to discard any remaining
359 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
360 * that there are no such buffers: if a large filesystem
361 * operation like a truncate needs to split itself over multiple
362 * transactions, then it may try to do a jbd2_journal_restart() while
363 * there are still BJ_Reserved buffers outstanding. These must
364 * be released cleanly from the current transaction.
365 *
366 * In this case, the filesystem must still reserve write access
367 * again before modifying the buffer in the new transaction, but
368 * we do not require it to remember exactly which old buffers it
369 * has reserved. This is consistent with the existing behaviour
370 * that multiple jbd2_journal_get_write_access() calls to the same
371 * buffer are perfectly permissable.
372 */
373 while (commit_transaction->t_reserved_list) {
374 jh = commit_transaction->t_reserved_list;
375 JBUFFER_TRACE(jh, "reserved, unused: refile");
376 /*
377 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
378 * leave undo-committed data.
379 */
380 if (jh->b_committed_data) {
381 struct buffer_head *bh = jh2bh(jh);
382
383 jbd_lock_bh_state(bh);
384 jbd2_slab_free(jh->b_committed_data, bh->b_size);
385 jh->b_committed_data = NULL;
386 jbd_unlock_bh_state(bh);
387 }
388 jbd2_journal_refile_buffer(journal, jh);
389 }
390
391 /*
392 * Now try to drop any written-back buffers from the journal's
393 * checkpoint lists. We do this *before* commit because it potentially
394 * frees some memory
395 */
396 spin_lock(&journal->j_list_lock);
397 __jbd2_journal_clean_checkpoint_list(journal);
398 spin_unlock(&journal->j_list_lock);
399
400 jbd_debug (3, "JBD: commit phase 1\n");
401
402 /*
403 * Switch to a new revoke table.
404 */
405 jbd2_journal_switch_revoke_table(journal);
406
407 commit_transaction->t_state = T_FLUSH;
408 journal->j_committing_transaction = commit_transaction;
409 journal->j_running_transaction = NULL;
410 commit_transaction->t_log_start = journal->j_head;
411 wake_up(&journal->j_wait_transaction_locked);
412 spin_unlock(&journal->j_state_lock);
413
414 jbd_debug (3, "JBD: commit phase 2\n");
415
416 /*
417 * First, drop modified flag: all accesses to the buffers
418 * will be tracked for a new trasaction only -bzzz
419 */
420 spin_lock(&journal->j_list_lock);
421 if (commit_transaction->t_buffers) {
422 new_jh = jh = commit_transaction->t_buffers->b_tnext;
423 do {
424 J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
425 new_jh->b_modified == 0);
426 new_jh->b_modified = 0;
427 new_jh = new_jh->b_tnext;
428 } while (new_jh != jh);
429 }
430 spin_unlock(&journal->j_list_lock);
431
432 /*
433 * Now start flushing things to disk, in the order they appear
434 * on the transaction lists. Data blocks go first.
435 */
436 err = 0;
437 journal_submit_data_buffers(journal, commit_transaction);
438
439 /*
440 * Wait for all previously submitted IO to complete.
441 */
442 spin_lock(&journal->j_list_lock);
443 while (commit_transaction->t_locked_list) {
444 struct buffer_head *bh;
445
446 jh = commit_transaction->t_locked_list->b_tprev;
447 bh = jh2bh(jh);
448 get_bh(bh);
449 if (buffer_locked(bh)) {
450 spin_unlock(&journal->j_list_lock);
451 wait_on_buffer(bh);
452 if (unlikely(!buffer_uptodate(bh)))
453 err = -EIO;
454 spin_lock(&journal->j_list_lock);
455 }
456 if (!inverted_lock(journal, bh)) {
457 put_bh(bh);
458 spin_lock(&journal->j_list_lock);
459 continue;
460 }
461 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
462 __jbd2_journal_unfile_buffer(jh);
463 jbd_unlock_bh_state(bh);
464 jbd2_journal_remove_journal_head(bh);
465 put_bh(bh);
466 } else {
467 jbd_unlock_bh_state(bh);
468 }
469 put_bh(bh);
470 cond_resched_lock(&journal->j_list_lock);
471 }
472 spin_unlock(&journal->j_list_lock);
473
474 if (err)
475 __jbd2_journal_abort_hard(journal);
476
477 jbd2_journal_write_revoke_records(journal, commit_transaction);
478
479 jbd_debug(3, "JBD: commit phase 2\n");
480
481 /*
482 * If we found any dirty or locked buffers, then we should have
483 * looped back up to the write_out_data label. If there weren't
484 * any then journal_clean_data_list should have wiped the list
485 * clean by now, so check that it is in fact empty.
486 */
487 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
488
489 jbd_debug (3, "JBD: commit phase 3\n");
490
491 /*
492 * Way to go: we have now written out all of the data for a
493 * transaction! Now comes the tricky part: we need to write out
494 * metadata. Loop over the transaction's entire buffer list:
495 */
496 commit_transaction->t_state = T_COMMIT;
497
498 descriptor = NULL;
499 bufs = 0;
500 while (commit_transaction->t_buffers) {
501
502 /* Find the next buffer to be journaled... */
503
504 jh = commit_transaction->t_buffers;
505
506 /* If we're in abort mode, we just un-journal the buffer and
507 release it for background writing. */
508
509 if (is_journal_aborted(journal)) {
510 JBUFFER_TRACE(jh, "journal is aborting: refile");
511 jbd2_journal_refile_buffer(journal, jh);
512 /* If that was the last one, we need to clean up
513 * any descriptor buffers which may have been
514 * already allocated, even if we are now
515 * aborting. */
516 if (!commit_transaction->t_buffers)
517 goto start_journal_io;
518 continue;
519 }
520
521 /* Make sure we have a descriptor block in which to
522 record the metadata buffer. */
523
524 if (!descriptor) {
525 struct buffer_head *bh;
526
527 J_ASSERT (bufs == 0);
528
529 jbd_debug(4, "JBD: get descriptor\n");
530
531 descriptor = jbd2_journal_get_descriptor_buffer(journal);
532 if (!descriptor) {
533 __jbd2_journal_abort_hard(journal);
534 continue;
535 }
536
537 bh = jh2bh(descriptor);
538 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
539 (unsigned long long)bh->b_blocknr, bh->b_data);
540 header = (journal_header_t *)&bh->b_data[0];
541 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
542 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
543 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
544
545 tagp = &bh->b_data[sizeof(journal_header_t)];
546 space_left = bh->b_size - sizeof(journal_header_t);
547 first_tag = 1;
548 set_buffer_jwrite(bh);
549 set_buffer_dirty(bh);
550 wbuf[bufs++] = bh;
551
552 /* Record it so that we can wait for IO
553 completion later */
554 BUFFER_TRACE(bh, "ph3: file as descriptor");
555 jbd2_journal_file_buffer(descriptor, commit_transaction,
556 BJ_LogCtl);
557 }
558
559 /* Where is the buffer to be written? */
560
561 err = jbd2_journal_next_log_block(journal, &blocknr);
562 /* If the block mapping failed, just abandon the buffer
563 and repeat this loop: we'll fall into the
564 refile-on-abort condition above. */
565 if (err) {
566 __jbd2_journal_abort_hard(journal);
567 continue;
568 }
569
570 /*
571 * start_this_handle() uses t_outstanding_credits to determine
572 * the free space in the log, but this counter is changed
573 * by jbd2_journal_next_log_block() also.
574 */
575 commit_transaction->t_outstanding_credits--;
576
577 /* Bump b_count to prevent truncate from stumbling over
578 the shadowed buffer! @@@ This can go if we ever get
579 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
580 atomic_inc(&jh2bh(jh)->b_count);
581
582 /* Make a temporary IO buffer with which to write it out
583 (this will requeue both the metadata buffer and the
584 temporary IO buffer). new_bh goes on BJ_IO*/
585
586 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
587 /*
588 * akpm: jbd2_journal_write_metadata_buffer() sets
589 * new_bh->b_transaction to commit_transaction.
590 * We need to clean this up before we release new_bh
591 * (which is of type BJ_IO)
592 */
593 JBUFFER_TRACE(jh, "ph3: write metadata");
594 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
595 jh, &new_jh, blocknr);
596 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
597 wbuf[bufs++] = jh2bh(new_jh);
598
599 /* Record the new block's tag in the current descriptor
600 buffer */
601
602 tag_flag = 0;
603 if (flags & 1)
604 tag_flag |= JBD2_FLAG_ESCAPE;
605 if (!first_tag)
606 tag_flag |= JBD2_FLAG_SAME_UUID;
607
608 tag = (journal_block_tag_t *) tagp;
609 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
610 tag->t_flags = cpu_to_be32(tag_flag);
611 tagp += tag_bytes;
612 space_left -= tag_bytes;
613
614 if (first_tag) {
615 memcpy (tagp, journal->j_uuid, 16);
616 tagp += 16;
617 space_left -= 16;
618 first_tag = 0;
619 }
620
621 /* If there's no more to do, or if the descriptor is full,
622 let the IO rip! */
623
624 if (bufs == journal->j_wbufsize ||
625 commit_transaction->t_buffers == NULL ||
626 space_left < tag_bytes + 16) {
627
628 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
629
630 /* Write an end-of-descriptor marker before
631 submitting the IOs. "tag" still points to
632 the last tag we set up. */
633
634 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
635
636start_journal_io:
637 for (i = 0; i < bufs; i++) {
638 struct buffer_head *bh = wbuf[i];
639 lock_buffer(bh);
640 clear_buffer_dirty(bh);
641 set_buffer_uptodate(bh);
642 bh->b_end_io = journal_end_buffer_io_sync;
643 submit_bh(WRITE, bh);
644 }
645 cond_resched();
646
647 /* Force a new descriptor to be generated next
648 time round the loop. */
649 descriptor = NULL;
650 bufs = 0;
651 }
652 }
653
654 /* Lo and behold: we have just managed to send a transaction to
655 the log. Before we can commit it, wait for the IO so far to
656 complete. Control buffers being written are on the
657 transaction's t_log_list queue, and metadata buffers are on
658 the t_iobuf_list queue.
659
660 Wait for the buffers in reverse order. That way we are
661 less likely to be woken up until all IOs have completed, and
662 so we incur less scheduling load.
663 */
664
665 jbd_debug(3, "JBD: commit phase 4\n");
666
667 /*
668 * akpm: these are BJ_IO, and j_list_lock is not needed.
669 * See __journal_try_to_free_buffer.
670 */
671wait_for_iobuf:
672 while (commit_transaction->t_iobuf_list != NULL) {
673 struct buffer_head *bh;
674
675 jh = commit_transaction->t_iobuf_list->b_tprev;
676 bh = jh2bh(jh);
677 if (buffer_locked(bh)) {
678 wait_on_buffer(bh);
679 goto wait_for_iobuf;
680 }
681 if (cond_resched())
682 goto wait_for_iobuf;
683
684 if (unlikely(!buffer_uptodate(bh)))
685 err = -EIO;
686
687 clear_buffer_jwrite(bh);
688
689 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
690 jbd2_journal_unfile_buffer(journal, jh);
691
692 /*
693 * ->t_iobuf_list should contain only dummy buffer_heads
694 * which were created by jbd2_journal_write_metadata_buffer().
695 */
696 BUFFER_TRACE(bh, "dumping temporary bh");
697 jbd2_journal_put_journal_head(jh);
698 __brelse(bh);
699 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
700 free_buffer_head(bh);
701
702 /* We also have to unlock and free the corresponding
703 shadowed buffer */
704 jh = commit_transaction->t_shadow_list->b_tprev;
705 bh = jh2bh(jh);
706 clear_bit(BH_JWrite, &bh->b_state);
707 J_ASSERT_BH(bh, buffer_jbddirty(bh));
708
709 /* The metadata is now released for reuse, but we need
710 to remember it against this transaction so that when
711 we finally commit, we can do any checkpointing
712 required. */
713 JBUFFER_TRACE(jh, "file as BJ_Forget");
714 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
715 /* Wake up any transactions which were waiting for this
716 IO to complete */
717 wake_up_bit(&bh->b_state, BH_Unshadow);
718 JBUFFER_TRACE(jh, "brelse shadowed buffer");
719 __brelse(bh);
720 }
721
722 J_ASSERT (commit_transaction->t_shadow_list == NULL);
723
724 jbd_debug(3, "JBD: commit phase 5\n");
725
726 /* Here we wait for the revoke record and descriptor record buffers */
727 wait_for_ctlbuf:
728 while (commit_transaction->t_log_list != NULL) {
729 struct buffer_head *bh;
730
731 jh = commit_transaction->t_log_list->b_tprev;
732 bh = jh2bh(jh);
733 if (buffer_locked(bh)) {
734 wait_on_buffer(bh);
735 goto wait_for_ctlbuf;
736 }
737 if (cond_resched())
738 goto wait_for_ctlbuf;
739
740 if (unlikely(!buffer_uptodate(bh)))
741 err = -EIO;
742
743 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
744 clear_buffer_jwrite(bh);
745 jbd2_journal_unfile_buffer(journal, jh);
746 jbd2_journal_put_journal_head(jh);
747 __brelse(bh); /* One for getblk */
748 /* AKPM: bforget here */
749 }
750
751 jbd_debug(3, "JBD: commit phase 6\n");
752
753 if (journal_write_commit_record(journal, commit_transaction))
754 err = -EIO;
755
756 if (err)
757 __jbd2_journal_abort_hard(journal);
758
759 /* End of a transaction! Finally, we can do checkpoint
760 processing: any buffers committed as a result of this
761 transaction can be removed from any checkpoint list it was on
762 before. */
763
764 jbd_debug(3, "JBD: commit phase 7\n");
765
766 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
767 J_ASSERT(commit_transaction->t_buffers == NULL);
768 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
769 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
770 J_ASSERT(commit_transaction->t_shadow_list == NULL);
771 J_ASSERT(commit_transaction->t_log_list == NULL);
772
773restart_loop:
774 /*
775 * As there are other places (journal_unmap_buffer()) adding buffers
776 * to this list we have to be careful and hold the j_list_lock.
777 */
778 spin_lock(&journal->j_list_lock);
779 while (commit_transaction->t_forget) {
780 transaction_t *cp_transaction;
781 struct buffer_head *bh;
782
783 jh = commit_transaction->t_forget;
784 spin_unlock(&journal->j_list_lock);
785 bh = jh2bh(jh);
786 jbd_lock_bh_state(bh);
787 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
788 jh->b_transaction == journal->j_running_transaction);
789
790 /*
791 * If there is undo-protected committed data against
792 * this buffer, then we can remove it now. If it is a
793 * buffer needing such protection, the old frozen_data
794 * field now points to a committed version of the
795 * buffer, so rotate that field to the new committed
796 * data.
797 *
798 * Otherwise, we can just throw away the frozen data now.
799 */
800 if (jh->b_committed_data) {
801 jbd2_slab_free(jh->b_committed_data, bh->b_size);
802 jh->b_committed_data = NULL;
803 if (jh->b_frozen_data) {
804 jh->b_committed_data = jh->b_frozen_data;
805 jh->b_frozen_data = NULL;
806 }
807 } else if (jh->b_frozen_data) {
808 jbd2_slab_free(jh->b_frozen_data, bh->b_size);
809 jh->b_frozen_data = NULL;
810 }
811
812 spin_lock(&journal->j_list_lock);
813 cp_transaction = jh->b_cp_transaction;
814 if (cp_transaction) {
815 JBUFFER_TRACE(jh, "remove from old cp transaction");
816 __jbd2_journal_remove_checkpoint(jh);
817 }
818
819 /* Only re-checkpoint the buffer_head if it is marked
820 * dirty. If the buffer was added to the BJ_Forget list
821 * by jbd2_journal_forget, it may no longer be dirty and
822 * there's no point in keeping a checkpoint record for
823 * it. */
824
825 /* A buffer which has been freed while still being
826 * journaled by a previous transaction may end up still
827 * being dirty here, but we want to avoid writing back
828 * that buffer in the future now that the last use has
829 * been committed. That's not only a performance gain,
830 * it also stops aliasing problems if the buffer is left
831 * behind for writeback and gets reallocated for another
832 * use in a different page. */
833 if (buffer_freed(bh)) {
834 clear_buffer_freed(bh);
835 clear_buffer_jbddirty(bh);
836 }
837
838 if (buffer_jbddirty(bh)) {
839 JBUFFER_TRACE(jh, "add to new checkpointing trans");
840 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
841 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
842 __jbd2_journal_refile_buffer(jh);
843 jbd_unlock_bh_state(bh);
844 } else {
845 J_ASSERT_BH(bh, !buffer_dirty(bh));
846 /* The buffer on BJ_Forget list and not jbddirty means
847 * it has been freed by this transaction and hence it
848 * could not have been reallocated until this
849 * transaction has committed. *BUT* it could be
850 * reallocated once we have written all the data to
851 * disk and before we process the buffer on BJ_Forget
852 * list. */
853 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
854 __jbd2_journal_refile_buffer(jh);
855 if (!jh->b_transaction) {
856 jbd_unlock_bh_state(bh);
857 /* needs a brelse */
858 jbd2_journal_remove_journal_head(bh);
859 release_buffer_page(bh);
860 } else
861 jbd_unlock_bh_state(bh);
862 }
863 cond_resched_lock(&journal->j_list_lock);
864 }
865 spin_unlock(&journal->j_list_lock);
866 /*
867 * This is a bit sleazy. We borrow j_list_lock to protect
868 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
869 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
870 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
871 */
872 spin_lock(&journal->j_state_lock);
873 spin_lock(&journal->j_list_lock);
874 /*
875 * Now recheck if some buffers did not get attached to the transaction
876 * while the lock was dropped...
877 */
878 if (commit_transaction->t_forget) {
879 spin_unlock(&journal->j_list_lock);
880 spin_unlock(&journal->j_state_lock);
881 goto restart_loop;
882 }
883
884 /* Done with this transaction! */
885
886 jbd_debug(3, "JBD: commit phase 8\n");
887
888 J_ASSERT(commit_transaction->t_state == T_COMMIT);
889
890 commit_transaction->t_state = T_FINISHED;
891 J_ASSERT(commit_transaction == journal->j_committing_transaction);
892 journal->j_commit_sequence = commit_transaction->t_tid;
893 journal->j_committing_transaction = NULL;
894 spin_unlock(&journal->j_state_lock);
895
896 if (commit_transaction->t_checkpoint_list == NULL) {
897 __jbd2_journal_drop_transaction(journal, commit_transaction);
898 } else {
899 if (journal->j_checkpoint_transactions == NULL) {
900 journal->j_checkpoint_transactions = commit_transaction;
901 commit_transaction->t_cpnext = commit_transaction;
902 commit_transaction->t_cpprev = commit_transaction;
903 } else {
904 commit_transaction->t_cpnext =
905 journal->j_checkpoint_transactions;
906 commit_transaction->t_cpprev =
907 commit_transaction->t_cpnext->t_cpprev;
908 commit_transaction->t_cpnext->t_cpprev =
909 commit_transaction;
910 commit_transaction->t_cpprev->t_cpnext =
911 commit_transaction;
912 }
913 }
914 spin_unlock(&journal->j_list_lock);
915
916 jbd_debug(1, "JBD: commit %d complete, head %d\n",
917 journal->j_commit_sequence, journal->j_tail_sequence);
918
919 wake_up(&journal->j_wait_done_commit);
920}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
new file mode 100644
index 000000000000..c60f378b0f76
--- /dev/null
+++ b/fs/jbd2/journal.c
@@ -0,0 +1,2084 @@
1/*
2 * linux/fs/jbd2/journal.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem journal-writing code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages journals: areas of disk reserved for logging
16 * transactional updates. This includes the kernel journaling thread
17 * which is responsible for scheduling updates to the log.
18 *
19 * We do not actually manage the physical storage of the journal in this
20 * file: that is left to a per-journal policy function, which allows us
21 * to store the journal within a filesystem-specified area for ext2
22 * journaling (ext2 can use a reserved inode for storing the log).
23 */
24
25#include <linux/module.h>
26#include <linux/time.h>
27#include <linux/fs.h>
28#include <linux/jbd2.h>
29#include <linux/errno.h>
30#include <linux/slab.h>
31#include <linux/smp_lock.h>
32#include <linux/init.h>
33#include <linux/mm.h>
34#include <linux/suspend.h>
35#include <linux/pagemap.h>
36#include <linux/kthread.h>
37#include <linux/poison.h>
38#include <linux/proc_fs.h>
39
40#include <asm/uaccess.h>
41#include <asm/page.h>
42
43EXPORT_SYMBOL(jbd2_journal_start);
44EXPORT_SYMBOL(jbd2_journal_restart);
45EXPORT_SYMBOL(jbd2_journal_extend);
46EXPORT_SYMBOL(jbd2_journal_stop);
47EXPORT_SYMBOL(jbd2_journal_lock_updates);
48EXPORT_SYMBOL(jbd2_journal_unlock_updates);
49EXPORT_SYMBOL(jbd2_journal_get_write_access);
50EXPORT_SYMBOL(jbd2_journal_get_create_access);
51EXPORT_SYMBOL(jbd2_journal_get_undo_access);
52EXPORT_SYMBOL(jbd2_journal_dirty_data);
53EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
54EXPORT_SYMBOL(jbd2_journal_release_buffer);
55EXPORT_SYMBOL(jbd2_journal_forget);
56#if 0
57EXPORT_SYMBOL(journal_sync_buffer);
58#endif
59EXPORT_SYMBOL(jbd2_journal_flush);
60EXPORT_SYMBOL(jbd2_journal_revoke);
61
62EXPORT_SYMBOL(jbd2_journal_init_dev);
63EXPORT_SYMBOL(jbd2_journal_init_inode);
64EXPORT_SYMBOL(jbd2_journal_update_format);
65EXPORT_SYMBOL(jbd2_journal_check_used_features);
66EXPORT_SYMBOL(jbd2_journal_check_available_features);
67EXPORT_SYMBOL(jbd2_journal_set_features);
68EXPORT_SYMBOL(jbd2_journal_create);
69EXPORT_SYMBOL(jbd2_journal_load);
70EXPORT_SYMBOL(jbd2_journal_destroy);
71EXPORT_SYMBOL(jbd2_journal_update_superblock);
72EXPORT_SYMBOL(jbd2_journal_abort);
73EXPORT_SYMBOL(jbd2_journal_errno);
74EXPORT_SYMBOL(jbd2_journal_ack_err);
75EXPORT_SYMBOL(jbd2_journal_clear_err);
76EXPORT_SYMBOL(jbd2_log_wait_commit);
77EXPORT_SYMBOL(jbd2_journal_start_commit);
78EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
79EXPORT_SYMBOL(jbd2_journal_wipe);
80EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
81EXPORT_SYMBOL(jbd2_journal_invalidatepage);
82EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
83EXPORT_SYMBOL(jbd2_journal_force_commit);
84
85static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
86static void __journal_abort_soft (journal_t *journal, int errno);
87static int jbd2_journal_create_jbd_slab(size_t slab_size);
88
89/*
90 * Helper function used to manage commit timeouts
91 */
92
93static void commit_timeout(unsigned long __data)
94{
95 struct task_struct * p = (struct task_struct *) __data;
96
97 wake_up_process(p);
98}
99
100/*
101 * kjournald2: The main thread function used to manage a logging device
102 * journal.
103 *
104 * This kernel thread is responsible for two things:
105 *
106 * 1) COMMIT: Every so often we need to commit the current state of the
107 * filesystem to disk. The journal thread is responsible for writing
108 * all of the metadata buffers to disk.
109 *
110 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
111 * of the data in that part of the log has been rewritten elsewhere on
112 * the disk. Flushing these old buffers to reclaim space in the log is
113 * known as checkpointing, and this thread is responsible for that job.
114 */
115
116static int kjournald2(void *arg)
117{
118 journal_t *journal = arg;
119 transaction_t *transaction;
120
121 /*
122 * Set up an interval timer which can be used to trigger a commit wakeup
123 * after the commit interval expires
124 */
125 setup_timer(&journal->j_commit_timer, commit_timeout,
126 (unsigned long)current);
127
128 /* Record that the journal thread is running */
129 journal->j_task = current;
130 wake_up(&journal->j_wait_done_commit);
131
132 printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n",
133 journal->j_commit_interval / HZ);
134
135 /*
136 * And now, wait forever for commit wakeup events.
137 */
138 spin_lock(&journal->j_state_lock);
139
140loop:
141 if (journal->j_flags & JBD2_UNMOUNT)
142 goto end_loop;
143
144 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
145 journal->j_commit_sequence, journal->j_commit_request);
146
147 if (journal->j_commit_sequence != journal->j_commit_request) {
148 jbd_debug(1, "OK, requests differ\n");
149 spin_unlock(&journal->j_state_lock);
150 del_timer_sync(&journal->j_commit_timer);
151 jbd2_journal_commit_transaction(journal);
152 spin_lock(&journal->j_state_lock);
153 goto loop;
154 }
155
156 wake_up(&journal->j_wait_done_commit);
157 if (freezing(current)) {
158 /*
159 * The simpler the better. Flushing journal isn't a
160 * good idea, because that depends on threads that may
161 * be already stopped.
162 */
163 jbd_debug(1, "Now suspending kjournald2\n");
164 spin_unlock(&journal->j_state_lock);
165 refrigerator();
166 spin_lock(&journal->j_state_lock);
167 } else {
168 /*
169 * We assume on resume that commits are already there,
170 * so we don't sleep
171 */
172 DEFINE_WAIT(wait);
173 int should_sleep = 1;
174
175 prepare_to_wait(&journal->j_wait_commit, &wait,
176 TASK_INTERRUPTIBLE);
177 if (journal->j_commit_sequence != journal->j_commit_request)
178 should_sleep = 0;
179 transaction = journal->j_running_transaction;
180 if (transaction && time_after_eq(jiffies,
181 transaction->t_expires))
182 should_sleep = 0;
183 if (journal->j_flags & JBD2_UNMOUNT)
184 should_sleep = 0;
185 if (should_sleep) {
186 spin_unlock(&journal->j_state_lock);
187 schedule();
188 spin_lock(&journal->j_state_lock);
189 }
190 finish_wait(&journal->j_wait_commit, &wait);
191 }
192
193 jbd_debug(1, "kjournald2 wakes\n");
194
195 /*
196 * Were we woken up by a commit wakeup event?
197 */
198 transaction = journal->j_running_transaction;
199 if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
200 journal->j_commit_request = transaction->t_tid;
201 jbd_debug(1, "woke because of timeout\n");
202 }
203 goto loop;
204
205end_loop:
206 spin_unlock(&journal->j_state_lock);
207 del_timer_sync(&journal->j_commit_timer);
208 journal->j_task = NULL;
209 wake_up(&journal->j_wait_done_commit);
210 jbd_debug(1, "Journal thread exiting.\n");
211 return 0;
212}
213
214static void jbd2_journal_start_thread(journal_t *journal)
215{
216 kthread_run(kjournald2, journal, "kjournald2");
217 wait_event(journal->j_wait_done_commit, journal->j_task != 0);
218}
219
220static void journal_kill_thread(journal_t *journal)
221{
222 spin_lock(&journal->j_state_lock);
223 journal->j_flags |= JBD2_UNMOUNT;
224
225 while (journal->j_task) {
226 wake_up(&journal->j_wait_commit);
227 spin_unlock(&journal->j_state_lock);
228 wait_event(journal->j_wait_done_commit, journal->j_task == 0);
229 spin_lock(&journal->j_state_lock);
230 }
231 spin_unlock(&journal->j_state_lock);
232}
233
234/*
235 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
236 *
237 * Writes a metadata buffer to a given disk block. The actual IO is not
238 * performed but a new buffer_head is constructed which labels the data
239 * to be written with the correct destination disk block.
240 *
241 * Any magic-number escaping which needs to be done will cause a
242 * copy-out here. If the buffer happens to start with the
243 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
244 * magic number is only written to the log for descripter blocks. In
245 * this case, we copy the data and replace the first word with 0, and we
246 * return a result code which indicates that this buffer needs to be
247 * marked as an escaped buffer in the corresponding log descriptor
248 * block. The missing word can then be restored when the block is read
249 * during recovery.
250 *
251 * If the source buffer has already been modified by a new transaction
252 * since we took the last commit snapshot, we use the frozen copy of
253 * that data for IO. If we end up using the existing buffer_head's data
254 * for the write, then we *have* to lock the buffer to prevent anyone
255 * else from using and possibly modifying it while the IO is in
256 * progress.
257 *
258 * The function returns a pointer to the buffer_heads to be used for IO.
259 *
260 * We assume that the journal has already been locked in this function.
261 *
262 * Return value:
263 * <0: Error
264 * >=0: Finished OK
265 *
266 * On success:
267 * Bit 0 set == escape performed on the data
268 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
269 */
270
271int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
272 struct journal_head *jh_in,
273 struct journal_head **jh_out,
274 unsigned long long blocknr)
275{
276 int need_copy_out = 0;
277 int done_copy_out = 0;
278 int do_escape = 0;
279 char *mapped_data;
280 struct buffer_head *new_bh;
281 struct journal_head *new_jh;
282 struct page *new_page;
283 unsigned int new_offset;
284 struct buffer_head *bh_in = jh2bh(jh_in);
285
286 /*
287 * The buffer really shouldn't be locked: only the current committing
288 * transaction is allowed to write it, so nobody else is allowed
289 * to do any IO.
290 *
291 * akpm: except if we're journalling data, and write() output is
292 * also part of a shared mapping, and another thread has
293 * decided to launch a writepage() against this buffer.
294 */
295 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
296
297 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
298
299 /*
300 * If a new transaction has already done a buffer copy-out, then
301 * we use that version of the data for the commit.
302 */
303 jbd_lock_bh_state(bh_in);
304repeat:
305 if (jh_in->b_frozen_data) {
306 done_copy_out = 1;
307 new_page = virt_to_page(jh_in->b_frozen_data);
308 new_offset = offset_in_page(jh_in->b_frozen_data);
309 } else {
310 new_page = jh2bh(jh_in)->b_page;
311 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
312 }
313
314 mapped_data = kmap_atomic(new_page, KM_USER0);
315 /*
316 * Check for escaping
317 */
318 if (*((__be32 *)(mapped_data + new_offset)) ==
319 cpu_to_be32(JBD2_MAGIC_NUMBER)) {
320 need_copy_out = 1;
321 do_escape = 1;
322 }
323 kunmap_atomic(mapped_data, KM_USER0);
324
325 /*
326 * Do we need to do a data copy?
327 */
328 if (need_copy_out && !done_copy_out) {
329 char *tmp;
330
331 jbd_unlock_bh_state(bh_in);
332 tmp = jbd2_slab_alloc(bh_in->b_size, GFP_NOFS);
333 jbd_lock_bh_state(bh_in);
334 if (jh_in->b_frozen_data) {
335 jbd2_slab_free(tmp, bh_in->b_size);
336 goto repeat;
337 }
338
339 jh_in->b_frozen_data = tmp;
340 mapped_data = kmap_atomic(new_page, KM_USER0);
341 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
342 kunmap_atomic(mapped_data, KM_USER0);
343
344 new_page = virt_to_page(tmp);
345 new_offset = offset_in_page(tmp);
346 done_copy_out = 1;
347 }
348
349 /*
350 * Did we need to do an escaping? Now we've done all the
351 * copying, we can finally do so.
352 */
353 if (do_escape) {
354 mapped_data = kmap_atomic(new_page, KM_USER0);
355 *((unsigned int *)(mapped_data + new_offset)) = 0;
356 kunmap_atomic(mapped_data, KM_USER0);
357 }
358
359 /* keep subsequent assertions sane */
360 new_bh->b_state = 0;
361 init_buffer(new_bh, NULL, NULL);
362 atomic_set(&new_bh->b_count, 1);
363 jbd_unlock_bh_state(bh_in);
364
365 new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
366
367 set_bh_page(new_bh, new_page, new_offset);
368 new_jh->b_transaction = NULL;
369 new_bh->b_size = jh2bh(jh_in)->b_size;
370 new_bh->b_bdev = transaction->t_journal->j_dev;
371 new_bh->b_blocknr = blocknr;
372 set_buffer_mapped(new_bh);
373 set_buffer_dirty(new_bh);
374
375 *jh_out = new_jh;
376
377 /*
378 * The to-be-written buffer needs to get moved to the io queue,
379 * and the original buffer whose contents we are shadowing or
380 * copying is moved to the transaction's shadow queue.
381 */
382 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
383 jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
384 JBUFFER_TRACE(new_jh, "file as BJ_IO");
385 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
386
387 return do_escape | (done_copy_out << 1);
388}
389
390/*
391 * Allocation code for the journal file. Manage the space left in the
392 * journal, so that we can begin checkpointing when appropriate.
393 */
394
395/*
396 * __jbd2_log_space_left: Return the number of free blocks left in the journal.
397 *
398 * Called with the journal already locked.
399 *
400 * Called under j_state_lock
401 */
402
403int __jbd2_log_space_left(journal_t *journal)
404{
405 int left = journal->j_free;
406
407 assert_spin_locked(&journal->j_state_lock);
408
409 /*
410 * Be pessimistic here about the number of those free blocks which
411 * might be required for log descriptor control blocks.
412 */
413
414#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
415
416 left -= MIN_LOG_RESERVED_BLOCKS;
417
418 if (left <= 0)
419 return 0;
420 left -= (left >> 3);
421 return left;
422}
423
424/*
425 * Called under j_state_lock. Returns true if a transaction was started.
426 */
427int __jbd2_log_start_commit(journal_t *journal, tid_t target)
428{
429 /*
430 * Are we already doing a recent enough commit?
431 */
432 if (!tid_geq(journal->j_commit_request, target)) {
433 /*
434 * We want a new commit: OK, mark the request and wakup the
435 * commit thread. We do _not_ do the commit ourselves.
436 */
437
438 journal->j_commit_request = target;
439 jbd_debug(1, "JBD: requesting commit %d/%d\n",
440 journal->j_commit_request,
441 journal->j_commit_sequence);
442 wake_up(&journal->j_wait_commit);
443 return 1;
444 }
445 return 0;
446}
447
448int jbd2_log_start_commit(journal_t *journal, tid_t tid)
449{
450 int ret;
451
452 spin_lock(&journal->j_state_lock);
453 ret = __jbd2_log_start_commit(journal, tid);
454 spin_unlock(&journal->j_state_lock);
455 return ret;
456}
457
458/*
459 * Force and wait upon a commit if the calling process is not within
460 * transaction. This is used for forcing out undo-protected data which contains
461 * bitmaps, when the fs is running out of space.
462 *
463 * We can only force the running transaction if we don't have an active handle;
464 * otherwise, we will deadlock.
465 *
466 * Returns true if a transaction was started.
467 */
468int jbd2_journal_force_commit_nested(journal_t *journal)
469{
470 transaction_t *transaction = NULL;
471 tid_t tid;
472
473 spin_lock(&journal->j_state_lock);
474 if (journal->j_running_transaction && !current->journal_info) {
475 transaction = journal->j_running_transaction;
476 __jbd2_log_start_commit(journal, transaction->t_tid);
477 } else if (journal->j_committing_transaction)
478 transaction = journal->j_committing_transaction;
479
480 if (!transaction) {
481 spin_unlock(&journal->j_state_lock);
482 return 0; /* Nothing to retry */
483 }
484
485 tid = transaction->t_tid;
486 spin_unlock(&journal->j_state_lock);
487 jbd2_log_wait_commit(journal, tid);
488 return 1;
489}
490
491/*
492 * Start a commit of the current running transaction (if any). Returns true
493 * if a transaction was started, and fills its tid in at *ptid
494 */
495int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
496{
497 int ret = 0;
498
499 spin_lock(&journal->j_state_lock);
500 if (journal->j_running_transaction) {
501 tid_t tid = journal->j_running_transaction->t_tid;
502
503 ret = __jbd2_log_start_commit(journal, tid);
504 if (ret && ptid)
505 *ptid = tid;
506 } else if (journal->j_committing_transaction && ptid) {
507 /*
508 * If ext3_write_super() recently started a commit, then we
509 * have to wait for completion of that transaction
510 */
511 *ptid = journal->j_committing_transaction->t_tid;
512 ret = 1;
513 }
514 spin_unlock(&journal->j_state_lock);
515 return ret;
516}
517
518/*
519 * Wait for a specified commit to complete.
520 * The caller may not hold the journal lock.
521 */
522int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
523{
524 int err = 0;
525
526#ifdef CONFIG_JBD_DEBUG
527 spin_lock(&journal->j_state_lock);
528 if (!tid_geq(journal->j_commit_request, tid)) {
529 printk(KERN_EMERG
530 "%s: error: j_commit_request=%d, tid=%d\n",
531 __FUNCTION__, journal->j_commit_request, tid);
532 }
533 spin_unlock(&journal->j_state_lock);
534#endif
535 spin_lock(&journal->j_state_lock);
536 while (tid_gt(tid, journal->j_commit_sequence)) {
537 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
538 tid, journal->j_commit_sequence);
539 wake_up(&journal->j_wait_commit);
540 spin_unlock(&journal->j_state_lock);
541 wait_event(journal->j_wait_done_commit,
542 !tid_gt(tid, journal->j_commit_sequence));
543 spin_lock(&journal->j_state_lock);
544 }
545 spin_unlock(&journal->j_state_lock);
546
547 if (unlikely(is_journal_aborted(journal))) {
548 printk(KERN_EMERG "journal commit I/O error\n");
549 err = -EIO;
550 }
551 return err;
552}
553
554/*
555 * Log buffer allocation routines:
556 */
557
558int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
559{
560 unsigned long blocknr;
561
562 spin_lock(&journal->j_state_lock);
563 J_ASSERT(journal->j_free > 1);
564
565 blocknr = journal->j_head;
566 journal->j_head++;
567 journal->j_free--;
568 if (journal->j_head == journal->j_last)
569 journal->j_head = journal->j_first;
570 spin_unlock(&journal->j_state_lock);
571 return jbd2_journal_bmap(journal, blocknr, retp);
572}
573
574/*
575 * Conversion of logical to physical block numbers for the journal
576 *
577 * On external journals the journal blocks are identity-mapped, so
578 * this is a no-op. If needed, we can use j_blk_offset - everything is
579 * ready.
580 */
581int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
582 unsigned long long *retp)
583{
584 int err = 0;
585 unsigned long long ret;
586
587 if (journal->j_inode) {
588 ret = bmap(journal->j_inode, blocknr);
589 if (ret)
590 *retp = ret;
591 else {
592 char b[BDEVNAME_SIZE];
593
594 printk(KERN_ALERT "%s: journal block not found "
595 "at offset %lu on %s\n",
596 __FUNCTION__,
597 blocknr,
598 bdevname(journal->j_dev, b));
599 err = -EIO;
600 __journal_abort_soft(journal, err);
601 }
602 } else {
603 *retp = blocknr; /* +journal->j_blk_offset */
604 }
605 return err;
606}
607
608/*
609 * We play buffer_head aliasing tricks to write data/metadata blocks to
610 * the journal without copying their contents, but for journal
611 * descriptor blocks we do need to generate bona fide buffers.
612 *
613 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
614 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
615 * But we don't bother doing that, so there will be coherency problems with
616 * mmaps of blockdevs which hold live JBD-controlled filesystems.
617 */
618struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
619{
620 struct buffer_head *bh;
621 unsigned long long blocknr;
622 int err;
623
624 err = jbd2_journal_next_log_block(journal, &blocknr);
625
626 if (err)
627 return NULL;
628
629 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
630 lock_buffer(bh);
631 memset(bh->b_data, 0, journal->j_blocksize);
632 set_buffer_uptodate(bh);
633 unlock_buffer(bh);
634 BUFFER_TRACE(bh, "return this buffer");
635 return jbd2_journal_add_journal_head(bh);
636}
637
638/*
639 * Management for journal control blocks: functions to create and
640 * destroy journal_t structures, and to initialise and read existing
641 * journal blocks from disk. */
642
643/* First: create and setup a journal_t object in memory. We initialise
644 * very few fields yet: that has to wait until we have created the
645 * journal structures from from scratch, or loaded them from disk. */
646
647static journal_t * journal_init_common (void)
648{
649 journal_t *journal;
650 int err;
651
652 journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
653 if (!journal)
654 goto fail;
655 memset(journal, 0, sizeof(*journal));
656
657 init_waitqueue_head(&journal->j_wait_transaction_locked);
658 init_waitqueue_head(&journal->j_wait_logspace);
659 init_waitqueue_head(&journal->j_wait_done_commit);
660 init_waitqueue_head(&journal->j_wait_checkpoint);
661 init_waitqueue_head(&journal->j_wait_commit);
662 init_waitqueue_head(&journal->j_wait_updates);
663 mutex_init(&journal->j_barrier);
664 mutex_init(&journal->j_checkpoint_mutex);
665 spin_lock_init(&journal->j_revoke_lock);
666 spin_lock_init(&journal->j_list_lock);
667 spin_lock_init(&journal->j_state_lock);
668
669 journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
670
671 /* The journal is marked for error until we succeed with recovery! */
672 journal->j_flags = JBD2_ABORT;
673
674 /* Set up a default-sized revoke table for the new mount. */
675 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
676 if (err) {
677 kfree(journal);
678 goto fail;
679 }
680 return journal;
681fail:
682 return NULL;
683}
684
685/* jbd2_journal_init_dev and jbd2_journal_init_inode:
686 *
687 * Create a journal structure assigned some fixed set of disk blocks to
688 * the journal. We don't actually touch those disk blocks yet, but we
689 * need to set up all of the mapping information to tell the journaling
690 * system where the journal blocks are.
691 *
692 */
693
694/**
695 * journal_t * jbd2_journal_init_dev() - creates an initialises a journal structure
696 * @bdev: Block device on which to create the journal
697 * @fs_dev: Device which hold journalled filesystem for this journal.
698 * @start: Block nr Start of journal.
699 * @len: Length of the journal in blocks.
700 * @blocksize: blocksize of journalling device
701 * @returns: a newly created journal_t *
702 *
703 * jbd2_journal_init_dev creates a journal which maps a fixed contiguous
704 * range of blocks on an arbitrary block device.
705 *
706 */
707journal_t * jbd2_journal_init_dev(struct block_device *bdev,
708 struct block_device *fs_dev,
709 unsigned long long start, int len, int blocksize)
710{
711 journal_t *journal = journal_init_common();
712 struct buffer_head *bh;
713 int n;
714
715 if (!journal)
716 return NULL;
717
718 /* journal descriptor can store up to n blocks -bzzz */
719 journal->j_blocksize = blocksize;
720 n = journal->j_blocksize / sizeof(journal_block_tag_t);
721 journal->j_wbufsize = n;
722 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
723 if (!journal->j_wbuf) {
724 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
725 __FUNCTION__);
726 kfree(journal);
727 journal = NULL;
728 goto out;
729 }
730 journal->j_dev = bdev;
731 journal->j_fs_dev = fs_dev;
732 journal->j_blk_offset = start;
733 journal->j_maxlen = len;
734
735 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
736 J_ASSERT(bh != NULL);
737 journal->j_sb_buffer = bh;
738 journal->j_superblock = (journal_superblock_t *)bh->b_data;
739out:
740 return journal;
741}
742
743/**
744 * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
745 * @inode: An inode to create the journal in
746 *
747 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
748 * the journal. The inode must exist already, must support bmap() and
749 * must have all data blocks preallocated.
750 */
751journal_t * jbd2_journal_init_inode (struct inode *inode)
752{
753 struct buffer_head *bh;
754 journal_t *journal = journal_init_common();
755 int err;
756 int n;
757 unsigned long long blocknr;
758
759 if (!journal)
760 return NULL;
761
762 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
763 journal->j_inode = inode;
764 jbd_debug(1,
765 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
766 journal, inode->i_sb->s_id, inode->i_ino,
767 (long long) inode->i_size,
768 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
769
770 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
771 journal->j_blocksize = inode->i_sb->s_blocksize;
772
773 /* journal descriptor can store up to n blocks -bzzz */
774 n = journal->j_blocksize / sizeof(journal_block_tag_t);
775 journal->j_wbufsize = n;
776 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
777 if (!journal->j_wbuf) {
778 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
779 __FUNCTION__);
780 kfree(journal);
781 return NULL;
782 }
783
784 err = jbd2_journal_bmap(journal, 0, &blocknr);
785 /* If that failed, give up */
786 if (err) {
787 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
788 __FUNCTION__);
789 kfree(journal);
790 return NULL;
791 }
792
793 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
794 J_ASSERT(bh != NULL);
795 journal->j_sb_buffer = bh;
796 journal->j_superblock = (journal_superblock_t *)bh->b_data;
797
798 return journal;
799}
800
801/*
802 * If the journal init or create aborts, we need to mark the journal
803 * superblock as being NULL to prevent the journal destroy from writing
804 * back a bogus superblock.
805 */
806static void journal_fail_superblock (journal_t *journal)
807{
808 struct buffer_head *bh = journal->j_sb_buffer;
809 brelse(bh);
810 journal->j_sb_buffer = NULL;
811}
812
813/*
814 * Given a journal_t structure, initialise the various fields for
815 * startup of a new journaling session. We use this both when creating
816 * a journal, and after recovering an old journal to reset it for
817 * subsequent use.
818 */
819
820static int journal_reset(journal_t *journal)
821{
822 journal_superblock_t *sb = journal->j_superblock;
823 unsigned long long first, last;
824
825 first = be32_to_cpu(sb->s_first);
826 last = be32_to_cpu(sb->s_maxlen);
827
828 journal->j_first = first;
829 journal->j_last = last;
830
831 journal->j_head = first;
832 journal->j_tail = first;
833 journal->j_free = last - first;
834
835 journal->j_tail_sequence = journal->j_transaction_sequence;
836 journal->j_commit_sequence = journal->j_transaction_sequence - 1;
837 journal->j_commit_request = journal->j_commit_sequence;
838
839 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
840
841 /* Add the dynamic fields and write it to disk. */
842 jbd2_journal_update_superblock(journal, 1);
843 jbd2_journal_start_thread(journal);
844 return 0;
845}
846
847/**
848 * int jbd2_journal_create() - Initialise the new journal file
849 * @journal: Journal to create. This structure must have been initialised
850 *
851 * Given a journal_t structure which tells us which disk blocks we can
852 * use, create a new journal superblock and initialise all of the
853 * journal fields from scratch.
854 **/
855int jbd2_journal_create(journal_t *journal)
856{
857 unsigned long long blocknr;
858 struct buffer_head *bh;
859 journal_superblock_t *sb;
860 int i, err;
861
862 if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
863 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
864 journal->j_maxlen);
865 journal_fail_superblock(journal);
866 return -EINVAL;
867 }
868
869 if (journal->j_inode == NULL) {
870 /*
871 * We don't know what block to start at!
872 */
873 printk(KERN_EMERG
874 "%s: creation of journal on external device!\n",
875 __FUNCTION__);
876 BUG();
877 }
878
879 /* Zero out the entire journal on disk. We cannot afford to
880 have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
881 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
882 for (i = 0; i < journal->j_maxlen; i++) {
883 err = jbd2_journal_bmap(journal, i, &blocknr);
884 if (err)
885 return err;
886 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
887 lock_buffer(bh);
888 memset (bh->b_data, 0, journal->j_blocksize);
889 BUFFER_TRACE(bh, "marking dirty");
890 mark_buffer_dirty(bh);
891 BUFFER_TRACE(bh, "marking uptodate");
892 set_buffer_uptodate(bh);
893 unlock_buffer(bh);
894 __brelse(bh);
895 }
896
897 sync_blockdev(journal->j_dev);
898 jbd_debug(1, "JBD: journal cleared.\n");
899
900 /* OK, fill in the initial static fields in the new superblock */
901 sb = journal->j_superblock;
902
903 sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
904 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
905
906 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
907 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
908 sb->s_first = cpu_to_be32(1);
909
910 journal->j_transaction_sequence = 1;
911
912 journal->j_flags &= ~JBD2_ABORT;
913 journal->j_format_version = 2;
914
915 return journal_reset(journal);
916}
917
918/**
919 * void jbd2_journal_update_superblock() - Update journal sb on disk.
920 * @journal: The journal to update.
921 * @wait: Set to '0' if you don't want to wait for IO completion.
922 *
923 * Update a journal's dynamic superblock fields and write it to disk,
924 * optionally waiting for the IO to complete.
925 */
926void jbd2_journal_update_superblock(journal_t *journal, int wait)
927{
928 journal_superblock_t *sb = journal->j_superblock;
929 struct buffer_head *bh = journal->j_sb_buffer;
930
931 /*
932 * As a special case, if the on-disk copy is already marked as needing
933 * no recovery (s_start == 0) and there are no outstanding transactions
934 * in the filesystem, then we can safely defer the superblock update
935 * until the next commit by setting JBD2_FLUSHED. This avoids
936 * attempting a write to a potential-readonly device.
937 */
938 if (sb->s_start == 0 && journal->j_tail_sequence ==
939 journal->j_transaction_sequence) {
940 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
941 "(start %ld, seq %d, errno %d)\n",
942 journal->j_tail, journal->j_tail_sequence,
943 journal->j_errno);
944 goto out;
945 }
946
947 spin_lock(&journal->j_state_lock);
948 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
949 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
950
951 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
952 sb->s_start = cpu_to_be32(journal->j_tail);
953 sb->s_errno = cpu_to_be32(journal->j_errno);
954 spin_unlock(&journal->j_state_lock);
955
956 BUFFER_TRACE(bh, "marking dirty");
957 mark_buffer_dirty(bh);
958 if (wait)
959 sync_dirty_buffer(bh);
960 else
961 ll_rw_block(SWRITE, 1, &bh);
962
963out:
964 /* If we have just flushed the log (by marking s_start==0), then
965 * any future commit will have to be careful to update the
966 * superblock again to re-record the true start of the log. */
967
968 spin_lock(&journal->j_state_lock);
969 if (sb->s_start)
970 journal->j_flags &= ~JBD2_FLUSHED;
971 else
972 journal->j_flags |= JBD2_FLUSHED;
973 spin_unlock(&journal->j_state_lock);
974}
975
976/*
977 * Read the superblock for a given journal, performing initial
978 * validation of the format.
979 */
980
981static int journal_get_superblock(journal_t *journal)
982{
983 struct buffer_head *bh;
984 journal_superblock_t *sb;
985 int err = -EIO;
986
987 bh = journal->j_sb_buffer;
988
989 J_ASSERT(bh != NULL);
990 if (!buffer_uptodate(bh)) {
991 ll_rw_block(READ, 1, &bh);
992 wait_on_buffer(bh);
993 if (!buffer_uptodate(bh)) {
994 printk (KERN_ERR
995 "JBD: IO error reading journal superblock\n");
996 goto out;
997 }
998 }
999
1000 sb = journal->j_superblock;
1001
1002 err = -EINVAL;
1003
1004 if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
1005 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1006 printk(KERN_WARNING "JBD: no valid journal superblock found\n");
1007 goto out;
1008 }
1009
1010 switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1011 case JBD2_SUPERBLOCK_V1:
1012 journal->j_format_version = 1;
1013 break;
1014 case JBD2_SUPERBLOCK_V2:
1015 journal->j_format_version = 2;
1016 break;
1017 default:
1018 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
1019 goto out;
1020 }
1021
1022 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1023 journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1024 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1025 printk (KERN_WARNING "JBD: journal file too short\n");
1026 goto out;
1027 }
1028
1029 return 0;
1030
1031out:
1032 journal_fail_superblock(journal);
1033 return err;
1034}
1035
1036/*
1037 * Load the on-disk journal superblock and read the key fields into the
1038 * journal_t.
1039 */
1040
1041static int load_superblock(journal_t *journal)
1042{
1043 int err;
1044 journal_superblock_t *sb;
1045
1046 err = journal_get_superblock(journal);
1047 if (err)
1048 return err;
1049
1050 sb = journal->j_superblock;
1051
1052 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1053 journal->j_tail = be32_to_cpu(sb->s_start);
1054 journal->j_first = be32_to_cpu(sb->s_first);
1055 journal->j_last = be32_to_cpu(sb->s_maxlen);
1056 journal->j_errno = be32_to_cpu(sb->s_errno);
1057
1058 return 0;
1059}
1060
1061
1062/**
1063 * int jbd2_journal_load() - Read journal from disk.
1064 * @journal: Journal to act on.
1065 *
1066 * Given a journal_t structure which tells us which disk blocks contain
1067 * a journal, read the journal from disk to initialise the in-memory
1068 * structures.
1069 */
1070int jbd2_journal_load(journal_t *journal)
1071{
1072 int err;
1073 journal_superblock_t *sb;
1074
1075 err = load_superblock(journal);
1076 if (err)
1077 return err;
1078
1079 sb = journal->j_superblock;
1080 /* If this is a V2 superblock, then we have to check the
1081 * features flags on it. */
1082
1083 if (journal->j_format_version >= 2) {
1084 if ((sb->s_feature_ro_compat &
1085 ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
1086 (sb->s_feature_incompat &
1087 ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
1088 printk (KERN_WARNING
1089 "JBD: Unrecognised features on journal\n");
1090 return -EINVAL;
1091 }
1092 }
1093
1094 /*
1095 * Create a slab for this blocksize
1096 */
1097 err = jbd2_journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
1098 if (err)
1099 return err;
1100
1101 /* Let the recovery code check whether it needs to recover any
1102 * data from the journal. */
1103 if (jbd2_journal_recover(journal))
1104 goto recovery_error;
1105
1106 /* OK, we've finished with the dynamic journal bits:
1107 * reinitialise the dynamic contents of the superblock in memory
1108 * and reset them on disk. */
1109 if (journal_reset(journal))
1110 goto recovery_error;
1111
1112 journal->j_flags &= ~JBD2_ABORT;
1113 journal->j_flags |= JBD2_LOADED;
1114 return 0;
1115
1116recovery_error:
1117 printk (KERN_WARNING "JBD: recovery failed\n");
1118 return -EIO;
1119}
1120
1121/**
1122 * void jbd2_journal_destroy() - Release a journal_t structure.
1123 * @journal: Journal to act on.
1124 *
1125 * Release a journal_t structure once it is no longer in use by the
1126 * journaled object.
1127 */
1128void jbd2_journal_destroy(journal_t *journal)
1129{
1130 /* Wait for the commit thread to wake up and die. */
1131 journal_kill_thread(journal);
1132
1133 /* Force a final log commit */
1134 if (journal->j_running_transaction)
1135 jbd2_journal_commit_transaction(journal);
1136
1137 /* Force any old transactions to disk */
1138
1139 /* Totally anal locking here... */
1140 spin_lock(&journal->j_list_lock);
1141 while (journal->j_checkpoint_transactions != NULL) {
1142 spin_unlock(&journal->j_list_lock);
1143 jbd2_log_do_checkpoint(journal);
1144 spin_lock(&journal->j_list_lock);
1145 }
1146
1147 J_ASSERT(journal->j_running_transaction == NULL);
1148 J_ASSERT(journal->j_committing_transaction == NULL);
1149 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1150 spin_unlock(&journal->j_list_lock);
1151
1152 /* We can now mark the journal as empty. */
1153 journal->j_tail = 0;
1154 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1155 if (journal->j_sb_buffer) {
1156 jbd2_journal_update_superblock(journal, 1);
1157 brelse(journal->j_sb_buffer);
1158 }
1159
1160 if (journal->j_inode)
1161 iput(journal->j_inode);
1162 if (journal->j_revoke)
1163 jbd2_journal_destroy_revoke(journal);
1164 kfree(journal->j_wbuf);
1165 kfree(journal);
1166}
1167
1168
1169/**
1170 *int jbd2_journal_check_used_features () - Check if features specified are used.
1171 * @journal: Journal to check.
1172 * @compat: bitmask of compatible features
1173 * @ro: bitmask of features that force read-only mount
1174 * @incompat: bitmask of incompatible features
1175 *
1176 * Check whether the journal uses all of a given set of
1177 * features. Return true (non-zero) if it does.
1178 **/
1179
1180int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1181 unsigned long ro, unsigned long incompat)
1182{
1183 journal_superblock_t *sb;
1184
1185 if (!compat && !ro && !incompat)
1186 return 1;
1187 if (journal->j_format_version == 1)
1188 return 0;
1189
1190 sb = journal->j_superblock;
1191
1192 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1193 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1194 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1195 return 1;
1196
1197 return 0;
1198}
1199
1200/**
1201 * int jbd2_journal_check_available_features() - Check feature set in journalling layer
1202 * @journal: Journal to check.
1203 * @compat: bitmask of compatible features
1204 * @ro: bitmask of features that force read-only mount
1205 * @incompat: bitmask of incompatible features
1206 *
1207 * Check whether the journaling code supports the use of
1208 * all of a given set of features on this journal. Return true
1209 * (non-zero) if it can. */
1210
1211int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
1212 unsigned long ro, unsigned long incompat)
1213{
1214 journal_superblock_t *sb;
1215
1216 if (!compat && !ro && !incompat)
1217 return 1;
1218
1219 sb = journal->j_superblock;
1220
1221 /* We can support any known requested features iff the
1222 * superblock is in version 2. Otherwise we fail to support any
1223 * extended sb features. */
1224
1225 if (journal->j_format_version != 2)
1226 return 0;
1227
1228 if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
1229 (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
1230 (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
1231 return 1;
1232
1233 return 0;
1234}
1235
1236/**
1237 * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
1238 * @journal: Journal to act on.
1239 * @compat: bitmask of compatible features
1240 * @ro: bitmask of features that force read-only mount
1241 * @incompat: bitmask of incompatible features
1242 *
1243 * Mark a given journal feature as present on the
1244 * superblock. Returns true if the requested features could be set.
1245 *
1246 */
1247
1248int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1249 unsigned long ro, unsigned long incompat)
1250{
1251 journal_superblock_t *sb;
1252
1253 if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
1254 return 1;
1255
1256 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
1257 return 0;
1258
1259 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1260 compat, ro, incompat);
1261
1262 sb = journal->j_superblock;
1263
1264 sb->s_feature_compat |= cpu_to_be32(compat);
1265 sb->s_feature_ro_compat |= cpu_to_be32(ro);
1266 sb->s_feature_incompat |= cpu_to_be32(incompat);
1267
1268 return 1;
1269}
1270
1271
1272/**
1273 * int jbd2_journal_update_format () - Update on-disk journal structure.
1274 * @journal: Journal to act on.
1275 *
1276 * Given an initialised but unloaded journal struct, poke about in the
1277 * on-disk structure to update it to the most recent supported version.
1278 */
1279int jbd2_journal_update_format (journal_t *journal)
1280{
1281 journal_superblock_t *sb;
1282 int err;
1283
1284 err = journal_get_superblock(journal);
1285 if (err)
1286 return err;
1287
1288 sb = journal->j_superblock;
1289
1290 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1291 case JBD2_SUPERBLOCK_V2:
1292 return 0;
1293 case JBD2_SUPERBLOCK_V1:
1294 return journal_convert_superblock_v1(journal, sb);
1295 default:
1296 break;
1297 }
1298 return -EINVAL;
1299}
1300
1301static int journal_convert_superblock_v1(journal_t *journal,
1302 journal_superblock_t *sb)
1303{
1304 int offset, blocksize;
1305 struct buffer_head *bh;
1306
1307 printk(KERN_WARNING
1308 "JBD: Converting superblock from version 1 to 2.\n");
1309
1310 /* Pre-initialise new fields to zero */
1311 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1312 blocksize = be32_to_cpu(sb->s_blocksize);
1313 memset(&sb->s_feature_compat, 0, blocksize-offset);
1314
1315 sb->s_nr_users = cpu_to_be32(1);
1316 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1317 journal->j_format_version = 2;
1318
1319 bh = journal->j_sb_buffer;
1320 BUFFER_TRACE(bh, "marking dirty");
1321 mark_buffer_dirty(bh);
1322 sync_dirty_buffer(bh);
1323 return 0;
1324}
1325
1326
1327/**
1328 * int jbd2_journal_flush () - Flush journal
1329 * @journal: Journal to act on.
1330 *
1331 * Flush all data for a given journal to disk and empty the journal.
1332 * Filesystems can use this when remounting readonly to ensure that
1333 * recovery does not need to happen on remount.
1334 */
1335
1336int jbd2_journal_flush(journal_t *journal)
1337{
1338 int err = 0;
1339 transaction_t *transaction = NULL;
1340 unsigned long old_tail;
1341
1342 spin_lock(&journal->j_state_lock);
1343
1344 /* Force everything buffered to the log... */
1345 if (journal->j_running_transaction) {
1346 transaction = journal->j_running_transaction;
1347 __jbd2_log_start_commit(journal, transaction->t_tid);
1348 } else if (journal->j_committing_transaction)
1349 transaction = journal->j_committing_transaction;
1350
1351 /* Wait for the log commit to complete... */
1352 if (transaction) {
1353 tid_t tid = transaction->t_tid;
1354
1355 spin_unlock(&journal->j_state_lock);
1356 jbd2_log_wait_commit(journal, tid);
1357 } else {
1358 spin_unlock(&journal->j_state_lock);
1359 }
1360
1361 /* ...and flush everything in the log out to disk. */
1362 spin_lock(&journal->j_list_lock);
1363 while (!err && journal->j_checkpoint_transactions != NULL) {
1364 spin_unlock(&journal->j_list_lock);
1365 err = jbd2_log_do_checkpoint(journal);
1366 spin_lock(&journal->j_list_lock);
1367 }
1368 spin_unlock(&journal->j_list_lock);
1369 jbd2_cleanup_journal_tail(journal);
1370
1371 /* Finally, mark the journal as really needing no recovery.
1372 * This sets s_start==0 in the underlying superblock, which is
1373 * the magic code for a fully-recovered superblock. Any future
1374 * commits of data to the journal will restore the current
1375 * s_start value. */
1376 spin_lock(&journal->j_state_lock);
1377 old_tail = journal->j_tail;
1378 journal->j_tail = 0;
1379 spin_unlock(&journal->j_state_lock);
1380 jbd2_journal_update_superblock(journal, 1);
1381 spin_lock(&journal->j_state_lock);
1382 journal->j_tail = old_tail;
1383
1384 J_ASSERT(!journal->j_running_transaction);
1385 J_ASSERT(!journal->j_committing_transaction);
1386 J_ASSERT(!journal->j_checkpoint_transactions);
1387 J_ASSERT(journal->j_head == journal->j_tail);
1388 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1389 spin_unlock(&journal->j_state_lock);
1390 return err;
1391}
1392
1393/**
1394 * int jbd2_journal_wipe() - Wipe journal contents
1395 * @journal: Journal to act on.
1396 * @write: flag (see below)
1397 *
1398 * Wipe out all of the contents of a journal, safely. This will produce
1399 * a warning if the journal contains any valid recovery information.
1400 * Must be called between journal_init_*() and jbd2_journal_load().
1401 *
1402 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
1403 * we merely suppress recovery.
1404 */
1405
1406int jbd2_journal_wipe(journal_t *journal, int write)
1407{
1408 journal_superblock_t *sb;
1409 int err = 0;
1410
1411 J_ASSERT (!(journal->j_flags & JBD2_LOADED));
1412
1413 err = load_superblock(journal);
1414 if (err)
1415 return err;
1416
1417 sb = journal->j_superblock;
1418
1419 if (!journal->j_tail)
1420 goto no_recovery;
1421
1422 printk (KERN_WARNING "JBD: %s recovery information on journal\n",
1423 write ? "Clearing" : "Ignoring");
1424
1425 err = jbd2_journal_skip_recovery(journal);
1426 if (write)
1427 jbd2_journal_update_superblock(journal, 1);
1428
1429 no_recovery:
1430 return err;
1431}
1432
1433/*
1434 * journal_dev_name: format a character string to describe on what
1435 * device this journal is present.
1436 */
1437
1438static const char *journal_dev_name(journal_t *journal, char *buffer)
1439{
1440 struct block_device *bdev;
1441
1442 if (journal->j_inode)
1443 bdev = journal->j_inode->i_sb->s_bdev;
1444 else
1445 bdev = journal->j_dev;
1446
1447 return bdevname(bdev, buffer);
1448}
1449
1450/*
1451 * Journal abort has very specific semantics, which we describe
1452 * for journal abort.
1453 *
1454 * Two internal function, which provide abort to te jbd layer
1455 * itself are here.
1456 */
1457
1458/*
1459 * Quick version for internal journal use (doesn't lock the journal).
1460 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
1461 * and don't attempt to make any other journal updates.
1462 */
1463void __jbd2_journal_abort_hard(journal_t *journal)
1464{
1465 transaction_t *transaction;
1466 char b[BDEVNAME_SIZE];
1467
1468 if (journal->j_flags & JBD2_ABORT)
1469 return;
1470
1471 printk(KERN_ERR "Aborting journal on device %s.\n",
1472 journal_dev_name(journal, b));
1473
1474 spin_lock(&journal->j_state_lock);
1475 journal->j_flags |= JBD2_ABORT;
1476 transaction = journal->j_running_transaction;
1477 if (transaction)
1478 __jbd2_log_start_commit(journal, transaction->t_tid);
1479 spin_unlock(&journal->j_state_lock);
1480}
1481
1482/* Soft abort: record the abort error status in the journal superblock,
1483 * but don't do any other IO. */
1484static void __journal_abort_soft (journal_t *journal, int errno)
1485{
1486 if (journal->j_flags & JBD2_ABORT)
1487 return;
1488
1489 if (!journal->j_errno)
1490 journal->j_errno = errno;
1491
1492 __jbd2_journal_abort_hard(journal);
1493
1494 if (errno)
1495 jbd2_journal_update_superblock(journal, 1);
1496}
1497
1498/**
1499 * void jbd2_journal_abort () - Shutdown the journal immediately.
1500 * @journal: the journal to shutdown.
1501 * @errno: an error number to record in the journal indicating
1502 * the reason for the shutdown.
1503 *
1504 * Perform a complete, immediate shutdown of the ENTIRE
1505 * journal (not of a single transaction). This operation cannot be
1506 * undone without closing and reopening the journal.
1507 *
1508 * The jbd2_journal_abort function is intended to support higher level error
1509 * recovery mechanisms such as the ext2/ext3 remount-readonly error
1510 * mode.
1511 *
1512 * Journal abort has very specific semantics. Any existing dirty,
1513 * unjournaled buffers in the main filesystem will still be written to
1514 * disk by bdflush, but the journaling mechanism will be suspended
1515 * immediately and no further transaction commits will be honoured.
1516 *
1517 * Any dirty, journaled buffers will be written back to disk without
1518 * hitting the journal. Atomicity cannot be guaranteed on an aborted
1519 * filesystem, but we _do_ attempt to leave as much data as possible
1520 * behind for fsck to use for cleanup.
1521 *
1522 * Any attempt to get a new transaction handle on a journal which is in
1523 * ABORT state will just result in an -EROFS error return. A
1524 * jbd2_journal_stop on an existing handle will return -EIO if we have
1525 * entered abort state during the update.
1526 *
1527 * Recursive transactions are not disturbed by journal abort until the
1528 * final jbd2_journal_stop, which will receive the -EIO error.
1529 *
1530 * Finally, the jbd2_journal_abort call allows the caller to supply an errno
1531 * which will be recorded (if possible) in the journal superblock. This
1532 * allows a client to record failure conditions in the middle of a
1533 * transaction without having to complete the transaction to record the
1534 * failure to disk. ext3_error, for example, now uses this
1535 * functionality.
1536 *
1537 * Errors which originate from within the journaling layer will NOT
1538 * supply an errno; a null errno implies that absolutely no further
1539 * writes are done to the journal (unless there are any already in
1540 * progress).
1541 *
1542 */
1543
1544void jbd2_journal_abort(journal_t *journal, int errno)
1545{
1546 __journal_abort_soft(journal, errno);
1547}
1548
1549/**
1550 * int jbd2_journal_errno () - returns the journal's error state.
1551 * @journal: journal to examine.
1552 *
1553 * This is the errno numbet set with jbd2_journal_abort(), the last
1554 * time the journal was mounted - if the journal was stopped
1555 * without calling abort this will be 0.
1556 *
1557 * If the journal has been aborted on this mount time -EROFS will
1558 * be returned.
1559 */
1560int jbd2_journal_errno(journal_t *journal)
1561{
1562 int err;
1563
1564 spin_lock(&journal->j_state_lock);
1565 if (journal->j_flags & JBD2_ABORT)
1566 err = -EROFS;
1567 else
1568 err = journal->j_errno;
1569 spin_unlock(&journal->j_state_lock);
1570 return err;
1571}
1572
1573/**
1574 * int jbd2_journal_clear_err () - clears the journal's error state
1575 * @journal: journal to act on.
1576 *
1577 * An error must be cleared or Acked to take a FS out of readonly
1578 * mode.
1579 */
1580int jbd2_journal_clear_err(journal_t *journal)
1581{
1582 int err = 0;
1583
1584 spin_lock(&journal->j_state_lock);
1585 if (journal->j_flags & JBD2_ABORT)
1586 err = -EROFS;
1587 else
1588 journal->j_errno = 0;
1589 spin_unlock(&journal->j_state_lock);
1590 return err;
1591}
1592
1593/**
1594 * void jbd2_journal_ack_err() - Ack journal err.
1595 * @journal: journal to act on.
1596 *
1597 * An error must be cleared or Acked to take a FS out of readonly
1598 * mode.
1599 */
1600void jbd2_journal_ack_err(journal_t *journal)
1601{
1602 spin_lock(&journal->j_state_lock);
1603 if (journal->j_errno)
1604 journal->j_flags |= JBD2_ACK_ERR;
1605 spin_unlock(&journal->j_state_lock);
1606}
1607
1608int jbd2_journal_blocks_per_page(struct inode *inode)
1609{
1610 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1611}
1612
1613/*
1614 * helper functions to deal with 32 or 64bit block numbers.
1615 */
1616size_t journal_tag_bytes(journal_t *journal)
1617{
1618 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
1619 return JBD_TAG_SIZE64;
1620 else
1621 return JBD_TAG_SIZE32;
1622}
1623
1624/*
1625 * Simple support for retrying memory allocations. Introduced to help to
1626 * debug different VM deadlock avoidance strategies.
1627 */
1628void * __jbd2_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
1629{
1630 return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
1631}
1632
1633/*
1634 * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
1635 * and allocate frozen and commit buffers from these slabs.
1636 *
1637 * Reason for doing this is to avoid, SLAB_DEBUG - since it could
1638 * cause bh to cross page boundary.
1639 */
1640
1641#define JBD_MAX_SLABS 5
1642#define JBD_SLAB_INDEX(size) (size >> 11)
1643
1644static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
1645static const char *jbd_slab_names[JBD_MAX_SLABS] = {
1646 "jbd2_1k", "jbd2_2k", "jbd2_4k", NULL, "jbd2_8k"
1647};
1648
1649static void jbd2_journal_destroy_jbd_slabs(void)
1650{
1651 int i;
1652
1653 for (i = 0; i < JBD_MAX_SLABS; i++) {
1654 if (jbd_slab[i])
1655 kmem_cache_destroy(jbd_slab[i]);
1656 jbd_slab[i] = NULL;
1657 }
1658}
1659
1660static int jbd2_journal_create_jbd_slab(size_t slab_size)
1661{
1662 int i = JBD_SLAB_INDEX(slab_size);
1663
1664 BUG_ON(i >= JBD_MAX_SLABS);
1665
1666 /*
1667 * Check if we already have a slab created for this size
1668 */
1669 if (jbd_slab[i])
1670 return 0;
1671
1672 /*
1673 * Create a slab and force alignment to be same as slabsize -
1674 * this will make sure that allocations won't cross the page
1675 * boundary.
1676 */
1677 jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
1678 slab_size, slab_size, 0, NULL, NULL);
1679 if (!jbd_slab[i]) {
1680 printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
1681 return -ENOMEM;
1682 }
1683 return 0;
1684}
1685
1686void * jbd2_slab_alloc(size_t size, gfp_t flags)
1687{
1688 int idx;
1689
1690 idx = JBD_SLAB_INDEX(size);
1691 BUG_ON(jbd_slab[idx] == NULL);
1692 return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
1693}
1694
1695void jbd2_slab_free(void *ptr, size_t size)
1696{
1697 int idx;
1698
1699 idx = JBD_SLAB_INDEX(size);
1700 BUG_ON(jbd_slab[idx] == NULL);
1701 kmem_cache_free(jbd_slab[idx], ptr);
1702}
1703
1704/*
1705 * Journal_head storage management
1706 */
1707static kmem_cache_t *jbd2_journal_head_cache;
1708#ifdef CONFIG_JBD_DEBUG
1709static atomic_t nr_journal_heads = ATOMIC_INIT(0);
1710#endif
1711
1712static int journal_init_jbd2_journal_head_cache(void)
1713{
1714 int retval;
1715
1716 J_ASSERT(jbd2_journal_head_cache == 0);
1717 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
1718 sizeof(struct journal_head),
1719 0, /* offset */
1720 0, /* flags */
1721 NULL, /* ctor */
1722 NULL); /* dtor */
1723 retval = 0;
1724 if (jbd2_journal_head_cache == 0) {
1725 retval = -ENOMEM;
1726 printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
1727 }
1728 return retval;
1729}
1730
1731static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
1732{
1733 J_ASSERT(jbd2_journal_head_cache != NULL);
1734 kmem_cache_destroy(jbd2_journal_head_cache);
1735 jbd2_journal_head_cache = NULL;
1736}
1737
1738/*
1739 * journal_head splicing and dicing
1740 */
1741static struct journal_head *journal_alloc_journal_head(void)
1742{
1743 struct journal_head *ret;
1744 static unsigned long last_warning;
1745
1746#ifdef CONFIG_JBD_DEBUG
1747 atomic_inc(&nr_journal_heads);
1748#endif
1749 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
1750 if (ret == 0) {
1751 jbd_debug(1, "out of memory for journal_head\n");
1752 if (time_after(jiffies, last_warning + 5*HZ)) {
1753 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1754 __FUNCTION__);
1755 last_warning = jiffies;
1756 }
1757 while (ret == 0) {
1758 yield();
1759 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
1760 }
1761 }
1762 return ret;
1763}
1764
1765static void journal_free_journal_head(struct journal_head *jh)
1766{
1767#ifdef CONFIG_JBD_DEBUG
1768 atomic_dec(&nr_journal_heads);
1769 memset(jh, JBD_POISON_FREE, sizeof(*jh));
1770#endif
1771 kmem_cache_free(jbd2_journal_head_cache, jh);
1772}
1773
1774/*
1775 * A journal_head is attached to a buffer_head whenever JBD has an
1776 * interest in the buffer.
1777 *
1778 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
1779 * is set. This bit is tested in core kernel code where we need to take
1780 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
1781 * there.
1782 *
1783 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
1784 *
1785 * When a buffer has its BH_JBD bit set it is immune from being released by
1786 * core kernel code, mainly via ->b_count.
1787 *
1788 * A journal_head may be detached from its buffer_head when the journal_head's
1789 * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
1790 * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the
1791 * journal_head can be dropped if needed.
1792 *
1793 * Various places in the kernel want to attach a journal_head to a buffer_head
1794 * _before_ attaching the journal_head to a transaction. To protect the
1795 * journal_head in this situation, jbd2_journal_add_journal_head elevates the
1796 * journal_head's b_jcount refcount by one. The caller must call
1797 * jbd2_journal_put_journal_head() to undo this.
1798 *
1799 * So the typical usage would be:
1800 *
1801 * (Attach a journal_head if needed. Increments b_jcount)
1802 * struct journal_head *jh = jbd2_journal_add_journal_head(bh);
1803 * ...
1804 * jh->b_transaction = xxx;
1805 * jbd2_journal_put_journal_head(jh);
1806 *
1807 * Now, the journal_head's b_jcount is zero, but it is safe from being released
1808 * because it has a non-zero b_transaction.
1809 */
1810
1811/*
1812 * Give a buffer_head a journal_head.
1813 *
1814 * Doesn't need the journal lock.
1815 * May sleep.
1816 */
1817struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
1818{
1819 struct journal_head *jh;
1820 struct journal_head *new_jh = NULL;
1821
1822repeat:
1823 if (!buffer_jbd(bh)) {
1824 new_jh = journal_alloc_journal_head();
1825 memset(new_jh, 0, sizeof(*new_jh));
1826 }
1827
1828 jbd_lock_bh_journal_head(bh);
1829 if (buffer_jbd(bh)) {
1830 jh = bh2jh(bh);
1831 } else {
1832 J_ASSERT_BH(bh,
1833 (atomic_read(&bh->b_count) > 0) ||
1834 (bh->b_page && bh->b_page->mapping));
1835
1836 if (!new_jh) {
1837 jbd_unlock_bh_journal_head(bh);
1838 goto repeat;
1839 }
1840
1841 jh = new_jh;
1842 new_jh = NULL; /* We consumed it */
1843 set_buffer_jbd(bh);
1844 bh->b_private = jh;
1845 jh->b_bh = bh;
1846 get_bh(bh);
1847 BUFFER_TRACE(bh, "added journal_head");
1848 }
1849 jh->b_jcount++;
1850 jbd_unlock_bh_journal_head(bh);
1851 if (new_jh)
1852 journal_free_journal_head(new_jh);
1853 return bh->b_private;
1854}
1855
1856/*
1857 * Grab a ref against this buffer_head's journal_head. If it ended up not
1858 * having a journal_head, return NULL
1859 */
1860struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
1861{
1862 struct journal_head *jh = NULL;
1863
1864 jbd_lock_bh_journal_head(bh);
1865 if (buffer_jbd(bh)) {
1866 jh = bh2jh(bh);
1867 jh->b_jcount++;
1868 }
1869 jbd_unlock_bh_journal_head(bh);
1870 return jh;
1871}
1872
1873static void __journal_remove_journal_head(struct buffer_head *bh)
1874{
1875 struct journal_head *jh = bh2jh(bh);
1876
1877 J_ASSERT_JH(jh, jh->b_jcount >= 0);
1878
1879 get_bh(bh);
1880 if (jh->b_jcount == 0) {
1881 if (jh->b_transaction == NULL &&
1882 jh->b_next_transaction == NULL &&
1883 jh->b_cp_transaction == NULL) {
1884 J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
1885 J_ASSERT_BH(bh, buffer_jbd(bh));
1886 J_ASSERT_BH(bh, jh2bh(jh) == bh);
1887 BUFFER_TRACE(bh, "remove journal_head");
1888 if (jh->b_frozen_data) {
1889 printk(KERN_WARNING "%s: freeing "
1890 "b_frozen_data\n",
1891 __FUNCTION__);
1892 jbd2_slab_free(jh->b_frozen_data, bh->b_size);
1893 }
1894 if (jh->b_committed_data) {
1895 printk(KERN_WARNING "%s: freeing "
1896 "b_committed_data\n",
1897 __FUNCTION__);
1898 jbd2_slab_free(jh->b_committed_data, bh->b_size);
1899 }
1900 bh->b_private = NULL;
1901 jh->b_bh = NULL; /* debug, really */
1902 clear_buffer_jbd(bh);
1903 __brelse(bh);
1904 journal_free_journal_head(jh);
1905 } else {
1906 BUFFER_TRACE(bh, "journal_head was locked");
1907 }
1908 }
1909}
1910
1911/*
1912 * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction
1913 * and has a zero b_jcount then remove and release its journal_head. If we did
1914 * see that the buffer is not used by any transaction we also "logically"
1915 * decrement ->b_count.
1916 *
1917 * We in fact take an additional increment on ->b_count as a convenience,
1918 * because the caller usually wants to do additional things with the bh
1919 * after calling here.
1920 * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some
1921 * time. Once the caller has run __brelse(), the buffer is eligible for
1922 * reaping by try_to_free_buffers().
1923 */
1924void jbd2_journal_remove_journal_head(struct buffer_head *bh)
1925{
1926 jbd_lock_bh_journal_head(bh);
1927 __journal_remove_journal_head(bh);
1928 jbd_unlock_bh_journal_head(bh);
1929}
1930
1931/*
1932 * Drop a reference on the passed journal_head. If it fell to zero then try to
1933 * release the journal_head from the buffer_head.
1934 */
1935void jbd2_journal_put_journal_head(struct journal_head *jh)
1936{
1937 struct buffer_head *bh = jh2bh(jh);
1938
1939 jbd_lock_bh_journal_head(bh);
1940 J_ASSERT_JH(jh, jh->b_jcount > 0);
1941 --jh->b_jcount;
1942 if (!jh->b_jcount && !jh->b_transaction) {
1943 __journal_remove_journal_head(bh);
1944 __brelse(bh);
1945 }
1946 jbd_unlock_bh_journal_head(bh);
1947}
1948
1949/*
1950 * /proc tunables
1951 */
1952#if defined(CONFIG_JBD_DEBUG)
1953int jbd2_journal_enable_debug;
1954EXPORT_SYMBOL(jbd2_journal_enable_debug);
1955#endif
1956
1957#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
1958
1959static struct proc_dir_entry *proc_jbd_debug;
1960
1961static int read_jbd_debug(char *page, char **start, off_t off,
1962 int count, int *eof, void *data)
1963{
1964 int ret;
1965
1966 ret = sprintf(page + off, "%d\n", jbd2_journal_enable_debug);
1967 *eof = 1;
1968 return ret;
1969}
1970
1971static int write_jbd_debug(struct file *file, const char __user *buffer,
1972 unsigned long count, void *data)
1973{
1974 char buf[32];
1975
1976 if (count > ARRAY_SIZE(buf) - 1)
1977 count = ARRAY_SIZE(buf) - 1;
1978 if (copy_from_user(buf, buffer, count))
1979 return -EFAULT;
1980 buf[ARRAY_SIZE(buf) - 1] = '\0';
1981 jbd2_journal_enable_debug = simple_strtoul(buf, NULL, 10);
1982 return count;
1983}
1984
1985#define JBD_PROC_NAME "sys/fs/jbd2-debug"
1986
1987static void __init create_jbd_proc_entry(void)
1988{
1989 proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
1990 if (proc_jbd_debug) {
1991 /* Why is this so hard? */
1992 proc_jbd_debug->read_proc = read_jbd_debug;
1993 proc_jbd_debug->write_proc = write_jbd_debug;
1994 }
1995}
1996
1997static void __exit jbd2_remove_jbd_proc_entry(void)
1998{
1999 if (proc_jbd_debug)
2000 remove_proc_entry(JBD_PROC_NAME, NULL);
2001}
2002
2003#else
2004
2005#define create_jbd_proc_entry() do {} while (0)
2006#define jbd2_remove_jbd_proc_entry() do {} while (0)
2007
2008#endif
2009
2010kmem_cache_t *jbd2_handle_cache;
2011
2012static int __init journal_init_handle_cache(void)
2013{
2014 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
2015 sizeof(handle_t),
2016 0, /* offset */
2017 0, /* flags */
2018 NULL, /* ctor */
2019 NULL); /* dtor */
2020 if (jbd2_handle_cache == NULL) {
2021 printk(KERN_EMERG "JBD: failed to create handle cache\n");
2022 return -ENOMEM;
2023 }
2024 return 0;
2025}
2026
2027static void jbd2_journal_destroy_handle_cache(void)
2028{
2029 if (jbd2_handle_cache)
2030 kmem_cache_destroy(jbd2_handle_cache);
2031}
2032
2033/*
2034 * Module startup and shutdown
2035 */
2036
2037static int __init journal_init_caches(void)
2038{
2039 int ret;
2040
2041 ret = jbd2_journal_init_revoke_caches();
2042 if (ret == 0)
2043 ret = journal_init_jbd2_journal_head_cache();
2044 if (ret == 0)
2045 ret = journal_init_handle_cache();
2046 return ret;
2047}
2048
2049static void jbd2_journal_destroy_caches(void)
2050{
2051 jbd2_journal_destroy_revoke_caches();
2052 jbd2_journal_destroy_jbd2_journal_head_cache();
2053 jbd2_journal_destroy_handle_cache();
2054 jbd2_journal_destroy_jbd_slabs();
2055}
2056
2057static int __init journal_init(void)
2058{
2059 int ret;
2060
2061 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2062
2063 ret = journal_init_caches();
2064 if (ret != 0)
2065 jbd2_journal_destroy_caches();
2066 create_jbd_proc_entry();
2067 return ret;
2068}
2069
2070static void __exit journal_exit(void)
2071{
2072#ifdef CONFIG_JBD_DEBUG
2073 int n = atomic_read(&nr_journal_heads);
2074 if (n)
2075 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
2076#endif
2077 jbd2_remove_jbd_proc_entry();
2078 jbd2_journal_destroy_caches();
2079}
2080
2081MODULE_LICENSE("GPL");
2082module_init(journal_init);
2083module_exit(journal_exit);
2084
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
new file mode 100644
index 000000000000..9f10acafaf70
--- /dev/null
+++ b/fs/jbd2/recovery.c
@@ -0,0 +1,609 @@
1/*
2 * linux/fs/recovery.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal recovery routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#ifndef __KERNEL__
17#include "jfs_user.h"
18#else
19#include <linux/time.h>
20#include <linux/fs.h>
21#include <linux/jbd2.h>
22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif
25
26/*
27 * Maintain information about the progress of the recovery job, so that
28 * the different passes can carry information between them.
29 */
30struct recovery_info
31{
32 tid_t start_transaction;
33 tid_t end_transaction;
34
35 int nr_replays;
36 int nr_revokes;
37 int nr_revoke_hits;
38};
39
40enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
41static int do_one_pass(journal_t *journal,
42 struct recovery_info *info, enum passtype pass);
43static int scan_revoke_records(journal_t *, struct buffer_head *,
44 tid_t, struct recovery_info *);
45
46#ifdef __KERNEL__
47
48/* Release readahead buffers after use */
49static void journal_brelse_array(struct buffer_head *b[], int n)
50{
51 while (--n >= 0)
52 brelse (b[n]);
53}
54
55
56/*
57 * When reading from the journal, we are going through the block device
58 * layer directly and so there is no readahead being done for us. We
59 * need to implement any readahead ourselves if we want it to happen at
60 * all. Recovery is basically one long sequential read, so make sure we
61 * do the IO in reasonably large chunks.
62 *
63 * This is not so critical that we need to be enormously clever about
64 * the readahead size, though. 128K is a purely arbitrary, good-enough
65 * fixed value.
66 */
67
68#define MAXBUF 8
69static int do_readahead(journal_t *journal, unsigned int start)
70{
71 int err;
72 unsigned int max, nbufs, next;
73 unsigned long long blocknr;
74 struct buffer_head *bh;
75
76 struct buffer_head * bufs[MAXBUF];
77
78 /* Do up to 128K of readahead */
79 max = start + (128 * 1024 / journal->j_blocksize);
80 if (max > journal->j_maxlen)
81 max = journal->j_maxlen;
82
83 /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
84 * a time to the block device IO layer. */
85
86 nbufs = 0;
87
88 for (next = start; next < max; next++) {
89 err = jbd2_journal_bmap(journal, next, &blocknr);
90
91 if (err) {
92 printk (KERN_ERR "JBD: bad block at offset %u\n",
93 next);
94 goto failed;
95 }
96
97 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
98 if (!bh) {
99 err = -ENOMEM;
100 goto failed;
101 }
102
103 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
104 bufs[nbufs++] = bh;
105 if (nbufs == MAXBUF) {
106 ll_rw_block(READ, nbufs, bufs);
107 journal_brelse_array(bufs, nbufs);
108 nbufs = 0;
109 }
110 } else
111 brelse(bh);
112 }
113
114 if (nbufs)
115 ll_rw_block(READ, nbufs, bufs);
116 err = 0;
117
118failed:
119 if (nbufs)
120 journal_brelse_array(bufs, nbufs);
121 return err;
122}
123
124#endif /* __KERNEL__ */
125
126
127/*
128 * Read a block from the journal
129 */
130
131static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset)
133{
134 int err;
135 unsigned long long blocknr;
136 struct buffer_head *bh;
137
138 *bhp = NULL;
139
140 if (offset >= journal->j_maxlen) {
141 printk(KERN_ERR "JBD: corrupted journal superblock\n");
142 return -EIO;
143 }
144
145 err = jbd2_journal_bmap(journal, offset, &blocknr);
146
147 if (err) {
148 printk (KERN_ERR "JBD: bad block at offset %u\n",
149 offset);
150 return err;
151 }
152
153 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
154 if (!bh)
155 return -ENOMEM;
156
157 if (!buffer_uptodate(bh)) {
158 /* If this is a brand new buffer, start readahead.
159 Otherwise, we assume we are already reading it. */
160 if (!buffer_req(bh))
161 do_readahead(journal, offset);
162 wait_on_buffer(bh);
163 }
164
165 if (!buffer_uptodate(bh)) {
166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
167 offset);
168 brelse(bh);
169 return -EIO;
170 }
171
172 *bhp = bh;
173 return 0;
174}
175
176
177/*
178 * Count the number of in-use tags in a journal descriptor block.
179 */
180
181static int count_tags(journal_t *journal, struct buffer_head *bh)
182{
183 char * tagp;
184 journal_block_tag_t * tag;
185 int nr = 0, size = journal->j_blocksize;
186 int tag_bytes = journal_tag_bytes(journal);
187
188 tagp = &bh->b_data[sizeof(journal_header_t)];
189
190 while ((tagp - bh->b_data + tag_bytes) <= size) {
191 tag = (journal_block_tag_t *) tagp;
192
193 nr++;
194 tagp += tag_bytes;
195 if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
196 tagp += 16;
197
198 if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
199 break;
200 }
201
202 return nr;
203}
204
205
206/* Make sure we wrap around the log correctly! */
207#define wrap(journal, var) \
208do { \
209 if (var >= (journal)->j_last) \
210 var -= ((journal)->j_last - (journal)->j_first); \
211} while (0)
212
213/**
214 * jbd2_journal_recover - recovers a on-disk journal
215 * @journal: the journal to recover
216 *
217 * The primary function for recovering the log contents when mounting a
218 * journaled device.
219 *
220 * Recovery is done in three passes. In the first pass, we look for the
221 * end of the log. In the second, we assemble the list of revoke
222 * blocks. In the third and final pass, we replay any un-revoked blocks
223 * in the log.
224 */
225int jbd2_journal_recover(journal_t *journal)
226{
227 int err;
228 journal_superblock_t * sb;
229
230 struct recovery_info info;
231
232 memset(&info, 0, sizeof(info));
233 sb = journal->j_superblock;
234
235 /*
236 * The journal superblock's s_start field (the current log head)
237 * is always zero if, and only if, the journal was cleanly
238 * unmounted.
239 */
240
241 if (!sb->s_start) {
242 jbd_debug(1, "No recovery required, last transaction %d\n",
243 be32_to_cpu(sb->s_sequence));
244 journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
245 return 0;
246 }
247
248 err = do_one_pass(journal, &info, PASS_SCAN);
249 if (!err)
250 err = do_one_pass(journal, &info, PASS_REVOKE);
251 if (!err)
252 err = do_one_pass(journal, &info, PASS_REPLAY);
253
254 jbd_debug(0, "JBD: recovery, exit status %d, "
255 "recovered transactions %u to %u\n",
256 err, info.start_transaction, info.end_transaction);
257 jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
258 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
259
260 /* Restart the log at the next transaction ID, thus invalidating
261 * any existing commit records in the log. */
262 journal->j_transaction_sequence = ++info.end_transaction;
263
264 jbd2_journal_clear_revoke(journal);
265 sync_blockdev(journal->j_fs_dev);
266 return err;
267}
268
269/**
270 * jbd2_journal_skip_recovery - Start journal and wipe exiting records
271 * @journal: journal to startup
272 *
273 * Locate any valid recovery information from the journal and set up the
274 * journal structures in memory to ignore it (presumably because the
275 * caller has evidence that it is out of date).
276 * This function does'nt appear to be exorted..
277 *
278 * We perform one pass over the journal to allow us to tell the user how
279 * much recovery information is being erased, and to let us initialise
280 * the journal transaction sequence numbers to the next unused ID.
281 */
282int jbd2_journal_skip_recovery(journal_t *journal)
283{
284 int err;
285 journal_superblock_t * sb;
286
287 struct recovery_info info;
288
289 memset (&info, 0, sizeof(info));
290 sb = journal->j_superblock;
291
292 err = do_one_pass(journal, &info, PASS_SCAN);
293
294 if (err) {
295 printk(KERN_ERR "JBD: error %d scanning journal\n", err);
296 ++journal->j_transaction_sequence;
297 } else {
298#ifdef CONFIG_JBD_DEBUG
299 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
300#endif
301 jbd_debug(0,
302 "JBD: ignoring %d transaction%s from the journal.\n",
303 dropped, (dropped == 1) ? "" : "s");
304 journal->j_transaction_sequence = ++info.end_transaction;
305 }
306
307 journal->j_tail = 0;
308 return err;
309}
310
311static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
312{
313 unsigned long long block = be32_to_cpu(tag->t_blocknr);
314 if (tag_bytes > JBD_TAG_SIZE32)
315 block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
316 return block;
317}
318
319static int do_one_pass(journal_t *journal,
320 struct recovery_info *info, enum passtype pass)
321{
322 unsigned int first_commit_ID, next_commit_ID;
323 unsigned long next_log_block;
324 int err, success = 0;
325 journal_superblock_t * sb;
326 journal_header_t * tmp;
327 struct buffer_head * bh;
328 unsigned int sequence;
329 int blocktype;
330 int tag_bytes = journal_tag_bytes(journal);
331
332 /* Precompute the maximum metadata descriptors in a descriptor block */
333 int MAX_BLOCKS_PER_DESC;
334 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
335 / tag_bytes);
336
337 /*
338 * First thing is to establish what we expect to find in the log
339 * (in terms of transaction IDs), and where (in terms of log
340 * block offsets): query the superblock.
341 */
342
343 sb = journal->j_superblock;
344 next_commit_ID = be32_to_cpu(sb->s_sequence);
345 next_log_block = be32_to_cpu(sb->s_start);
346
347 first_commit_ID = next_commit_ID;
348 if (pass == PASS_SCAN)
349 info->start_transaction = first_commit_ID;
350
351 jbd_debug(1, "Starting recovery pass %d\n", pass);
352
353 /*
354 * Now we walk through the log, transaction by transaction,
355 * making sure that each transaction has a commit block in the
356 * expected place. Each complete transaction gets replayed back
357 * into the main filesystem.
358 */
359
360 while (1) {
361 int flags;
362 char * tagp;
363 journal_block_tag_t * tag;
364 struct buffer_head * obh;
365 struct buffer_head * nbh;
366
367 cond_resched(); /* We're under lock_kernel() */
368
369 /* If we already know where to stop the log traversal,
370 * check right now that we haven't gone past the end of
371 * the log. */
372
373 if (pass != PASS_SCAN)
374 if (tid_geq(next_commit_ID, info->end_transaction))
375 break;
376
377 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
378 next_commit_ID, next_log_block, journal->j_last);
379
380 /* Skip over each chunk of the transaction looking
381 * either the next descriptor block or the final commit
382 * record. */
383
384 jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
385 err = jread(&bh, journal, next_log_block);
386 if (err)
387 goto failed;
388
389 next_log_block++;
390 wrap(journal, next_log_block);
391
392 /* What kind of buffer is it?
393 *
394 * If it is a descriptor block, check that it has the
395 * expected sequence number. Otherwise, we're all done
396 * here. */
397
398 tmp = (journal_header_t *)bh->b_data;
399
400 if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
401 brelse(bh);
402 break;
403 }
404
405 blocktype = be32_to_cpu(tmp->h_blocktype);
406 sequence = be32_to_cpu(tmp->h_sequence);
407 jbd_debug(3, "Found magic %d, sequence %d\n",
408 blocktype, sequence);
409
410 if (sequence != next_commit_ID) {
411 brelse(bh);
412 break;
413 }
414
415 /* OK, we have a valid descriptor block which matches
416 * all of the sequence number checks. What are we going
417 * to do with it? That depends on the pass... */
418
419 switch(blocktype) {
420 case JBD2_DESCRIPTOR_BLOCK:
421 /* If it is a valid descriptor block, replay it
422 * in pass REPLAY; otherwise, just skip over the
423 * blocks it describes. */
424 if (pass != PASS_REPLAY) {
425 next_log_block += count_tags(journal, bh);
426 wrap(journal, next_log_block);
427 brelse(bh);
428 continue;
429 }
430
431 /* A descriptor block: we can now write all of
432 * the data blocks. Yay, useful work is finally
433 * getting done here! */
434
435 tagp = &bh->b_data[sizeof(journal_header_t)];
436 while ((tagp - bh->b_data + tag_bytes)
437 <= journal->j_blocksize) {
438 unsigned long io_block;
439
440 tag = (journal_block_tag_t *) tagp;
441 flags = be32_to_cpu(tag->t_flags);
442
443 io_block = next_log_block++;
444 wrap(journal, next_log_block);
445 err = jread(&obh, journal, io_block);
446 if (err) {
447 /* Recover what we can, but
448 * report failure at the end. */
449 success = err;
450 printk (KERN_ERR
451 "JBD: IO error %d recovering "
452 "block %ld in log\n",
453 err, io_block);
454 } else {
455 unsigned long long blocknr;
456
457 J_ASSERT(obh != NULL);
458 blocknr = read_tag_block(tag_bytes,
459 tag);
460
461 /* If the block has been
462 * revoked, then we're all done
463 * here. */
464 if (jbd2_journal_test_revoke
465 (journal, blocknr,
466 next_commit_ID)) {
467 brelse(obh);
468 ++info->nr_revoke_hits;
469 goto skip_write;
470 }
471
472 /* Find a buffer for the new
473 * data being restored */
474 nbh = __getblk(journal->j_fs_dev,
475 blocknr,
476 journal->j_blocksize);
477 if (nbh == NULL) {
478 printk(KERN_ERR
479 "JBD: Out of memory "
480 "during recovery.\n");
481 err = -ENOMEM;
482 brelse(bh);
483 brelse(obh);
484 goto failed;
485 }
486
487 lock_buffer(nbh);
488 memcpy(nbh->b_data, obh->b_data,
489 journal->j_blocksize);
490 if (flags & JBD2_FLAG_ESCAPE) {
491 *((__be32 *)bh->b_data) =
492 cpu_to_be32(JBD2_MAGIC_NUMBER);
493 }
494
495 BUFFER_TRACE(nbh, "marking dirty");
496 set_buffer_uptodate(nbh);
497 mark_buffer_dirty(nbh);
498 BUFFER_TRACE(nbh, "marking uptodate");
499 ++info->nr_replays;
500 /* ll_rw_block(WRITE, 1, &nbh); */
501 unlock_buffer(nbh);
502 brelse(obh);
503 brelse(nbh);
504 }
505
506 skip_write:
507 tagp += tag_bytes;
508 if (!(flags & JBD2_FLAG_SAME_UUID))
509 tagp += 16;
510
511 if (flags & JBD2_FLAG_LAST_TAG)
512 break;
513 }
514
515 brelse(bh);
516 continue;
517
518 case JBD2_COMMIT_BLOCK:
519 /* Found an expected commit block: not much to
520 * do other than move on to the next sequence
521 * number. */
522 brelse(bh);
523 next_commit_ID++;
524 continue;
525
526 case JBD2_REVOKE_BLOCK:
527 /* If we aren't in the REVOKE pass, then we can
528 * just skip over this block. */
529 if (pass != PASS_REVOKE) {
530 brelse(bh);
531 continue;
532 }
533
534 err = scan_revoke_records(journal, bh,
535 next_commit_ID, info);
536 brelse(bh);
537 if (err)
538 goto failed;
539 continue;
540
541 default:
542 jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
543 blocktype);
544 brelse(bh);
545 goto done;
546 }
547 }
548
549 done:
550 /*
551 * We broke out of the log scan loop: either we came to the
552 * known end of the log or we found an unexpected block in the
553 * log. If the latter happened, then we know that the "current"
554 * transaction marks the end of the valid log.
555 */
556
557 if (pass == PASS_SCAN)
558 info->end_transaction = next_commit_ID;
559 else {
560 /* It's really bad news if different passes end up at
561 * different places (but possible due to IO errors). */
562 if (info->end_transaction != next_commit_ID) {
563 printk (KERN_ERR "JBD: recovery pass %d ended at "
564 "transaction %u, expected %u\n",
565 pass, next_commit_ID, info->end_transaction);
566 if (!success)
567 success = -EIO;
568 }
569 }
570
571 return success;
572
573 failed:
574 return err;
575}
576
577
578/* Scan a revoke record, marking all blocks mentioned as revoked. */
579
580static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
581 tid_t sequence, struct recovery_info *info)
582{
583 jbd2_journal_revoke_header_t *header;
584 int offset, max;
585 int record_len = 4;
586
587 header = (jbd2_journal_revoke_header_t *) bh->b_data;
588 offset = sizeof(jbd2_journal_revoke_header_t);
589 max = be32_to_cpu(header->r_count);
590
591 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
592 record_len = 8;
593
594 while (offset + record_len <= max) {
595 unsigned long long blocknr;
596 int err;
597
598 if (record_len == 4)
599 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
600 else
601 blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
602 offset += record_len;
603 err = jbd2_journal_set_revoke(journal, blocknr, sequence);
604 if (err)
605 return err;
606 ++info->nr_revokes;
607 }
608 return 0;
609}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
new file mode 100644
index 000000000000..380d19917f37
--- /dev/null
+++ b/fs/jbd2/revoke.c
@@ -0,0 +1,712 @@
1/*
2 * linux/fs/revoke.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
5 *
6 * Copyright 2000 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal revoke routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 *
15 * Revoke is the mechanism used to prevent old log records for deleted
16 * metadata from being replayed on top of newer data using the same
17 * blocks. The revoke mechanism is used in two separate places:
18 *
19 * + Commit: during commit we write the entire list of the current
20 * transaction's revoked blocks to the journal
21 *
22 * + Recovery: during recovery we record the transaction ID of all
23 * revoked blocks. If there are multiple revoke records in the log
24 * for a single block, only the last one counts, and if there is a log
25 * entry for a block beyond the last revoke, then that log entry still
26 * gets replayed.
27 *
28 * We can get interactions between revokes and new log data within a
29 * single transaction:
30 *
31 * Block is revoked and then journaled:
32 * The desired end result is the journaling of the new block, so we
33 * cancel the revoke before the transaction commits.
34 *
35 * Block is journaled and then revoked:
36 * The revoke must take precedence over the write of the block, so we
37 * need either to cancel the journal entry or to write the revoke
38 * later in the log than the log block. In this case, we choose the
39 * latter: journaling a block cancels any revoke record for that block
40 * in the current transaction, so any revoke for that block in the
41 * transaction must have happened after the block was journaled and so
42 * the revoke must take precedence.
43 *
44 * Block is revoked and then written as data:
45 * The data write is allowed to succeed, but the revoke is _not_
46 * cancelled. We still need to prevent old log records from
47 * overwriting the new data. We don't even need to clear the revoke
48 * bit here.
49 *
50 * Revoke information on buffers is a tri-state value:
51 *
52 * RevokeValid clear: no cached revoke status, need to look it up
53 * RevokeValid set, Revoked clear:
54 * buffer has not been revoked, and cancel_revoke
55 * need do nothing.
56 * RevokeValid set, Revoked set:
57 * buffer has been revoked.
58 */
59
60#ifndef __KERNEL__
61#include "jfs_user.h"
62#else
63#include <linux/time.h>
64#include <linux/fs.h>
65#include <linux/jbd2.h>
66#include <linux/errno.h>
67#include <linux/slab.h>
68#include <linux/list.h>
69#include <linux/smp_lock.h>
70#include <linux/init.h>
71#endif
72
73static kmem_cache_t *jbd2_revoke_record_cache;
74static kmem_cache_t *jbd2_revoke_table_cache;
75
76/* Each revoke record represents one single revoked block. During
77 journal replay, this involves recording the transaction ID of the
78 last transaction to revoke this block. */
79
80struct jbd2_revoke_record_s
81{
82 struct list_head hash;
83 tid_t sequence; /* Used for recovery only */
84 unsigned long long blocknr;
85};
86
87
88/* The revoke table is just a simple hash table of revoke records. */
89struct jbd2_revoke_table_s
90{
91 /* It is conceivable that we might want a larger hash table
92 * for recovery. Must be a power of two. */
93 int hash_size;
94 int hash_shift;
95 struct list_head *hash_table;
96};
97
98
99#ifdef __KERNEL__
100static void write_one_revoke_record(journal_t *, transaction_t *,
101 struct journal_head **, int *,
102 struct jbd2_revoke_record_s *);
103static void flush_descriptor(journal_t *, struct journal_head *, int);
104#endif
105
106/* Utility functions to maintain the revoke table */
107
108/* Borrowed from buffer.c: this is a tried and tested block hash function */
109static inline int hash(journal_t *journal, unsigned long long block)
110{
111 struct jbd2_revoke_table_s *table = journal->j_revoke;
112 int hash_shift = table->hash_shift;
113 int hash = (int)block ^ (int)((block >> 31) >> 1);
114
115 return ((hash << (hash_shift - 6)) ^
116 (hash >> 13) ^
117 (hash << (hash_shift - 12))) & (table->hash_size - 1);
118}
119
120static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
121 tid_t seq)
122{
123 struct list_head *hash_list;
124 struct jbd2_revoke_record_s *record;
125
126repeat:
127 record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
128 if (!record)
129 goto oom;
130
131 record->sequence = seq;
132 record->blocknr = blocknr;
133 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
134 spin_lock(&journal->j_revoke_lock);
135 list_add(&record->hash, hash_list);
136 spin_unlock(&journal->j_revoke_lock);
137 return 0;
138
139oom:
140 if (!journal_oom_retry)
141 return -ENOMEM;
142 jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
143 yield();
144 goto repeat;
145}
146
147/* Find a revoke record in the journal's hash table. */
148
149static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
150 unsigned long long blocknr)
151{
152 struct list_head *hash_list;
153 struct jbd2_revoke_record_s *record;
154
155 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
156
157 spin_lock(&journal->j_revoke_lock);
158 record = (struct jbd2_revoke_record_s *) hash_list->next;
159 while (&(record->hash) != hash_list) {
160 if (record->blocknr == blocknr) {
161 spin_unlock(&journal->j_revoke_lock);
162 return record;
163 }
164 record = (struct jbd2_revoke_record_s *) record->hash.next;
165 }
166 spin_unlock(&journal->j_revoke_lock);
167 return NULL;
168}
169
170int __init jbd2_journal_init_revoke_caches(void)
171{
172 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
173 sizeof(struct jbd2_revoke_record_s),
174 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
175 if (jbd2_revoke_record_cache == 0)
176 return -ENOMEM;
177
178 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
179 sizeof(struct jbd2_revoke_table_s),
180 0, 0, NULL, NULL);
181 if (jbd2_revoke_table_cache == 0) {
182 kmem_cache_destroy(jbd2_revoke_record_cache);
183 jbd2_revoke_record_cache = NULL;
184 return -ENOMEM;
185 }
186 return 0;
187}
188
189void jbd2_journal_destroy_revoke_caches(void)
190{
191 kmem_cache_destroy(jbd2_revoke_record_cache);
192 jbd2_revoke_record_cache = NULL;
193 kmem_cache_destroy(jbd2_revoke_table_cache);
194 jbd2_revoke_table_cache = NULL;
195}
196
197/* Initialise the revoke table for a given journal to a given size. */
198
199int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
200{
201 int shift, tmp;
202
203 J_ASSERT (journal->j_revoke_table[0] == NULL);
204
205 shift = 0;
206 tmp = hash_size;
207 while((tmp >>= 1UL) != 0UL)
208 shift++;
209
210 journal->j_revoke_table[0] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
211 if (!journal->j_revoke_table[0])
212 return -ENOMEM;
213 journal->j_revoke = journal->j_revoke_table[0];
214
215 /* Check that the hash_size is a power of two */
216 J_ASSERT ((hash_size & (hash_size-1)) == 0);
217
218 journal->j_revoke->hash_size = hash_size;
219
220 journal->j_revoke->hash_shift = shift;
221
222 journal->j_revoke->hash_table =
223 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
224 if (!journal->j_revoke->hash_table) {
225 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
226 journal->j_revoke = NULL;
227 return -ENOMEM;
228 }
229
230 for (tmp = 0; tmp < hash_size; tmp++)
231 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
232
233 journal->j_revoke_table[1] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
234 if (!journal->j_revoke_table[1]) {
235 kfree(journal->j_revoke_table[0]->hash_table);
236 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
237 return -ENOMEM;
238 }
239
240 journal->j_revoke = journal->j_revoke_table[1];
241
242 /* Check that the hash_size is a power of two */
243 J_ASSERT ((hash_size & (hash_size-1)) == 0);
244
245 journal->j_revoke->hash_size = hash_size;
246
247 journal->j_revoke->hash_shift = shift;
248
249 journal->j_revoke->hash_table =
250 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
251 if (!journal->j_revoke->hash_table) {
252 kfree(journal->j_revoke_table[0]->hash_table);
253 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
254 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[1]);
255 journal->j_revoke = NULL;
256 return -ENOMEM;
257 }
258
259 for (tmp = 0; tmp < hash_size; tmp++)
260 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
261
262 spin_lock_init(&journal->j_revoke_lock);
263
264 return 0;
265}
266
267/* Destoy a journal's revoke table. The table must already be empty! */
268
269void jbd2_journal_destroy_revoke(journal_t *journal)
270{
271 struct jbd2_revoke_table_s *table;
272 struct list_head *hash_list;
273 int i;
274
275 table = journal->j_revoke_table[0];
276 if (!table)
277 return;
278
279 for (i=0; i<table->hash_size; i++) {
280 hash_list = &table->hash_table[i];
281 J_ASSERT (list_empty(hash_list));
282 }
283
284 kfree(table->hash_table);
285 kmem_cache_free(jbd2_revoke_table_cache, table);
286 journal->j_revoke = NULL;
287
288 table = journal->j_revoke_table[1];
289 if (!table)
290 return;
291
292 for (i=0; i<table->hash_size; i++) {
293 hash_list = &table->hash_table[i];
294 J_ASSERT (list_empty(hash_list));
295 }
296
297 kfree(table->hash_table);
298 kmem_cache_free(jbd2_revoke_table_cache, table);
299 journal->j_revoke = NULL;
300}
301
302
303#ifdef __KERNEL__
304
305/*
306 * jbd2_journal_revoke: revoke a given buffer_head from the journal. This
307 * prevents the block from being replayed during recovery if we take a
308 * crash after this current transaction commits. Any subsequent
309 * metadata writes of the buffer in this transaction cancel the
310 * revoke.
311 *
312 * Note that this call may block --- it is up to the caller to make
313 * sure that there are no further calls to journal_write_metadata
314 * before the revoke is complete. In ext3, this implies calling the
315 * revoke before clearing the block bitmap when we are deleting
316 * metadata.
317 *
318 * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
319 * parameter, but does _not_ forget the buffer_head if the bh was only
320 * found implicitly.
321 *
322 * bh_in may not be a journalled buffer - it may have come off
323 * the hash tables without an attached journal_head.
324 *
325 * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
326 * by one.
327 */
328
329int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
330 struct buffer_head *bh_in)
331{
332 struct buffer_head *bh = NULL;
333 journal_t *journal;
334 struct block_device *bdev;
335 int err;
336
337 might_sleep();
338 if (bh_in)
339 BUFFER_TRACE(bh_in, "enter");
340
341 journal = handle->h_transaction->t_journal;
342 if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
343 J_ASSERT (!"Cannot set revoke feature!");
344 return -EINVAL;
345 }
346
347 bdev = journal->j_fs_dev;
348 bh = bh_in;
349
350 if (!bh) {
351 bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
352 if (bh)
353 BUFFER_TRACE(bh, "found on hash");
354 }
355#ifdef JBD_EXPENSIVE_CHECKING
356 else {
357 struct buffer_head *bh2;
358
359 /* If there is a different buffer_head lying around in
360 * memory anywhere... */
361 bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
362 if (bh2) {
363 /* ... and it has RevokeValid status... */
364 if (bh2 != bh && buffer_revokevalid(bh2))
365 /* ...then it better be revoked too,
366 * since it's illegal to create a revoke
367 * record against a buffer_head which is
368 * not marked revoked --- that would
369 * risk missing a subsequent revoke
370 * cancel. */
371 J_ASSERT_BH(bh2, buffer_revoked(bh2));
372 put_bh(bh2);
373 }
374 }
375#endif
376
377 /* We really ought not ever to revoke twice in a row without
378 first having the revoke cancelled: it's illegal to free a
379 block twice without allocating it in between! */
380 if (bh) {
381 if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
382 "inconsistent data on disk")) {
383 if (!bh_in)
384 brelse(bh);
385 return -EIO;
386 }
387 set_buffer_revoked(bh);
388 set_buffer_revokevalid(bh);
389 if (bh_in) {
390 BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
391 jbd2_journal_forget(handle, bh_in);
392 } else {
393 BUFFER_TRACE(bh, "call brelse");
394 __brelse(bh);
395 }
396 }
397
398 jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
399 err = insert_revoke_hash(journal, blocknr,
400 handle->h_transaction->t_tid);
401 BUFFER_TRACE(bh_in, "exit");
402 return err;
403}
404
405/*
406 * Cancel an outstanding revoke. For use only internally by the
407 * journaling code (called from jbd2_journal_get_write_access).
408 *
409 * We trust buffer_revoked() on the buffer if the buffer is already
410 * being journaled: if there is no revoke pending on the buffer, then we
411 * don't do anything here.
412 *
413 * This would break if it were possible for a buffer to be revoked and
414 * discarded, and then reallocated within the same transaction. In such
415 * a case we would have lost the revoked bit, but when we arrived here
416 * the second time we would still have a pending revoke to cancel. So,
417 * do not trust the Revoked bit on buffers unless RevokeValid is also
418 * set.
419 *
420 * The caller must have the journal locked.
421 */
422int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
423{
424 struct jbd2_revoke_record_s *record;
425 journal_t *journal = handle->h_transaction->t_journal;
426 int need_cancel;
427 int did_revoke = 0; /* akpm: debug */
428 struct buffer_head *bh = jh2bh(jh);
429
430 jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
431
432 /* Is the existing Revoke bit valid? If so, we trust it, and
433 * only perform the full cancel if the revoke bit is set. If
434 * not, we can't trust the revoke bit, and we need to do the
435 * full search for a revoke record. */
436 if (test_set_buffer_revokevalid(bh)) {
437 need_cancel = test_clear_buffer_revoked(bh);
438 } else {
439 need_cancel = 1;
440 clear_buffer_revoked(bh);
441 }
442
443 if (need_cancel) {
444 record = find_revoke_record(journal, bh->b_blocknr);
445 if (record) {
446 jbd_debug(4, "cancelled existing revoke on "
447 "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
448 spin_lock(&journal->j_revoke_lock);
449 list_del(&record->hash);
450 spin_unlock(&journal->j_revoke_lock);
451 kmem_cache_free(jbd2_revoke_record_cache, record);
452 did_revoke = 1;
453 }
454 }
455
456#ifdef JBD_EXPENSIVE_CHECKING
457 /* There better not be one left behind by now! */
458 record = find_revoke_record(journal, bh->b_blocknr);
459 J_ASSERT_JH(jh, record == NULL);
460#endif
461
462 /* Finally, have we just cleared revoke on an unhashed
463 * buffer_head? If so, we'd better make sure we clear the
464 * revoked status on any hashed alias too, otherwise the revoke
465 * state machine will get very upset later on. */
466 if (need_cancel) {
467 struct buffer_head *bh2;
468 bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
469 if (bh2) {
470 if (bh2 != bh)
471 clear_buffer_revoked(bh2);
472 __brelse(bh2);
473 }
474 }
475 return did_revoke;
476}
477
478/* journal_switch_revoke table select j_revoke for next transaction
479 * we do not want to suspend any processing until all revokes are
480 * written -bzzz
481 */
482void jbd2_journal_switch_revoke_table(journal_t *journal)
483{
484 int i;
485
486 if (journal->j_revoke == journal->j_revoke_table[0])
487 journal->j_revoke = journal->j_revoke_table[1];
488 else
489 journal->j_revoke = journal->j_revoke_table[0];
490
491 for (i = 0; i < journal->j_revoke->hash_size; i++)
492 INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
493}
494
495/*
496 * Write revoke records to the journal for all entries in the current
497 * revoke hash, deleting the entries as we go.
498 *
499 * Called with the journal lock held.
500 */
501
502void jbd2_journal_write_revoke_records(journal_t *journal,
503 transaction_t *transaction)
504{
505 struct journal_head *descriptor;
506 struct jbd2_revoke_record_s *record;
507 struct jbd2_revoke_table_s *revoke;
508 struct list_head *hash_list;
509 int i, offset, count;
510
511 descriptor = NULL;
512 offset = 0;
513 count = 0;
514
515 /* select revoke table for committing transaction */
516 revoke = journal->j_revoke == journal->j_revoke_table[0] ?
517 journal->j_revoke_table[1] : journal->j_revoke_table[0];
518
519 for (i = 0; i < revoke->hash_size; i++) {
520 hash_list = &revoke->hash_table[i];
521
522 while (!list_empty(hash_list)) {
523 record = (struct jbd2_revoke_record_s *)
524 hash_list->next;
525 write_one_revoke_record(journal, transaction,
526 &descriptor, &offset,
527 record);
528 count++;
529 list_del(&record->hash);
530 kmem_cache_free(jbd2_revoke_record_cache, record);
531 }
532 }
533 if (descriptor)
534 flush_descriptor(journal, descriptor, offset);
535 jbd_debug(1, "Wrote %d revoke records\n", count);
536}
537
538/*
539 * Write out one revoke record. We need to create a new descriptor
540 * block if the old one is full or if we have not already created one.
541 */
542
543static void write_one_revoke_record(journal_t *journal,
544 transaction_t *transaction,
545 struct journal_head **descriptorp,
546 int *offsetp,
547 struct jbd2_revoke_record_s *record)
548{
549 struct journal_head *descriptor;
550 int offset;
551 journal_header_t *header;
552
553 /* If we are already aborting, this all becomes a noop. We
554 still need to go round the loop in
555 jbd2_journal_write_revoke_records in order to free all of the
556 revoke records: only the IO to the journal is omitted. */
557 if (is_journal_aborted(journal))
558 return;
559
560 descriptor = *descriptorp;
561 offset = *offsetp;
562
563 /* Make sure we have a descriptor with space left for the record */
564 if (descriptor) {
565 if (offset == journal->j_blocksize) {
566 flush_descriptor(journal, descriptor, offset);
567 descriptor = NULL;
568 }
569 }
570
571 if (!descriptor) {
572 descriptor = jbd2_journal_get_descriptor_buffer(journal);
573 if (!descriptor)
574 return;
575 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
576 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
577 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
578 header->h_sequence = cpu_to_be32(transaction->t_tid);
579
580 /* Record it so that we can wait for IO completion later */
581 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
582 jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
583
584 offset = sizeof(jbd2_journal_revoke_header_t);
585 *descriptorp = descriptor;
586 }
587
588 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
589 * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
590 cpu_to_be64(record->blocknr);
591 offset += 8;
592
593 } else {
594 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
595 cpu_to_be32(record->blocknr);
596 offset += 4;
597 }
598
599 *offsetp = offset;
600}
601
602/*
603 * Flush a revoke descriptor out to the journal. If we are aborting,
604 * this is a noop; otherwise we are generating a buffer which needs to
605 * be waited for during commit, so it has to go onto the appropriate
606 * journal buffer list.
607 */
608
609static void flush_descriptor(journal_t *journal,
610 struct journal_head *descriptor,
611 int offset)
612{
613 jbd2_journal_revoke_header_t *header;
614 struct buffer_head *bh = jh2bh(descriptor);
615
616 if (is_journal_aborted(journal)) {
617 put_bh(bh);
618 return;
619 }
620
621 header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
622 header->r_count = cpu_to_be32(offset);
623 set_buffer_jwrite(bh);
624 BUFFER_TRACE(bh, "write");
625 set_buffer_dirty(bh);
626 ll_rw_block(SWRITE, 1, &bh);
627}
628#endif
629
630/*
631 * Revoke support for recovery.
632 *
633 * Recovery needs to be able to:
634 *
635 * record all revoke records, including the tid of the latest instance
636 * of each revoke in the journal
637 *
638 * check whether a given block in a given transaction should be replayed
639 * (ie. has not been revoked by a revoke record in that or a subsequent
640 * transaction)
641 *
642 * empty the revoke table after recovery.
643 */
644
645/*
646 * First, setting revoke records. We create a new revoke record for
647 * every block ever revoked in the log as we scan it for recovery, and
648 * we update the existing records if we find multiple revokes for a
649 * single block.
650 */
651
652int jbd2_journal_set_revoke(journal_t *journal,
653 unsigned long long blocknr,
654 tid_t sequence)
655{
656 struct jbd2_revoke_record_s *record;
657
658 record = find_revoke_record(journal, blocknr);
659 if (record) {
660 /* If we have multiple occurrences, only record the
661 * latest sequence number in the hashed record */
662 if (tid_gt(sequence, record->sequence))
663 record->sequence = sequence;
664 return 0;
665 }
666 return insert_revoke_hash(journal, blocknr, sequence);
667}
668
669/*
670 * Test revoke records. For a given block referenced in the log, has
671 * that block been revoked? A revoke record with a given transaction
672 * sequence number revokes all blocks in that transaction and earlier
673 * ones, but later transactions still need replayed.
674 */
675
676int jbd2_journal_test_revoke(journal_t *journal,
677 unsigned long long blocknr,
678 tid_t sequence)
679{
680 struct jbd2_revoke_record_s *record;
681
682 record = find_revoke_record(journal, blocknr);
683 if (!record)
684 return 0;
685 if (tid_gt(sequence, record->sequence))
686 return 0;
687 return 1;
688}
689
690/*
691 * Finally, once recovery is over, we need to clear the revoke table so
692 * that it can be reused by the running filesystem.
693 */
694
695void jbd2_journal_clear_revoke(journal_t *journal)
696{
697 int i;
698 struct list_head *hash_list;
699 struct jbd2_revoke_record_s *record;
700 struct jbd2_revoke_table_s *revoke;
701
702 revoke = journal->j_revoke;
703
704 for (i = 0; i < revoke->hash_size; i++) {
705 hash_list = &revoke->hash_table[i];
706 while (!list_empty(hash_list)) {
707 record = (struct jbd2_revoke_record_s*) hash_list->next;
708 list_del(&record->hash);
709 kmem_cache_free(jbd2_revoke_record_cache, record);
710 }
711 }
712}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
new file mode 100644
index 000000000000..b6cf2be845a1
--- /dev/null
+++ b/fs/jbd2/transaction.c
@@ -0,0 +1,2081 @@
1/*
2 * linux/fs/transaction.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem transaction handling code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages transactions (compound commits managed by the
16 * journaling code) and handles (individual atomic operations by the
17 * filesystem).
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd2.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/timer.h>
26#include <linux/smp_lock.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29
30/*
31 * jbd2_get_transaction: obtain a new transaction_t object.
32 *
33 * Simply allocate and initialise a new transaction. Create it in
34 * RUNNING state and add it to the current journal (which should not
35 * have an existing running transaction: we only make a new transaction
36 * once we have started to commit the old one).
37 *
38 * Preconditions:
39 * The journal MUST be locked. We don't perform atomic mallocs on the
40 * new transaction and we can't block without protecting against other
41 * processes trying to touch the journal while it is in transition.
42 *
43 * Called under j_state_lock
44 */
45
46static transaction_t *
47jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
48{
49 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING;
51 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock);
54
55 /* Set up the commit timer for the new transaction. */
56 journal->j_commit_timer.expires = transaction->t_expires;
57 add_timer(&journal->j_commit_timer);
58
59 J_ASSERT(journal->j_running_transaction == NULL);
60 journal->j_running_transaction = transaction;
61
62 return transaction;
63}
64
65/*
66 * Handle management.
67 *
68 * A handle_t is an object which represents a single atomic update to a
69 * filesystem, and which tracks all of the modifications which form part
70 * of that one update.
71 */
72
73/*
74 * start_this_handle: Given a handle, deal with any locking or stalling
75 * needed to make sure that there is enough journal space for the handle
76 * to begin. Attach the handle to a transaction and set up the
77 * transaction's buffer credits.
78 */
79
80static int start_this_handle(journal_t *journal, handle_t *handle)
81{
82 transaction_t *transaction;
83 int needed;
84 int nblocks = handle->h_buffer_credits;
85 transaction_t *new_transaction = NULL;
86 int ret = 0;
87
88 if (nblocks > journal->j_max_transaction_buffers) {
89 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
90 current->comm, nblocks,
91 journal->j_max_transaction_buffers);
92 ret = -ENOSPC;
93 goto out;
94 }
95
96alloc_transaction:
97 if (!journal->j_running_transaction) {
98 new_transaction = jbd_kmalloc(sizeof(*new_transaction),
99 GFP_NOFS);
100 if (!new_transaction) {
101 ret = -ENOMEM;
102 goto out;
103 }
104 memset(new_transaction, 0, sizeof(*new_transaction));
105 }
106
107 jbd_debug(3, "New handle %p going live.\n", handle);
108
109repeat:
110
111 /*
112 * We need to hold j_state_lock until t_updates has been incremented,
113 * for proper journal barrier handling
114 */
115 spin_lock(&journal->j_state_lock);
116repeat_locked:
117 if (is_journal_aborted(journal) ||
118 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
119 spin_unlock(&journal->j_state_lock);
120 ret = -EROFS;
121 goto out;
122 }
123
124 /* Wait on the journal's transaction barrier if necessary */
125 if (journal->j_barrier_count) {
126 spin_unlock(&journal->j_state_lock);
127 wait_event(journal->j_wait_transaction_locked,
128 journal->j_barrier_count == 0);
129 goto repeat;
130 }
131
132 if (!journal->j_running_transaction) {
133 if (!new_transaction) {
134 spin_unlock(&journal->j_state_lock);
135 goto alloc_transaction;
136 }
137 jbd2_get_transaction(journal, new_transaction);
138 new_transaction = NULL;
139 }
140
141 transaction = journal->j_running_transaction;
142
143 /*
144 * If the current transaction is locked down for commit, wait for the
145 * lock to be released.
146 */
147 if (transaction->t_state == T_LOCKED) {
148 DEFINE_WAIT(wait);
149
150 prepare_to_wait(&journal->j_wait_transaction_locked,
151 &wait, TASK_UNINTERRUPTIBLE);
152 spin_unlock(&journal->j_state_lock);
153 schedule();
154 finish_wait(&journal->j_wait_transaction_locked, &wait);
155 goto repeat;
156 }
157
158 /*
159 * If there is not enough space left in the log to write all potential
160 * buffers requested by this operation, we need to stall pending a log
161 * checkpoint to free some more log space.
162 */
163 spin_lock(&transaction->t_handle_lock);
164 needed = transaction->t_outstanding_credits + nblocks;
165
166 if (needed > journal->j_max_transaction_buffers) {
167 /*
168 * If the current transaction is already too large, then start
169 * to commit it: we can then go back and attach this handle to
170 * a new transaction.
171 */
172 DEFINE_WAIT(wait);
173
174 jbd_debug(2, "Handle %p starting new commit...\n", handle);
175 spin_unlock(&transaction->t_handle_lock);
176 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
177 TASK_UNINTERRUPTIBLE);
178 __jbd2_log_start_commit(journal, transaction->t_tid);
179 spin_unlock(&journal->j_state_lock);
180 schedule();
181 finish_wait(&journal->j_wait_transaction_locked, &wait);
182 goto repeat;
183 }
184
185 /*
186 * The commit code assumes that it can get enough log space
187 * without forcing a checkpoint. This is *critical* for
188 * correctness: a checkpoint of a buffer which is also
189 * associated with a committing transaction creates a deadlock,
190 * so commit simply cannot force through checkpoints.
191 *
192 * We must therefore ensure the necessary space in the journal
193 * *before* starting to dirty potentially checkpointed buffers
194 * in the new transaction.
195 *
196 * The worst part is, any transaction currently committing can
197 * reduce the free space arbitrarily. Be careful to account for
198 * those buffers when checkpointing.
199 */
200
201 /*
202 * @@@ AKPM: This seems rather over-defensive. We're giving commit
203 * a _lot_ of headroom: 1/4 of the journal plus the size of
204 * the committing transaction. Really, we only need to give it
205 * committing_transaction->t_outstanding_credits plus "enough" for
206 * the log control blocks.
207 * Also, this test is inconsitent with the matching one in
208 * jbd2_journal_extend().
209 */
210 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
211 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
212 spin_unlock(&transaction->t_handle_lock);
213 __jbd2_log_wait_for_space(journal);
214 goto repeat_locked;
215 }
216
217 /* OK, account for the buffers that this operation expects to
218 * use and add the handle to the running transaction. */
219
220 handle->h_transaction = transaction;
221 transaction->t_outstanding_credits += nblocks;
222 transaction->t_updates++;
223 transaction->t_handle_count++;
224 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
225 handle, nblocks, transaction->t_outstanding_credits,
226 __jbd2_log_space_left(journal));
227 spin_unlock(&transaction->t_handle_lock);
228 spin_unlock(&journal->j_state_lock);
229out:
230 if (unlikely(new_transaction)) /* It's usually NULL */
231 kfree(new_transaction);
232 return ret;
233}
234
235/* Allocate a new handle. This should probably be in a slab... */
236static handle_t *new_handle(int nblocks)
237{
238 handle_t *handle = jbd_alloc_handle(GFP_NOFS);
239 if (!handle)
240 return NULL;
241 memset(handle, 0, sizeof(*handle));
242 handle->h_buffer_credits = nblocks;
243 handle->h_ref = 1;
244
245 return handle;
246}
247
248/**
249 * handle_t *jbd2_journal_start() - Obtain a new handle.
250 * @journal: Journal to start transaction on.
251 * @nblocks: number of block buffer we might modify
252 *
253 * We make sure that the transaction can guarantee at least nblocks of
254 * modified buffers in the log. We block until the log can guarantee
255 * that much space.
256 *
257 * This function is visible to journal users (like ext3fs), so is not
258 * called with the journal already locked.
259 *
260 * Return a pointer to a newly allocated handle, or NULL on failure
261 */
262handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
263{
264 handle_t *handle = journal_current_handle();
265 int err;
266
267 if (!journal)
268 return ERR_PTR(-EROFS);
269
270 if (handle) {
271 J_ASSERT(handle->h_transaction->t_journal == journal);
272 handle->h_ref++;
273 return handle;
274 }
275
276 handle = new_handle(nblocks);
277 if (!handle)
278 return ERR_PTR(-ENOMEM);
279
280 current->journal_info = handle;
281
282 err = start_this_handle(journal, handle);
283 if (err < 0) {
284 jbd_free_handle(handle);
285 current->journal_info = NULL;
286 handle = ERR_PTR(err);
287 }
288 return handle;
289}
290
291/**
292 * int jbd2_journal_extend() - extend buffer credits.
293 * @handle: handle to 'extend'
294 * @nblocks: nr blocks to try to extend by.
295 *
296 * Some transactions, such as large extends and truncates, can be done
297 * atomically all at once or in several stages. The operation requests
298 * a credit for a number of buffer modications in advance, but can
299 * extend its credit if it needs more.
300 *
301 * jbd2_journal_extend tries to give the running handle more buffer credits.
302 * It does not guarantee that allocation - this is a best-effort only.
303 * The calling process MUST be able to deal cleanly with a failure to
304 * extend here.
305 *
306 * Return 0 on success, non-zero on failure.
307 *
308 * return code < 0 implies an error
309 * return code > 0 implies normal transaction-full status.
310 */
311int jbd2_journal_extend(handle_t *handle, int nblocks)
312{
313 transaction_t *transaction = handle->h_transaction;
314 journal_t *journal = transaction->t_journal;
315 int result;
316 int wanted;
317
318 result = -EIO;
319 if (is_handle_aborted(handle))
320 goto out;
321
322 result = 1;
323
324 spin_lock(&journal->j_state_lock);
325
326 /* Don't extend a locked-down transaction! */
327 if (handle->h_transaction->t_state != T_RUNNING) {
328 jbd_debug(3, "denied handle %p %d blocks: "
329 "transaction not running\n", handle, nblocks);
330 goto error_out;
331 }
332
333 spin_lock(&transaction->t_handle_lock);
334 wanted = transaction->t_outstanding_credits + nblocks;
335
336 if (wanted > journal->j_max_transaction_buffers) {
337 jbd_debug(3, "denied handle %p %d blocks: "
338 "transaction too large\n", handle, nblocks);
339 goto unlock;
340 }
341
342 if (wanted > __jbd2_log_space_left(journal)) {
343 jbd_debug(3, "denied handle %p %d blocks: "
344 "insufficient log space\n", handle, nblocks);
345 goto unlock;
346 }
347
348 handle->h_buffer_credits += nblocks;
349 transaction->t_outstanding_credits += nblocks;
350 result = 0;
351
352 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
353unlock:
354 spin_unlock(&transaction->t_handle_lock);
355error_out:
356 spin_unlock(&journal->j_state_lock);
357out:
358 return result;
359}
360
361
362/**
363 * int jbd2_journal_restart() - restart a handle .
364 * @handle: handle to restart
365 * @nblocks: nr credits requested
366 *
367 * Restart a handle for a multi-transaction filesystem
368 * operation.
369 *
370 * If the jbd2_journal_extend() call above fails to grant new buffer credits
371 * to a running handle, a call to jbd2_journal_restart will commit the
372 * handle's transaction so far and reattach the handle to a new
373 * transaction capabable of guaranteeing the requested number of
374 * credits.
375 */
376
377int jbd2_journal_restart(handle_t *handle, int nblocks)
378{
379 transaction_t *transaction = handle->h_transaction;
380 journal_t *journal = transaction->t_journal;
381 int ret;
382
383 /* If we've had an abort of any type, don't even think about
384 * actually doing the restart! */
385 if (is_handle_aborted(handle))
386 return 0;
387
388 /*
389 * First unlink the handle from its current transaction, and start the
390 * commit on that.
391 */
392 J_ASSERT(transaction->t_updates > 0);
393 J_ASSERT(journal_current_handle() == handle);
394
395 spin_lock(&journal->j_state_lock);
396 spin_lock(&transaction->t_handle_lock);
397 transaction->t_outstanding_credits -= handle->h_buffer_credits;
398 transaction->t_updates--;
399
400 if (!transaction->t_updates)
401 wake_up(&journal->j_wait_updates);
402 spin_unlock(&transaction->t_handle_lock);
403
404 jbd_debug(2, "restarting handle %p\n", handle);
405 __jbd2_log_start_commit(journal, transaction->t_tid);
406 spin_unlock(&journal->j_state_lock);
407
408 handle->h_buffer_credits = nblocks;
409 ret = start_this_handle(journal, handle);
410 return ret;
411}
412
413
414/**
415 * void jbd2_journal_lock_updates () - establish a transaction barrier.
416 * @journal: Journal to establish a barrier on.
417 *
418 * This locks out any further updates from being started, and blocks
419 * until all existing updates have completed, returning only once the
420 * journal is in a quiescent state with no updates running.
421 *
422 * The journal lock should not be held on entry.
423 */
424void jbd2_journal_lock_updates(journal_t *journal)
425{
426 DEFINE_WAIT(wait);
427
428 spin_lock(&journal->j_state_lock);
429 ++journal->j_barrier_count;
430
431 /* Wait until there are no running updates */
432 while (1) {
433 transaction_t *transaction = journal->j_running_transaction;
434
435 if (!transaction)
436 break;
437
438 spin_lock(&transaction->t_handle_lock);
439 if (!transaction->t_updates) {
440 spin_unlock(&transaction->t_handle_lock);
441 break;
442 }
443 prepare_to_wait(&journal->j_wait_updates, &wait,
444 TASK_UNINTERRUPTIBLE);
445 spin_unlock(&transaction->t_handle_lock);
446 spin_unlock(&journal->j_state_lock);
447 schedule();
448 finish_wait(&journal->j_wait_updates, &wait);
449 spin_lock(&journal->j_state_lock);
450 }
451 spin_unlock(&journal->j_state_lock);
452
453 /*
454 * We have now established a barrier against other normal updates, but
455 * we also need to barrier against other jbd2_journal_lock_updates() calls
456 * to make sure that we serialise special journal-locked operations
457 * too.
458 */
459 mutex_lock(&journal->j_barrier);
460}
461
462/**
463 * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
464 * @journal: Journal to release the barrier on.
465 *
466 * Release a transaction barrier obtained with jbd2_journal_lock_updates().
467 *
468 * Should be called without the journal lock held.
469 */
470void jbd2_journal_unlock_updates (journal_t *journal)
471{
472 J_ASSERT(journal->j_barrier_count != 0);
473
474 mutex_unlock(&journal->j_barrier);
475 spin_lock(&journal->j_state_lock);
476 --journal->j_barrier_count;
477 spin_unlock(&journal->j_state_lock);
478 wake_up(&journal->j_wait_transaction_locked);
479}
480
481/*
482 * Report any unexpected dirty buffers which turn up. Normally those
483 * indicate an error, but they can occur if the user is running (say)
484 * tune2fs to modify the live filesystem, so we need the option of
485 * continuing as gracefully as possible. #
486 *
487 * The caller should already hold the journal lock and
488 * j_list_lock spinlock: most callers will need those anyway
489 * in order to probe the buffer's journaling state safely.
490 */
491static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
492{
493 int jlist;
494
495 /* If this buffer is one which might reasonably be dirty
496 * --- ie. data, or not part of this journal --- then
497 * we're OK to leave it alone, but otherwise we need to
498 * move the dirty bit to the journal's own internal
499 * JBDDirty bit. */
500 jlist = jh->b_jlist;
501
502 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
503 jlist == BJ_Shadow || jlist == BJ_Forget) {
504 struct buffer_head *bh = jh2bh(jh);
505
506 if (test_clear_buffer_dirty(bh))
507 set_buffer_jbddirty(bh);
508 }
509}
510
511/*
512 * If the buffer is already part of the current transaction, then there
513 * is nothing we need to do. If it is already part of a prior
514 * transaction which we are still committing to disk, then we need to
515 * make sure that we do not overwrite the old copy: we do copy-out to
516 * preserve the copy going to disk. We also account the buffer against
517 * the handle's metadata buffer credits (unless the buffer is already
518 * part of the transaction, that is).
519 *
520 */
521static int
522do_get_write_access(handle_t *handle, struct journal_head *jh,
523 int force_copy)
524{
525 struct buffer_head *bh;
526 transaction_t *transaction;
527 journal_t *journal;
528 int error;
529 char *frozen_buffer = NULL;
530 int need_copy = 0;
531
532 if (is_handle_aborted(handle))
533 return -EROFS;
534
535 transaction = handle->h_transaction;
536 journal = transaction->t_journal;
537
538 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
539
540 JBUFFER_TRACE(jh, "entry");
541repeat:
542 bh = jh2bh(jh);
543
544 /* @@@ Need to check for errors here at some point. */
545
546 lock_buffer(bh);
547 jbd_lock_bh_state(bh);
548
549 /* We now hold the buffer lock so it is safe to query the buffer
550 * state. Is the buffer dirty?
551 *
552 * If so, there are two possibilities. The buffer may be
553 * non-journaled, and undergoing a quite legitimate writeback.
554 * Otherwise, it is journaled, and we don't expect dirty buffers
555 * in that state (the buffers should be marked JBD_Dirty
556 * instead.) So either the IO is being done under our own
557 * control and this is a bug, or it's a third party IO such as
558 * dump(8) (which may leave the buffer scheduled for read ---
559 * ie. locked but not dirty) or tune2fs (which may actually have
560 * the buffer dirtied, ugh.) */
561
562 if (buffer_dirty(bh)) {
563 /*
564 * First question: is this buffer already part of the current
565 * transaction or the existing committing transaction?
566 */
567 if (jh->b_transaction) {
568 J_ASSERT_JH(jh,
569 jh->b_transaction == transaction ||
570 jh->b_transaction ==
571 journal->j_committing_transaction);
572 if (jh->b_next_transaction)
573 J_ASSERT_JH(jh, jh->b_next_transaction ==
574 transaction);
575 }
576 /*
577 * In any case we need to clean the dirty flag and we must
578 * do it under the buffer lock to be sure we don't race
579 * with running write-out.
580 */
581 JBUFFER_TRACE(jh, "Unexpected dirty buffer");
582 jbd_unexpected_dirty_buffer(jh);
583 }
584
585 unlock_buffer(bh);
586
587 error = -EROFS;
588 if (is_handle_aborted(handle)) {
589 jbd_unlock_bh_state(bh);
590 goto out;
591 }
592 error = 0;
593
594 /*
595 * The buffer is already part of this transaction if b_transaction or
596 * b_next_transaction points to it
597 */
598 if (jh->b_transaction == transaction ||
599 jh->b_next_transaction == transaction)
600 goto done;
601
602 /*
603 * If there is already a copy-out version of this buffer, then we don't
604 * need to make another one
605 */
606 if (jh->b_frozen_data) {
607 JBUFFER_TRACE(jh, "has frozen data");
608 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
609 jh->b_next_transaction = transaction;
610 goto done;
611 }
612
613 /* Is there data here we need to preserve? */
614
615 if (jh->b_transaction && jh->b_transaction != transaction) {
616 JBUFFER_TRACE(jh, "owned by older transaction");
617 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
618 J_ASSERT_JH(jh, jh->b_transaction ==
619 journal->j_committing_transaction);
620
621 /* There is one case we have to be very careful about.
622 * If the committing transaction is currently writing
623 * this buffer out to disk and has NOT made a copy-out,
624 * then we cannot modify the buffer contents at all
625 * right now. The essence of copy-out is that it is the
626 * extra copy, not the primary copy, which gets
627 * journaled. If the primary copy is already going to
628 * disk then we cannot do copy-out here. */
629
630 if (jh->b_jlist == BJ_Shadow) {
631 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
632 wait_queue_head_t *wqh;
633
634 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
635
636 JBUFFER_TRACE(jh, "on shadow: sleep");
637 jbd_unlock_bh_state(bh);
638 /* commit wakes up all shadow buffers after IO */
639 for ( ; ; ) {
640 prepare_to_wait(wqh, &wait.wait,
641 TASK_UNINTERRUPTIBLE);
642 if (jh->b_jlist != BJ_Shadow)
643 break;
644 schedule();
645 }
646 finish_wait(wqh, &wait.wait);
647 goto repeat;
648 }
649
650 /* Only do the copy if the currently-owning transaction
651 * still needs it. If it is on the Forget list, the
652 * committing transaction is past that stage. The
653 * buffer had better remain locked during the kmalloc,
654 * but that should be true --- we hold the journal lock
655 * still and the buffer is already on the BUF_JOURNAL
656 * list so won't be flushed.
657 *
658 * Subtle point, though: if this is a get_undo_access,
659 * then we will be relying on the frozen_data to contain
660 * the new value of the committed_data record after the
661 * transaction, so we HAVE to force the frozen_data copy
662 * in that case. */
663
664 if (jh->b_jlist != BJ_Forget || force_copy) {
665 JBUFFER_TRACE(jh, "generate frozen data");
666 if (!frozen_buffer) {
667 JBUFFER_TRACE(jh, "allocate memory for buffer");
668 jbd_unlock_bh_state(bh);
669 frozen_buffer =
670 jbd2_slab_alloc(jh2bh(jh)->b_size,
671 GFP_NOFS);
672 if (!frozen_buffer) {
673 printk(KERN_EMERG
674 "%s: OOM for frozen_buffer\n",
675 __FUNCTION__);
676 JBUFFER_TRACE(jh, "oom!");
677 error = -ENOMEM;
678 jbd_lock_bh_state(bh);
679 goto done;
680 }
681 goto repeat;
682 }
683 jh->b_frozen_data = frozen_buffer;
684 frozen_buffer = NULL;
685 need_copy = 1;
686 }
687 jh->b_next_transaction = transaction;
688 }
689
690
691 /*
692 * Finally, if the buffer is not journaled right now, we need to make
693 * sure it doesn't get written to disk before the caller actually
694 * commits the new data
695 */
696 if (!jh->b_transaction) {
697 JBUFFER_TRACE(jh, "no transaction");
698 J_ASSERT_JH(jh, !jh->b_next_transaction);
699 jh->b_transaction = transaction;
700 JBUFFER_TRACE(jh, "file as BJ_Reserved");
701 spin_lock(&journal->j_list_lock);
702 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
703 spin_unlock(&journal->j_list_lock);
704 }
705
706done:
707 if (need_copy) {
708 struct page *page;
709 int offset;
710 char *source;
711
712 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
713 "Possible IO failure.\n");
714 page = jh2bh(jh)->b_page;
715 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
716 source = kmap_atomic(page, KM_USER0);
717 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
718 kunmap_atomic(source, KM_USER0);
719 }
720 jbd_unlock_bh_state(bh);
721
722 /*
723 * If we are about to journal a buffer, then any revoke pending on it is
724 * no longer valid
725 */
726 jbd2_journal_cancel_revoke(handle, jh);
727
728out:
729 if (unlikely(frozen_buffer)) /* It's usually NULL */
730 jbd2_slab_free(frozen_buffer, bh->b_size);
731
732 JBUFFER_TRACE(jh, "exit");
733 return error;
734}
735
736/**
737 * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
738 * @handle: transaction to add buffer modifications to
739 * @bh: bh to be used for metadata writes
740 * @credits: variable that will receive credits for the buffer
741 *
742 * Returns an error code or 0 on success.
743 *
744 * In full data journalling mode the buffer may be of type BJ_AsyncData,
745 * because we're write()ing a buffer which is also part of a shared mapping.
746 */
747
748int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
749{
750 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
751 int rc;
752
753 /* We do not want to get caught playing with fields which the
754 * log thread also manipulates. Make sure that the buffer
755 * completes any outstanding IO before proceeding. */
756 rc = do_get_write_access(handle, jh, 0);
757 jbd2_journal_put_journal_head(jh);
758 return rc;
759}
760
761
762/*
763 * When the user wants to journal a newly created buffer_head
764 * (ie. getblk() returned a new buffer and we are going to populate it
765 * manually rather than reading off disk), then we need to keep the
766 * buffer_head locked until it has been completely filled with new
767 * data. In this case, we should be able to make the assertion that
768 * the bh is not already part of an existing transaction.
769 *
770 * The buffer should already be locked by the caller by this point.
771 * There is no lock ranking violation: it was a newly created,
772 * unlocked buffer beforehand. */
773
774/**
775 * int jbd2_journal_get_create_access () - notify intent to use newly created bh
776 * @handle: transaction to new buffer to
777 * @bh: new buffer.
778 *
779 * Call this if you create a new bh.
780 */
781int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
782{
783 transaction_t *transaction = handle->h_transaction;
784 journal_t *journal = transaction->t_journal;
785 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
786 int err;
787
788 jbd_debug(5, "journal_head %p\n", jh);
789 err = -EROFS;
790 if (is_handle_aborted(handle))
791 goto out;
792 err = 0;
793
794 JBUFFER_TRACE(jh, "entry");
795 /*
796 * The buffer may already belong to this transaction due to pre-zeroing
797 * in the filesystem's new_block code. It may also be on the previous,
798 * committing transaction's lists, but it HAS to be in Forget state in
799 * that case: the transaction must have deleted the buffer for it to be
800 * reused here.
801 */
802 jbd_lock_bh_state(bh);
803 spin_lock(&journal->j_list_lock);
804 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
805 jh->b_transaction == NULL ||
806 (jh->b_transaction == journal->j_committing_transaction &&
807 jh->b_jlist == BJ_Forget)));
808
809 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
810 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
811
812 if (jh->b_transaction == NULL) {
813 jh->b_transaction = transaction;
814 JBUFFER_TRACE(jh, "file as BJ_Reserved");
815 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
816 } else if (jh->b_transaction == journal->j_committing_transaction) {
817 JBUFFER_TRACE(jh, "set next transaction");
818 jh->b_next_transaction = transaction;
819 }
820 spin_unlock(&journal->j_list_lock);
821 jbd_unlock_bh_state(bh);
822
823 /*
824 * akpm: I added this. ext3_alloc_branch can pick up new indirect
825 * blocks which contain freed but then revoked metadata. We need
826 * to cancel the revoke in case we end up freeing it yet again
827 * and the reallocating as data - this would cause a second revoke,
828 * which hits an assertion error.
829 */
830 JBUFFER_TRACE(jh, "cancelling revoke");
831 jbd2_journal_cancel_revoke(handle, jh);
832 jbd2_journal_put_journal_head(jh);
833out:
834 return err;
835}
836
837/**
838 * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
839 * non-rewindable consequences
840 * @handle: transaction
841 * @bh: buffer to undo
842 * @credits: store the number of taken credits here (if not NULL)
843 *
844 * Sometimes there is a need to distinguish between metadata which has
845 * been committed to disk and that which has not. The ext3fs code uses
846 * this for freeing and allocating space, we have to make sure that we
847 * do not reuse freed space until the deallocation has been committed,
848 * since if we overwrote that space we would make the delete
849 * un-rewindable in case of a crash.
850 *
851 * To deal with that, jbd2_journal_get_undo_access requests write access to a
852 * buffer for parts of non-rewindable operations such as delete
853 * operations on the bitmaps. The journaling code must keep a copy of
854 * the buffer's contents prior to the undo_access call until such time
855 * as we know that the buffer has definitely been committed to disk.
856 *
857 * We never need to know which transaction the committed data is part
858 * of, buffers touched here are guaranteed to be dirtied later and so
859 * will be committed to a new transaction in due course, at which point
860 * we can discard the old committed data pointer.
861 *
862 * Returns error number or 0 on success.
863 */
864int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
865{
866 int err;
867 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
868 char *committed_data = NULL;
869
870 JBUFFER_TRACE(jh, "entry");
871
872 /*
873 * Do this first --- it can drop the journal lock, so we want to
874 * make sure that obtaining the committed_data is done
875 * atomically wrt. completion of any outstanding commits.
876 */
877 err = do_get_write_access(handle, jh, 1);
878 if (err)
879 goto out;
880
881repeat:
882 if (!jh->b_committed_data) {
883 committed_data = jbd2_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
884 if (!committed_data) {
885 printk(KERN_EMERG "%s: No memory for committed data\n",
886 __FUNCTION__);
887 err = -ENOMEM;
888 goto out;
889 }
890 }
891
892 jbd_lock_bh_state(bh);
893 if (!jh->b_committed_data) {
894 /* Copy out the current buffer contents into the
895 * preserved, committed copy. */
896 JBUFFER_TRACE(jh, "generate b_committed data");
897 if (!committed_data) {
898 jbd_unlock_bh_state(bh);
899 goto repeat;
900 }
901
902 jh->b_committed_data = committed_data;
903 committed_data = NULL;
904 memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
905 }
906 jbd_unlock_bh_state(bh);
907out:
908 jbd2_journal_put_journal_head(jh);
909 if (unlikely(committed_data))
910 jbd2_slab_free(committed_data, bh->b_size);
911 return err;
912}
913
914/**
915 * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
916 * needs to be flushed before we can commit the
917 * current transaction.
918 * @handle: transaction
919 * @bh: bufferhead to mark
920 *
921 * The buffer is placed on the transaction's data list and is marked as
922 * belonging to the transaction.
923 *
924 * Returns error number or 0 on success.
925 *
926 * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
927 * by kswapd.
928 */
929int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
930{
931 journal_t *journal = handle->h_transaction->t_journal;
932 int need_brelse = 0;
933 struct journal_head *jh;
934
935 if (is_handle_aborted(handle))
936 return 0;
937
938 jh = jbd2_journal_add_journal_head(bh);
939 JBUFFER_TRACE(jh, "entry");
940
941 /*
942 * The buffer could *already* be dirty. Writeout can start
943 * at any time.
944 */
945 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
946
947 /*
948 * What if the buffer is already part of a running transaction?
949 *
950 * There are two cases:
951 * 1) It is part of the current running transaction. Refile it,
952 * just in case we have allocated it as metadata, deallocated
953 * it, then reallocated it as data.
954 * 2) It is part of the previous, still-committing transaction.
955 * If all we want to do is to guarantee that the buffer will be
956 * written to disk before this new transaction commits, then
957 * being sure that the *previous* transaction has this same
958 * property is sufficient for us! Just leave it on its old
959 * transaction.
960 *
961 * In case (2), the buffer must not already exist as metadata
962 * --- that would violate write ordering (a transaction is free
963 * to write its data at any point, even before the previous
964 * committing transaction has committed). The caller must
965 * never, ever allow this to happen: there's nothing we can do
966 * about it in this layer.
967 */
968 jbd_lock_bh_state(bh);
969 spin_lock(&journal->j_list_lock);
970 if (jh->b_transaction) {
971 JBUFFER_TRACE(jh, "has transaction");
972 if (jh->b_transaction != handle->h_transaction) {
973 JBUFFER_TRACE(jh, "belongs to older transaction");
974 J_ASSERT_JH(jh, jh->b_transaction ==
975 journal->j_committing_transaction);
976
977 /* @@@ IS THIS TRUE ? */
978 /*
979 * Not any more. Scenario: someone does a write()
980 * in data=journal mode. The buffer's transaction has
981 * moved into commit. Then someone does another
982 * write() to the file. We do the frozen data copyout
983 * and set b_next_transaction to point to j_running_t.
984 * And while we're in that state, someone does a
985 * writepage() in an attempt to pageout the same area
986 * of the file via a shared mapping. At present that
987 * calls jbd2_journal_dirty_data(), and we get right here.
988 * It may be too late to journal the data. Simply
989 * falling through to the next test will suffice: the
990 * data will be dirty and wil be checkpointed. The
991 * ordering comments in the next comment block still
992 * apply.
993 */
994 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
995
996 /*
997 * If we're journalling data, and this buffer was
998 * subject to a write(), it could be metadata, forget
999 * or shadow against the committing transaction. Now,
1000 * someone has dirtied the same darn page via a mapping
1001 * and it is being writepage()'d.
1002 * We *could* just steal the page from commit, with some
1003 * fancy locking there. Instead, we just skip it -
1004 * don't tie the page's buffers to the new transaction
1005 * at all.
1006 * Implication: if we crash before the writepage() data
1007 * is written into the filesystem, recovery will replay
1008 * the write() data.
1009 */
1010 if (jh->b_jlist != BJ_None &&
1011 jh->b_jlist != BJ_SyncData &&
1012 jh->b_jlist != BJ_Locked) {
1013 JBUFFER_TRACE(jh, "Not stealing");
1014 goto no_journal;
1015 }
1016
1017 /*
1018 * This buffer may be undergoing writeout in commit. We
1019 * can't return from here and let the caller dirty it
1020 * again because that can cause the write-out loop in
1021 * commit to never terminate.
1022 */
1023 if (buffer_dirty(bh)) {
1024 get_bh(bh);
1025 spin_unlock(&journal->j_list_lock);
1026 jbd_unlock_bh_state(bh);
1027 need_brelse = 1;
1028 sync_dirty_buffer(bh);
1029 jbd_lock_bh_state(bh);
1030 spin_lock(&journal->j_list_lock);
1031 /* The buffer may become locked again at any
1032 time if it is redirtied */
1033 }
1034
1035 /* journal_clean_data_list() may have got there first */
1036 if (jh->b_transaction != NULL) {
1037 JBUFFER_TRACE(jh, "unfile from commit");
1038 __jbd2_journal_temp_unlink_buffer(jh);
1039 /* It still points to the committing
1040 * transaction; move it to this one so
1041 * that the refile assert checks are
1042 * happy. */
1043 jh->b_transaction = handle->h_transaction;
1044 }
1045 /* The buffer will be refiled below */
1046
1047 }
1048 /*
1049 * Special case --- the buffer might actually have been
1050 * allocated and then immediately deallocated in the previous,
1051 * committing transaction, so might still be left on that
1052 * transaction's metadata lists.
1053 */
1054 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1055 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1056 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1057 __jbd2_journal_temp_unlink_buffer(jh);
1058 jh->b_transaction = handle->h_transaction;
1059 JBUFFER_TRACE(jh, "file as data");
1060 __jbd2_journal_file_buffer(jh, handle->h_transaction,
1061 BJ_SyncData);
1062 }
1063 } else {
1064 JBUFFER_TRACE(jh, "not on a transaction");
1065 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1066 }
1067no_journal:
1068 spin_unlock(&journal->j_list_lock);
1069 jbd_unlock_bh_state(bh);
1070 if (need_brelse) {
1071 BUFFER_TRACE(bh, "brelse");
1072 __brelse(bh);
1073 }
1074 JBUFFER_TRACE(jh, "exit");
1075 jbd2_journal_put_journal_head(jh);
1076 return 0;
1077}
1078
1079/**
1080 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
1081 * @handle: transaction to add buffer to.
1082 * @bh: buffer to mark
1083 *
1084 * mark dirty metadata which needs to be journaled as part of the current
1085 * transaction.
1086 *
1087 * The buffer is placed on the transaction's metadata list and is marked
1088 * as belonging to the transaction.
1089 *
1090 * Returns error number or 0 on success.
1091 *
1092 * Special care needs to be taken if the buffer already belongs to the
1093 * current committing transaction (in which case we should have frozen
1094 * data present for that commit). In that case, we don't relink the
1095 * buffer: that only gets done when the old transaction finally
1096 * completes its commit.
1097 */
1098int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1099{
1100 transaction_t *transaction = handle->h_transaction;
1101 journal_t *journal = transaction->t_journal;
1102 struct journal_head *jh = bh2jh(bh);
1103
1104 jbd_debug(5, "journal_head %p\n", jh);
1105 JBUFFER_TRACE(jh, "entry");
1106 if (is_handle_aborted(handle))
1107 goto out;
1108
1109 jbd_lock_bh_state(bh);
1110
1111 if (jh->b_modified == 0) {
1112 /*
1113 * This buffer's got modified and becoming part
1114 * of the transaction. This needs to be done
1115 * once a transaction -bzzz
1116 */
1117 jh->b_modified = 1;
1118 J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1119 handle->h_buffer_credits--;
1120 }
1121
1122 /*
1123 * fastpath, to avoid expensive locking. If this buffer is already
1124 * on the running transaction's metadata list there is nothing to do.
1125 * Nobody can take it off again because there is a handle open.
1126 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1127 * result in this test being false, so we go in and take the locks.
1128 */
1129 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1130 JBUFFER_TRACE(jh, "fastpath");
1131 J_ASSERT_JH(jh, jh->b_transaction ==
1132 journal->j_running_transaction);
1133 goto out_unlock_bh;
1134 }
1135
1136 set_buffer_jbddirty(bh);
1137
1138 /*
1139 * Metadata already on the current transaction list doesn't
1140 * need to be filed. Metadata on another transaction's list must
1141 * be committing, and will be refiled once the commit completes:
1142 * leave it alone for now.
1143 */
1144 if (jh->b_transaction != transaction) {
1145 JBUFFER_TRACE(jh, "already on other transaction");
1146 J_ASSERT_JH(jh, jh->b_transaction ==
1147 journal->j_committing_transaction);
1148 J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1149 /* And this case is illegal: we can't reuse another
1150 * transaction's data buffer, ever. */
1151 goto out_unlock_bh;
1152 }
1153
1154 /* That test should have eliminated the following case: */
1155 J_ASSERT_JH(jh, jh->b_frozen_data == 0);
1156
1157 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1158 spin_lock(&journal->j_list_lock);
1159 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1160 spin_unlock(&journal->j_list_lock);
1161out_unlock_bh:
1162 jbd_unlock_bh_state(bh);
1163out:
1164 JBUFFER_TRACE(jh, "exit");
1165 return 0;
1166}
1167
1168/*
1169 * jbd2_journal_release_buffer: undo a get_write_access without any buffer
1170 * updates, if the update decided in the end that it didn't need access.
1171 *
1172 */
1173void
1174jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1175{
1176 BUFFER_TRACE(bh, "entry");
1177}
1178
1179/**
1180 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1181 * @handle: transaction handle
1182 * @bh: bh to 'forget'
1183 *
1184 * We can only do the bforget if there are no commits pending against the
1185 * buffer. If the buffer is dirty in the current running transaction we
1186 * can safely unlink it.
1187 *
1188 * bh may not be a journalled buffer at all - it may be a non-JBD
1189 * buffer which came off the hashtable. Check for this.
1190 *
1191 * Decrements bh->b_count by one.
1192 *
1193 * Allow this call even if the handle has aborted --- it may be part of
1194 * the caller's cleanup after an abort.
1195 */
1196int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1197{
1198 transaction_t *transaction = handle->h_transaction;
1199 journal_t *journal = transaction->t_journal;
1200 struct journal_head *jh;
1201 int drop_reserve = 0;
1202 int err = 0;
1203
1204 BUFFER_TRACE(bh, "entry");
1205
1206 jbd_lock_bh_state(bh);
1207 spin_lock(&journal->j_list_lock);
1208
1209 if (!buffer_jbd(bh))
1210 goto not_jbd;
1211 jh = bh2jh(bh);
1212
1213 /* Critical error: attempting to delete a bitmap buffer, maybe?
1214 * Don't do any jbd operations, and return an error. */
1215 if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1216 "inconsistent data on disk")) {
1217 err = -EIO;
1218 goto not_jbd;
1219 }
1220
1221 /*
1222 * The buffer's going from the transaction, we must drop
1223 * all references -bzzz
1224 */
1225 jh->b_modified = 0;
1226
1227 if (jh->b_transaction == handle->h_transaction) {
1228 J_ASSERT_JH(jh, !jh->b_frozen_data);
1229
1230 /* If we are forgetting a buffer which is already part
1231 * of this transaction, then we can just drop it from
1232 * the transaction immediately. */
1233 clear_buffer_dirty(bh);
1234 clear_buffer_jbddirty(bh);
1235
1236 JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1237
1238 drop_reserve = 1;
1239
1240 /*
1241 * We are no longer going to journal this buffer.
1242 * However, the commit of this transaction is still
1243 * important to the buffer: the delete that we are now
1244 * processing might obsolete an old log entry, so by
1245 * committing, we can satisfy the buffer's checkpoint.
1246 *
1247 * So, if we have a checkpoint on the buffer, we should
1248 * now refile the buffer on our BJ_Forget list so that
1249 * we know to remove the checkpoint after we commit.
1250 */
1251
1252 if (jh->b_cp_transaction) {
1253 __jbd2_journal_temp_unlink_buffer(jh);
1254 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1255 } else {
1256 __jbd2_journal_unfile_buffer(jh);
1257 jbd2_journal_remove_journal_head(bh);
1258 __brelse(bh);
1259 if (!buffer_jbd(bh)) {
1260 spin_unlock(&journal->j_list_lock);
1261 jbd_unlock_bh_state(bh);
1262 __bforget(bh);
1263 goto drop;
1264 }
1265 }
1266 } else if (jh->b_transaction) {
1267 J_ASSERT_JH(jh, (jh->b_transaction ==
1268 journal->j_committing_transaction));
1269 /* However, if the buffer is still owned by a prior
1270 * (committing) transaction, we can't drop it yet... */
1271 JBUFFER_TRACE(jh, "belongs to older transaction");
1272 /* ... but we CAN drop it from the new transaction if we
1273 * have also modified it since the original commit. */
1274
1275 if (jh->b_next_transaction) {
1276 J_ASSERT(jh->b_next_transaction == transaction);
1277 jh->b_next_transaction = NULL;
1278 drop_reserve = 1;
1279 }
1280 }
1281
1282not_jbd:
1283 spin_unlock(&journal->j_list_lock);
1284 jbd_unlock_bh_state(bh);
1285 __brelse(bh);
1286drop:
1287 if (drop_reserve) {
1288 /* no need to reserve log space for this block -bzzz */
1289 handle->h_buffer_credits++;
1290 }
1291 return err;
1292}
1293
1294/**
1295 * int jbd2_journal_stop() - complete a transaction
1296 * @handle: tranaction to complete.
1297 *
1298 * All done for a particular handle.
1299 *
1300 * There is not much action needed here. We just return any remaining
1301 * buffer credits to the transaction and remove the handle. The only
1302 * complication is that we need to start a commit operation if the
1303 * filesystem is marked for synchronous update.
1304 *
1305 * jbd2_journal_stop itself will not usually return an error, but it may
1306 * do so in unusual circumstances. In particular, expect it to
1307 * return -EIO if a jbd2_journal_abort has been executed since the
1308 * transaction began.
1309 */
1310int jbd2_journal_stop(handle_t *handle)
1311{
1312 transaction_t *transaction = handle->h_transaction;
1313 journal_t *journal = transaction->t_journal;
1314 int old_handle_count, err;
1315 pid_t pid;
1316
1317 J_ASSERT(journal_current_handle() == handle);
1318
1319 if (is_handle_aborted(handle))
1320 err = -EIO;
1321 else {
1322 J_ASSERT(transaction->t_updates > 0);
1323 err = 0;
1324 }
1325
1326 if (--handle->h_ref > 0) {
1327 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1328 handle->h_ref);
1329 return err;
1330 }
1331
1332 jbd_debug(4, "Handle %p going down\n", handle);
1333
1334 /*
1335 * Implement synchronous transaction batching. If the handle
1336 * was synchronous, don't force a commit immediately. Let's
1337 * yield and let another thread piggyback onto this transaction.
1338 * Keep doing that while new threads continue to arrive.
1339 * It doesn't cost much - we're about to run a commit and sleep
1340 * on IO anyway. Speeds up many-threaded, many-dir operations
1341 * by 30x or more...
1342 *
1343 * But don't do this if this process was the most recent one to
1344 * perform a synchronous write. We do this to detect the case where a
1345 * single process is doing a stream of sync writes. No point in waiting
1346 * for joiners in that case.
1347 */
1348 pid = current->pid;
1349 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1350 journal->j_last_sync_writer = pid;
1351 do {
1352 old_handle_count = transaction->t_handle_count;
1353 schedule_timeout_uninterruptible(1);
1354 } while (old_handle_count != transaction->t_handle_count);
1355 }
1356
1357 current->journal_info = NULL;
1358 spin_lock(&journal->j_state_lock);
1359 spin_lock(&transaction->t_handle_lock);
1360 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1361 transaction->t_updates--;
1362 if (!transaction->t_updates) {
1363 wake_up(&journal->j_wait_updates);
1364 if (journal->j_barrier_count)
1365 wake_up(&journal->j_wait_transaction_locked);
1366 }
1367
1368 /*
1369 * If the handle is marked SYNC, we need to set another commit
1370 * going! We also want to force a commit if the current
1371 * transaction is occupying too much of the log, or if the
1372 * transaction is too old now.
1373 */
1374 if (handle->h_sync ||
1375 transaction->t_outstanding_credits >
1376 journal->j_max_transaction_buffers ||
1377 time_after_eq(jiffies, transaction->t_expires)) {
1378 /* Do this even for aborted journals: an abort still
1379 * completes the commit thread, it just doesn't write
1380 * anything to disk. */
1381 tid_t tid = transaction->t_tid;
1382
1383 spin_unlock(&transaction->t_handle_lock);
1384 jbd_debug(2, "transaction too old, requesting commit for "
1385 "handle %p\n", handle);
1386 /* This is non-blocking */
1387 __jbd2_log_start_commit(journal, transaction->t_tid);
1388 spin_unlock(&journal->j_state_lock);
1389
1390 /*
1391 * Special case: JBD2_SYNC synchronous updates require us
1392 * to wait for the commit to complete.
1393 */
1394 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1395 err = jbd2_log_wait_commit(journal, tid);
1396 } else {
1397 spin_unlock(&transaction->t_handle_lock);
1398 spin_unlock(&journal->j_state_lock);
1399 }
1400
1401 jbd_free_handle(handle);
1402 return err;
1403}
1404
1405/**int jbd2_journal_force_commit() - force any uncommitted transactions
1406 * @journal: journal to force
1407 *
1408 * For synchronous operations: force any uncommitted transactions
1409 * to disk. May seem kludgy, but it reuses all the handle batching
1410 * code in a very simple manner.
1411 */
1412int jbd2_journal_force_commit(journal_t *journal)
1413{
1414 handle_t *handle;
1415 int ret;
1416
1417 handle = jbd2_journal_start(journal, 1);
1418 if (IS_ERR(handle)) {
1419 ret = PTR_ERR(handle);
1420 } else {
1421 handle->h_sync = 1;
1422 ret = jbd2_journal_stop(handle);
1423 }
1424 return ret;
1425}
1426
1427/*
1428 *
1429 * List management code snippets: various functions for manipulating the
1430 * transaction buffer lists.
1431 *
1432 */
1433
1434/*
1435 * Append a buffer to a transaction list, given the transaction's list head
1436 * pointer.
1437 *
1438 * j_list_lock is held.
1439 *
1440 * jbd_lock_bh_state(jh2bh(jh)) is held.
1441 */
1442
1443static inline void
1444__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1445{
1446 if (!*list) {
1447 jh->b_tnext = jh->b_tprev = jh;
1448 *list = jh;
1449 } else {
1450 /* Insert at the tail of the list to preserve order */
1451 struct journal_head *first = *list, *last = first->b_tprev;
1452 jh->b_tprev = last;
1453 jh->b_tnext = first;
1454 last->b_tnext = first->b_tprev = jh;
1455 }
1456}
1457
1458/*
1459 * Remove a buffer from a transaction list, given the transaction's list
1460 * head pointer.
1461 *
1462 * Called with j_list_lock held, and the journal may not be locked.
1463 *
1464 * jbd_lock_bh_state(jh2bh(jh)) is held.
1465 */
1466
1467static inline void
1468__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1469{
1470 if (*list == jh) {
1471 *list = jh->b_tnext;
1472 if (*list == jh)
1473 *list = NULL;
1474 }
1475 jh->b_tprev->b_tnext = jh->b_tnext;
1476 jh->b_tnext->b_tprev = jh->b_tprev;
1477}
1478
1479/*
1480 * Remove a buffer from the appropriate transaction list.
1481 *
1482 * Note that this function can *change* the value of
1483 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
1484 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
1485 * is holding onto a copy of one of thee pointers, it could go bad.
1486 * Generally the caller needs to re-read the pointer from the transaction_t.
1487 *
1488 * Called under j_list_lock. The journal may not be locked.
1489 */
1490void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1491{
1492 struct journal_head **list = NULL;
1493 transaction_t *transaction;
1494 struct buffer_head *bh = jh2bh(jh);
1495
1496 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1497 transaction = jh->b_transaction;
1498 if (transaction)
1499 assert_spin_locked(&transaction->t_journal->j_list_lock);
1500
1501 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1502 if (jh->b_jlist != BJ_None)
1503 J_ASSERT_JH(jh, transaction != 0);
1504
1505 switch (jh->b_jlist) {
1506 case BJ_None:
1507 return;
1508 case BJ_SyncData:
1509 list = &transaction->t_sync_datalist;
1510 break;
1511 case BJ_Metadata:
1512 transaction->t_nr_buffers--;
1513 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1514 list = &transaction->t_buffers;
1515 break;
1516 case BJ_Forget:
1517 list = &transaction->t_forget;
1518 break;
1519 case BJ_IO:
1520 list = &transaction->t_iobuf_list;
1521 break;
1522 case BJ_Shadow:
1523 list = &transaction->t_shadow_list;
1524 break;
1525 case BJ_LogCtl:
1526 list = &transaction->t_log_list;
1527 break;
1528 case BJ_Reserved:
1529 list = &transaction->t_reserved_list;
1530 break;
1531 case BJ_Locked:
1532 list = &transaction->t_locked_list;
1533 break;
1534 }
1535
1536 __blist_del_buffer(list, jh);
1537 jh->b_jlist = BJ_None;
1538 if (test_clear_buffer_jbddirty(bh))
1539 mark_buffer_dirty(bh); /* Expose it to the VM */
1540}
1541
1542void __jbd2_journal_unfile_buffer(struct journal_head *jh)
1543{
1544 __jbd2_journal_temp_unlink_buffer(jh);
1545 jh->b_transaction = NULL;
1546}
1547
1548void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1549{
1550 jbd_lock_bh_state(jh2bh(jh));
1551 spin_lock(&journal->j_list_lock);
1552 __jbd2_journal_unfile_buffer(jh);
1553 spin_unlock(&journal->j_list_lock);
1554 jbd_unlock_bh_state(jh2bh(jh));
1555}
1556
1557/*
1558 * Called from jbd2_journal_try_to_free_buffers().
1559 *
1560 * Called under jbd_lock_bh_state(bh)
1561 */
1562static void
1563__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1564{
1565 struct journal_head *jh;
1566
1567 jh = bh2jh(bh);
1568
1569 if (buffer_locked(bh) || buffer_dirty(bh))
1570 goto out;
1571
1572 if (jh->b_next_transaction != 0)
1573 goto out;
1574
1575 spin_lock(&journal->j_list_lock);
1576 if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
1577 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1578 /* A written-back ordered data buffer */
1579 JBUFFER_TRACE(jh, "release data");
1580 __jbd2_journal_unfile_buffer(jh);
1581 jbd2_journal_remove_journal_head(bh);
1582 __brelse(bh);
1583 }
1584 } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
1585 /* written-back checkpointed metadata buffer */
1586 if (jh->b_jlist == BJ_None) {
1587 JBUFFER_TRACE(jh, "remove from checkpoint list");
1588 __jbd2_journal_remove_checkpoint(jh);
1589 jbd2_journal_remove_journal_head(bh);
1590 __brelse(bh);
1591 }
1592 }
1593 spin_unlock(&journal->j_list_lock);
1594out:
1595 return;
1596}
1597
1598
1599/**
1600 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
1601 * @journal: journal for operation
1602 * @page: to try and free
1603 * @unused_gfp_mask: unused
1604 *
1605 *
1606 * For all the buffers on this page,
1607 * if they are fully written out ordered data, move them onto BUF_CLEAN
1608 * so try_to_free_buffers() can reap them.
1609 *
1610 * This function returns non-zero if we wish try_to_free_buffers()
1611 * to be called. We do this if the page is releasable by try_to_free_buffers().
1612 * We also do it if the page has locked or dirty buffers and the caller wants
1613 * us to perform sync or async writeout.
1614 *
1615 * This complicates JBD locking somewhat. We aren't protected by the
1616 * BKL here. We wish to remove the buffer from its committing or
1617 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
1618 *
1619 * This may *change* the value of transaction_t->t_datalist, so anyone
1620 * who looks at t_datalist needs to lock against this function.
1621 *
1622 * Even worse, someone may be doing a jbd2_journal_dirty_data on this
1623 * buffer. So we need to lock against that. jbd2_journal_dirty_data()
1624 * will come out of the lock with the buffer dirty, which makes it
1625 * ineligible for release here.
1626 *
1627 * Who else is affected by this? hmm... Really the only contender
1628 * is do_get_write_access() - it could be looking at the buffer while
1629 * journal_try_to_free_buffer() is changing its state. But that
1630 * cannot happen because we never reallocate freed data as metadata
1631 * while the data is part of a transaction. Yes?
1632 */
1633int jbd2_journal_try_to_free_buffers(journal_t *journal,
1634 struct page *page, gfp_t unused_gfp_mask)
1635{
1636 struct buffer_head *head;
1637 struct buffer_head *bh;
1638 int ret = 0;
1639
1640 J_ASSERT(PageLocked(page));
1641
1642 head = page_buffers(page);
1643 bh = head;
1644 do {
1645 struct journal_head *jh;
1646
1647 /*
1648 * We take our own ref against the journal_head here to avoid
1649 * having to add tons of locking around each instance of
1650 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
1651 */
1652 jh = jbd2_journal_grab_journal_head(bh);
1653 if (!jh)
1654 continue;
1655
1656 jbd_lock_bh_state(bh);
1657 __journal_try_to_free_buffer(journal, bh);
1658 jbd2_journal_put_journal_head(jh);
1659 jbd_unlock_bh_state(bh);
1660 if (buffer_jbd(bh))
1661 goto busy;
1662 } while ((bh = bh->b_this_page) != head);
1663 ret = try_to_free_buffers(page);
1664busy:
1665 return ret;
1666}
1667
1668/*
1669 * This buffer is no longer needed. If it is on an older transaction's
1670 * checkpoint list we need to record it on this transaction's forget list
1671 * to pin this buffer (and hence its checkpointing transaction) down until
1672 * this transaction commits. If the buffer isn't on a checkpoint list, we
1673 * release it.
1674 * Returns non-zero if JBD no longer has an interest in the buffer.
1675 *
1676 * Called under j_list_lock.
1677 *
1678 * Called under jbd_lock_bh_state(bh).
1679 */
1680static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1681{
1682 int may_free = 1;
1683 struct buffer_head *bh = jh2bh(jh);
1684
1685 __jbd2_journal_unfile_buffer(jh);
1686
1687 if (jh->b_cp_transaction) {
1688 JBUFFER_TRACE(jh, "on running+cp transaction");
1689 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1690 clear_buffer_jbddirty(bh);
1691 may_free = 0;
1692 } else {
1693 JBUFFER_TRACE(jh, "on running transaction");
1694 jbd2_journal_remove_journal_head(bh);
1695 __brelse(bh);
1696 }
1697 return may_free;
1698}
1699
1700/*
1701 * jbd2_journal_invalidatepage
1702 *
1703 * This code is tricky. It has a number of cases to deal with.
1704 *
1705 * There are two invariants which this code relies on:
1706 *
1707 * i_size must be updated on disk before we start calling invalidatepage on the
1708 * data.
1709 *
1710 * This is done in ext3 by defining an ext3_setattr method which
1711 * updates i_size before truncate gets going. By maintaining this
1712 * invariant, we can be sure that it is safe to throw away any buffers
1713 * attached to the current transaction: once the transaction commits,
1714 * we know that the data will not be needed.
1715 *
1716 * Note however that we can *not* throw away data belonging to the
1717 * previous, committing transaction!
1718 *
1719 * Any disk blocks which *are* part of the previous, committing
1720 * transaction (and which therefore cannot be discarded immediately) are
1721 * not going to be reused in the new running transaction
1722 *
1723 * The bitmap committed_data images guarantee this: any block which is
1724 * allocated in one transaction and removed in the next will be marked
1725 * as in-use in the committed_data bitmap, so cannot be reused until
1726 * the next transaction to delete the block commits. This means that
1727 * leaving committing buffers dirty is quite safe: the disk blocks
1728 * cannot be reallocated to a different file and so buffer aliasing is
1729 * not possible.
1730 *
1731 *
1732 * The above applies mainly to ordered data mode. In writeback mode we
1733 * don't make guarantees about the order in which data hits disk --- in
1734 * particular we don't guarantee that new dirty data is flushed before
1735 * transaction commit --- so it is always safe just to discard data
1736 * immediately in that mode. --sct
1737 */
1738
1739/*
1740 * The journal_unmap_buffer helper function returns zero if the buffer
1741 * concerned remains pinned as an anonymous buffer belonging to an older
1742 * transaction.
1743 *
1744 * We're outside-transaction here. Either or both of j_running_transaction
1745 * and j_committing_transaction may be NULL.
1746 */
1747static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1748{
1749 transaction_t *transaction;
1750 struct journal_head *jh;
1751 int may_free = 1;
1752 int ret;
1753
1754 BUFFER_TRACE(bh, "entry");
1755
1756 /*
1757 * It is safe to proceed here without the j_list_lock because the
1758 * buffers cannot be stolen by try_to_free_buffers as long as we are
1759 * holding the page lock. --sct
1760 */
1761
1762 if (!buffer_jbd(bh))
1763 goto zap_buffer_unlocked;
1764
1765 spin_lock(&journal->j_state_lock);
1766 jbd_lock_bh_state(bh);
1767 spin_lock(&journal->j_list_lock);
1768
1769 jh = jbd2_journal_grab_journal_head(bh);
1770 if (!jh)
1771 goto zap_buffer_no_jh;
1772
1773 transaction = jh->b_transaction;
1774 if (transaction == NULL) {
1775 /* First case: not on any transaction. If it
1776 * has no checkpoint link, then we can zap it:
1777 * it's a writeback-mode buffer so we don't care
1778 * if it hits disk safely. */
1779 if (!jh->b_cp_transaction) {
1780 JBUFFER_TRACE(jh, "not on any transaction: zap");
1781 goto zap_buffer;
1782 }
1783
1784 if (!buffer_dirty(bh)) {
1785 /* bdflush has written it. We can drop it now */
1786 goto zap_buffer;
1787 }
1788
1789 /* OK, it must be in the journal but still not
1790 * written fully to disk: it's metadata or
1791 * journaled data... */
1792
1793 if (journal->j_running_transaction) {
1794 /* ... and once the current transaction has
1795 * committed, the buffer won't be needed any
1796 * longer. */
1797 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1798 ret = __dispose_buffer(jh,
1799 journal->j_running_transaction);
1800 jbd2_journal_put_journal_head(jh);
1801 spin_unlock(&journal->j_list_lock);
1802 jbd_unlock_bh_state(bh);
1803 spin_unlock(&journal->j_state_lock);
1804 return ret;
1805 } else {
1806 /* There is no currently-running transaction. So the
1807 * orphan record which we wrote for this file must have
1808 * passed into commit. We must attach this buffer to
1809 * the committing transaction, if it exists. */
1810 if (journal->j_committing_transaction) {
1811 JBUFFER_TRACE(jh, "give to committing trans");
1812 ret = __dispose_buffer(jh,
1813 journal->j_committing_transaction);
1814 jbd2_journal_put_journal_head(jh);
1815 spin_unlock(&journal->j_list_lock);
1816 jbd_unlock_bh_state(bh);
1817 spin_unlock(&journal->j_state_lock);
1818 return ret;
1819 } else {
1820 /* The orphan record's transaction has
1821 * committed. We can cleanse this buffer */
1822 clear_buffer_jbddirty(bh);
1823 goto zap_buffer;
1824 }
1825 }
1826 } else if (transaction == journal->j_committing_transaction) {
1827 if (jh->b_jlist == BJ_Locked) {
1828 /*
1829 * The buffer is on the committing transaction's locked
1830 * list. We have the buffer locked, so I/O has
1831 * completed. So we can nail the buffer now.
1832 */
1833 may_free = __dispose_buffer(jh, transaction);
1834 goto zap_buffer;
1835 }
1836 /*
1837 * If it is committing, we simply cannot touch it. We
1838 * can remove it's next_transaction pointer from the
1839 * running transaction if that is set, but nothing
1840 * else. */
1841 JBUFFER_TRACE(jh, "on committing transaction");
1842 set_buffer_freed(bh);
1843 if (jh->b_next_transaction) {
1844 J_ASSERT(jh->b_next_transaction ==
1845 journal->j_running_transaction);
1846 jh->b_next_transaction = NULL;
1847 }
1848 jbd2_journal_put_journal_head(jh);
1849 spin_unlock(&journal->j_list_lock);
1850 jbd_unlock_bh_state(bh);
1851 spin_unlock(&journal->j_state_lock);
1852 return 0;
1853 } else {
1854 /* Good, the buffer belongs to the running transaction.
1855 * We are writing our own transaction's data, not any
1856 * previous one's, so it is safe to throw it away
1857 * (remember that we expect the filesystem to have set
1858 * i_size already for this truncate so recovery will not
1859 * expose the disk blocks we are discarding here.) */
1860 J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1861 may_free = __dispose_buffer(jh, transaction);
1862 }
1863
1864zap_buffer:
1865 jbd2_journal_put_journal_head(jh);
1866zap_buffer_no_jh:
1867 spin_unlock(&journal->j_list_lock);
1868 jbd_unlock_bh_state(bh);
1869 spin_unlock(&journal->j_state_lock);
1870zap_buffer_unlocked:
1871 clear_buffer_dirty(bh);
1872 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
1873 clear_buffer_mapped(bh);
1874 clear_buffer_req(bh);
1875 clear_buffer_new(bh);
1876 bh->b_bdev = NULL;
1877 return may_free;
1878}
1879
1880/**
1881 * void jbd2_journal_invalidatepage()
1882 * @journal: journal to use for flush...
1883 * @page: page to flush
1884 * @offset: length of page to invalidate.
1885 *
1886 * Reap page buffers containing data after offset in page.
1887 *
1888 */
1889void jbd2_journal_invalidatepage(journal_t *journal,
1890 struct page *page,
1891 unsigned long offset)
1892{
1893 struct buffer_head *head, *bh, *next;
1894 unsigned int curr_off = 0;
1895 int may_free = 1;
1896
1897 if (!PageLocked(page))
1898 BUG();
1899 if (!page_has_buffers(page))
1900 return;
1901
1902 /* We will potentially be playing with lists other than just the
1903 * data lists (especially for journaled data mode), so be
1904 * cautious in our locking. */
1905
1906 head = bh = page_buffers(page);
1907 do {
1908 unsigned int next_off = curr_off + bh->b_size;
1909 next = bh->b_this_page;
1910
1911 if (offset <= curr_off) {
1912 /* This block is wholly outside the truncation point */
1913 lock_buffer(bh);
1914 may_free &= journal_unmap_buffer(journal, bh);
1915 unlock_buffer(bh);
1916 }
1917 curr_off = next_off;
1918 bh = next;
1919
1920 } while (bh != head);
1921
1922 if (!offset) {
1923 if (may_free && try_to_free_buffers(page))
1924 J_ASSERT(!page_has_buffers(page));
1925 }
1926}
1927
1928/*
1929 * File a buffer on the given transaction list.
1930 */
1931void __jbd2_journal_file_buffer(struct journal_head *jh,
1932 transaction_t *transaction, int jlist)
1933{
1934 struct journal_head **list = NULL;
1935 int was_dirty = 0;
1936 struct buffer_head *bh = jh2bh(jh);
1937
1938 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1939 assert_spin_locked(&transaction->t_journal->j_list_lock);
1940
1941 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1942 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1943 jh->b_transaction == 0);
1944
1945 if (jh->b_transaction && jh->b_jlist == jlist)
1946 return;
1947
1948 /* The following list of buffer states needs to be consistent
1949 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
1950 * state. */
1951
1952 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
1953 jlist == BJ_Shadow || jlist == BJ_Forget) {
1954 if (test_clear_buffer_dirty(bh) ||
1955 test_clear_buffer_jbddirty(bh))
1956 was_dirty = 1;
1957 }
1958
1959 if (jh->b_transaction)
1960 __jbd2_journal_temp_unlink_buffer(jh);
1961 jh->b_transaction = transaction;
1962
1963 switch (jlist) {
1964 case BJ_None:
1965 J_ASSERT_JH(jh, !jh->b_committed_data);
1966 J_ASSERT_JH(jh, !jh->b_frozen_data);
1967 return;
1968 case BJ_SyncData:
1969 list = &transaction->t_sync_datalist;
1970 break;
1971 case BJ_Metadata:
1972 transaction->t_nr_buffers++;
1973 list = &transaction->t_buffers;
1974 break;
1975 case BJ_Forget:
1976 list = &transaction->t_forget;
1977 break;
1978 case BJ_IO:
1979 list = &transaction->t_iobuf_list;
1980 break;
1981 case BJ_Shadow:
1982 list = &transaction->t_shadow_list;
1983 break;
1984 case BJ_LogCtl:
1985 list = &transaction->t_log_list;
1986 break;
1987 case BJ_Reserved:
1988 list = &transaction->t_reserved_list;
1989 break;
1990 case BJ_Locked:
1991 list = &transaction->t_locked_list;
1992 break;
1993 }
1994
1995 __blist_add_buffer(list, jh);
1996 jh->b_jlist = jlist;
1997
1998 if (was_dirty)
1999 set_buffer_jbddirty(bh);
2000}
2001
2002void jbd2_journal_file_buffer(struct journal_head *jh,
2003 transaction_t *transaction, int jlist)
2004{
2005 jbd_lock_bh_state(jh2bh(jh));
2006 spin_lock(&transaction->t_journal->j_list_lock);
2007 __jbd2_journal_file_buffer(jh, transaction, jlist);
2008 spin_unlock(&transaction->t_journal->j_list_lock);
2009 jbd_unlock_bh_state(jh2bh(jh));
2010}
2011
2012/*
2013 * Remove a buffer from its current buffer list in preparation for
2014 * dropping it from its current transaction entirely. If the buffer has
2015 * already started to be used by a subsequent transaction, refile the
2016 * buffer on that transaction's metadata list.
2017 *
2018 * Called under journal->j_list_lock
2019 *
2020 * Called under jbd_lock_bh_state(jh2bh(jh))
2021 */
2022void __jbd2_journal_refile_buffer(struct journal_head *jh)
2023{
2024 int was_dirty;
2025 struct buffer_head *bh = jh2bh(jh);
2026
2027 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2028 if (jh->b_transaction)
2029 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2030
2031 /* If the buffer is now unused, just drop it. */
2032 if (jh->b_next_transaction == NULL) {
2033 __jbd2_journal_unfile_buffer(jh);
2034 return;
2035 }
2036
2037 /*
2038 * It has been modified by a later transaction: add it to the new
2039 * transaction's metadata list.
2040 */
2041
2042 was_dirty = test_clear_buffer_jbddirty(bh);
2043 __jbd2_journal_temp_unlink_buffer(jh);
2044 jh->b_transaction = jh->b_next_transaction;
2045 jh->b_next_transaction = NULL;
2046 __jbd2_journal_file_buffer(jh, jh->b_transaction,
2047 was_dirty ? BJ_Metadata : BJ_Reserved);
2048 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2049
2050 if (was_dirty)
2051 set_buffer_jbddirty(bh);
2052}
2053
2054/*
2055 * For the unlocked version of this call, also make sure that any
2056 * hanging journal_head is cleaned up if necessary.
2057 *
2058 * __jbd2_journal_refile_buffer is usually called as part of a single locked
2059 * operation on a buffer_head, in which the caller is probably going to
2060 * be hooking the journal_head onto other lists. In that case it is up
2061 * to the caller to remove the journal_head if necessary. For the
2062 * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be
2063 * doing anything else to the buffer so we need to do the cleanup
2064 * ourselves to avoid a jh leak.
2065 *
2066 * *** The journal_head may be freed by this call! ***
2067 */
2068void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2069{
2070 struct buffer_head *bh = jh2bh(jh);
2071
2072 jbd_lock_bh_state(bh);
2073 spin_lock(&journal->j_list_lock);
2074
2075 __jbd2_journal_refile_buffer(jh);
2076 jbd_unlock_bh_state(bh);
2077 jbd2_journal_remove_journal_head(bh);
2078
2079 spin_unlock(&journal->j_list_lock);
2080 __brelse(bh);
2081}