aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jbd2')
-rw-r--r--fs/jbd2/Makefile7
-rw-r--r--fs/jbd2/checkpoint.c697
-rw-r--r--fs/jbd2/commit.c911
-rw-r--r--fs/jbd2/journal.c2072
-rw-r--r--fs/jbd2/recovery.c592
-rw-r--r--fs/jbd2/revoke.c703
-rw-r--r--fs/jbd2/transaction.c2080
7 files changed, 7062 insertions, 0 deletions
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
new file mode 100644
index 000000000000..54aca4868a36
--- /dev/null
+++ b/fs/jbd2/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the linux journaling routines.
3#
4
5obj-$(CONFIG_JBD) += jbd.o
6
7jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
new file mode 100644
index 000000000000..0208cc7ac5d0
--- /dev/null
+++ b/fs/jbd2/checkpoint.c
@@ -0,0 +1,697 @@
1/*
2 * linux/fs/checkpoint.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Checkpoint routines for the generic filesystem journaling code.
13 * Part of the ext2fs journaling system.
14 *
15 * Checkpointing is the process of ensuring that a section of the log is
16 * committed fully to disk, so that that portion of the log can be
17 * reused.
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25
26/*
27 * Unlink a buffer from a transaction checkpoint list.
28 *
29 * Called with j_list_lock held.
30 */
31static inline void __buffer_unlink_first(struct journal_head *jh)
32{
33 transaction_t *transaction = jh->b_cp_transaction;
34
35 jh->b_cpnext->b_cpprev = jh->b_cpprev;
36 jh->b_cpprev->b_cpnext = jh->b_cpnext;
37 if (transaction->t_checkpoint_list == jh) {
38 transaction->t_checkpoint_list = jh->b_cpnext;
39 if (transaction->t_checkpoint_list == jh)
40 transaction->t_checkpoint_list = NULL;
41 }
42}
43
44/*
45 * Unlink a buffer from a transaction checkpoint(io) list.
46 *
47 * Called with j_list_lock held.
48 */
49static inline void __buffer_unlink(struct journal_head *jh)
50{
51 transaction_t *transaction = jh->b_cp_transaction;
52
53 __buffer_unlink_first(jh);
54 if (transaction->t_checkpoint_io_list == jh) {
55 transaction->t_checkpoint_io_list = jh->b_cpnext;
56 if (transaction->t_checkpoint_io_list == jh)
57 transaction->t_checkpoint_io_list = NULL;
58 }
59}
60
61/*
62 * Move a buffer from the checkpoint list to the checkpoint io list
63 *
64 * Called with j_list_lock held
65 */
66static inline void __buffer_relink_io(struct journal_head *jh)
67{
68 transaction_t *transaction = jh->b_cp_transaction;
69
70 __buffer_unlink_first(jh);
71
72 if (!transaction->t_checkpoint_io_list) {
73 jh->b_cpnext = jh->b_cpprev = jh;
74 } else {
75 jh->b_cpnext = transaction->t_checkpoint_io_list;
76 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
77 jh->b_cpprev->b_cpnext = jh;
78 jh->b_cpnext->b_cpprev = jh;
79 }
80 transaction->t_checkpoint_io_list = jh;
81}
82
83/*
84 * Try to release a checkpointed buffer from its transaction.
85 * Returns 1 if we released it and 2 if we also released the
86 * whole transaction.
87 *
88 * Requires j_list_lock
89 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
90 */
91static int __try_to_free_cp_buf(struct journal_head *jh)
92{
93 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh);
95
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh);
100 journal_remove_journal_head(bh);
101 BUFFER_TRACE(bh, "release");
102 __brelse(bh);
103 } else {
104 jbd_unlock_bh_state(bh);
105 }
106 return ret;
107}
108
109/*
110 * __log_wait_for_space: wait until there is space in the journal.
111 *
112 * Called under j-state_lock *only*. It will be unlocked if we have to wait
113 * for a checkpoint to free up some space in the log.
114 */
115void __log_wait_for_space(journal_t *journal)
116{
117 int nblocks;
118 assert_spin_locked(&journal->j_state_lock);
119
120 nblocks = jbd_space_needed(journal);
121 while (__log_space_left(journal) < nblocks) {
122 if (journal->j_flags & JFS_ABORT)
123 return;
124 spin_unlock(&journal->j_state_lock);
125 mutex_lock(&journal->j_checkpoint_mutex);
126
127 /*
128 * Test again, another process may have checkpointed while we
129 * were waiting for the checkpoint lock
130 */
131 spin_lock(&journal->j_state_lock);
132 nblocks = jbd_space_needed(journal);
133 if (__log_space_left(journal) < nblocks) {
134 spin_unlock(&journal->j_state_lock);
135 log_do_checkpoint(journal);
136 spin_lock(&journal->j_state_lock);
137 }
138 mutex_unlock(&journal->j_checkpoint_mutex);
139 }
140}
141
142/*
143 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
144 * The caller must restart a list walk. Wait for someone else to run
145 * jbd_unlock_bh_state().
146 */
147static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
148 __releases(journal->j_list_lock)
149{
150 get_bh(bh);
151 spin_unlock(&journal->j_list_lock);
152 jbd_lock_bh_state(bh);
153 jbd_unlock_bh_state(bh);
154 put_bh(bh);
155}
156
157/*
158 * Clean up transaction's list of buffers submitted for io.
159 * We wait for any pending IO to complete and remove any clean
160 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO.
162 *
163 * Called with j_list_lock held.
164 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{
167 struct journal_head *jh;
168 struct buffer_head *bh;
169 tid_t this_tid;
170 int released = 0;
171
172 this_tid = transaction->t_tid;
173restart:
174 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid)
177 return;
178 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh);
181 if (!jbd_trylock_bh_state(bh)) {
182 jbd_sync_bh(journal, bh);
183 spin_lock(&journal->j_list_lock);
184 goto restart;
185 }
186 if (buffer_locked(bh)) {
187 atomic_inc(&bh->b_count);
188 spin_unlock(&journal->j_list_lock);
189 jbd_unlock_bh_state(bh);
190 wait_on_buffer(bh);
191 /* the journal_head may have gone by now */
192 BUFFER_TRACE(bh, "brelse");
193 __brelse(bh);
194 spin_lock(&journal->j_list_lock);
195 goto restart;
196 }
197 /*
198 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list
200 */
201 released = __journal_remove_checkpoint(jh);
202 jbd_unlock_bh_state(bh);
203 journal_remove_journal_head(bh);
204 __brelse(bh);
205 }
206}
207
208#define NR_BATCH 64
209
210static void
211__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
212{
213 int i;
214
215 ll_rw_block(SWRITE, *batch_count, bhs);
216 for (i = 0; i < *batch_count; i++) {
217 struct buffer_head *bh = bhs[i];
218 clear_buffer_jwrite(bh);
219 BUFFER_TRACE(bh, "brelse");
220 __brelse(bh);
221 }
222 *batch_count = 0;
223}
224
225/*
226 * Try to flush one buffer from the checkpoint list to disk.
227 *
228 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list.
230 *
231 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
233 */
234static int __process_buffer(journal_t *journal, struct journal_head *jh,
235 struct buffer_head **bhs, int *batch_count)
236{
237 struct buffer_head *bh = jh2bh(jh);
238 int ret = 0;
239
240 if (buffer_locked(bh)) {
241 atomic_inc(&bh->b_count);
242 spin_unlock(&journal->j_list_lock);
243 jbd_unlock_bh_state(bh);
244 wait_on_buffer(bh);
245 /* the journal_head may have gone by now */
246 BUFFER_TRACE(bh, "brelse");
247 __brelse(bh);
248 ret = 1;
249 } else if (jh->b_transaction != NULL) {
250 transaction_t *t = jh->b_transaction;
251 tid_t tid = t->t_tid;
252
253 spin_unlock(&journal->j_list_lock);
254 jbd_unlock_bh_state(bh);
255 log_start_commit(journal, tid);
256 log_wait_commit(journal, tid);
257 ret = 1;
258 } else if (!buffer_dirty(bh)) {
259 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
260 BUFFER_TRACE(bh, "remove from checkpoint");
261 __journal_remove_checkpoint(jh);
262 spin_unlock(&journal->j_list_lock);
263 jbd_unlock_bh_state(bh);
264 journal_remove_journal_head(bh);
265 __brelse(bh);
266 ret = 1;
267 } else {
268 /*
269 * Important: we are about to write the buffer, and
270 * possibly block, while still holding the journal lock.
271 * We cannot afford to let the transaction logic start
272 * messing around with this buffer before we write it to
273 * disk, as that would break recoverability.
274 */
275 BUFFER_TRACE(bh, "queue");
276 get_bh(bh);
277 J_ASSERT_BH(bh, !buffer_jwrite(bh));
278 set_buffer_jwrite(bh);
279 bhs[*batch_count] = bh;
280 __buffer_relink_io(jh);
281 jbd_unlock_bh_state(bh);
282 (*batch_count)++;
283 if (*batch_count == NR_BATCH) {
284 spin_unlock(&journal->j_list_lock);
285 __flush_batch(journal, bhs, batch_count);
286 ret = 1;
287 }
288 }
289 return ret;
290}
291
292/*
293 * Perform an actual checkpoint. We take the first transaction on the
294 * list of transactions to be checkpointed and send all its buffers
295 * to disk. We submit larger chunks of data at once.
296 *
297 * The journal should be locked before calling this function.
298 */
299int log_do_checkpoint(journal_t *journal)
300{
301 transaction_t *transaction;
302 tid_t this_tid;
303 int result;
304
305 jbd_debug(1, "Start checkpoint\n");
306
307 /*
308 * First thing: if there are any transactions in the log which
309 * don't need checkpointing, just eliminate them from the
310 * journal straight away.
311 */
312 result = cleanup_journal_tail(journal);
313 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
314 if (result <= 0)
315 return result;
316
317 /*
318 * OK, we need to start writing disk blocks. Take one transaction
319 * and write it.
320 */
321 spin_lock(&journal->j_list_lock);
322 if (!journal->j_checkpoint_transactions)
323 goto out;
324 transaction = journal->j_checkpoint_transactions;
325 this_tid = transaction->t_tid;
326restart:
327 /*
328 * If someone cleaned up this transaction while we slept, we're
329 * done (maybe it's a new transaction, but it fell at the same
330 * address).
331 */
332 if (journal->j_checkpoint_transactions == transaction &&
333 transaction->t_tid == this_tid) {
334 int batch_count = 0;
335 struct buffer_head *bhs[NR_BATCH];
336 struct journal_head *jh;
337 int retry = 0;
338
339 while (!retry && transaction->t_checkpoint_list) {
340 struct buffer_head *bh;
341
342 jh = transaction->t_checkpoint_list;
343 bh = jh2bh(jh);
344 if (!jbd_trylock_bh_state(bh)) {
345 jbd_sync_bh(journal, bh);
346 retry = 1;
347 break;
348 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count);
350 if (!retry && lock_need_resched(&journal->j_list_lock)){
351 spin_unlock(&journal->j_list_lock);
352 retry = 1;
353 break;
354 }
355 }
356
357 if (batch_count) {
358 if (!retry) {
359 spin_unlock(&journal->j_list_lock);
360 retry = 1;
361 }
362 __flush_batch(journal, bhs, &batch_count);
363 }
364
365 if (retry) {
366 spin_lock(&journal->j_list_lock);
367 goto restart;
368 }
369 /*
370 * Now we have cleaned up the first transaction's checkpoint
371 * list. Let's clean up the second one
372 */
373 __wait_cp_io(journal, transaction);
374 }
375out:
376 spin_unlock(&journal->j_list_lock);
377 result = cleanup_journal_tail(journal);
378 if (result < 0)
379 return result;
380 return 0;
381}
382
383/*
384 * Check the list of checkpoint transactions for the journal to see if
385 * we have already got rid of any since the last update of the log tail
386 * in the journal superblock. If so, we can instantly roll the
387 * superblock forward to remove those transactions from the log.
388 *
389 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
390 *
391 * Called with the journal lock held.
392 *
393 * This is the only part of the journaling code which really needs to be
394 * aware of transaction aborts. Checkpointing involves writing to the
395 * main filesystem area rather than to the journal, so it can proceed
396 * even in abort state, but we must not update the journal superblock if
397 * we have an abort error outstanding.
398 */
399
400int cleanup_journal_tail(journal_t *journal)
401{
402 transaction_t * transaction;
403 tid_t first_tid;
404 unsigned long blocknr, freed;
405
406 /* OK, work out the oldest transaction remaining in the log, and
407 * the log block it starts at.
408 *
409 * If the log is now empty, we need to work out which is the
410 * next transaction ID we will write, and where it will
411 * start. */
412
413 spin_lock(&journal->j_state_lock);
414 spin_lock(&journal->j_list_lock);
415 transaction = journal->j_checkpoint_transactions;
416 if (transaction) {
417 first_tid = transaction->t_tid;
418 blocknr = transaction->t_log_start;
419 } else if ((transaction = journal->j_committing_transaction) != NULL) {
420 first_tid = transaction->t_tid;
421 blocknr = transaction->t_log_start;
422 } else if ((transaction = journal->j_running_transaction) != NULL) {
423 first_tid = transaction->t_tid;
424 blocknr = journal->j_head;
425 } else {
426 first_tid = journal->j_transaction_sequence;
427 blocknr = journal->j_head;
428 }
429 spin_unlock(&journal->j_list_lock);
430 J_ASSERT(blocknr != 0);
431
432 /* If the oldest pinned transaction is at the tail of the log
433 already then there's not much we can do right now. */
434 if (journal->j_tail_sequence == first_tid) {
435 spin_unlock(&journal->j_state_lock);
436 return 1;
437 }
438
439 /* OK, update the superblock to recover the freed space.
440 * Physical blocks come first: have we wrapped beyond the end of
441 * the log? */
442 freed = blocknr - journal->j_tail;
443 if (blocknr < journal->j_tail)
444 freed = freed + journal->j_last - journal->j_first;
445
446 jbd_debug(1,
447 "Cleaning journal tail from %d to %d (offset %lu), "
448 "freeing %lu\n",
449 journal->j_tail_sequence, first_tid, blocknr, freed);
450
451 journal->j_free += freed;
452 journal->j_tail_sequence = first_tid;
453 journal->j_tail = blocknr;
454 spin_unlock(&journal->j_state_lock);
455 if (!(journal->j_flags & JFS_ABORT))
456 journal_update_superblock(journal, 1);
457 return 0;
458}
459
460
461/* Checkpoint list management */
462
463/*
464 * journal_clean_one_cp_list
465 *
466 * Find all the written-back checkpoint buffers in the given list and release them.
467 *
468 * Called with the journal locked.
469 * Called with j_list_lock held.
470 * Returns number of bufers reaped (for debug)
471 */
472
473static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
474{
475 struct journal_head *last_jh;
476 struct journal_head *next_jh = jh;
477 int ret, freed = 0;
478
479 *released = 0;
480 if (!jh)
481 return 0;
482
483 last_jh = jh->b_cpprev;
484 do {
485 jh = next_jh;
486 next_jh = jh->b_cpnext;
487 /* Use trylock because of the ranking */
488 if (jbd_trylock_bh_state(jh2bh(jh))) {
489 ret = __try_to_free_cp_buf(jh);
490 if (ret) {
491 freed++;
492 if (ret == 2) {
493 *released = 1;
494 return freed;
495 }
496 }
497 }
498 /*
499 * This function only frees up some memory
500 * if possible so we dont have an obligation
501 * to finish processing. Bail out if preemption
502 * requested:
503 */
504 if (need_resched())
505 return freed;
506 } while (jh != last_jh);
507
508 return freed;
509}
510
511/*
512 * journal_clean_checkpoint_list
513 *
514 * Find all the written-back checkpoint buffers in the journal and release them.
515 *
516 * Called with the journal locked.
517 * Called with j_list_lock held.
518 * Returns number of buffers reaped (for debug)
519 */
520
521int __journal_clean_checkpoint_list(journal_t *journal)
522{
523 transaction_t *transaction, *last_transaction, *next_transaction;
524 int ret = 0;
525 int released;
526
527 transaction = journal->j_checkpoint_transactions;
528 if (!transaction)
529 goto out;
530
531 last_transaction = transaction->t_cpprev;
532 next_transaction = transaction;
533 do {
534 transaction = next_transaction;
535 next_transaction = transaction->t_cpnext;
536 ret += journal_clean_one_cp_list(transaction->
537 t_checkpoint_list, &released);
538 /*
539 * This function only frees up some memory if possible so we
540 * dont have an obligation to finish processing. Bail out if
541 * preemption requested:
542 */
543 if (need_resched())
544 goto out;
545 if (released)
546 continue;
547 /*
548 * It is essential that we are as careful as in the case of
549 * t_checkpoint_list with removing the buffer from the list as
550 * we can possibly see not yet submitted buffers on io_list
551 */
552 ret += journal_clean_one_cp_list(transaction->
553 t_checkpoint_io_list, &released);
554 if (need_resched())
555 goto out;
556 } while (transaction != last_transaction);
557out:
558 return ret;
559}
560
561/*
562 * journal_remove_checkpoint: called after a buffer has been committed
563 * to disk (either by being write-back flushed to disk, or being
564 * committed to the log).
565 *
566 * We cannot safely clean a transaction out of the log until all of the
567 * buffer updates committed in that transaction have safely been stored
568 * elsewhere on disk. To achieve this, all of the buffers in a
569 * transaction need to be maintained on the transaction's checkpoint
570 * lists until they have been rewritten, at which point this function is
571 * called to remove the buffer from the existing transaction's
572 * checkpoint lists.
573 *
574 * The function returns 1 if it frees the transaction, 0 otherwise.
575 *
576 * This function is called with the journal locked.
577 * This function is called with j_list_lock held.
578 * This function is called with jbd_lock_bh_state(jh2bh(jh))
579 */
580
581int __journal_remove_checkpoint(struct journal_head *jh)
582{
583 transaction_t *transaction;
584 journal_t *journal;
585 int ret = 0;
586
587 JBUFFER_TRACE(jh, "entry");
588
589 if ((transaction = jh->b_cp_transaction) == NULL) {
590 JBUFFER_TRACE(jh, "not on transaction");
591 goto out;
592 }
593 journal = transaction->t_journal;
594
595 __buffer_unlink(jh);
596 jh->b_cp_transaction = NULL;
597
598 if (transaction->t_checkpoint_list != NULL ||
599 transaction->t_checkpoint_io_list != NULL)
600 goto out;
601 JBUFFER_TRACE(jh, "transaction has no more buffers");
602
603 /*
604 * There is one special case to worry about: if we have just pulled the
605 * buffer off a committing transaction's forget list, then even if the
606 * checkpoint list is empty, the transaction obviously cannot be
607 * dropped!
608 *
609 * The locking here around j_committing_transaction is a bit sleazy.
610 * See the comment at the end of journal_commit_transaction().
611 */
612 if (transaction == journal->j_committing_transaction) {
613 JBUFFER_TRACE(jh, "belongs to committing transaction");
614 goto out;
615 }
616
617 /* OK, that was the last buffer for the transaction: we can now
618 safely remove this transaction from the log */
619
620 __journal_drop_transaction(journal, transaction);
621
622 /* Just in case anybody was waiting for more transactions to be
623 checkpointed... */
624 wake_up(&journal->j_wait_logspace);
625 ret = 1;
626out:
627 JBUFFER_TRACE(jh, "exit");
628 return ret;
629}
630
631/*
632 * journal_insert_checkpoint: put a committed buffer onto a checkpoint
633 * list so that we know when it is safe to clean the transaction out of
634 * the log.
635 *
636 * Called with the journal locked.
637 * Called with j_list_lock held.
638 */
639void __journal_insert_checkpoint(struct journal_head *jh,
640 transaction_t *transaction)
641{
642 JBUFFER_TRACE(jh, "entry");
643 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
644 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
645
646 jh->b_cp_transaction = transaction;
647
648 if (!transaction->t_checkpoint_list) {
649 jh->b_cpnext = jh->b_cpprev = jh;
650 } else {
651 jh->b_cpnext = transaction->t_checkpoint_list;
652 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
653 jh->b_cpprev->b_cpnext = jh;
654 jh->b_cpnext->b_cpprev = jh;
655 }
656 transaction->t_checkpoint_list = jh;
657}
658
659/*
660 * We've finished with this transaction structure: adios...
661 *
662 * The transaction must have no links except for the checkpoint by this
663 * point.
664 *
665 * Called with the journal locked.
666 * Called with j_list_lock held.
667 */
668
669void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
670{
671 assert_spin_locked(&journal->j_list_lock);
672 if (transaction->t_cpnext) {
673 transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
674 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
675 if (journal->j_checkpoint_transactions == transaction)
676 journal->j_checkpoint_transactions =
677 transaction->t_cpnext;
678 if (journal->j_checkpoint_transactions == transaction)
679 journal->j_checkpoint_transactions = NULL;
680 }
681
682 J_ASSERT(transaction->t_state == T_FINISHED);
683 J_ASSERT(transaction->t_buffers == NULL);
684 J_ASSERT(transaction->t_sync_datalist == NULL);
685 J_ASSERT(transaction->t_forget == NULL);
686 J_ASSERT(transaction->t_iobuf_list == NULL);
687 J_ASSERT(transaction->t_shadow_list == NULL);
688 J_ASSERT(transaction->t_log_list == NULL);
689 J_ASSERT(transaction->t_checkpoint_list == NULL);
690 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
691 J_ASSERT(transaction->t_updates == 0);
692 J_ASSERT(journal->j_committing_transaction != transaction);
693 J_ASSERT(journal->j_running_transaction != transaction);
694
695 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
696 kfree(transaction);
697}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
new file mode 100644
index 000000000000..10be51290a27
--- /dev/null
+++ b/fs/jbd2/commit.c
@@ -0,0 +1,911 @@
1/*
2 * linux/fs/jbd/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
23#include <linux/smp_lock.h>
24
25/*
26 * Default IO end handler for temporary BJ_IO buffer_heads.
27 */
28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29{
30 BUFFER_TRACE(bh, "");
31 if (uptodate)
32 set_buffer_uptodate(bh);
33 else
34 clear_buffer_uptodate(bh);
35 unlock_buffer(bh);
36}
37
38/*
39 * When an ext3-ordered file is truncated, it is possible that many pages are
40 * not sucessfully freed, because they are attached to a committing transaction.
41 * After the transaction commits, these pages are left on the LRU, with no
42 * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 * the numbers in /proc/meminfo look odd.
45 *
46 * So here, we have a buffer which has just come off the forget list. Look to
47 * see if we can strip all buffers from the backing page.
48 *
49 * Called under lock_journal(), and possibly under journal_datalist_lock. The
50 * caller provided us with a ref against the buffer, and we drop that here.
51 */
52static void release_buffer_page(struct buffer_head *bh)
53{
54 struct page *page;
55
56 if (buffer_dirty(bh))
57 goto nope;
58 if (atomic_read(&bh->b_count) != 1)
59 goto nope;
60 page = bh->b_page;
61 if (!page)
62 goto nope;
63 if (page->mapping)
64 goto nope;
65
66 /* OK, it's a truncated page */
67 if (TestSetPageLocked(page))
68 goto nope;
69
70 page_cache_get(page);
71 __brelse(bh);
72 try_to_free_buffers(page);
73 unlock_page(page);
74 page_cache_release(page);
75 return;
76
77nope:
78 __brelse(bh);
79}
80
81/*
82 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83 * held. For ranking reasons we must trylock. If we lose, schedule away and
84 * return 0. j_list_lock is dropped in this case.
85 */
86static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87{
88 if (!jbd_trylock_bh_state(bh)) {
89 spin_unlock(&journal->j_list_lock);
90 schedule();
91 return 0;
92 }
93 return 1;
94}
95
96/* Done it all: now write the commit record. We should have
97 * cleaned up our previous buffers by now, so if we are in abort
98 * mode we can now just skip the rest of the journal write
99 * entirely.
100 *
101 * Returns 1 if the journal needs to be aborted or 0 on success
102 */
103static int journal_write_commit_record(journal_t *journal,
104 transaction_t *commit_transaction)
105{
106 struct journal_head *descriptor;
107 struct buffer_head *bh;
108 int i, ret;
109 int barrier_done = 0;
110
111 if (is_journal_aborted(journal))
112 return 0;
113
114 descriptor = journal_get_descriptor_buffer(journal);
115 if (!descriptor)
116 return 1;
117
118 bh = jh2bh(descriptor);
119
120 /* AKPM: buglet - add `i' to tmp! */
121 for (i = 0; i < bh->b_size; i += 512) {
122 journal_header_t *tmp = (journal_header_t*)bh->b_data;
123 tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
124 tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
125 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126 }
127
128 JBUFFER_TRACE(descriptor, "write commit block");
129 set_buffer_dirty(bh);
130 if (journal->j_flags & JFS_BARRIER) {
131 set_buffer_ordered(bh);
132 barrier_done = 1;
133 }
134 ret = sync_dirty_buffer(bh);
135 /* is it possible for another commit to fail at roughly
136 * the same time as this one? If so, we don't want to
137 * trust the barrier flag in the super, but instead want
138 * to remember if we sent a barrier request
139 */
140 if (ret == -EOPNOTSUPP && barrier_done) {
141 char b[BDEVNAME_SIZE];
142
143 printk(KERN_WARNING
144 "JBD: barrier-based sync failed on %s - "
145 "disabling barriers\n",
146 bdevname(journal->j_dev, b));
147 spin_lock(&journal->j_state_lock);
148 journal->j_flags &= ~JFS_BARRIER;
149 spin_unlock(&journal->j_state_lock);
150
151 /* And try again, without the barrier */
152 clear_buffer_ordered(bh);
153 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh);
156 }
157 put_bh(bh); /* One for getblk() */
158 journal_put_journal_head(descriptor);
159
160 return (ret == -EIO);
161}
162
163static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164{
165 int i;
166
167 for (i = 0; i < bufs; i++) {
168 wbuf[i]->b_end_io = end_buffer_write_sync;
169 /* We use-up our safety reference in submit_bh() */
170 submit_bh(WRITE, wbuf[i]);
171 }
172}
173
174/*
175 * Submit all the data buffers to disk
176 */
177static void journal_submit_data_buffers(journal_t *journal,
178 transaction_t *commit_transaction)
179{
180 struct journal_head *jh;
181 struct buffer_head *bh;
182 int locked;
183 int bufs = 0;
184 struct buffer_head **wbuf = journal->j_wbuf;
185
186 /*
187 * Whenever we unlock the journal and sleep, things can get added
188 * onto ->t_sync_datalist, so we have to keep looping back to
189 * write_out_data until we *know* that the list is empty.
190 *
191 * Cleanup any flushed data buffers from the data list. Even in
192 * abort mode, we want to flush this out as soon as possible.
193 */
194write_out_data:
195 cond_resched();
196 spin_lock(&journal->j_list_lock);
197
198 while (commit_transaction->t_sync_datalist) {
199 jh = commit_transaction->t_sync_datalist;
200 bh = jh2bh(jh);
201 locked = 0;
202
203 /* Get reference just to make sure buffer does not disappear
204 * when we are forced to drop various locks */
205 get_bh(bh);
206 /* If the buffer is dirty, we need to submit IO and hence
207 * we need the buffer lock. We try to lock the buffer without
208 * blocking. If we fail, we need to drop j_list_lock and do
209 * blocking lock_buffer().
210 */
211 if (buffer_dirty(bh)) {
212 if (test_set_buffer_locked(bh)) {
213 BUFFER_TRACE(bh, "needs blocking lock");
214 spin_unlock(&journal->j_list_lock);
215 /* Write out all data to prevent deadlocks */
216 journal_do_submit_data(wbuf, bufs);
217 bufs = 0;
218 lock_buffer(bh);
219 spin_lock(&journal->j_list_lock);
220 }
221 locked = 1;
222 }
223 /* We have to get bh_state lock. Again out of order, sigh. */
224 if (!inverted_lock(journal, bh)) {
225 jbd_lock_bh_state(bh);
226 spin_lock(&journal->j_list_lock);
227 }
228 /* Someone already cleaned up the buffer? */
229 if (!buffer_jbd(bh)
230 || jh->b_transaction != commit_transaction
231 || jh->b_jlist != BJ_SyncData) {
232 jbd_unlock_bh_state(bh);
233 if (locked)
234 unlock_buffer(bh);
235 BUFFER_TRACE(bh, "already cleaned up");
236 put_bh(bh);
237 continue;
238 }
239 if (locked && test_clear_buffer_dirty(bh)) {
240 BUFFER_TRACE(bh, "needs writeout, adding to array");
241 wbuf[bufs++] = bh;
242 __journal_file_buffer(jh, commit_transaction,
243 BJ_Locked);
244 jbd_unlock_bh_state(bh);
245 if (bufs == journal->j_wbufsize) {
246 spin_unlock(&journal->j_list_lock);
247 journal_do_submit_data(wbuf, bufs);
248 bufs = 0;
249 goto write_out_data;
250 }
251 }
252 else {
253 BUFFER_TRACE(bh, "writeout complete: unfile");
254 __journal_unfile_buffer(jh);
255 jbd_unlock_bh_state(bh);
256 if (locked)
257 unlock_buffer(bh);
258 journal_remove_journal_head(bh);
259 /* Once for our safety reference, once for
260 * journal_remove_journal_head() */
261 put_bh(bh);
262 put_bh(bh);
263 }
264
265 if (lock_need_resched(&journal->j_list_lock)) {
266 spin_unlock(&journal->j_list_lock);
267 goto write_out_data;
268 }
269 }
270 spin_unlock(&journal->j_list_lock);
271 journal_do_submit_data(wbuf, bufs);
272}
273
274/*
275 * journal_commit_transaction
276 *
277 * The primary function for committing a transaction to the log. This
278 * function is called by the journal thread to begin a complete commit.
279 */
280void journal_commit_transaction(journal_t *journal)
281{
282 transaction_t *commit_transaction;
283 struct journal_head *jh, *new_jh, *descriptor;
284 struct buffer_head **wbuf = journal->j_wbuf;
285 int bufs;
286 int flags;
287 int err;
288 unsigned long blocknr;
289 char *tagp = NULL;
290 journal_header_t *header;
291 journal_block_tag_t *tag = NULL;
292 int space_left = 0;
293 int first_tag = 0;
294 int tag_flag;
295 int i;
296
297 /*
298 * First job: lock down the current transaction and wait for
299 * all outstanding updates to complete.
300 */
301
302#ifdef COMMIT_STATS
303 spin_lock(&journal->j_list_lock);
304 summarise_journal_usage(journal);
305 spin_unlock(&journal->j_list_lock);
306#endif
307
308 /* Do we need to erase the effects of a prior journal_flush? */
309 if (journal->j_flags & JFS_FLUSHED) {
310 jbd_debug(3, "super block updated\n");
311 journal_update_superblock(journal, 1);
312 } else {
313 jbd_debug(3, "superblock not updated\n");
314 }
315
316 J_ASSERT(journal->j_running_transaction != NULL);
317 J_ASSERT(journal->j_committing_transaction == NULL);
318
319 commit_transaction = journal->j_running_transaction;
320 J_ASSERT(commit_transaction->t_state == T_RUNNING);
321
322 jbd_debug(1, "JBD: starting commit of transaction %d\n",
323 commit_transaction->t_tid);
324
325 spin_lock(&journal->j_state_lock);
326 commit_transaction->t_state = T_LOCKED;
327
328 spin_lock(&commit_transaction->t_handle_lock);
329 while (commit_transaction->t_updates) {
330 DEFINE_WAIT(wait);
331
332 prepare_to_wait(&journal->j_wait_updates, &wait,
333 TASK_UNINTERRUPTIBLE);
334 if (commit_transaction->t_updates) {
335 spin_unlock(&commit_transaction->t_handle_lock);
336 spin_unlock(&journal->j_state_lock);
337 schedule();
338 spin_lock(&journal->j_state_lock);
339 spin_lock(&commit_transaction->t_handle_lock);
340 }
341 finish_wait(&journal->j_wait_updates, &wait);
342 }
343 spin_unlock(&commit_transaction->t_handle_lock);
344
345 J_ASSERT (commit_transaction->t_outstanding_credits <=
346 journal->j_max_transaction_buffers);
347
348 /*
349 * First thing we are allowed to do is to discard any remaining
350 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
351 * that there are no such buffers: if a large filesystem
352 * operation like a truncate needs to split itself over multiple
353 * transactions, then it may try to do a journal_restart() while
354 * there are still BJ_Reserved buffers outstanding. These must
355 * be released cleanly from the current transaction.
356 *
357 * In this case, the filesystem must still reserve write access
358 * again before modifying the buffer in the new transaction, but
359 * we do not require it to remember exactly which old buffers it
360 * has reserved. This is consistent with the existing behaviour
361 * that multiple journal_get_write_access() calls to the same
362 * buffer are perfectly permissable.
363 */
364 while (commit_transaction->t_reserved_list) {
365 jh = commit_transaction->t_reserved_list;
366 JBUFFER_TRACE(jh, "reserved, unused: refile");
367 /*
368 * A journal_get_undo_access()+journal_release_buffer() may
369 * leave undo-committed data.
370 */
371 if (jh->b_committed_data) {
372 struct buffer_head *bh = jh2bh(jh);
373
374 jbd_lock_bh_state(bh);
375 jbd_slab_free(jh->b_committed_data, bh->b_size);
376 jh->b_committed_data = NULL;
377 jbd_unlock_bh_state(bh);
378 }
379 journal_refile_buffer(journal, jh);
380 }
381
382 /*
383 * Now try to drop any written-back buffers from the journal's
384 * checkpoint lists. We do this *before* commit because it potentially
385 * frees some memory
386 */
387 spin_lock(&journal->j_list_lock);
388 __journal_clean_checkpoint_list(journal);
389 spin_unlock(&journal->j_list_lock);
390
391 jbd_debug (3, "JBD: commit phase 1\n");
392
393 /*
394 * Switch to a new revoke table.
395 */
396 journal_switch_revoke_table(journal);
397
398 commit_transaction->t_state = T_FLUSH;
399 journal->j_committing_transaction = commit_transaction;
400 journal->j_running_transaction = NULL;
401 commit_transaction->t_log_start = journal->j_head;
402 wake_up(&journal->j_wait_transaction_locked);
403 spin_unlock(&journal->j_state_lock);
404
405 jbd_debug (3, "JBD: commit phase 2\n");
406
407 /*
408 * First, drop modified flag: all accesses to the buffers
409 * will be tracked for a new trasaction only -bzzz
410 */
411 spin_lock(&journal->j_list_lock);
412 if (commit_transaction->t_buffers) {
413 new_jh = jh = commit_transaction->t_buffers->b_tnext;
414 do {
415 J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
416 new_jh->b_modified == 0);
417 new_jh->b_modified = 0;
418 new_jh = new_jh->b_tnext;
419 } while (new_jh != jh);
420 }
421 spin_unlock(&journal->j_list_lock);
422
423 /*
424 * Now start flushing things to disk, in the order they appear
425 * on the transaction lists. Data blocks go first.
426 */
427 err = 0;
428 journal_submit_data_buffers(journal, commit_transaction);
429
430 /*
431 * Wait for all previously submitted IO to complete.
432 */
433 spin_lock(&journal->j_list_lock);
434 while (commit_transaction->t_locked_list) {
435 struct buffer_head *bh;
436
437 jh = commit_transaction->t_locked_list->b_tprev;
438 bh = jh2bh(jh);
439 get_bh(bh);
440 if (buffer_locked(bh)) {
441 spin_unlock(&journal->j_list_lock);
442 wait_on_buffer(bh);
443 if (unlikely(!buffer_uptodate(bh)))
444 err = -EIO;
445 spin_lock(&journal->j_list_lock);
446 }
447 if (!inverted_lock(journal, bh)) {
448 put_bh(bh);
449 spin_lock(&journal->j_list_lock);
450 continue;
451 }
452 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
453 __journal_unfile_buffer(jh);
454 jbd_unlock_bh_state(bh);
455 journal_remove_journal_head(bh);
456 put_bh(bh);
457 } else {
458 jbd_unlock_bh_state(bh);
459 }
460 put_bh(bh);
461 cond_resched_lock(&journal->j_list_lock);
462 }
463 spin_unlock(&journal->j_list_lock);
464
465 if (err)
466 __journal_abort_hard(journal);
467
468 journal_write_revoke_records(journal, commit_transaction);
469
470 jbd_debug(3, "JBD: commit phase 2\n");
471
472 /*
473 * If we found any dirty or locked buffers, then we should have
474 * looped back up to the write_out_data label. If there weren't
475 * any then journal_clean_data_list should have wiped the list
476 * clean by now, so check that it is in fact empty.
477 */
478 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
479
480 jbd_debug (3, "JBD: commit phase 3\n");
481
482 /*
483 * Way to go: we have now written out all of the data for a
484 * transaction! Now comes the tricky part: we need to write out
485 * metadata. Loop over the transaction's entire buffer list:
486 */
487 commit_transaction->t_state = T_COMMIT;
488
489 descriptor = NULL;
490 bufs = 0;
491 while (commit_transaction->t_buffers) {
492
493 /* Find the next buffer to be journaled... */
494
495 jh = commit_transaction->t_buffers;
496
497 /* If we're in abort mode, we just un-journal the buffer and
498 release it for background writing. */
499
500 if (is_journal_aborted(journal)) {
501 JBUFFER_TRACE(jh, "journal is aborting: refile");
502 journal_refile_buffer(journal, jh);
503 /* If that was the last one, we need to clean up
504 * any descriptor buffers which may have been
505 * already allocated, even if we are now
506 * aborting. */
507 if (!commit_transaction->t_buffers)
508 goto start_journal_io;
509 continue;
510 }
511
512 /* Make sure we have a descriptor block in which to
513 record the metadata buffer. */
514
515 if (!descriptor) {
516 struct buffer_head *bh;
517
518 J_ASSERT (bufs == 0);
519
520 jbd_debug(4, "JBD: get descriptor\n");
521
522 descriptor = journal_get_descriptor_buffer(journal);
523 if (!descriptor) {
524 __journal_abort_hard(journal);
525 continue;
526 }
527
528 bh = jh2bh(descriptor);
529 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
530 (unsigned long long)bh->b_blocknr, bh->b_data);
531 header = (journal_header_t *)&bh->b_data[0];
532 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
533 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
534 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
535
536 tagp = &bh->b_data[sizeof(journal_header_t)];
537 space_left = bh->b_size - sizeof(journal_header_t);
538 first_tag = 1;
539 set_buffer_jwrite(bh);
540 set_buffer_dirty(bh);
541 wbuf[bufs++] = bh;
542
543 /* Record it so that we can wait for IO
544 completion later */
545 BUFFER_TRACE(bh, "ph3: file as descriptor");
546 journal_file_buffer(descriptor, commit_transaction,
547 BJ_LogCtl);
548 }
549
550 /* Where is the buffer to be written? */
551
552 err = journal_next_log_block(journal, &blocknr);
553 /* If the block mapping failed, just abandon the buffer
554 and repeat this loop: we'll fall into the
555 refile-on-abort condition above. */
556 if (err) {
557 __journal_abort_hard(journal);
558 continue;
559 }
560
561 /*
562 * start_this_handle() uses t_outstanding_credits to determine
563 * the free space in the log, but this counter is changed
564 * by journal_next_log_block() also.
565 */
566 commit_transaction->t_outstanding_credits--;
567
568 /* Bump b_count to prevent truncate from stumbling over
569 the shadowed buffer! @@@ This can go if we ever get
570 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
571 atomic_inc(&jh2bh(jh)->b_count);
572
573 /* Make a temporary IO buffer with which to write it out
574 (this will requeue both the metadata buffer and the
575 temporary IO buffer). new_bh goes on BJ_IO*/
576
577 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
578 /*
579 * akpm: journal_write_metadata_buffer() sets
580 * new_bh->b_transaction to commit_transaction.
581 * We need to clean this up before we release new_bh
582 * (which is of type BJ_IO)
583 */
584 JBUFFER_TRACE(jh, "ph3: write metadata");
585 flags = journal_write_metadata_buffer(commit_transaction,
586 jh, &new_jh, blocknr);
587 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
588 wbuf[bufs++] = jh2bh(new_jh);
589
590 /* Record the new block's tag in the current descriptor
591 buffer */
592
593 tag_flag = 0;
594 if (flags & 1)
595 tag_flag |= JFS_FLAG_ESCAPE;
596 if (!first_tag)
597 tag_flag |= JFS_FLAG_SAME_UUID;
598
599 tag = (journal_block_tag_t *) tagp;
600 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
601 tag->t_flags = cpu_to_be32(tag_flag);
602 tagp += sizeof(journal_block_tag_t);
603 space_left -= sizeof(journal_block_tag_t);
604
605 if (first_tag) {
606 memcpy (tagp, journal->j_uuid, 16);
607 tagp += 16;
608 space_left -= 16;
609 first_tag = 0;
610 }
611
612 /* If there's no more to do, or if the descriptor is full,
613 let the IO rip! */
614
615 if (bufs == journal->j_wbufsize ||
616 commit_transaction->t_buffers == NULL ||
617 space_left < sizeof(journal_block_tag_t) + 16) {
618
619 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
620
621 /* Write an end-of-descriptor marker before
622 submitting the IOs. "tag" still points to
623 the last tag we set up. */
624
625 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
626
627start_journal_io:
628 for (i = 0; i < bufs; i++) {
629 struct buffer_head *bh = wbuf[i];
630 lock_buffer(bh);
631 clear_buffer_dirty(bh);
632 set_buffer_uptodate(bh);
633 bh->b_end_io = journal_end_buffer_io_sync;
634 submit_bh(WRITE, bh);
635 }
636 cond_resched();
637
638 /* Force a new descriptor to be generated next
639 time round the loop. */
640 descriptor = NULL;
641 bufs = 0;
642 }
643 }
644
645 /* Lo and behold: we have just managed to send a transaction to
646 the log. Before we can commit it, wait for the IO so far to
647 complete. Control buffers being written are on the
648 transaction's t_log_list queue, and metadata buffers are on
649 the t_iobuf_list queue.
650
651 Wait for the buffers in reverse order. That way we are
652 less likely to be woken up until all IOs have completed, and
653 so we incur less scheduling load.
654 */
655
656 jbd_debug(3, "JBD: commit phase 4\n");
657
658 /*
659 * akpm: these are BJ_IO, and j_list_lock is not needed.
660 * See __journal_try_to_free_buffer.
661 */
662wait_for_iobuf:
663 while (commit_transaction->t_iobuf_list != NULL) {
664 struct buffer_head *bh;
665
666 jh = commit_transaction->t_iobuf_list->b_tprev;
667 bh = jh2bh(jh);
668 if (buffer_locked(bh)) {
669 wait_on_buffer(bh);
670 goto wait_for_iobuf;
671 }
672 if (cond_resched())
673 goto wait_for_iobuf;
674
675 if (unlikely(!buffer_uptodate(bh)))
676 err = -EIO;
677
678 clear_buffer_jwrite(bh);
679
680 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
681 journal_unfile_buffer(journal, jh);
682
683 /*
684 * ->t_iobuf_list should contain only dummy buffer_heads
685 * which were created by journal_write_metadata_buffer().
686 */
687 BUFFER_TRACE(bh, "dumping temporary bh");
688 journal_put_journal_head(jh);
689 __brelse(bh);
690 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
691 free_buffer_head(bh);
692
693 /* We also have to unlock and free the corresponding
694 shadowed buffer */
695 jh = commit_transaction->t_shadow_list->b_tprev;
696 bh = jh2bh(jh);
697 clear_bit(BH_JWrite, &bh->b_state);
698 J_ASSERT_BH(bh, buffer_jbddirty(bh));
699
700 /* The metadata is now released for reuse, but we need
701 to remember it against this transaction so that when
702 we finally commit, we can do any checkpointing
703 required. */
704 JBUFFER_TRACE(jh, "file as BJ_Forget");
705 journal_file_buffer(jh, commit_transaction, BJ_Forget);
706 /* Wake up any transactions which were waiting for this
707 IO to complete */
708 wake_up_bit(&bh->b_state, BH_Unshadow);
709 JBUFFER_TRACE(jh, "brelse shadowed buffer");
710 __brelse(bh);
711 }
712
713 J_ASSERT (commit_transaction->t_shadow_list == NULL);
714
715 jbd_debug(3, "JBD: commit phase 5\n");
716
717 /* Here we wait for the revoke record and descriptor record buffers */
718 wait_for_ctlbuf:
719 while (commit_transaction->t_log_list != NULL) {
720 struct buffer_head *bh;
721
722 jh = commit_transaction->t_log_list->b_tprev;
723 bh = jh2bh(jh);
724 if (buffer_locked(bh)) {
725 wait_on_buffer(bh);
726 goto wait_for_ctlbuf;
727 }
728 if (cond_resched())
729 goto wait_for_ctlbuf;
730
731 if (unlikely(!buffer_uptodate(bh)))
732 err = -EIO;
733
734 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
735 clear_buffer_jwrite(bh);
736 journal_unfile_buffer(journal, jh);
737 journal_put_journal_head(jh);
738 __brelse(bh); /* One for getblk */
739 /* AKPM: bforget here */
740 }
741
742 jbd_debug(3, "JBD: commit phase 6\n");
743
744 if (journal_write_commit_record(journal, commit_transaction))
745 err = -EIO;
746
747 if (err)
748 __journal_abort_hard(journal);
749
750 /* End of a transaction! Finally, we can do checkpoint
751 processing: any buffers committed as a result of this
752 transaction can be removed from any checkpoint list it was on
753 before. */
754
755 jbd_debug(3, "JBD: commit phase 7\n");
756
757 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
758 J_ASSERT(commit_transaction->t_buffers == NULL);
759 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
760 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
761 J_ASSERT(commit_transaction->t_shadow_list == NULL);
762 J_ASSERT(commit_transaction->t_log_list == NULL);
763
764restart_loop:
765 /*
766 * As there are other places (journal_unmap_buffer()) adding buffers
767 * to this list we have to be careful and hold the j_list_lock.
768 */
769 spin_lock(&journal->j_list_lock);
770 while (commit_transaction->t_forget) {
771 transaction_t *cp_transaction;
772 struct buffer_head *bh;
773
774 jh = commit_transaction->t_forget;
775 spin_unlock(&journal->j_list_lock);
776 bh = jh2bh(jh);
777 jbd_lock_bh_state(bh);
778 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
779 jh->b_transaction == journal->j_running_transaction);
780
781 /*
782 * If there is undo-protected committed data against
783 * this buffer, then we can remove it now. If it is a
784 * buffer needing such protection, the old frozen_data
785 * field now points to a committed version of the
786 * buffer, so rotate that field to the new committed
787 * data.
788 *
789 * Otherwise, we can just throw away the frozen data now.
790 */
791 if (jh->b_committed_data) {
792 jbd_slab_free(jh->b_committed_data, bh->b_size);
793 jh->b_committed_data = NULL;
794 if (jh->b_frozen_data) {
795 jh->b_committed_data = jh->b_frozen_data;
796 jh->b_frozen_data = NULL;
797 }
798 } else if (jh->b_frozen_data) {
799 jbd_slab_free(jh->b_frozen_data, bh->b_size);
800 jh->b_frozen_data = NULL;
801 }
802
803 spin_lock(&journal->j_list_lock);
804 cp_transaction = jh->b_cp_transaction;
805 if (cp_transaction) {
806 JBUFFER_TRACE(jh, "remove from old cp transaction");
807 __journal_remove_checkpoint(jh);
808 }
809
810 /* Only re-checkpoint the buffer_head if it is marked
811 * dirty. If the buffer was added to the BJ_Forget list
812 * by journal_forget, it may no longer be dirty and
813 * there's no point in keeping a checkpoint record for
814 * it. */
815
816 /* A buffer which has been freed while still being
817 * journaled by a previous transaction may end up still
818 * being dirty here, but we want to avoid writing back
819 * that buffer in the future now that the last use has
820 * been committed. That's not only a performance gain,
821 * it also stops aliasing problems if the buffer is left
822 * behind for writeback and gets reallocated for another
823 * use in a different page. */
824 if (buffer_freed(bh)) {
825 clear_buffer_freed(bh);
826 clear_buffer_jbddirty(bh);
827 }
828
829 if (buffer_jbddirty(bh)) {
830 JBUFFER_TRACE(jh, "add to new checkpointing trans");
831 __journal_insert_checkpoint(jh, commit_transaction);
832 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
833 __journal_refile_buffer(jh);
834 jbd_unlock_bh_state(bh);
835 } else {
836 J_ASSERT_BH(bh, !buffer_dirty(bh));
837 /* The buffer on BJ_Forget list and not jbddirty means
838 * it has been freed by this transaction and hence it
839 * could not have been reallocated until this
840 * transaction has committed. *BUT* it could be
841 * reallocated once we have written all the data to
842 * disk and before we process the buffer on BJ_Forget
843 * list. */
844 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
845 __journal_refile_buffer(jh);
846 if (!jh->b_transaction) {
847 jbd_unlock_bh_state(bh);
848 /* needs a brelse */
849 journal_remove_journal_head(bh);
850 release_buffer_page(bh);
851 } else
852 jbd_unlock_bh_state(bh);
853 }
854 cond_resched_lock(&journal->j_list_lock);
855 }
856 spin_unlock(&journal->j_list_lock);
857 /*
858 * This is a bit sleazy. We borrow j_list_lock to protect
859 * journal->j_committing_transaction in __journal_remove_checkpoint.
860 * Really, __journal_remove_checkpoint should be using j_state_lock but
861 * it's a bit hassle to hold that across __journal_remove_checkpoint
862 */
863 spin_lock(&journal->j_state_lock);
864 spin_lock(&journal->j_list_lock);
865 /*
866 * Now recheck if some buffers did not get attached to the transaction
867 * while the lock was dropped...
868 */
869 if (commit_transaction->t_forget) {
870 spin_unlock(&journal->j_list_lock);
871 spin_unlock(&journal->j_state_lock);
872 goto restart_loop;
873 }
874
875 /* Done with this transaction! */
876
877 jbd_debug(3, "JBD: commit phase 8\n");
878
879 J_ASSERT(commit_transaction->t_state == T_COMMIT);
880
881 commit_transaction->t_state = T_FINISHED;
882 J_ASSERT(commit_transaction == journal->j_committing_transaction);
883 journal->j_commit_sequence = commit_transaction->t_tid;
884 journal->j_committing_transaction = NULL;
885 spin_unlock(&journal->j_state_lock);
886
887 if (commit_transaction->t_checkpoint_list == NULL) {
888 __journal_drop_transaction(journal, commit_transaction);
889 } else {
890 if (journal->j_checkpoint_transactions == NULL) {
891 journal->j_checkpoint_transactions = commit_transaction;
892 commit_transaction->t_cpnext = commit_transaction;
893 commit_transaction->t_cpprev = commit_transaction;
894 } else {
895 commit_transaction->t_cpnext =
896 journal->j_checkpoint_transactions;
897 commit_transaction->t_cpprev =
898 commit_transaction->t_cpnext->t_cpprev;
899 commit_transaction->t_cpnext->t_cpprev =
900 commit_transaction;
901 commit_transaction->t_cpprev->t_cpnext =
902 commit_transaction;
903 }
904 }
905 spin_unlock(&journal->j_list_lock);
906
907 jbd_debug(1, "JBD: commit %d complete, head %d\n",
908 journal->j_commit_sequence, journal->j_tail_sequence);
909
910 wake_up(&journal->j_wait_done_commit);
911}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
new file mode 100644
index 000000000000..c518dd8fe60a
--- /dev/null
+++ b/fs/jbd2/journal.c
@@ -0,0 +1,2072 @@
1/*
2 * linux/fs/jbd/journal.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem journal-writing code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages journals: areas of disk reserved for logging
16 * transactional updates. This includes the kernel journaling thread
17 * which is responsible for scheduling updates to the log.
18 *
19 * We do not actually manage the physical storage of the journal in this
20 * file: that is left to a per-journal policy function, which allows us
21 * to store the journal within a filesystem-specified area for ext2
22 * journaling (ext2 can use a reserved inode for storing the log).
23 */
24
25#include <linux/module.h>
26#include <linux/time.h>
27#include <linux/fs.h>
28#include <linux/jbd.h>
29#include <linux/errno.h>
30#include <linux/slab.h>
31#include <linux/smp_lock.h>
32#include <linux/init.h>
33#include <linux/mm.h>
34#include <linux/suspend.h>
35#include <linux/pagemap.h>
36#include <linux/kthread.h>
37#include <linux/poison.h>
38#include <linux/proc_fs.h>
39
40#include <asm/uaccess.h>
41#include <asm/page.h>
42
43EXPORT_SYMBOL(journal_start);
44EXPORT_SYMBOL(journal_restart);
45EXPORT_SYMBOL(journal_extend);
46EXPORT_SYMBOL(journal_stop);
47EXPORT_SYMBOL(journal_lock_updates);
48EXPORT_SYMBOL(journal_unlock_updates);
49EXPORT_SYMBOL(journal_get_write_access);
50EXPORT_SYMBOL(journal_get_create_access);
51EXPORT_SYMBOL(journal_get_undo_access);
52EXPORT_SYMBOL(journal_dirty_data);
53EXPORT_SYMBOL(journal_dirty_metadata);
54EXPORT_SYMBOL(journal_release_buffer);
55EXPORT_SYMBOL(journal_forget);
56#if 0
57EXPORT_SYMBOL(journal_sync_buffer);
58#endif
59EXPORT_SYMBOL(journal_flush);
60EXPORT_SYMBOL(journal_revoke);
61
62EXPORT_SYMBOL(journal_init_dev);
63EXPORT_SYMBOL(journal_init_inode);
64EXPORT_SYMBOL(journal_update_format);
65EXPORT_SYMBOL(journal_check_used_features);
66EXPORT_SYMBOL(journal_check_available_features);
67EXPORT_SYMBOL(journal_set_features);
68EXPORT_SYMBOL(journal_create);
69EXPORT_SYMBOL(journal_load);
70EXPORT_SYMBOL(journal_destroy);
71EXPORT_SYMBOL(journal_update_superblock);
72EXPORT_SYMBOL(journal_abort);
73EXPORT_SYMBOL(journal_errno);
74EXPORT_SYMBOL(journal_ack_err);
75EXPORT_SYMBOL(journal_clear_err);
76EXPORT_SYMBOL(log_wait_commit);
77EXPORT_SYMBOL(journal_start_commit);
78EXPORT_SYMBOL(journal_force_commit_nested);
79EXPORT_SYMBOL(journal_wipe);
80EXPORT_SYMBOL(journal_blocks_per_page);
81EXPORT_SYMBOL(journal_invalidatepage);
82EXPORT_SYMBOL(journal_try_to_free_buffers);
83EXPORT_SYMBOL(journal_force_commit);
84
85static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
86static void __journal_abort_soft (journal_t *journal, int errno);
87static int journal_create_jbd_slab(size_t slab_size);
88
89/*
90 * Helper function used to manage commit timeouts
91 */
92
93static void commit_timeout(unsigned long __data)
94{
95 struct task_struct * p = (struct task_struct *) __data;
96
97 wake_up_process(p);
98}
99
100/*
101 * kjournald: The main thread function used to manage a logging device
102 * journal.
103 *
104 * This kernel thread is responsible for two things:
105 *
106 * 1) COMMIT: Every so often we need to commit the current state of the
107 * filesystem to disk. The journal thread is responsible for writing
108 * all of the metadata buffers to disk.
109 *
110 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
111 * of the data in that part of the log has been rewritten elsewhere on
112 * the disk. Flushing these old buffers to reclaim space in the log is
113 * known as checkpointing, and this thread is responsible for that job.
114 */
115
116static int kjournald(void *arg)
117{
118 journal_t *journal = arg;
119 transaction_t *transaction;
120
121 /*
122 * Set up an interval timer which can be used to trigger a commit wakeup
123 * after the commit interval expires
124 */
125 setup_timer(&journal->j_commit_timer, commit_timeout,
126 (unsigned long)current);
127
128 /* Record that the journal thread is running */
129 journal->j_task = current;
130 wake_up(&journal->j_wait_done_commit);
131
132 printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
133 journal->j_commit_interval / HZ);
134
135 /*
136 * And now, wait forever for commit wakeup events.
137 */
138 spin_lock(&journal->j_state_lock);
139
140loop:
141 if (journal->j_flags & JFS_UNMOUNT)
142 goto end_loop;
143
144 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
145 journal->j_commit_sequence, journal->j_commit_request);
146
147 if (journal->j_commit_sequence != journal->j_commit_request) {
148 jbd_debug(1, "OK, requests differ\n");
149 spin_unlock(&journal->j_state_lock);
150 del_timer_sync(&journal->j_commit_timer);
151 journal_commit_transaction(journal);
152 spin_lock(&journal->j_state_lock);
153 goto loop;
154 }
155
156 wake_up(&journal->j_wait_done_commit);
157 if (freezing(current)) {
158 /*
159 * The simpler the better. Flushing journal isn't a
160 * good idea, because that depends on threads that may
161 * be already stopped.
162 */
163 jbd_debug(1, "Now suspending kjournald\n");
164 spin_unlock(&journal->j_state_lock);
165 refrigerator();
166 spin_lock(&journal->j_state_lock);
167 } else {
168 /*
169 * We assume on resume that commits are already there,
170 * so we don't sleep
171 */
172 DEFINE_WAIT(wait);
173 int should_sleep = 1;
174
175 prepare_to_wait(&journal->j_wait_commit, &wait,
176 TASK_INTERRUPTIBLE);
177 if (journal->j_commit_sequence != journal->j_commit_request)
178 should_sleep = 0;
179 transaction = journal->j_running_transaction;
180 if (transaction && time_after_eq(jiffies,
181 transaction->t_expires))
182 should_sleep = 0;
183 if (journal->j_flags & JFS_UNMOUNT)
184 should_sleep = 0;
185 if (should_sleep) {
186 spin_unlock(&journal->j_state_lock);
187 schedule();
188 spin_lock(&journal->j_state_lock);
189 }
190 finish_wait(&journal->j_wait_commit, &wait);
191 }
192
193 jbd_debug(1, "kjournald wakes\n");
194
195 /*
196 * Were we woken up by a commit wakeup event?
197 */
198 transaction = journal->j_running_transaction;
199 if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
200 journal->j_commit_request = transaction->t_tid;
201 jbd_debug(1, "woke because of timeout\n");
202 }
203 goto loop;
204
205end_loop:
206 spin_unlock(&journal->j_state_lock);
207 del_timer_sync(&journal->j_commit_timer);
208 journal->j_task = NULL;
209 wake_up(&journal->j_wait_done_commit);
210 jbd_debug(1, "Journal thread exiting.\n");
211 return 0;
212}
213
214static void journal_start_thread(journal_t *journal)
215{
216 kthread_run(kjournald, journal, "kjournald");
217 wait_event(journal->j_wait_done_commit, journal->j_task != 0);
218}
219
220static void journal_kill_thread(journal_t *journal)
221{
222 spin_lock(&journal->j_state_lock);
223 journal->j_flags |= JFS_UNMOUNT;
224
225 while (journal->j_task) {
226 wake_up(&journal->j_wait_commit);
227 spin_unlock(&journal->j_state_lock);
228 wait_event(journal->j_wait_done_commit, journal->j_task == 0);
229 spin_lock(&journal->j_state_lock);
230 }
231 spin_unlock(&journal->j_state_lock);
232}
233
234/*
235 * journal_write_metadata_buffer: write a metadata buffer to the journal.
236 *
237 * Writes a metadata buffer to a given disk block. The actual IO is not
238 * performed but a new buffer_head is constructed which labels the data
239 * to be written with the correct destination disk block.
240 *
241 * Any magic-number escaping which needs to be done will cause a
242 * copy-out here. If the buffer happens to start with the
243 * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
244 * magic number is only written to the log for descripter blocks. In
245 * this case, we copy the data and replace the first word with 0, and we
246 * return a result code which indicates that this buffer needs to be
247 * marked as an escaped buffer in the corresponding log descriptor
248 * block. The missing word can then be restored when the block is read
249 * during recovery.
250 *
251 * If the source buffer has already been modified by a new transaction
252 * since we took the last commit snapshot, we use the frozen copy of
253 * that data for IO. If we end up using the existing buffer_head's data
254 * for the write, then we *have* to lock the buffer to prevent anyone
255 * else from using and possibly modifying it while the IO is in
256 * progress.
257 *
258 * The function returns a pointer to the buffer_heads to be used for IO.
259 *
260 * We assume that the journal has already been locked in this function.
261 *
262 * Return value:
263 * <0: Error
264 * >=0: Finished OK
265 *
266 * On success:
267 * Bit 0 set == escape performed on the data
268 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
269 */
270
271int journal_write_metadata_buffer(transaction_t *transaction,
272 struct journal_head *jh_in,
273 struct journal_head **jh_out,
274 unsigned long blocknr)
275{
276 int need_copy_out = 0;
277 int done_copy_out = 0;
278 int do_escape = 0;
279 char *mapped_data;
280 struct buffer_head *new_bh;
281 struct journal_head *new_jh;
282 struct page *new_page;
283 unsigned int new_offset;
284 struct buffer_head *bh_in = jh2bh(jh_in);
285
286 /*
287 * The buffer really shouldn't be locked: only the current committing
288 * transaction is allowed to write it, so nobody else is allowed
289 * to do any IO.
290 *
291 * akpm: except if we're journalling data, and write() output is
292 * also part of a shared mapping, and another thread has
293 * decided to launch a writepage() against this buffer.
294 */
295 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
296
297 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
298
299 /*
300 * If a new transaction has already done a buffer copy-out, then
301 * we use that version of the data for the commit.
302 */
303 jbd_lock_bh_state(bh_in);
304repeat:
305 if (jh_in->b_frozen_data) {
306 done_copy_out = 1;
307 new_page = virt_to_page(jh_in->b_frozen_data);
308 new_offset = offset_in_page(jh_in->b_frozen_data);
309 } else {
310 new_page = jh2bh(jh_in)->b_page;
311 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
312 }
313
314 mapped_data = kmap_atomic(new_page, KM_USER0);
315 /*
316 * Check for escaping
317 */
318 if (*((__be32 *)(mapped_data + new_offset)) ==
319 cpu_to_be32(JFS_MAGIC_NUMBER)) {
320 need_copy_out = 1;
321 do_escape = 1;
322 }
323 kunmap_atomic(mapped_data, KM_USER0);
324
325 /*
326 * Do we need to do a data copy?
327 */
328 if (need_copy_out && !done_copy_out) {
329 char *tmp;
330
331 jbd_unlock_bh_state(bh_in);
332 tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS);
333 jbd_lock_bh_state(bh_in);
334 if (jh_in->b_frozen_data) {
335 jbd_slab_free(tmp, bh_in->b_size);
336 goto repeat;
337 }
338
339 jh_in->b_frozen_data = tmp;
340 mapped_data = kmap_atomic(new_page, KM_USER0);
341 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
342 kunmap_atomic(mapped_data, KM_USER0);
343
344 new_page = virt_to_page(tmp);
345 new_offset = offset_in_page(tmp);
346 done_copy_out = 1;
347 }
348
349 /*
350 * Did we need to do an escaping? Now we've done all the
351 * copying, we can finally do so.
352 */
353 if (do_escape) {
354 mapped_data = kmap_atomic(new_page, KM_USER0);
355 *((unsigned int *)(mapped_data + new_offset)) = 0;
356 kunmap_atomic(mapped_data, KM_USER0);
357 }
358
359 /* keep subsequent assertions sane */
360 new_bh->b_state = 0;
361 init_buffer(new_bh, NULL, NULL);
362 atomic_set(&new_bh->b_count, 1);
363 jbd_unlock_bh_state(bh_in);
364
365 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
366
367 set_bh_page(new_bh, new_page, new_offset);
368 new_jh->b_transaction = NULL;
369 new_bh->b_size = jh2bh(jh_in)->b_size;
370 new_bh->b_bdev = transaction->t_journal->j_dev;
371 new_bh->b_blocknr = blocknr;
372 set_buffer_mapped(new_bh);
373 set_buffer_dirty(new_bh);
374
375 *jh_out = new_jh;
376
377 /*
378 * The to-be-written buffer needs to get moved to the io queue,
379 * and the original buffer whose contents we are shadowing or
380 * copying is moved to the transaction's shadow queue.
381 */
382 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
383 journal_file_buffer(jh_in, transaction, BJ_Shadow);
384 JBUFFER_TRACE(new_jh, "file as BJ_IO");
385 journal_file_buffer(new_jh, transaction, BJ_IO);
386
387 return do_escape | (done_copy_out << 1);
388}
389
390/*
391 * Allocation code for the journal file. Manage the space left in the
392 * journal, so that we can begin checkpointing when appropriate.
393 */
394
395/*
396 * __log_space_left: Return the number of free blocks left in the journal.
397 *
398 * Called with the journal already locked.
399 *
400 * Called under j_state_lock
401 */
402
403int __log_space_left(journal_t *journal)
404{
405 int left = journal->j_free;
406
407 assert_spin_locked(&journal->j_state_lock);
408
409 /*
410 * Be pessimistic here about the number of those free blocks which
411 * might be required for log descriptor control blocks.
412 */
413
414#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
415
416 left -= MIN_LOG_RESERVED_BLOCKS;
417
418 if (left <= 0)
419 return 0;
420 left -= (left >> 3);
421 return left;
422}
423
424/*
425 * Called under j_state_lock. Returns true if a transaction was started.
426 */
427int __log_start_commit(journal_t *journal, tid_t target)
428{
429 /*
430 * Are we already doing a recent enough commit?
431 */
432 if (!tid_geq(journal->j_commit_request, target)) {
433 /*
434 * We want a new commit: OK, mark the request and wakup the
435 * commit thread. We do _not_ do the commit ourselves.
436 */
437
438 journal->j_commit_request = target;
439 jbd_debug(1, "JBD: requesting commit %d/%d\n",
440 journal->j_commit_request,
441 journal->j_commit_sequence);
442 wake_up(&journal->j_wait_commit);
443 return 1;
444 }
445 return 0;
446}
447
448int log_start_commit(journal_t *journal, tid_t tid)
449{
450 int ret;
451
452 spin_lock(&journal->j_state_lock);
453 ret = __log_start_commit(journal, tid);
454 spin_unlock(&journal->j_state_lock);
455 return ret;
456}
457
458/*
459 * Force and wait upon a commit if the calling process is not within
460 * transaction. This is used for forcing out undo-protected data which contains
461 * bitmaps, when the fs is running out of space.
462 *
463 * We can only force the running transaction if we don't have an active handle;
464 * otherwise, we will deadlock.
465 *
466 * Returns true if a transaction was started.
467 */
468int journal_force_commit_nested(journal_t *journal)
469{
470 transaction_t *transaction = NULL;
471 tid_t tid;
472
473 spin_lock(&journal->j_state_lock);
474 if (journal->j_running_transaction && !current->journal_info) {
475 transaction = journal->j_running_transaction;
476 __log_start_commit(journal, transaction->t_tid);
477 } else if (journal->j_committing_transaction)
478 transaction = journal->j_committing_transaction;
479
480 if (!transaction) {
481 spin_unlock(&journal->j_state_lock);
482 return 0; /* Nothing to retry */
483 }
484
485 tid = transaction->t_tid;
486 spin_unlock(&journal->j_state_lock);
487 log_wait_commit(journal, tid);
488 return 1;
489}
490
491/*
492 * Start a commit of the current running transaction (if any). Returns true
493 * if a transaction was started, and fills its tid in at *ptid
494 */
495int journal_start_commit(journal_t *journal, tid_t *ptid)
496{
497 int ret = 0;
498
499 spin_lock(&journal->j_state_lock);
500 if (journal->j_running_transaction) {
501 tid_t tid = journal->j_running_transaction->t_tid;
502
503 ret = __log_start_commit(journal, tid);
504 if (ret && ptid)
505 *ptid = tid;
506 } else if (journal->j_committing_transaction && ptid) {
507 /*
508 * If ext3_write_super() recently started a commit, then we
509 * have to wait for completion of that transaction
510 */
511 *ptid = journal->j_committing_transaction->t_tid;
512 ret = 1;
513 }
514 spin_unlock(&journal->j_state_lock);
515 return ret;
516}
517
518/*
519 * Wait for a specified commit to complete.
520 * The caller may not hold the journal lock.
521 */
522int log_wait_commit(journal_t *journal, tid_t tid)
523{
524 int err = 0;
525
526#ifdef CONFIG_JBD_DEBUG
527 spin_lock(&journal->j_state_lock);
528 if (!tid_geq(journal->j_commit_request, tid)) {
529 printk(KERN_EMERG
530 "%s: error: j_commit_request=%d, tid=%d\n",
531 __FUNCTION__, journal->j_commit_request, tid);
532 }
533 spin_unlock(&journal->j_state_lock);
534#endif
535 spin_lock(&journal->j_state_lock);
536 while (tid_gt(tid, journal->j_commit_sequence)) {
537 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
538 tid, journal->j_commit_sequence);
539 wake_up(&journal->j_wait_commit);
540 spin_unlock(&journal->j_state_lock);
541 wait_event(journal->j_wait_done_commit,
542 !tid_gt(tid, journal->j_commit_sequence));
543 spin_lock(&journal->j_state_lock);
544 }
545 spin_unlock(&journal->j_state_lock);
546
547 if (unlikely(is_journal_aborted(journal))) {
548 printk(KERN_EMERG "journal commit I/O error\n");
549 err = -EIO;
550 }
551 return err;
552}
553
554/*
555 * Log buffer allocation routines:
556 */
557
558int journal_next_log_block(journal_t *journal, unsigned long *retp)
559{
560 unsigned long blocknr;
561
562 spin_lock(&journal->j_state_lock);
563 J_ASSERT(journal->j_free > 1);
564
565 blocknr = journal->j_head;
566 journal->j_head++;
567 journal->j_free--;
568 if (journal->j_head == journal->j_last)
569 journal->j_head = journal->j_first;
570 spin_unlock(&journal->j_state_lock);
571 return journal_bmap(journal, blocknr, retp);
572}
573
574/*
575 * Conversion of logical to physical block numbers for the journal
576 *
577 * On external journals the journal blocks are identity-mapped, so
578 * this is a no-op. If needed, we can use j_blk_offset - everything is
579 * ready.
580 */
581int journal_bmap(journal_t *journal, unsigned long blocknr,
582 unsigned long *retp)
583{
584 int err = 0;
585 unsigned long ret;
586
587 if (journal->j_inode) {
588 ret = bmap(journal->j_inode, blocknr);
589 if (ret)
590 *retp = ret;
591 else {
592 char b[BDEVNAME_SIZE];
593
594 printk(KERN_ALERT "%s: journal block not found "
595 "at offset %lu on %s\n",
596 __FUNCTION__,
597 blocknr,
598 bdevname(journal->j_dev, b));
599 err = -EIO;
600 __journal_abort_soft(journal, err);
601 }
602 } else {
603 *retp = blocknr; /* +journal->j_blk_offset */
604 }
605 return err;
606}
607
608/*
609 * We play buffer_head aliasing tricks to write data/metadata blocks to
610 * the journal without copying their contents, but for journal
611 * descriptor blocks we do need to generate bona fide buffers.
612 *
613 * After the caller of journal_get_descriptor_buffer() has finished modifying
614 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
615 * But we don't bother doing that, so there will be coherency problems with
616 * mmaps of blockdevs which hold live JBD-controlled filesystems.
617 */
618struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
619{
620 struct buffer_head *bh;
621 unsigned long blocknr;
622 int err;
623
624 err = journal_next_log_block(journal, &blocknr);
625
626 if (err)
627 return NULL;
628
629 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
630 lock_buffer(bh);
631 memset(bh->b_data, 0, journal->j_blocksize);
632 set_buffer_uptodate(bh);
633 unlock_buffer(bh);
634 BUFFER_TRACE(bh, "return this buffer");
635 return journal_add_journal_head(bh);
636}
637
638/*
639 * Management for journal control blocks: functions to create and
640 * destroy journal_t structures, and to initialise and read existing
641 * journal blocks from disk. */
642
643/* First: create and setup a journal_t object in memory. We initialise
644 * very few fields yet: that has to wait until we have created the
645 * journal structures from from scratch, or loaded them from disk. */
646
647static journal_t * journal_init_common (void)
648{
649 journal_t *journal;
650 int err;
651
652 journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
653 if (!journal)
654 goto fail;
655 memset(journal, 0, sizeof(*journal));
656
657 init_waitqueue_head(&journal->j_wait_transaction_locked);
658 init_waitqueue_head(&journal->j_wait_logspace);
659 init_waitqueue_head(&journal->j_wait_done_commit);
660 init_waitqueue_head(&journal->j_wait_checkpoint);
661 init_waitqueue_head(&journal->j_wait_commit);
662 init_waitqueue_head(&journal->j_wait_updates);
663 mutex_init(&journal->j_barrier);
664 mutex_init(&journal->j_checkpoint_mutex);
665 spin_lock_init(&journal->j_revoke_lock);
666 spin_lock_init(&journal->j_list_lock);
667 spin_lock_init(&journal->j_state_lock);
668
669 journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
670
671 /* The journal is marked for error until we succeed with recovery! */
672 journal->j_flags = JFS_ABORT;
673
674 /* Set up a default-sized revoke table for the new mount. */
675 err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
676 if (err) {
677 kfree(journal);
678 goto fail;
679 }
680 return journal;
681fail:
682 return NULL;
683}
684
685/* journal_init_dev and journal_init_inode:
686 *
687 * Create a journal structure assigned some fixed set of disk blocks to
688 * the journal. We don't actually touch those disk blocks yet, but we
689 * need to set up all of the mapping information to tell the journaling
690 * system where the journal blocks are.
691 *
692 */
693
694/**
695 * journal_t * journal_init_dev() - creates an initialises a journal structure
696 * @bdev: Block device on which to create the journal
697 * @fs_dev: Device which hold journalled filesystem for this journal.
698 * @start: Block nr Start of journal.
699 * @len: Length of the journal in blocks.
700 * @blocksize: blocksize of journalling device
701 * @returns: a newly created journal_t *
702 *
703 * journal_init_dev creates a journal which maps a fixed contiguous
704 * range of blocks on an arbitrary block device.
705 *
706 */
707journal_t * journal_init_dev(struct block_device *bdev,
708 struct block_device *fs_dev,
709 int start, int len, int blocksize)
710{
711 journal_t *journal = journal_init_common();
712 struct buffer_head *bh;
713 int n;
714
715 if (!journal)
716 return NULL;
717
718 /* journal descriptor can store up to n blocks -bzzz */
719 journal->j_blocksize = blocksize;
720 n = journal->j_blocksize / sizeof(journal_block_tag_t);
721 journal->j_wbufsize = n;
722 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
723 if (!journal->j_wbuf) {
724 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
725 __FUNCTION__);
726 kfree(journal);
727 journal = NULL;
728 }
729 journal->j_dev = bdev;
730 journal->j_fs_dev = fs_dev;
731 journal->j_blk_offset = start;
732 journal->j_maxlen = len;
733
734 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
735 J_ASSERT(bh != NULL);
736 journal->j_sb_buffer = bh;
737 journal->j_superblock = (journal_superblock_t *)bh->b_data;
738
739 return journal;
740}
741
742/**
743 * journal_t * journal_init_inode () - creates a journal which maps to a inode.
744 * @inode: An inode to create the journal in
745 *
746 * journal_init_inode creates a journal which maps an on-disk inode as
747 * the journal. The inode must exist already, must support bmap() and
748 * must have all data blocks preallocated.
749 */
750journal_t * journal_init_inode (struct inode *inode)
751{
752 struct buffer_head *bh;
753 journal_t *journal = journal_init_common();
754 int err;
755 int n;
756 unsigned long blocknr;
757
758 if (!journal)
759 return NULL;
760
761 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
762 journal->j_inode = inode;
763 jbd_debug(1,
764 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
765 journal, inode->i_sb->s_id, inode->i_ino,
766 (long long) inode->i_size,
767 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
768
769 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
770 journal->j_blocksize = inode->i_sb->s_blocksize;
771
772 /* journal descriptor can store up to n blocks -bzzz */
773 n = journal->j_blocksize / sizeof(journal_block_tag_t);
774 journal->j_wbufsize = n;
775 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
776 if (!journal->j_wbuf) {
777 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
778 __FUNCTION__);
779 kfree(journal);
780 return NULL;
781 }
782
783 err = journal_bmap(journal, 0, &blocknr);
784 /* If that failed, give up */
785 if (err) {
786 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
787 __FUNCTION__);
788 kfree(journal);
789 return NULL;
790 }
791
792 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
793 J_ASSERT(bh != NULL);
794 journal->j_sb_buffer = bh;
795 journal->j_superblock = (journal_superblock_t *)bh->b_data;
796
797 return journal;
798}
799
800/*
801 * If the journal init or create aborts, we need to mark the journal
802 * superblock as being NULL to prevent the journal destroy from writing
803 * back a bogus superblock.
804 */
805static void journal_fail_superblock (journal_t *journal)
806{
807 struct buffer_head *bh = journal->j_sb_buffer;
808 brelse(bh);
809 journal->j_sb_buffer = NULL;
810}
811
812/*
813 * Given a journal_t structure, initialise the various fields for
814 * startup of a new journaling session. We use this both when creating
815 * a journal, and after recovering an old journal to reset it for
816 * subsequent use.
817 */
818
819static int journal_reset(journal_t *journal)
820{
821 journal_superblock_t *sb = journal->j_superblock;
822 unsigned long first, last;
823
824 first = be32_to_cpu(sb->s_first);
825 last = be32_to_cpu(sb->s_maxlen);
826
827 journal->j_first = first;
828 journal->j_last = last;
829
830 journal->j_head = first;
831 journal->j_tail = first;
832 journal->j_free = last - first;
833
834 journal->j_tail_sequence = journal->j_transaction_sequence;
835 journal->j_commit_sequence = journal->j_transaction_sequence - 1;
836 journal->j_commit_request = journal->j_commit_sequence;
837
838 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
839
840 /* Add the dynamic fields and write it to disk. */
841 journal_update_superblock(journal, 1);
842 journal_start_thread(journal);
843 return 0;
844}
845
846/**
847 * int journal_create() - Initialise the new journal file
848 * @journal: Journal to create. This structure must have been initialised
849 *
850 * Given a journal_t structure which tells us which disk blocks we can
851 * use, create a new journal superblock and initialise all of the
852 * journal fields from scratch.
853 **/
854int journal_create(journal_t *journal)
855{
856 unsigned long blocknr;
857 struct buffer_head *bh;
858 journal_superblock_t *sb;
859 int i, err;
860
861 if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
862 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
863 journal->j_maxlen);
864 journal_fail_superblock(journal);
865 return -EINVAL;
866 }
867
868 if (journal->j_inode == NULL) {
869 /*
870 * We don't know what block to start at!
871 */
872 printk(KERN_EMERG
873 "%s: creation of journal on external device!\n",
874 __FUNCTION__);
875 BUG();
876 }
877
878 /* Zero out the entire journal on disk. We cannot afford to
879 have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
880 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
881 for (i = 0; i < journal->j_maxlen; i++) {
882 err = journal_bmap(journal, i, &blocknr);
883 if (err)
884 return err;
885 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
886 lock_buffer(bh);
887 memset (bh->b_data, 0, journal->j_blocksize);
888 BUFFER_TRACE(bh, "marking dirty");
889 mark_buffer_dirty(bh);
890 BUFFER_TRACE(bh, "marking uptodate");
891 set_buffer_uptodate(bh);
892 unlock_buffer(bh);
893 __brelse(bh);
894 }
895
896 sync_blockdev(journal->j_dev);
897 jbd_debug(1, "JBD: journal cleared.\n");
898
899 /* OK, fill in the initial static fields in the new superblock */
900 sb = journal->j_superblock;
901
902 sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
903 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
904
905 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
906 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
907 sb->s_first = cpu_to_be32(1);
908
909 journal->j_transaction_sequence = 1;
910
911 journal->j_flags &= ~JFS_ABORT;
912 journal->j_format_version = 2;
913
914 return journal_reset(journal);
915}
916
917/**
918 * void journal_update_superblock() - Update journal sb on disk.
919 * @journal: The journal to update.
920 * @wait: Set to '0' if you don't want to wait for IO completion.
921 *
922 * Update a journal's dynamic superblock fields and write it to disk,
923 * optionally waiting for the IO to complete.
924 */
925void journal_update_superblock(journal_t *journal, int wait)
926{
927 journal_superblock_t *sb = journal->j_superblock;
928 struct buffer_head *bh = journal->j_sb_buffer;
929
930 /*
931 * As a special case, if the on-disk copy is already marked as needing
932 * no recovery (s_start == 0) and there are no outstanding transactions
933 * in the filesystem, then we can safely defer the superblock update
934 * until the next commit by setting JFS_FLUSHED. This avoids
935 * attempting a write to a potential-readonly device.
936 */
937 if (sb->s_start == 0 && journal->j_tail_sequence ==
938 journal->j_transaction_sequence) {
939 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
940 "(start %ld, seq %d, errno %d)\n",
941 journal->j_tail, journal->j_tail_sequence,
942 journal->j_errno);
943 goto out;
944 }
945
946 spin_lock(&journal->j_state_lock);
947 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
948 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
949
950 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
951 sb->s_start = cpu_to_be32(journal->j_tail);
952 sb->s_errno = cpu_to_be32(journal->j_errno);
953 spin_unlock(&journal->j_state_lock);
954
955 BUFFER_TRACE(bh, "marking dirty");
956 mark_buffer_dirty(bh);
957 if (wait)
958 sync_dirty_buffer(bh);
959 else
960 ll_rw_block(SWRITE, 1, &bh);
961
962out:
963 /* If we have just flushed the log (by marking s_start==0), then
964 * any future commit will have to be careful to update the
965 * superblock again to re-record the true start of the log. */
966
967 spin_lock(&journal->j_state_lock);
968 if (sb->s_start)
969 journal->j_flags &= ~JFS_FLUSHED;
970 else
971 journal->j_flags |= JFS_FLUSHED;
972 spin_unlock(&journal->j_state_lock);
973}
974
975/*
976 * Read the superblock for a given journal, performing initial
977 * validation of the format.
978 */
979
980static int journal_get_superblock(journal_t *journal)
981{
982 struct buffer_head *bh;
983 journal_superblock_t *sb;
984 int err = -EIO;
985
986 bh = journal->j_sb_buffer;
987
988 J_ASSERT(bh != NULL);
989 if (!buffer_uptodate(bh)) {
990 ll_rw_block(READ, 1, &bh);
991 wait_on_buffer(bh);
992 if (!buffer_uptodate(bh)) {
993 printk (KERN_ERR
994 "JBD: IO error reading journal superblock\n");
995 goto out;
996 }
997 }
998
999 sb = journal->j_superblock;
1000
1001 err = -EINVAL;
1002
1003 if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) ||
1004 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1005 printk(KERN_WARNING "JBD: no valid journal superblock found\n");
1006 goto out;
1007 }
1008
1009 switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1010 case JFS_SUPERBLOCK_V1:
1011 journal->j_format_version = 1;
1012 break;
1013 case JFS_SUPERBLOCK_V2:
1014 journal->j_format_version = 2;
1015 break;
1016 default:
1017 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
1018 goto out;
1019 }
1020
1021 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1022 journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1023 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1024 printk (KERN_WARNING "JBD: journal file too short\n");
1025 goto out;
1026 }
1027
1028 return 0;
1029
1030out:
1031 journal_fail_superblock(journal);
1032 return err;
1033}
1034
1035/*
1036 * Load the on-disk journal superblock and read the key fields into the
1037 * journal_t.
1038 */
1039
1040static int load_superblock(journal_t *journal)
1041{
1042 int err;
1043 journal_superblock_t *sb;
1044
1045 err = journal_get_superblock(journal);
1046 if (err)
1047 return err;
1048
1049 sb = journal->j_superblock;
1050
1051 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1052 journal->j_tail = be32_to_cpu(sb->s_start);
1053 journal->j_first = be32_to_cpu(sb->s_first);
1054 journal->j_last = be32_to_cpu(sb->s_maxlen);
1055 journal->j_errno = be32_to_cpu(sb->s_errno);
1056
1057 return 0;
1058}
1059
1060
1061/**
1062 * int journal_load() - Read journal from disk.
1063 * @journal: Journal to act on.
1064 *
1065 * Given a journal_t structure which tells us which disk blocks contain
1066 * a journal, read the journal from disk to initialise the in-memory
1067 * structures.
1068 */
1069int journal_load(journal_t *journal)
1070{
1071 int err;
1072 journal_superblock_t *sb;
1073
1074 err = load_superblock(journal);
1075 if (err)
1076 return err;
1077
1078 sb = journal->j_superblock;
1079 /* If this is a V2 superblock, then we have to check the
1080 * features flags on it. */
1081
1082 if (journal->j_format_version >= 2) {
1083 if ((sb->s_feature_ro_compat &
1084 ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
1085 (sb->s_feature_incompat &
1086 ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
1087 printk (KERN_WARNING
1088 "JBD: Unrecognised features on journal\n");
1089 return -EINVAL;
1090 }
1091 }
1092
1093 /*
1094 * Create a slab for this blocksize
1095 */
1096 err = journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
1097 if (err)
1098 return err;
1099
1100 /* Let the recovery code check whether it needs to recover any
1101 * data from the journal. */
1102 if (journal_recover(journal))
1103 goto recovery_error;
1104
1105 /* OK, we've finished with the dynamic journal bits:
1106 * reinitialise the dynamic contents of the superblock in memory
1107 * and reset them on disk. */
1108 if (journal_reset(journal))
1109 goto recovery_error;
1110
1111 journal->j_flags &= ~JFS_ABORT;
1112 journal->j_flags |= JFS_LOADED;
1113 return 0;
1114
1115recovery_error:
1116 printk (KERN_WARNING "JBD: recovery failed\n");
1117 return -EIO;
1118}
1119
1120/**
1121 * void journal_destroy() - Release a journal_t structure.
1122 * @journal: Journal to act on.
1123 *
1124 * Release a journal_t structure once it is no longer in use by the
1125 * journaled object.
1126 */
1127void journal_destroy(journal_t *journal)
1128{
1129 /* Wait for the commit thread to wake up and die. */
1130 journal_kill_thread(journal);
1131
1132 /* Force a final log commit */
1133 if (journal->j_running_transaction)
1134 journal_commit_transaction(journal);
1135
1136 /* Force any old transactions to disk */
1137
1138 /* Totally anal locking here... */
1139 spin_lock(&journal->j_list_lock);
1140 while (journal->j_checkpoint_transactions != NULL) {
1141 spin_unlock(&journal->j_list_lock);
1142 log_do_checkpoint(journal);
1143 spin_lock(&journal->j_list_lock);
1144 }
1145
1146 J_ASSERT(journal->j_running_transaction == NULL);
1147 J_ASSERT(journal->j_committing_transaction == NULL);
1148 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1149 spin_unlock(&journal->j_list_lock);
1150
1151 /* We can now mark the journal as empty. */
1152 journal->j_tail = 0;
1153 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1154 if (journal->j_sb_buffer) {
1155 journal_update_superblock(journal, 1);
1156 brelse(journal->j_sb_buffer);
1157 }
1158
1159 if (journal->j_inode)
1160 iput(journal->j_inode);
1161 if (journal->j_revoke)
1162 journal_destroy_revoke(journal);
1163 kfree(journal->j_wbuf);
1164 kfree(journal);
1165}
1166
1167
1168/**
1169 *int journal_check_used_features () - Check if features specified are used.
1170 * @journal: Journal to check.
1171 * @compat: bitmask of compatible features
1172 * @ro: bitmask of features that force read-only mount
1173 * @incompat: bitmask of incompatible features
1174 *
1175 * Check whether the journal uses all of a given set of
1176 * features. Return true (non-zero) if it does.
1177 **/
1178
1179int journal_check_used_features (journal_t *journal, unsigned long compat,
1180 unsigned long ro, unsigned long incompat)
1181{
1182 journal_superblock_t *sb;
1183
1184 if (!compat && !ro && !incompat)
1185 return 1;
1186 if (journal->j_format_version == 1)
1187 return 0;
1188
1189 sb = journal->j_superblock;
1190
1191 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1192 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1193 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1194 return 1;
1195
1196 return 0;
1197}
1198
1199/**
1200 * int journal_check_available_features() - Check feature set in journalling layer
1201 * @journal: Journal to check.
1202 * @compat: bitmask of compatible features
1203 * @ro: bitmask of features that force read-only mount
1204 * @incompat: bitmask of incompatible features
1205 *
1206 * Check whether the journaling code supports the use of
1207 * all of a given set of features on this journal. Return true
1208 * (non-zero) if it can. */
1209
1210int journal_check_available_features (journal_t *journal, unsigned long compat,
1211 unsigned long ro, unsigned long incompat)
1212{
1213 journal_superblock_t *sb;
1214
1215 if (!compat && !ro && !incompat)
1216 return 1;
1217
1218 sb = journal->j_superblock;
1219
1220 /* We can support any known requested features iff the
1221 * superblock is in version 2. Otherwise we fail to support any
1222 * extended sb features. */
1223
1224 if (journal->j_format_version != 2)
1225 return 0;
1226
1227 if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
1228 (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
1229 (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
1230 return 1;
1231
1232 return 0;
1233}
1234
1235/**
1236 * int journal_set_features () - Mark a given journal feature in the superblock
1237 * @journal: Journal to act on.
1238 * @compat: bitmask of compatible features
1239 * @ro: bitmask of features that force read-only mount
1240 * @incompat: bitmask of incompatible features
1241 *
1242 * Mark a given journal feature as present on the
1243 * superblock. Returns true if the requested features could be set.
1244 *
1245 */
1246
1247int journal_set_features (journal_t *journal, unsigned long compat,
1248 unsigned long ro, unsigned long incompat)
1249{
1250 journal_superblock_t *sb;
1251
1252 if (journal_check_used_features(journal, compat, ro, incompat))
1253 return 1;
1254
1255 if (!journal_check_available_features(journal, compat, ro, incompat))
1256 return 0;
1257
1258 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1259 compat, ro, incompat);
1260
1261 sb = journal->j_superblock;
1262
1263 sb->s_feature_compat |= cpu_to_be32(compat);
1264 sb->s_feature_ro_compat |= cpu_to_be32(ro);
1265 sb->s_feature_incompat |= cpu_to_be32(incompat);
1266
1267 return 1;
1268}
1269
1270
1271/**
1272 * int journal_update_format () - Update on-disk journal structure.
1273 * @journal: Journal to act on.
1274 *
1275 * Given an initialised but unloaded journal struct, poke about in the
1276 * on-disk structure to update it to the most recent supported version.
1277 */
1278int journal_update_format (journal_t *journal)
1279{
1280 journal_superblock_t *sb;
1281 int err;
1282
1283 err = journal_get_superblock(journal);
1284 if (err)
1285 return err;
1286
1287 sb = journal->j_superblock;
1288
1289 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1290 case JFS_SUPERBLOCK_V2:
1291 return 0;
1292 case JFS_SUPERBLOCK_V1:
1293 return journal_convert_superblock_v1(journal, sb);
1294 default:
1295 break;
1296 }
1297 return -EINVAL;
1298}
1299
1300static int journal_convert_superblock_v1(journal_t *journal,
1301 journal_superblock_t *sb)
1302{
1303 int offset, blocksize;
1304 struct buffer_head *bh;
1305
1306 printk(KERN_WARNING
1307 "JBD: Converting superblock from version 1 to 2.\n");
1308
1309 /* Pre-initialise new fields to zero */
1310 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1311 blocksize = be32_to_cpu(sb->s_blocksize);
1312 memset(&sb->s_feature_compat, 0, blocksize-offset);
1313
1314 sb->s_nr_users = cpu_to_be32(1);
1315 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
1316 journal->j_format_version = 2;
1317
1318 bh = journal->j_sb_buffer;
1319 BUFFER_TRACE(bh, "marking dirty");
1320 mark_buffer_dirty(bh);
1321 sync_dirty_buffer(bh);
1322 return 0;
1323}
1324
1325
1326/**
1327 * int journal_flush () - Flush journal
1328 * @journal: Journal to act on.
1329 *
1330 * Flush all data for a given journal to disk and empty the journal.
1331 * Filesystems can use this when remounting readonly to ensure that
1332 * recovery does not need to happen on remount.
1333 */
1334
1335int journal_flush(journal_t *journal)
1336{
1337 int err = 0;
1338 transaction_t *transaction = NULL;
1339 unsigned long old_tail;
1340
1341 spin_lock(&journal->j_state_lock);
1342
1343 /* Force everything buffered to the log... */
1344 if (journal->j_running_transaction) {
1345 transaction = journal->j_running_transaction;
1346 __log_start_commit(journal, transaction->t_tid);
1347 } else if (journal->j_committing_transaction)
1348 transaction = journal->j_committing_transaction;
1349
1350 /* Wait for the log commit to complete... */
1351 if (transaction) {
1352 tid_t tid = transaction->t_tid;
1353
1354 spin_unlock(&journal->j_state_lock);
1355 log_wait_commit(journal, tid);
1356 } else {
1357 spin_unlock(&journal->j_state_lock);
1358 }
1359
1360 /* ...and flush everything in the log out to disk. */
1361 spin_lock(&journal->j_list_lock);
1362 while (!err && journal->j_checkpoint_transactions != NULL) {
1363 spin_unlock(&journal->j_list_lock);
1364 err = log_do_checkpoint(journal);
1365 spin_lock(&journal->j_list_lock);
1366 }
1367 spin_unlock(&journal->j_list_lock);
1368 cleanup_journal_tail(journal);
1369
1370 /* Finally, mark the journal as really needing no recovery.
1371 * This sets s_start==0 in the underlying superblock, which is
1372 * the magic code for a fully-recovered superblock. Any future
1373 * commits of data to the journal will restore the current
1374 * s_start value. */
1375 spin_lock(&journal->j_state_lock);
1376 old_tail = journal->j_tail;
1377 journal->j_tail = 0;
1378 spin_unlock(&journal->j_state_lock);
1379 journal_update_superblock(journal, 1);
1380 spin_lock(&journal->j_state_lock);
1381 journal->j_tail = old_tail;
1382
1383 J_ASSERT(!journal->j_running_transaction);
1384 J_ASSERT(!journal->j_committing_transaction);
1385 J_ASSERT(!journal->j_checkpoint_transactions);
1386 J_ASSERT(journal->j_head == journal->j_tail);
1387 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1388 spin_unlock(&journal->j_state_lock);
1389 return err;
1390}
1391
1392/**
1393 * int journal_wipe() - Wipe journal contents
1394 * @journal: Journal to act on.
1395 * @write: flag (see below)
1396 *
1397 * Wipe out all of the contents of a journal, safely. This will produce
1398 * a warning if the journal contains any valid recovery information.
1399 * Must be called between journal_init_*() and journal_load().
1400 *
1401 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
1402 * we merely suppress recovery.
1403 */
1404
1405int journal_wipe(journal_t *journal, int write)
1406{
1407 journal_superblock_t *sb;
1408 int err = 0;
1409
1410 J_ASSERT (!(journal->j_flags & JFS_LOADED));
1411
1412 err = load_superblock(journal);
1413 if (err)
1414 return err;
1415
1416 sb = journal->j_superblock;
1417
1418 if (!journal->j_tail)
1419 goto no_recovery;
1420
1421 printk (KERN_WARNING "JBD: %s recovery information on journal\n",
1422 write ? "Clearing" : "Ignoring");
1423
1424 err = journal_skip_recovery(journal);
1425 if (write)
1426 journal_update_superblock(journal, 1);
1427
1428 no_recovery:
1429 return err;
1430}
1431
1432/*
1433 * journal_dev_name: format a character string to describe on what
1434 * device this journal is present.
1435 */
1436
1437static const char *journal_dev_name(journal_t *journal, char *buffer)
1438{
1439 struct block_device *bdev;
1440
1441 if (journal->j_inode)
1442 bdev = journal->j_inode->i_sb->s_bdev;
1443 else
1444 bdev = journal->j_dev;
1445
1446 return bdevname(bdev, buffer);
1447}
1448
1449/*
1450 * Journal abort has very specific semantics, which we describe
1451 * for journal abort.
1452 *
1453 * Two internal function, which provide abort to te jbd layer
1454 * itself are here.
1455 */
1456
1457/*
1458 * Quick version for internal journal use (doesn't lock the journal).
1459 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
1460 * and don't attempt to make any other journal updates.
1461 */
1462void __journal_abort_hard(journal_t *journal)
1463{
1464 transaction_t *transaction;
1465 char b[BDEVNAME_SIZE];
1466
1467 if (journal->j_flags & JFS_ABORT)
1468 return;
1469
1470 printk(KERN_ERR "Aborting journal on device %s.\n",
1471 journal_dev_name(journal, b));
1472
1473 spin_lock(&journal->j_state_lock);
1474 journal->j_flags |= JFS_ABORT;
1475 transaction = journal->j_running_transaction;
1476 if (transaction)
1477 __log_start_commit(journal, transaction->t_tid);
1478 spin_unlock(&journal->j_state_lock);
1479}
1480
1481/* Soft abort: record the abort error status in the journal superblock,
1482 * but don't do any other IO. */
1483static void __journal_abort_soft (journal_t *journal, int errno)
1484{
1485 if (journal->j_flags & JFS_ABORT)
1486 return;
1487
1488 if (!journal->j_errno)
1489 journal->j_errno = errno;
1490
1491 __journal_abort_hard(journal);
1492
1493 if (errno)
1494 journal_update_superblock(journal, 1);
1495}
1496
1497/**
1498 * void journal_abort () - Shutdown the journal immediately.
1499 * @journal: the journal to shutdown.
1500 * @errno: an error number to record in the journal indicating
1501 * the reason for the shutdown.
1502 *
1503 * Perform a complete, immediate shutdown of the ENTIRE
1504 * journal (not of a single transaction). This operation cannot be
1505 * undone without closing and reopening the journal.
1506 *
1507 * The journal_abort function is intended to support higher level error
1508 * recovery mechanisms such as the ext2/ext3 remount-readonly error
1509 * mode.
1510 *
1511 * Journal abort has very specific semantics. Any existing dirty,
1512 * unjournaled buffers in the main filesystem will still be written to
1513 * disk by bdflush, but the journaling mechanism will be suspended
1514 * immediately and no further transaction commits will be honoured.
1515 *
1516 * Any dirty, journaled buffers will be written back to disk without
1517 * hitting the journal. Atomicity cannot be guaranteed on an aborted
1518 * filesystem, but we _do_ attempt to leave as much data as possible
1519 * behind for fsck to use for cleanup.
1520 *
1521 * Any attempt to get a new transaction handle on a journal which is in
1522 * ABORT state will just result in an -EROFS error return. A
1523 * journal_stop on an existing handle will return -EIO if we have
1524 * entered abort state during the update.
1525 *
1526 * Recursive transactions are not disturbed by journal abort until the
1527 * final journal_stop, which will receive the -EIO error.
1528 *
1529 * Finally, the journal_abort call allows the caller to supply an errno
1530 * which will be recorded (if possible) in the journal superblock. This
1531 * allows a client to record failure conditions in the middle of a
1532 * transaction without having to complete the transaction to record the
1533 * failure to disk. ext3_error, for example, now uses this
1534 * functionality.
1535 *
1536 * Errors which originate from within the journaling layer will NOT
1537 * supply an errno; a null errno implies that absolutely no further
1538 * writes are done to the journal (unless there are any already in
1539 * progress).
1540 *
1541 */
1542
1543void journal_abort(journal_t *journal, int errno)
1544{
1545 __journal_abort_soft(journal, errno);
1546}
1547
1548/**
1549 * int journal_errno () - returns the journal's error state.
1550 * @journal: journal to examine.
1551 *
1552 * This is the errno numbet set with journal_abort(), the last
1553 * time the journal was mounted - if the journal was stopped
1554 * without calling abort this will be 0.
1555 *
1556 * If the journal has been aborted on this mount time -EROFS will
1557 * be returned.
1558 */
1559int journal_errno(journal_t *journal)
1560{
1561 int err;
1562
1563 spin_lock(&journal->j_state_lock);
1564 if (journal->j_flags & JFS_ABORT)
1565 err = -EROFS;
1566 else
1567 err = journal->j_errno;
1568 spin_unlock(&journal->j_state_lock);
1569 return err;
1570}
1571
1572/**
1573 * int journal_clear_err () - clears the journal's error state
1574 * @journal: journal to act on.
1575 *
1576 * An error must be cleared or Acked to take a FS out of readonly
1577 * mode.
1578 */
1579int journal_clear_err(journal_t *journal)
1580{
1581 int err = 0;
1582
1583 spin_lock(&journal->j_state_lock);
1584 if (journal->j_flags & JFS_ABORT)
1585 err = -EROFS;
1586 else
1587 journal->j_errno = 0;
1588 spin_unlock(&journal->j_state_lock);
1589 return err;
1590}
1591
1592/**
1593 * void journal_ack_err() - Ack journal err.
1594 * @journal: journal to act on.
1595 *
1596 * An error must be cleared or Acked to take a FS out of readonly
1597 * mode.
1598 */
1599void journal_ack_err(journal_t *journal)
1600{
1601 spin_lock(&journal->j_state_lock);
1602 if (journal->j_errno)
1603 journal->j_flags |= JFS_ACK_ERR;
1604 spin_unlock(&journal->j_state_lock);
1605}
1606
1607int journal_blocks_per_page(struct inode *inode)
1608{
1609 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1610}
1611
1612/*
1613 * Simple support for retrying memory allocations. Introduced to help to
1614 * debug different VM deadlock avoidance strategies.
1615 */
1616void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
1617{
1618 return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
1619}
1620
1621/*
1622 * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
1623 * and allocate frozen and commit buffers from these slabs.
1624 *
1625 * Reason for doing this is to avoid, SLAB_DEBUG - since it could
1626 * cause bh to cross page boundary.
1627 */
1628
1629#define JBD_MAX_SLABS 5
1630#define JBD_SLAB_INDEX(size) (size >> 11)
1631
1632static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
1633static const char *jbd_slab_names[JBD_MAX_SLABS] = {
1634 "jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k"
1635};
1636
1637static void journal_destroy_jbd_slabs(void)
1638{
1639 int i;
1640
1641 for (i = 0; i < JBD_MAX_SLABS; i++) {
1642 if (jbd_slab[i])
1643 kmem_cache_destroy(jbd_slab[i]);
1644 jbd_slab[i] = NULL;
1645 }
1646}
1647
1648static int journal_create_jbd_slab(size_t slab_size)
1649{
1650 int i = JBD_SLAB_INDEX(slab_size);
1651
1652 BUG_ON(i >= JBD_MAX_SLABS);
1653
1654 /*
1655 * Check if we already have a slab created for this size
1656 */
1657 if (jbd_slab[i])
1658 return 0;
1659
1660 /*
1661 * Create a slab and force alignment to be same as slabsize -
1662 * this will make sure that allocations won't cross the page
1663 * boundary.
1664 */
1665 jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
1666 slab_size, slab_size, 0, NULL, NULL);
1667 if (!jbd_slab[i]) {
1668 printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
1669 return -ENOMEM;
1670 }
1671 return 0;
1672}
1673
1674void * jbd_slab_alloc(size_t size, gfp_t flags)
1675{
1676 int idx;
1677
1678 idx = JBD_SLAB_INDEX(size);
1679 BUG_ON(jbd_slab[idx] == NULL);
1680 return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
1681}
1682
1683void jbd_slab_free(void *ptr, size_t size)
1684{
1685 int idx;
1686
1687 idx = JBD_SLAB_INDEX(size);
1688 BUG_ON(jbd_slab[idx] == NULL);
1689 kmem_cache_free(jbd_slab[idx], ptr);
1690}
1691
1692/*
1693 * Journal_head storage management
1694 */
1695static kmem_cache_t *journal_head_cache;
1696#ifdef CONFIG_JBD_DEBUG
1697static atomic_t nr_journal_heads = ATOMIC_INIT(0);
1698#endif
1699
1700static int journal_init_journal_head_cache(void)
1701{
1702 int retval;
1703
1704 J_ASSERT(journal_head_cache == 0);
1705 journal_head_cache = kmem_cache_create("journal_head",
1706 sizeof(struct journal_head),
1707 0, /* offset */
1708 0, /* flags */
1709 NULL, /* ctor */
1710 NULL); /* dtor */
1711 retval = 0;
1712 if (journal_head_cache == 0) {
1713 retval = -ENOMEM;
1714 printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
1715 }
1716 return retval;
1717}
1718
1719static void journal_destroy_journal_head_cache(void)
1720{
1721 J_ASSERT(journal_head_cache != NULL);
1722 kmem_cache_destroy(journal_head_cache);
1723 journal_head_cache = NULL;
1724}
1725
1726/*
1727 * journal_head splicing and dicing
1728 */
1729static struct journal_head *journal_alloc_journal_head(void)
1730{
1731 struct journal_head *ret;
1732 static unsigned long last_warning;
1733
1734#ifdef CONFIG_JBD_DEBUG
1735 atomic_inc(&nr_journal_heads);
1736#endif
1737 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
1738 if (ret == 0) {
1739 jbd_debug(1, "out of memory for journal_head\n");
1740 if (time_after(jiffies, last_warning + 5*HZ)) {
1741 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1742 __FUNCTION__);
1743 last_warning = jiffies;
1744 }
1745 while (ret == 0) {
1746 yield();
1747 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
1748 }
1749 }
1750 return ret;
1751}
1752
1753static void journal_free_journal_head(struct journal_head *jh)
1754{
1755#ifdef CONFIG_JBD_DEBUG
1756 atomic_dec(&nr_journal_heads);
1757 memset(jh, JBD_POISON_FREE, sizeof(*jh));
1758#endif
1759 kmem_cache_free(journal_head_cache, jh);
1760}
1761
1762/*
1763 * A journal_head is attached to a buffer_head whenever JBD has an
1764 * interest in the buffer.
1765 *
1766 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
1767 * is set. This bit is tested in core kernel code where we need to take
1768 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
1769 * there.
1770 *
1771 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
1772 *
1773 * When a buffer has its BH_JBD bit set it is immune from being released by
1774 * core kernel code, mainly via ->b_count.
1775 *
1776 * A journal_head may be detached from its buffer_head when the journal_head's
1777 * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
1778 * Various places in JBD call journal_remove_journal_head() to indicate that the
1779 * journal_head can be dropped if needed.
1780 *
1781 * Various places in the kernel want to attach a journal_head to a buffer_head
1782 * _before_ attaching the journal_head to a transaction. To protect the
1783 * journal_head in this situation, journal_add_journal_head elevates the
1784 * journal_head's b_jcount refcount by one. The caller must call
1785 * journal_put_journal_head() to undo this.
1786 *
1787 * So the typical usage would be:
1788 *
1789 * (Attach a journal_head if needed. Increments b_jcount)
1790 * struct journal_head *jh = journal_add_journal_head(bh);
1791 * ...
1792 * jh->b_transaction = xxx;
1793 * journal_put_journal_head(jh);
1794 *
1795 * Now, the journal_head's b_jcount is zero, but it is safe from being released
1796 * because it has a non-zero b_transaction.
1797 */
1798
1799/*
1800 * Give a buffer_head a journal_head.
1801 *
1802 * Doesn't need the journal lock.
1803 * May sleep.
1804 */
1805struct journal_head *journal_add_journal_head(struct buffer_head *bh)
1806{
1807 struct journal_head *jh;
1808 struct journal_head *new_jh = NULL;
1809
1810repeat:
1811 if (!buffer_jbd(bh)) {
1812 new_jh = journal_alloc_journal_head();
1813 memset(new_jh, 0, sizeof(*new_jh));
1814 }
1815
1816 jbd_lock_bh_journal_head(bh);
1817 if (buffer_jbd(bh)) {
1818 jh = bh2jh(bh);
1819 } else {
1820 J_ASSERT_BH(bh,
1821 (atomic_read(&bh->b_count) > 0) ||
1822 (bh->b_page && bh->b_page->mapping));
1823
1824 if (!new_jh) {
1825 jbd_unlock_bh_journal_head(bh);
1826 goto repeat;
1827 }
1828
1829 jh = new_jh;
1830 new_jh = NULL; /* We consumed it */
1831 set_buffer_jbd(bh);
1832 bh->b_private = jh;
1833 jh->b_bh = bh;
1834 get_bh(bh);
1835 BUFFER_TRACE(bh, "added journal_head");
1836 }
1837 jh->b_jcount++;
1838 jbd_unlock_bh_journal_head(bh);
1839 if (new_jh)
1840 journal_free_journal_head(new_jh);
1841 return bh->b_private;
1842}
1843
1844/*
1845 * Grab a ref against this buffer_head's journal_head. If it ended up not
1846 * having a journal_head, return NULL
1847 */
1848struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
1849{
1850 struct journal_head *jh = NULL;
1851
1852 jbd_lock_bh_journal_head(bh);
1853 if (buffer_jbd(bh)) {
1854 jh = bh2jh(bh);
1855 jh->b_jcount++;
1856 }
1857 jbd_unlock_bh_journal_head(bh);
1858 return jh;
1859}
1860
1861static void __journal_remove_journal_head(struct buffer_head *bh)
1862{
1863 struct journal_head *jh = bh2jh(bh);
1864
1865 J_ASSERT_JH(jh, jh->b_jcount >= 0);
1866
1867 get_bh(bh);
1868 if (jh->b_jcount == 0) {
1869 if (jh->b_transaction == NULL &&
1870 jh->b_next_transaction == NULL &&
1871 jh->b_cp_transaction == NULL) {
1872 J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
1873 J_ASSERT_BH(bh, buffer_jbd(bh));
1874 J_ASSERT_BH(bh, jh2bh(jh) == bh);
1875 BUFFER_TRACE(bh, "remove journal_head");
1876 if (jh->b_frozen_data) {
1877 printk(KERN_WARNING "%s: freeing "
1878 "b_frozen_data\n",
1879 __FUNCTION__);
1880 jbd_slab_free(jh->b_frozen_data, bh->b_size);
1881 }
1882 if (jh->b_committed_data) {
1883 printk(KERN_WARNING "%s: freeing "
1884 "b_committed_data\n",
1885 __FUNCTION__);
1886 jbd_slab_free(jh->b_committed_data, bh->b_size);
1887 }
1888 bh->b_private = NULL;
1889 jh->b_bh = NULL; /* debug, really */
1890 clear_buffer_jbd(bh);
1891 __brelse(bh);
1892 journal_free_journal_head(jh);
1893 } else {
1894 BUFFER_TRACE(bh, "journal_head was locked");
1895 }
1896 }
1897}
1898
1899/*
1900 * journal_remove_journal_head(): if the buffer isn't attached to a transaction
1901 * and has a zero b_jcount then remove and release its journal_head. If we did
1902 * see that the buffer is not used by any transaction we also "logically"
1903 * decrement ->b_count.
1904 *
1905 * We in fact take an additional increment on ->b_count as a convenience,
1906 * because the caller usually wants to do additional things with the bh
1907 * after calling here.
1908 * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
1909 * time. Once the caller has run __brelse(), the buffer is eligible for
1910 * reaping by try_to_free_buffers().
1911 */
1912void journal_remove_journal_head(struct buffer_head *bh)
1913{
1914 jbd_lock_bh_journal_head(bh);
1915 __journal_remove_journal_head(bh);
1916 jbd_unlock_bh_journal_head(bh);
1917}
1918
1919/*
1920 * Drop a reference on the passed journal_head. If it fell to zero then try to
1921 * release the journal_head from the buffer_head.
1922 */
1923void journal_put_journal_head(struct journal_head *jh)
1924{
1925 struct buffer_head *bh = jh2bh(jh);
1926
1927 jbd_lock_bh_journal_head(bh);
1928 J_ASSERT_JH(jh, jh->b_jcount > 0);
1929 --jh->b_jcount;
1930 if (!jh->b_jcount && !jh->b_transaction) {
1931 __journal_remove_journal_head(bh);
1932 __brelse(bh);
1933 }
1934 jbd_unlock_bh_journal_head(bh);
1935}
1936
1937/*
1938 * /proc tunables
1939 */
1940#if defined(CONFIG_JBD_DEBUG)
1941int journal_enable_debug;
1942EXPORT_SYMBOL(journal_enable_debug);
1943#endif
1944
1945#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
1946
1947static struct proc_dir_entry *proc_jbd_debug;
1948
1949static int read_jbd_debug(char *page, char **start, off_t off,
1950 int count, int *eof, void *data)
1951{
1952 int ret;
1953
1954 ret = sprintf(page + off, "%d\n", journal_enable_debug);
1955 *eof = 1;
1956 return ret;
1957}
1958
1959static int write_jbd_debug(struct file *file, const char __user *buffer,
1960 unsigned long count, void *data)
1961{
1962 char buf[32];
1963
1964 if (count > ARRAY_SIZE(buf) - 1)
1965 count = ARRAY_SIZE(buf) - 1;
1966 if (copy_from_user(buf, buffer, count))
1967 return -EFAULT;
1968 buf[ARRAY_SIZE(buf) - 1] = '\0';
1969 journal_enable_debug = simple_strtoul(buf, NULL, 10);
1970 return count;
1971}
1972
1973#define JBD_PROC_NAME "sys/fs/jbd-debug"
1974
1975static void __init create_jbd_proc_entry(void)
1976{
1977 proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
1978 if (proc_jbd_debug) {
1979 /* Why is this so hard? */
1980 proc_jbd_debug->read_proc = read_jbd_debug;
1981 proc_jbd_debug->write_proc = write_jbd_debug;
1982 }
1983}
1984
1985static void __exit remove_jbd_proc_entry(void)
1986{
1987 if (proc_jbd_debug)
1988 remove_proc_entry(JBD_PROC_NAME, NULL);
1989}
1990
1991#else
1992
1993#define create_jbd_proc_entry() do {} while (0)
1994#define remove_jbd_proc_entry() do {} while (0)
1995
1996#endif
1997
1998kmem_cache_t *jbd_handle_cache;
1999
2000static int __init journal_init_handle_cache(void)
2001{
2002 jbd_handle_cache = kmem_cache_create("journal_handle",
2003 sizeof(handle_t),
2004 0, /* offset */
2005 0, /* flags */
2006 NULL, /* ctor */
2007 NULL); /* dtor */
2008 if (jbd_handle_cache == NULL) {
2009 printk(KERN_EMERG "JBD: failed to create handle cache\n");
2010 return -ENOMEM;
2011 }
2012 return 0;
2013}
2014
2015static void journal_destroy_handle_cache(void)
2016{
2017 if (jbd_handle_cache)
2018 kmem_cache_destroy(jbd_handle_cache);
2019}
2020
2021/*
2022 * Module startup and shutdown
2023 */
2024
2025static int __init journal_init_caches(void)
2026{
2027 int ret;
2028
2029 ret = journal_init_revoke_caches();
2030 if (ret == 0)
2031 ret = journal_init_journal_head_cache();
2032 if (ret == 0)
2033 ret = journal_init_handle_cache();
2034 return ret;
2035}
2036
2037static void journal_destroy_caches(void)
2038{
2039 journal_destroy_revoke_caches();
2040 journal_destroy_journal_head_cache();
2041 journal_destroy_handle_cache();
2042 journal_destroy_jbd_slabs();
2043}
2044
2045static int __init journal_init(void)
2046{
2047 int ret;
2048
2049 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2050
2051 ret = journal_init_caches();
2052 if (ret != 0)
2053 journal_destroy_caches();
2054 create_jbd_proc_entry();
2055 return ret;
2056}
2057
2058static void __exit journal_exit(void)
2059{
2060#ifdef CONFIG_JBD_DEBUG
2061 int n = atomic_read(&nr_journal_heads);
2062 if (n)
2063 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
2064#endif
2065 remove_jbd_proc_entry();
2066 journal_destroy_caches();
2067}
2068
2069MODULE_LICENSE("GPL");
2070module_init(journal_init);
2071module_exit(journal_exit);
2072
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
new file mode 100644
index 000000000000..11563fe2a52b
--- /dev/null
+++ b/fs/jbd2/recovery.c
@@ -0,0 +1,592 @@
1/*
2 * linux/fs/recovery.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal recovery routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#ifndef __KERNEL__
17#include "jfs_user.h"
18#else
19#include <linux/time.h>
20#include <linux/fs.h>
21#include <linux/jbd.h>
22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif
25
26/*
27 * Maintain information about the progress of the recovery job, so that
28 * the different passes can carry information between them.
29 */
30struct recovery_info
31{
32 tid_t start_transaction;
33 tid_t end_transaction;
34
35 int nr_replays;
36 int nr_revokes;
37 int nr_revoke_hits;
38};
39
40enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
41static int do_one_pass(journal_t *journal,
42 struct recovery_info *info, enum passtype pass);
43static int scan_revoke_records(journal_t *, struct buffer_head *,
44 tid_t, struct recovery_info *);
45
46#ifdef __KERNEL__
47
48/* Release readahead buffers after use */
49static void journal_brelse_array(struct buffer_head *b[], int n)
50{
51 while (--n >= 0)
52 brelse (b[n]);
53}
54
55
56/*
57 * When reading from the journal, we are going through the block device
58 * layer directly and so there is no readahead being done for us. We
59 * need to implement any readahead ourselves if we want it to happen at
60 * all. Recovery is basically one long sequential read, so make sure we
61 * do the IO in reasonably large chunks.
62 *
63 * This is not so critical that we need to be enormously clever about
64 * the readahead size, though. 128K is a purely arbitrary, good-enough
65 * fixed value.
66 */
67
68#define MAXBUF 8
69static int do_readahead(journal_t *journal, unsigned int start)
70{
71 int err;
72 unsigned int max, nbufs, next;
73 unsigned long blocknr;
74 struct buffer_head *bh;
75
76 struct buffer_head * bufs[MAXBUF];
77
78 /* Do up to 128K of readahead */
79 max = start + (128 * 1024 / journal->j_blocksize);
80 if (max > journal->j_maxlen)
81 max = journal->j_maxlen;
82
83 /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
84 * a time to the block device IO layer. */
85
86 nbufs = 0;
87
88 for (next = start; next < max; next++) {
89 err = journal_bmap(journal, next, &blocknr);
90
91 if (err) {
92 printk (KERN_ERR "JBD: bad block at offset %u\n",
93 next);
94 goto failed;
95 }
96
97 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
98 if (!bh) {
99 err = -ENOMEM;
100 goto failed;
101 }
102
103 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
104 bufs[nbufs++] = bh;
105 if (nbufs == MAXBUF) {
106 ll_rw_block(READ, nbufs, bufs);
107 journal_brelse_array(bufs, nbufs);
108 nbufs = 0;
109 }
110 } else
111 brelse(bh);
112 }
113
114 if (nbufs)
115 ll_rw_block(READ, nbufs, bufs);
116 err = 0;
117
118failed:
119 if (nbufs)
120 journal_brelse_array(bufs, nbufs);
121 return err;
122}
123
124#endif /* __KERNEL__ */
125
126
127/*
128 * Read a block from the journal
129 */
130
131static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset)
133{
134 int err;
135 unsigned long blocknr;
136 struct buffer_head *bh;
137
138 *bhp = NULL;
139
140 if (offset >= journal->j_maxlen) {
141 printk(KERN_ERR "JBD: corrupted journal superblock\n");
142 return -EIO;
143 }
144
145 err = journal_bmap(journal, offset, &blocknr);
146
147 if (err) {
148 printk (KERN_ERR "JBD: bad block at offset %u\n",
149 offset);
150 return err;
151 }
152
153 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
154 if (!bh)
155 return -ENOMEM;
156
157 if (!buffer_uptodate(bh)) {
158 /* If this is a brand new buffer, start readahead.
159 Otherwise, we assume we are already reading it. */
160 if (!buffer_req(bh))
161 do_readahead(journal, offset);
162 wait_on_buffer(bh);
163 }
164
165 if (!buffer_uptodate(bh)) {
166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
167 offset);
168 brelse(bh);
169 return -EIO;
170 }
171
172 *bhp = bh;
173 return 0;
174}
175
176
177/*
178 * Count the number of in-use tags in a journal descriptor block.
179 */
180
181static int count_tags(struct buffer_head *bh, int size)
182{
183 char * tagp;
184 journal_block_tag_t * tag;
185 int nr = 0;
186
187 tagp = &bh->b_data[sizeof(journal_header_t)];
188
189 while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
190 tag = (journal_block_tag_t *) tagp;
191
192 nr++;
193 tagp += sizeof(journal_block_tag_t);
194 if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
195 tagp += 16;
196
197 if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
198 break;
199 }
200
201 return nr;
202}
203
204
205/* Make sure we wrap around the log correctly! */
206#define wrap(journal, var) \
207do { \
208 if (var >= (journal)->j_last) \
209 var -= ((journal)->j_last - (journal)->j_first); \
210} while (0)
211
212/**
213 * journal_recover - recovers a on-disk journal
214 * @journal: the journal to recover
215 *
216 * The primary function for recovering the log contents when mounting a
217 * journaled device.
218 *
219 * Recovery is done in three passes. In the first pass, we look for the
220 * end of the log. In the second, we assemble the list of revoke
221 * blocks. In the third and final pass, we replay any un-revoked blocks
222 * in the log.
223 */
224int journal_recover(journal_t *journal)
225{
226 int err;
227 journal_superblock_t * sb;
228
229 struct recovery_info info;
230
231 memset(&info, 0, sizeof(info));
232 sb = journal->j_superblock;
233
234 /*
235 * The journal superblock's s_start field (the current log head)
236 * is always zero if, and only if, the journal was cleanly
237 * unmounted.
238 */
239
240 if (!sb->s_start) {
241 jbd_debug(1, "No recovery required, last transaction %d\n",
242 be32_to_cpu(sb->s_sequence));
243 journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
244 return 0;
245 }
246
247 err = do_one_pass(journal, &info, PASS_SCAN);
248 if (!err)
249 err = do_one_pass(journal, &info, PASS_REVOKE);
250 if (!err)
251 err = do_one_pass(journal, &info, PASS_REPLAY);
252
253 jbd_debug(0, "JBD: recovery, exit status %d, "
254 "recovered transactions %u to %u\n",
255 err, info.start_transaction, info.end_transaction);
256 jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
257 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
258
259 /* Restart the log at the next transaction ID, thus invalidating
260 * any existing commit records in the log. */
261 journal->j_transaction_sequence = ++info.end_transaction;
262
263 journal_clear_revoke(journal);
264 sync_blockdev(journal->j_fs_dev);
265 return err;
266}
267
268/**
269 * journal_skip_recovery - Start journal and wipe exiting records
270 * @journal: journal to startup
271 *
272 * Locate any valid recovery information from the journal and set up the
273 * journal structures in memory to ignore it (presumably because the
274 * caller has evidence that it is out of date).
275 * This function does'nt appear to be exorted..
276 *
277 * We perform one pass over the journal to allow us to tell the user how
278 * much recovery information is being erased, and to let us initialise
279 * the journal transaction sequence numbers to the next unused ID.
280 */
281int journal_skip_recovery(journal_t *journal)
282{
283 int err;
284 journal_superblock_t * sb;
285
286 struct recovery_info info;
287
288 memset (&info, 0, sizeof(info));
289 sb = journal->j_superblock;
290
291 err = do_one_pass(journal, &info, PASS_SCAN);
292
293 if (err) {
294 printk(KERN_ERR "JBD: error %d scanning journal\n", err);
295 ++journal->j_transaction_sequence;
296 } else {
297#ifdef CONFIG_JBD_DEBUG
298 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
299#endif
300 jbd_debug(0,
301 "JBD: ignoring %d transaction%s from the journal.\n",
302 dropped, (dropped == 1) ? "" : "s");
303 journal->j_transaction_sequence = ++info.end_transaction;
304 }
305
306 journal->j_tail = 0;
307 return err;
308}
309
310static int do_one_pass(journal_t *journal,
311 struct recovery_info *info, enum passtype pass)
312{
313 unsigned int first_commit_ID, next_commit_ID;
314 unsigned long next_log_block;
315 int err, success = 0;
316 journal_superblock_t * sb;
317 journal_header_t * tmp;
318 struct buffer_head * bh;
319 unsigned int sequence;
320 int blocktype;
321
322 /* Precompute the maximum metadata descriptors in a descriptor block */
323 int MAX_BLOCKS_PER_DESC;
324 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
325 / sizeof(journal_block_tag_t));
326
327 /*
328 * First thing is to establish what we expect to find in the log
329 * (in terms of transaction IDs), and where (in terms of log
330 * block offsets): query the superblock.
331 */
332
333 sb = journal->j_superblock;
334 next_commit_ID = be32_to_cpu(sb->s_sequence);
335 next_log_block = be32_to_cpu(sb->s_start);
336
337 first_commit_ID = next_commit_ID;
338 if (pass == PASS_SCAN)
339 info->start_transaction = first_commit_ID;
340
341 jbd_debug(1, "Starting recovery pass %d\n", pass);
342
343 /*
344 * Now we walk through the log, transaction by transaction,
345 * making sure that each transaction has a commit block in the
346 * expected place. Each complete transaction gets replayed back
347 * into the main filesystem.
348 */
349
350 while (1) {
351 int flags;
352 char * tagp;
353 journal_block_tag_t * tag;
354 struct buffer_head * obh;
355 struct buffer_head * nbh;
356
357 cond_resched(); /* We're under lock_kernel() */
358
359 /* If we already know where to stop the log traversal,
360 * check right now that we haven't gone past the end of
361 * the log. */
362
363 if (pass != PASS_SCAN)
364 if (tid_geq(next_commit_ID, info->end_transaction))
365 break;
366
367 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
368 next_commit_ID, next_log_block, journal->j_last);
369
370 /* Skip over each chunk of the transaction looking
371 * either the next descriptor block or the final commit
372 * record. */
373
374 jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
375 err = jread(&bh, journal, next_log_block);
376 if (err)
377 goto failed;
378
379 next_log_block++;
380 wrap(journal, next_log_block);
381
382 /* What kind of buffer is it?
383 *
384 * If it is a descriptor block, check that it has the
385 * expected sequence number. Otherwise, we're all done
386 * here. */
387
388 tmp = (journal_header_t *)bh->b_data;
389
390 if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
391 brelse(bh);
392 break;
393 }
394
395 blocktype = be32_to_cpu(tmp->h_blocktype);
396 sequence = be32_to_cpu(tmp->h_sequence);
397 jbd_debug(3, "Found magic %d, sequence %d\n",
398 blocktype, sequence);
399
400 if (sequence != next_commit_ID) {
401 brelse(bh);
402 break;
403 }
404
405 /* OK, we have a valid descriptor block which matches
406 * all of the sequence number checks. What are we going
407 * to do with it? That depends on the pass... */
408
409 switch(blocktype) {
410 case JFS_DESCRIPTOR_BLOCK:
411 /* If it is a valid descriptor block, replay it
412 * in pass REPLAY; otherwise, just skip over the
413 * blocks it describes. */
414 if (pass != PASS_REPLAY) {
415 next_log_block +=
416 count_tags(bh, journal->j_blocksize);
417 wrap(journal, next_log_block);
418 brelse(bh);
419 continue;
420 }
421
422 /* A descriptor block: we can now write all of
423 * the data blocks. Yay, useful work is finally
424 * getting done here! */
425
426 tagp = &bh->b_data[sizeof(journal_header_t)];
427 while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
428 <= journal->j_blocksize) {
429 unsigned long io_block;
430
431 tag = (journal_block_tag_t *) tagp;
432 flags = be32_to_cpu(tag->t_flags);
433
434 io_block = next_log_block++;
435 wrap(journal, next_log_block);
436 err = jread(&obh, journal, io_block);
437 if (err) {
438 /* Recover what we can, but
439 * report failure at the end. */
440 success = err;
441 printk (KERN_ERR
442 "JBD: IO error %d recovering "
443 "block %ld in log\n",
444 err, io_block);
445 } else {
446 unsigned long blocknr;
447
448 J_ASSERT(obh != NULL);
449 blocknr = be32_to_cpu(tag->t_blocknr);
450
451 /* If the block has been
452 * revoked, then we're all done
453 * here. */
454 if (journal_test_revoke
455 (journal, blocknr,
456 next_commit_ID)) {
457 brelse(obh);
458 ++info->nr_revoke_hits;
459 goto skip_write;
460 }
461
462 /* Find a buffer for the new
463 * data being restored */
464 nbh = __getblk(journal->j_fs_dev,
465 blocknr,
466 journal->j_blocksize);
467 if (nbh == NULL) {
468 printk(KERN_ERR
469 "JBD: Out of memory "
470 "during recovery.\n");
471 err = -ENOMEM;
472 brelse(bh);
473 brelse(obh);
474 goto failed;
475 }
476
477 lock_buffer(nbh);
478 memcpy(nbh->b_data, obh->b_data,
479 journal->j_blocksize);
480 if (flags & JFS_FLAG_ESCAPE) {
481 *((__be32 *)bh->b_data) =
482 cpu_to_be32(JFS_MAGIC_NUMBER);
483 }
484
485 BUFFER_TRACE(nbh, "marking dirty");
486 set_buffer_uptodate(nbh);
487 mark_buffer_dirty(nbh);
488 BUFFER_TRACE(nbh, "marking uptodate");
489 ++info->nr_replays;
490 /* ll_rw_block(WRITE, 1, &nbh); */
491 unlock_buffer(nbh);
492 brelse(obh);
493 brelse(nbh);
494 }
495
496 skip_write:
497 tagp += sizeof(journal_block_tag_t);
498 if (!(flags & JFS_FLAG_SAME_UUID))
499 tagp += 16;
500
501 if (flags & JFS_FLAG_LAST_TAG)
502 break;
503 }
504
505 brelse(bh);
506 continue;
507
508 case JFS_COMMIT_BLOCK:
509 /* Found an expected commit block: not much to
510 * do other than move on to the next sequence
511 * number. */
512 brelse(bh);
513 next_commit_ID++;
514 continue;
515
516 case JFS_REVOKE_BLOCK:
517 /* If we aren't in the REVOKE pass, then we can
518 * just skip over this block. */
519 if (pass != PASS_REVOKE) {
520 brelse(bh);
521 continue;
522 }
523
524 err = scan_revoke_records(journal, bh,
525 next_commit_ID, info);
526 brelse(bh);
527 if (err)
528 goto failed;
529 continue;
530
531 default:
532 jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
533 blocktype);
534 brelse(bh);
535 goto done;
536 }
537 }
538
539 done:
540 /*
541 * We broke out of the log scan loop: either we came to the
542 * known end of the log or we found an unexpected block in the
543 * log. If the latter happened, then we know that the "current"
544 * transaction marks the end of the valid log.
545 */
546
547 if (pass == PASS_SCAN)
548 info->end_transaction = next_commit_ID;
549 else {
550 /* It's really bad news if different passes end up at
551 * different places (but possible due to IO errors). */
552 if (info->end_transaction != next_commit_ID) {
553 printk (KERN_ERR "JBD: recovery pass %d ended at "
554 "transaction %u, expected %u\n",
555 pass, next_commit_ID, info->end_transaction);
556 if (!success)
557 success = -EIO;
558 }
559 }
560
561 return success;
562
563 failed:
564 return err;
565}
566
567
568/* Scan a revoke record, marking all blocks mentioned as revoked. */
569
570static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
571 tid_t sequence, struct recovery_info *info)
572{
573 journal_revoke_header_t *header;
574 int offset, max;
575
576 header = (journal_revoke_header_t *) bh->b_data;
577 offset = sizeof(journal_revoke_header_t);
578 max = be32_to_cpu(header->r_count);
579
580 while (offset < max) {
581 unsigned long blocknr;
582 int err;
583
584 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
585 offset += 4;
586 err = journal_set_revoke(journal, blocknr, sequence);
587 if (err)
588 return err;
589 ++info->nr_revokes;
590 }
591 return 0;
592}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
new file mode 100644
index 000000000000..c532429d8d9b
--- /dev/null
+++ b/fs/jbd2/revoke.c
@@ -0,0 +1,703 @@
1/*
2 * linux/fs/revoke.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
5 *
6 * Copyright 2000 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal revoke routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 *
15 * Revoke is the mechanism used to prevent old log records for deleted
16 * metadata from being replayed on top of newer data using the same
17 * blocks. The revoke mechanism is used in two separate places:
18 *
19 * + Commit: during commit we write the entire list of the current
20 * transaction's revoked blocks to the journal
21 *
22 * + Recovery: during recovery we record the transaction ID of all
23 * revoked blocks. If there are multiple revoke records in the log
24 * for a single block, only the last one counts, and if there is a log
25 * entry for a block beyond the last revoke, then that log entry still
26 * gets replayed.
27 *
28 * We can get interactions between revokes and new log data within a
29 * single transaction:
30 *
31 * Block is revoked and then journaled:
32 * The desired end result is the journaling of the new block, so we
33 * cancel the revoke before the transaction commits.
34 *
35 * Block is journaled and then revoked:
36 * The revoke must take precedence over the write of the block, so we
37 * need either to cancel the journal entry or to write the revoke
38 * later in the log than the log block. In this case, we choose the
39 * latter: journaling a block cancels any revoke record for that block
40 * in the current transaction, so any revoke for that block in the
41 * transaction must have happened after the block was journaled and so
42 * the revoke must take precedence.
43 *
44 * Block is revoked and then written as data:
45 * The data write is allowed to succeed, but the revoke is _not_
46 * cancelled. We still need to prevent old log records from
47 * overwriting the new data. We don't even need to clear the revoke
48 * bit here.
49 *
50 * Revoke information on buffers is a tri-state value:
51 *
52 * RevokeValid clear: no cached revoke status, need to look it up
53 * RevokeValid set, Revoked clear:
54 * buffer has not been revoked, and cancel_revoke
55 * need do nothing.
56 * RevokeValid set, Revoked set:
57 * buffer has been revoked.
58 */
59
60#ifndef __KERNEL__
61#include "jfs_user.h"
62#else
63#include <linux/time.h>
64#include <linux/fs.h>
65#include <linux/jbd.h>
66#include <linux/errno.h>
67#include <linux/slab.h>
68#include <linux/list.h>
69#include <linux/smp_lock.h>
70#include <linux/init.h>
71#endif
72
73static kmem_cache_t *revoke_record_cache;
74static kmem_cache_t *revoke_table_cache;
75
76/* Each revoke record represents one single revoked block. During
77 journal replay, this involves recording the transaction ID of the
78 last transaction to revoke this block. */
79
80struct jbd_revoke_record_s
81{
82 struct list_head hash;
83 tid_t sequence; /* Used for recovery only */
84 unsigned long blocknr;
85};
86
87
88/* The revoke table is just a simple hash table of revoke records. */
89struct jbd_revoke_table_s
90{
91 /* It is conceivable that we might want a larger hash table
92 * for recovery. Must be a power of two. */
93 int hash_size;
94 int hash_shift;
95 struct list_head *hash_table;
96};
97
98
99#ifdef __KERNEL__
100static void write_one_revoke_record(journal_t *, transaction_t *,
101 struct journal_head **, int *,
102 struct jbd_revoke_record_s *);
103static void flush_descriptor(journal_t *, struct journal_head *, int);
104#endif
105
106/* Utility functions to maintain the revoke table */
107
108/* Borrowed from buffer.c: this is a tried and tested block hash function */
109static inline int hash(journal_t *journal, unsigned long block)
110{
111 struct jbd_revoke_table_s *table = journal->j_revoke;
112 int hash_shift = table->hash_shift;
113
114 return ((block << (hash_shift - 6)) ^
115 (block >> 13) ^
116 (block << (hash_shift - 12))) & (table->hash_size - 1);
117}
118
119static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
120 tid_t seq)
121{
122 struct list_head *hash_list;
123 struct jbd_revoke_record_s *record;
124
125repeat:
126 record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
127 if (!record)
128 goto oom;
129
130 record->sequence = seq;
131 record->blocknr = blocknr;
132 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
133 spin_lock(&journal->j_revoke_lock);
134 list_add(&record->hash, hash_list);
135 spin_unlock(&journal->j_revoke_lock);
136 return 0;
137
138oom:
139 if (!journal_oom_retry)
140 return -ENOMEM;
141 jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
142 yield();
143 goto repeat;
144}
145
146/* Find a revoke record in the journal's hash table. */
147
148static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
149 unsigned long blocknr)
150{
151 struct list_head *hash_list;
152 struct jbd_revoke_record_s *record;
153
154 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
155
156 spin_lock(&journal->j_revoke_lock);
157 record = (struct jbd_revoke_record_s *) hash_list->next;
158 while (&(record->hash) != hash_list) {
159 if (record->blocknr == blocknr) {
160 spin_unlock(&journal->j_revoke_lock);
161 return record;
162 }
163 record = (struct jbd_revoke_record_s *) record->hash.next;
164 }
165 spin_unlock(&journal->j_revoke_lock);
166 return NULL;
167}
168
169int __init journal_init_revoke_caches(void)
170{
171 revoke_record_cache = kmem_cache_create("revoke_record",
172 sizeof(struct jbd_revoke_record_s),
173 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
174 if (revoke_record_cache == 0)
175 return -ENOMEM;
176
177 revoke_table_cache = kmem_cache_create("revoke_table",
178 sizeof(struct jbd_revoke_table_s),
179 0, 0, NULL, NULL);
180 if (revoke_table_cache == 0) {
181 kmem_cache_destroy(revoke_record_cache);
182 revoke_record_cache = NULL;
183 return -ENOMEM;
184 }
185 return 0;
186}
187
188void journal_destroy_revoke_caches(void)
189{
190 kmem_cache_destroy(revoke_record_cache);
191 revoke_record_cache = NULL;
192 kmem_cache_destroy(revoke_table_cache);
193 revoke_table_cache = NULL;
194}
195
196/* Initialise the revoke table for a given journal to a given size. */
197
198int journal_init_revoke(journal_t *journal, int hash_size)
199{
200 int shift, tmp;
201
202 J_ASSERT (journal->j_revoke_table[0] == NULL);
203
204 shift = 0;
205 tmp = hash_size;
206 while((tmp >>= 1UL) != 0UL)
207 shift++;
208
209 journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
210 if (!journal->j_revoke_table[0])
211 return -ENOMEM;
212 journal->j_revoke = journal->j_revoke_table[0];
213
214 /* Check that the hash_size is a power of two */
215 J_ASSERT ((hash_size & (hash_size-1)) == 0);
216
217 journal->j_revoke->hash_size = hash_size;
218
219 journal->j_revoke->hash_shift = shift;
220
221 journal->j_revoke->hash_table =
222 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
223 if (!journal->j_revoke->hash_table) {
224 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
225 journal->j_revoke = NULL;
226 return -ENOMEM;
227 }
228
229 for (tmp = 0; tmp < hash_size; tmp++)
230 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
231
232 journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
233 if (!journal->j_revoke_table[1]) {
234 kfree(journal->j_revoke_table[0]->hash_table);
235 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
236 return -ENOMEM;
237 }
238
239 journal->j_revoke = journal->j_revoke_table[1];
240
241 /* Check that the hash_size is a power of two */
242 J_ASSERT ((hash_size & (hash_size-1)) == 0);
243
244 journal->j_revoke->hash_size = hash_size;
245
246 journal->j_revoke->hash_shift = shift;
247
248 journal->j_revoke->hash_table =
249 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
250 if (!journal->j_revoke->hash_table) {
251 kfree(journal->j_revoke_table[0]->hash_table);
252 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
253 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
254 journal->j_revoke = NULL;
255 return -ENOMEM;
256 }
257
258 for (tmp = 0; tmp < hash_size; tmp++)
259 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
260
261 spin_lock_init(&journal->j_revoke_lock);
262
263 return 0;
264}
265
266/* Destoy a journal's revoke table. The table must already be empty! */
267
268void journal_destroy_revoke(journal_t *journal)
269{
270 struct jbd_revoke_table_s *table;
271 struct list_head *hash_list;
272 int i;
273
274 table = journal->j_revoke_table[0];
275 if (!table)
276 return;
277
278 for (i=0; i<table->hash_size; i++) {
279 hash_list = &table->hash_table[i];
280 J_ASSERT (list_empty(hash_list));
281 }
282
283 kfree(table->hash_table);
284 kmem_cache_free(revoke_table_cache, table);
285 journal->j_revoke = NULL;
286
287 table = journal->j_revoke_table[1];
288 if (!table)
289 return;
290
291 for (i=0; i<table->hash_size; i++) {
292 hash_list = &table->hash_table[i];
293 J_ASSERT (list_empty(hash_list));
294 }
295
296 kfree(table->hash_table);
297 kmem_cache_free(revoke_table_cache, table);
298 journal->j_revoke = NULL;
299}
300
301
302#ifdef __KERNEL__
303
304/*
305 * journal_revoke: revoke a given buffer_head from the journal. This
306 * prevents the block from being replayed during recovery if we take a
307 * crash after this current transaction commits. Any subsequent
308 * metadata writes of the buffer in this transaction cancel the
309 * revoke.
310 *
311 * Note that this call may block --- it is up to the caller to make
312 * sure that there are no further calls to journal_write_metadata
313 * before the revoke is complete. In ext3, this implies calling the
314 * revoke before clearing the block bitmap when we are deleting
315 * metadata.
316 *
317 * Revoke performs a journal_forget on any buffer_head passed in as a
318 * parameter, but does _not_ forget the buffer_head if the bh was only
319 * found implicitly.
320 *
321 * bh_in may not be a journalled buffer - it may have come off
322 * the hash tables without an attached journal_head.
323 *
324 * If bh_in is non-zero, journal_revoke() will decrement its b_count
325 * by one.
326 */
327
328int journal_revoke(handle_t *handle, unsigned long blocknr,
329 struct buffer_head *bh_in)
330{
331 struct buffer_head *bh = NULL;
332 journal_t *journal;
333 struct block_device *bdev;
334 int err;
335
336 might_sleep();
337 if (bh_in)
338 BUFFER_TRACE(bh_in, "enter");
339
340 journal = handle->h_transaction->t_journal;
341 if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
342 J_ASSERT (!"Cannot set revoke feature!");
343 return -EINVAL;
344 }
345
346 bdev = journal->j_fs_dev;
347 bh = bh_in;
348
349 if (!bh) {
350 bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
351 if (bh)
352 BUFFER_TRACE(bh, "found on hash");
353 }
354#ifdef JBD_EXPENSIVE_CHECKING
355 else {
356 struct buffer_head *bh2;
357
358 /* If there is a different buffer_head lying around in
359 * memory anywhere... */
360 bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
361 if (bh2) {
362 /* ... and it has RevokeValid status... */
363 if (bh2 != bh && buffer_revokevalid(bh2))
364 /* ...then it better be revoked too,
365 * since it's illegal to create a revoke
366 * record against a buffer_head which is
367 * not marked revoked --- that would
368 * risk missing a subsequent revoke
369 * cancel. */
370 J_ASSERT_BH(bh2, buffer_revoked(bh2));
371 put_bh(bh2);
372 }
373 }
374#endif
375
376 /* We really ought not ever to revoke twice in a row without
377 first having the revoke cancelled: it's illegal to free a
378 block twice without allocating it in between! */
379 if (bh) {
380 if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
381 "inconsistent data on disk")) {
382 if (!bh_in)
383 brelse(bh);
384 return -EIO;
385 }
386 set_buffer_revoked(bh);
387 set_buffer_revokevalid(bh);
388 if (bh_in) {
389 BUFFER_TRACE(bh_in, "call journal_forget");
390 journal_forget(handle, bh_in);
391 } else {
392 BUFFER_TRACE(bh, "call brelse");
393 __brelse(bh);
394 }
395 }
396
397 jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
398 err = insert_revoke_hash(journal, blocknr,
399 handle->h_transaction->t_tid);
400 BUFFER_TRACE(bh_in, "exit");
401 return err;
402}
403
404/*
405 * Cancel an outstanding revoke. For use only internally by the
406 * journaling code (called from journal_get_write_access).
407 *
408 * We trust buffer_revoked() on the buffer if the buffer is already
409 * being journaled: if there is no revoke pending on the buffer, then we
410 * don't do anything here.
411 *
412 * This would break if it were possible for a buffer to be revoked and
413 * discarded, and then reallocated within the same transaction. In such
414 * a case we would have lost the revoked bit, but when we arrived here
415 * the second time we would still have a pending revoke to cancel. So,
416 * do not trust the Revoked bit on buffers unless RevokeValid is also
417 * set.
418 *
419 * The caller must have the journal locked.
420 */
421int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
422{
423 struct jbd_revoke_record_s *record;
424 journal_t *journal = handle->h_transaction->t_journal;
425 int need_cancel;
426 int did_revoke = 0; /* akpm: debug */
427 struct buffer_head *bh = jh2bh(jh);
428
429 jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
430
431 /* Is the existing Revoke bit valid? If so, we trust it, and
432 * only perform the full cancel if the revoke bit is set. If
433 * not, we can't trust the revoke bit, and we need to do the
434 * full search for a revoke record. */
435 if (test_set_buffer_revokevalid(bh)) {
436 need_cancel = test_clear_buffer_revoked(bh);
437 } else {
438 need_cancel = 1;
439 clear_buffer_revoked(bh);
440 }
441
442 if (need_cancel) {
443 record = find_revoke_record(journal, bh->b_blocknr);
444 if (record) {
445 jbd_debug(4, "cancelled existing revoke on "
446 "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
447 spin_lock(&journal->j_revoke_lock);
448 list_del(&record->hash);
449 spin_unlock(&journal->j_revoke_lock);
450 kmem_cache_free(revoke_record_cache, record);
451 did_revoke = 1;
452 }
453 }
454
455#ifdef JBD_EXPENSIVE_CHECKING
456 /* There better not be one left behind by now! */
457 record = find_revoke_record(journal, bh->b_blocknr);
458 J_ASSERT_JH(jh, record == NULL);
459#endif
460
461 /* Finally, have we just cleared revoke on an unhashed
462 * buffer_head? If so, we'd better make sure we clear the
463 * revoked status on any hashed alias too, otherwise the revoke
464 * state machine will get very upset later on. */
465 if (need_cancel) {
466 struct buffer_head *bh2;
467 bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
468 if (bh2) {
469 if (bh2 != bh)
470 clear_buffer_revoked(bh2);
471 __brelse(bh2);
472 }
473 }
474 return did_revoke;
475}
476
477/* journal_switch_revoke table select j_revoke for next transaction
478 * we do not want to suspend any processing until all revokes are
479 * written -bzzz
480 */
481void journal_switch_revoke_table(journal_t *journal)
482{
483 int i;
484
485 if (journal->j_revoke == journal->j_revoke_table[0])
486 journal->j_revoke = journal->j_revoke_table[1];
487 else
488 journal->j_revoke = journal->j_revoke_table[0];
489
490 for (i = 0; i < journal->j_revoke->hash_size; i++)
491 INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
492}
493
494/*
495 * Write revoke records to the journal for all entries in the current
496 * revoke hash, deleting the entries as we go.
497 *
498 * Called with the journal lock held.
499 */
500
501void journal_write_revoke_records(journal_t *journal,
502 transaction_t *transaction)
503{
504 struct journal_head *descriptor;
505 struct jbd_revoke_record_s *record;
506 struct jbd_revoke_table_s *revoke;
507 struct list_head *hash_list;
508 int i, offset, count;
509
510 descriptor = NULL;
511 offset = 0;
512 count = 0;
513
514 /* select revoke table for committing transaction */
515 revoke = journal->j_revoke == journal->j_revoke_table[0] ?
516 journal->j_revoke_table[1] : journal->j_revoke_table[0];
517
518 for (i = 0; i < revoke->hash_size; i++) {
519 hash_list = &revoke->hash_table[i];
520
521 while (!list_empty(hash_list)) {
522 record = (struct jbd_revoke_record_s *)
523 hash_list->next;
524 write_one_revoke_record(journal, transaction,
525 &descriptor, &offset,
526 record);
527 count++;
528 list_del(&record->hash);
529 kmem_cache_free(revoke_record_cache, record);
530 }
531 }
532 if (descriptor)
533 flush_descriptor(journal, descriptor, offset);
534 jbd_debug(1, "Wrote %d revoke records\n", count);
535}
536
537/*
538 * Write out one revoke record. We need to create a new descriptor
539 * block if the old one is full or if we have not already created one.
540 */
541
542static void write_one_revoke_record(journal_t *journal,
543 transaction_t *transaction,
544 struct journal_head **descriptorp,
545 int *offsetp,
546 struct jbd_revoke_record_s *record)
547{
548 struct journal_head *descriptor;
549 int offset;
550 journal_header_t *header;
551
552 /* If we are already aborting, this all becomes a noop. We
553 still need to go round the loop in
554 journal_write_revoke_records in order to free all of the
555 revoke records: only the IO to the journal is omitted. */
556 if (is_journal_aborted(journal))
557 return;
558
559 descriptor = *descriptorp;
560 offset = *offsetp;
561
562 /* Make sure we have a descriptor with space left for the record */
563 if (descriptor) {
564 if (offset == journal->j_blocksize) {
565 flush_descriptor(journal, descriptor, offset);
566 descriptor = NULL;
567 }
568 }
569
570 if (!descriptor) {
571 descriptor = journal_get_descriptor_buffer(journal);
572 if (!descriptor)
573 return;
574 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
575 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
576 header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
577 header->h_sequence = cpu_to_be32(transaction->t_tid);
578
579 /* Record it so that we can wait for IO completion later */
580 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
581 journal_file_buffer(descriptor, transaction, BJ_LogCtl);
582
583 offset = sizeof(journal_revoke_header_t);
584 *descriptorp = descriptor;
585 }
586
587 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
588 cpu_to_be32(record->blocknr);
589 offset += 4;
590 *offsetp = offset;
591}
592
593/*
594 * Flush a revoke descriptor out to the journal. If we are aborting,
595 * this is a noop; otherwise we are generating a buffer which needs to
596 * be waited for during commit, so it has to go onto the appropriate
597 * journal buffer list.
598 */
599
600static void flush_descriptor(journal_t *journal,
601 struct journal_head *descriptor,
602 int offset)
603{
604 journal_revoke_header_t *header;
605 struct buffer_head *bh = jh2bh(descriptor);
606
607 if (is_journal_aborted(journal)) {
608 put_bh(bh);
609 return;
610 }
611
612 header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
613 header->r_count = cpu_to_be32(offset);
614 set_buffer_jwrite(bh);
615 BUFFER_TRACE(bh, "write");
616 set_buffer_dirty(bh);
617 ll_rw_block(SWRITE, 1, &bh);
618}
619#endif
620
621/*
622 * Revoke support for recovery.
623 *
624 * Recovery needs to be able to:
625 *
626 * record all revoke records, including the tid of the latest instance
627 * of each revoke in the journal
628 *
629 * check whether a given block in a given transaction should be replayed
630 * (ie. has not been revoked by a revoke record in that or a subsequent
631 * transaction)
632 *
633 * empty the revoke table after recovery.
634 */
635
636/*
637 * First, setting revoke records. We create a new revoke record for
638 * every block ever revoked in the log as we scan it for recovery, and
639 * we update the existing records if we find multiple revokes for a
640 * single block.
641 */
642
643int journal_set_revoke(journal_t *journal,
644 unsigned long blocknr,
645 tid_t sequence)
646{
647 struct jbd_revoke_record_s *record;
648
649 record = find_revoke_record(journal, blocknr);
650 if (record) {
651 /* If we have multiple occurrences, only record the
652 * latest sequence number in the hashed record */
653 if (tid_gt(sequence, record->sequence))
654 record->sequence = sequence;
655 return 0;
656 }
657 return insert_revoke_hash(journal, blocknr, sequence);
658}
659
660/*
661 * Test revoke records. For a given block referenced in the log, has
662 * that block been revoked? A revoke record with a given transaction
663 * sequence number revokes all blocks in that transaction and earlier
664 * ones, but later transactions still need replayed.
665 */
666
667int journal_test_revoke(journal_t *journal,
668 unsigned long blocknr,
669 tid_t sequence)
670{
671 struct jbd_revoke_record_s *record;
672
673 record = find_revoke_record(journal, blocknr);
674 if (!record)
675 return 0;
676 if (tid_gt(sequence, record->sequence))
677 return 0;
678 return 1;
679}
680
681/*
682 * Finally, once recovery is over, we need to clear the revoke table so
683 * that it can be reused by the running filesystem.
684 */
685
686void journal_clear_revoke(journal_t *journal)
687{
688 int i;
689 struct list_head *hash_list;
690 struct jbd_revoke_record_s *record;
691 struct jbd_revoke_table_s *revoke;
692
693 revoke = journal->j_revoke;
694
695 for (i = 0; i < revoke->hash_size; i++) {
696 hash_list = &revoke->hash_table[i];
697 while (!list_empty(hash_list)) {
698 record = (struct jbd_revoke_record_s*) hash_list->next;
699 list_del(&record->hash);
700 kmem_cache_free(revoke_record_cache, record);
701 }
702 }
703}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
new file mode 100644
index 000000000000..e1b3c8af4d17
--- /dev/null
+++ b/fs/jbd2/transaction.c
@@ -0,0 +1,2080 @@
1/*
2 * linux/fs/transaction.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem transaction handling code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages transactions (compound commits managed by the
16 * journaling code) and handles (individual atomic operations by the
17 * filesystem).
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/timer.h>
26#include <linux/smp_lock.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29
30/*
31 * get_transaction: obtain a new transaction_t object.
32 *
33 * Simply allocate and initialise a new transaction. Create it in
34 * RUNNING state and add it to the current journal (which should not
35 * have an existing running transaction: we only make a new transaction
36 * once we have started to commit the old one).
37 *
38 * Preconditions:
39 * The journal MUST be locked. We don't perform atomic mallocs on the
40 * new transaction and we can't block without protecting against other
41 * processes trying to touch the journal while it is in transition.
42 *
43 * Called under j_state_lock
44 */
45
46static transaction_t *
47get_transaction(journal_t *journal, transaction_t *transaction)
48{
49 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING;
51 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock);
54
55 /* Set up the commit timer for the new transaction. */
56 journal->j_commit_timer.expires = transaction->t_expires;
57 add_timer(&journal->j_commit_timer);
58
59 J_ASSERT(journal->j_running_transaction == NULL);
60 journal->j_running_transaction = transaction;
61
62 return transaction;
63}
64
65/*
66 * Handle management.
67 *
68 * A handle_t is an object which represents a single atomic update to a
69 * filesystem, and which tracks all of the modifications which form part
70 * of that one update.
71 */
72
73/*
74 * start_this_handle: Given a handle, deal with any locking or stalling
75 * needed to make sure that there is enough journal space for the handle
76 * to begin. Attach the handle to a transaction and set up the
77 * transaction's buffer credits.
78 */
79
80static int start_this_handle(journal_t *journal, handle_t *handle)
81{
82 transaction_t *transaction;
83 int needed;
84 int nblocks = handle->h_buffer_credits;
85 transaction_t *new_transaction = NULL;
86 int ret = 0;
87
88 if (nblocks > journal->j_max_transaction_buffers) {
89 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
90 current->comm, nblocks,
91 journal->j_max_transaction_buffers);
92 ret = -ENOSPC;
93 goto out;
94 }
95
96alloc_transaction:
97 if (!journal->j_running_transaction) {
98 new_transaction = jbd_kmalloc(sizeof(*new_transaction),
99 GFP_NOFS);
100 if (!new_transaction) {
101 ret = -ENOMEM;
102 goto out;
103 }
104 memset(new_transaction, 0, sizeof(*new_transaction));
105 }
106
107 jbd_debug(3, "New handle %p going live.\n", handle);
108
109repeat:
110
111 /*
112 * We need to hold j_state_lock until t_updates has been incremented,
113 * for proper journal barrier handling
114 */
115 spin_lock(&journal->j_state_lock);
116repeat_locked:
117 if (is_journal_aborted(journal) ||
118 (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
119 spin_unlock(&journal->j_state_lock);
120 ret = -EROFS;
121 goto out;
122 }
123
124 /* Wait on the journal's transaction barrier if necessary */
125 if (journal->j_barrier_count) {
126 spin_unlock(&journal->j_state_lock);
127 wait_event(journal->j_wait_transaction_locked,
128 journal->j_barrier_count == 0);
129 goto repeat;
130 }
131
132 if (!journal->j_running_transaction) {
133 if (!new_transaction) {
134 spin_unlock(&journal->j_state_lock);
135 goto alloc_transaction;
136 }
137 get_transaction(journal, new_transaction);
138 new_transaction = NULL;
139 }
140
141 transaction = journal->j_running_transaction;
142
143 /*
144 * If the current transaction is locked down for commit, wait for the
145 * lock to be released.
146 */
147 if (transaction->t_state == T_LOCKED) {
148 DEFINE_WAIT(wait);
149
150 prepare_to_wait(&journal->j_wait_transaction_locked,
151 &wait, TASK_UNINTERRUPTIBLE);
152 spin_unlock(&journal->j_state_lock);
153 schedule();
154 finish_wait(&journal->j_wait_transaction_locked, &wait);
155 goto repeat;
156 }
157
158 /*
159 * If there is not enough space left in the log to write all potential
160 * buffers requested by this operation, we need to stall pending a log
161 * checkpoint to free some more log space.
162 */
163 spin_lock(&transaction->t_handle_lock);
164 needed = transaction->t_outstanding_credits + nblocks;
165
166 if (needed > journal->j_max_transaction_buffers) {
167 /*
168 * If the current transaction is already too large, then start
169 * to commit it: we can then go back and attach this handle to
170 * a new transaction.
171 */
172 DEFINE_WAIT(wait);
173
174 jbd_debug(2, "Handle %p starting new commit...\n", handle);
175 spin_unlock(&transaction->t_handle_lock);
176 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
177 TASK_UNINTERRUPTIBLE);
178 __log_start_commit(journal, transaction->t_tid);
179 spin_unlock(&journal->j_state_lock);
180 schedule();
181 finish_wait(&journal->j_wait_transaction_locked, &wait);
182 goto repeat;
183 }
184
185 /*
186 * The commit code assumes that it can get enough log space
187 * without forcing a checkpoint. This is *critical* for
188 * correctness: a checkpoint of a buffer which is also
189 * associated with a committing transaction creates a deadlock,
190 * so commit simply cannot force through checkpoints.
191 *
192 * We must therefore ensure the necessary space in the journal
193 * *before* starting to dirty potentially checkpointed buffers
194 * in the new transaction.
195 *
196 * The worst part is, any transaction currently committing can
197 * reduce the free space arbitrarily. Be careful to account for
198 * those buffers when checkpointing.
199 */
200
201 /*
202 * @@@ AKPM: This seems rather over-defensive. We're giving commit
203 * a _lot_ of headroom: 1/4 of the journal plus the size of
204 * the committing transaction. Really, we only need to give it
205 * committing_transaction->t_outstanding_credits plus "enough" for
206 * the log control blocks.
207 * Also, this test is inconsitent with the matching one in
208 * journal_extend().
209 */
210 if (__log_space_left(journal) < jbd_space_needed(journal)) {
211 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
212 spin_unlock(&transaction->t_handle_lock);
213 __log_wait_for_space(journal);
214 goto repeat_locked;
215 }
216
217 /* OK, account for the buffers that this operation expects to
218 * use and add the handle to the running transaction. */
219
220 handle->h_transaction = transaction;
221 transaction->t_outstanding_credits += nblocks;
222 transaction->t_updates++;
223 transaction->t_handle_count++;
224 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
225 handle, nblocks, transaction->t_outstanding_credits,
226 __log_space_left(journal));
227 spin_unlock(&transaction->t_handle_lock);
228 spin_unlock(&journal->j_state_lock);
229out:
230 if (unlikely(new_transaction)) /* It's usually NULL */
231 kfree(new_transaction);
232 return ret;
233}
234
235/* Allocate a new handle. This should probably be in a slab... */
236static handle_t *new_handle(int nblocks)
237{
238 handle_t *handle = jbd_alloc_handle(GFP_NOFS);
239 if (!handle)
240 return NULL;
241 memset(handle, 0, sizeof(*handle));
242 handle->h_buffer_credits = nblocks;
243 handle->h_ref = 1;
244
245 return handle;
246}
247
248/**
249 * handle_t *journal_start() - Obtain a new handle.
250 * @journal: Journal to start transaction on.
251 * @nblocks: number of block buffer we might modify
252 *
253 * We make sure that the transaction can guarantee at least nblocks of
254 * modified buffers in the log. We block until the log can guarantee
255 * that much space.
256 *
257 * This function is visible to journal users (like ext3fs), so is not
258 * called with the journal already locked.
259 *
260 * Return a pointer to a newly allocated handle, or NULL on failure
261 */
262handle_t *journal_start(journal_t *journal, int nblocks)
263{
264 handle_t *handle = journal_current_handle();
265 int err;
266
267 if (!journal)
268 return ERR_PTR(-EROFS);
269
270 if (handle) {
271 J_ASSERT(handle->h_transaction->t_journal == journal);
272 handle->h_ref++;
273 return handle;
274 }
275
276 handle = new_handle(nblocks);
277 if (!handle)
278 return ERR_PTR(-ENOMEM);
279
280 current->journal_info = handle;
281
282 err = start_this_handle(journal, handle);
283 if (err < 0) {
284 jbd_free_handle(handle);
285 current->journal_info = NULL;
286 handle = ERR_PTR(err);
287 }
288 return handle;
289}
290
291/**
292 * int journal_extend() - extend buffer credits.
293 * @handle: handle to 'extend'
294 * @nblocks: nr blocks to try to extend by.
295 *
296 * Some transactions, such as large extends and truncates, can be done
297 * atomically all at once or in several stages. The operation requests
298 * a credit for a number of buffer modications in advance, but can
299 * extend its credit if it needs more.
300 *
301 * journal_extend tries to give the running handle more buffer credits.
302 * It does not guarantee that allocation - this is a best-effort only.
303 * The calling process MUST be able to deal cleanly with a failure to
304 * extend here.
305 *
306 * Return 0 on success, non-zero on failure.
307 *
308 * return code < 0 implies an error
309 * return code > 0 implies normal transaction-full status.
310 */
311int journal_extend(handle_t *handle, int nblocks)
312{
313 transaction_t *transaction = handle->h_transaction;
314 journal_t *journal = transaction->t_journal;
315 int result;
316 int wanted;
317
318 result = -EIO;
319 if (is_handle_aborted(handle))
320 goto out;
321
322 result = 1;
323
324 spin_lock(&journal->j_state_lock);
325
326 /* Don't extend a locked-down transaction! */
327 if (handle->h_transaction->t_state != T_RUNNING) {
328 jbd_debug(3, "denied handle %p %d blocks: "
329 "transaction not running\n", handle, nblocks);
330 goto error_out;
331 }
332
333 spin_lock(&transaction->t_handle_lock);
334 wanted = transaction->t_outstanding_credits + nblocks;
335
336 if (wanted > journal->j_max_transaction_buffers) {
337 jbd_debug(3, "denied handle %p %d blocks: "
338 "transaction too large\n", handle, nblocks);
339 goto unlock;
340 }
341
342 if (wanted > __log_space_left(journal)) {
343 jbd_debug(3, "denied handle %p %d blocks: "
344 "insufficient log space\n", handle, nblocks);
345 goto unlock;
346 }
347
348 handle->h_buffer_credits += nblocks;
349 transaction->t_outstanding_credits += nblocks;
350 result = 0;
351
352 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
353unlock:
354 spin_unlock(&transaction->t_handle_lock);
355error_out:
356 spin_unlock(&journal->j_state_lock);
357out:
358 return result;
359}
360
361
362/**
363 * int journal_restart() - restart a handle .
364 * @handle: handle to restart
365 * @nblocks: nr credits requested
366 *
367 * Restart a handle for a multi-transaction filesystem
368 * operation.
369 *
370 * If the journal_extend() call above fails to grant new buffer credits
371 * to a running handle, a call to journal_restart will commit the
372 * handle's transaction so far and reattach the handle to a new
373 * transaction capabable of guaranteeing the requested number of
374 * credits.
375 */
376
377int journal_restart(handle_t *handle, int nblocks)
378{
379 transaction_t *transaction = handle->h_transaction;
380 journal_t *journal = transaction->t_journal;
381 int ret;
382
383 /* If we've had an abort of any type, don't even think about
384 * actually doing the restart! */
385 if (is_handle_aborted(handle))
386 return 0;
387
388 /*
389 * First unlink the handle from its current transaction, and start the
390 * commit on that.
391 */
392 J_ASSERT(transaction->t_updates > 0);
393 J_ASSERT(journal_current_handle() == handle);
394
395 spin_lock(&journal->j_state_lock);
396 spin_lock(&transaction->t_handle_lock);
397 transaction->t_outstanding_credits -= handle->h_buffer_credits;
398 transaction->t_updates--;
399
400 if (!transaction->t_updates)
401 wake_up(&journal->j_wait_updates);
402 spin_unlock(&transaction->t_handle_lock);
403
404 jbd_debug(2, "restarting handle %p\n", handle);
405 __log_start_commit(journal, transaction->t_tid);
406 spin_unlock(&journal->j_state_lock);
407
408 handle->h_buffer_credits = nblocks;
409 ret = start_this_handle(journal, handle);
410 return ret;
411}
412
413
414/**
415 * void journal_lock_updates () - establish a transaction barrier.
416 * @journal: Journal to establish a barrier on.
417 *
418 * This locks out any further updates from being started, and blocks
419 * until all existing updates have completed, returning only once the
420 * journal is in a quiescent state with no updates running.
421 *
422 * The journal lock should not be held on entry.
423 */
424void journal_lock_updates(journal_t *journal)
425{
426 DEFINE_WAIT(wait);
427
428 spin_lock(&journal->j_state_lock);
429 ++journal->j_barrier_count;
430
431 /* Wait until there are no running updates */
432 while (1) {
433 transaction_t *transaction = journal->j_running_transaction;
434
435 if (!transaction)
436 break;
437
438 spin_lock(&transaction->t_handle_lock);
439 if (!transaction->t_updates) {
440 spin_unlock(&transaction->t_handle_lock);
441 break;
442 }
443 prepare_to_wait(&journal->j_wait_updates, &wait,
444 TASK_UNINTERRUPTIBLE);
445 spin_unlock(&transaction->t_handle_lock);
446 spin_unlock(&journal->j_state_lock);
447 schedule();
448 finish_wait(&journal->j_wait_updates, &wait);
449 spin_lock(&journal->j_state_lock);
450 }
451 spin_unlock(&journal->j_state_lock);
452
453 /*
454 * We have now established a barrier against other normal updates, but
455 * we also need to barrier against other journal_lock_updates() calls
456 * to make sure that we serialise special journal-locked operations
457 * too.
458 */
459 mutex_lock(&journal->j_barrier);
460}
461
462/**
463 * void journal_unlock_updates (journal_t* journal) - release barrier
464 * @journal: Journal to release the barrier on.
465 *
466 * Release a transaction barrier obtained with journal_lock_updates().
467 *
468 * Should be called without the journal lock held.
469 */
470void journal_unlock_updates (journal_t *journal)
471{
472 J_ASSERT(journal->j_barrier_count != 0);
473
474 mutex_unlock(&journal->j_barrier);
475 spin_lock(&journal->j_state_lock);
476 --journal->j_barrier_count;
477 spin_unlock(&journal->j_state_lock);
478 wake_up(&journal->j_wait_transaction_locked);
479}
480
481/*
482 * Report any unexpected dirty buffers which turn up. Normally those
483 * indicate an error, but they can occur if the user is running (say)
484 * tune2fs to modify the live filesystem, so we need the option of
485 * continuing as gracefully as possible. #
486 *
487 * The caller should already hold the journal lock and
488 * j_list_lock spinlock: most callers will need those anyway
489 * in order to probe the buffer's journaling state safely.
490 */
491static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
492{
493 int jlist;
494
495 /* If this buffer is one which might reasonably be dirty
496 * --- ie. data, or not part of this journal --- then
497 * we're OK to leave it alone, but otherwise we need to
498 * move the dirty bit to the journal's own internal
499 * JBDDirty bit. */
500 jlist = jh->b_jlist;
501
502 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
503 jlist == BJ_Shadow || jlist == BJ_Forget) {
504 struct buffer_head *bh = jh2bh(jh);
505
506 if (test_clear_buffer_dirty(bh))
507 set_buffer_jbddirty(bh);
508 }
509}
510
511/*
512 * If the buffer is already part of the current transaction, then there
513 * is nothing we need to do. If it is already part of a prior
514 * transaction which we are still committing to disk, then we need to
515 * make sure that we do not overwrite the old copy: we do copy-out to
516 * preserve the copy going to disk. We also account the buffer against
517 * the handle's metadata buffer credits (unless the buffer is already
518 * part of the transaction, that is).
519 *
520 */
521static int
522do_get_write_access(handle_t *handle, struct journal_head *jh,
523 int force_copy)
524{
525 struct buffer_head *bh;
526 transaction_t *transaction;
527 journal_t *journal;
528 int error;
529 char *frozen_buffer = NULL;
530 int need_copy = 0;
531
532 if (is_handle_aborted(handle))
533 return -EROFS;
534
535 transaction = handle->h_transaction;
536 journal = transaction->t_journal;
537
538 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
539
540 JBUFFER_TRACE(jh, "entry");
541repeat:
542 bh = jh2bh(jh);
543
544 /* @@@ Need to check for errors here at some point. */
545
546 lock_buffer(bh);
547 jbd_lock_bh_state(bh);
548
549 /* We now hold the buffer lock so it is safe to query the buffer
550 * state. Is the buffer dirty?
551 *
552 * If so, there are two possibilities. The buffer may be
553 * non-journaled, and undergoing a quite legitimate writeback.
554 * Otherwise, it is journaled, and we don't expect dirty buffers
555 * in that state (the buffers should be marked JBD_Dirty
556 * instead.) So either the IO is being done under our own
557 * control and this is a bug, or it's a third party IO such as
558 * dump(8) (which may leave the buffer scheduled for read ---
559 * ie. locked but not dirty) or tune2fs (which may actually have
560 * the buffer dirtied, ugh.) */
561
562 if (buffer_dirty(bh)) {
563 /*
564 * First question: is this buffer already part of the current
565 * transaction or the existing committing transaction?
566 */
567 if (jh->b_transaction) {
568 J_ASSERT_JH(jh,
569 jh->b_transaction == transaction ||
570 jh->b_transaction ==
571 journal->j_committing_transaction);
572 if (jh->b_next_transaction)
573 J_ASSERT_JH(jh, jh->b_next_transaction ==
574 transaction);
575 }
576 /*
577 * In any case we need to clean the dirty flag and we must
578 * do it under the buffer lock to be sure we don't race
579 * with running write-out.
580 */
581 JBUFFER_TRACE(jh, "Unexpected dirty buffer");
582 jbd_unexpected_dirty_buffer(jh);
583 }
584
585 unlock_buffer(bh);
586
587 error = -EROFS;
588 if (is_handle_aborted(handle)) {
589 jbd_unlock_bh_state(bh);
590 goto out;
591 }
592 error = 0;
593
594 /*
595 * The buffer is already part of this transaction if b_transaction or
596 * b_next_transaction points to it
597 */
598 if (jh->b_transaction == transaction ||
599 jh->b_next_transaction == transaction)
600 goto done;
601
602 /*
603 * If there is already a copy-out version of this buffer, then we don't
604 * need to make another one
605 */
606 if (jh->b_frozen_data) {
607 JBUFFER_TRACE(jh, "has frozen data");
608 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
609 jh->b_next_transaction = transaction;
610 goto done;
611 }
612
613 /* Is there data here we need to preserve? */
614
615 if (jh->b_transaction && jh->b_transaction != transaction) {
616 JBUFFER_TRACE(jh, "owned by older transaction");
617 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
618 J_ASSERT_JH(jh, jh->b_transaction ==
619 journal->j_committing_transaction);
620
621 /* There is one case we have to be very careful about.
622 * If the committing transaction is currently writing
623 * this buffer out to disk and has NOT made a copy-out,
624 * then we cannot modify the buffer contents at all
625 * right now. The essence of copy-out is that it is the
626 * extra copy, not the primary copy, which gets
627 * journaled. If the primary copy is already going to
628 * disk then we cannot do copy-out here. */
629
630 if (jh->b_jlist == BJ_Shadow) {
631 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
632 wait_queue_head_t *wqh;
633
634 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
635
636 JBUFFER_TRACE(jh, "on shadow: sleep");
637 jbd_unlock_bh_state(bh);
638 /* commit wakes up all shadow buffers after IO */
639 for ( ; ; ) {
640 prepare_to_wait(wqh, &wait.wait,
641 TASK_UNINTERRUPTIBLE);
642 if (jh->b_jlist != BJ_Shadow)
643 break;
644 schedule();
645 }
646 finish_wait(wqh, &wait.wait);
647 goto repeat;
648 }
649
650 /* Only do the copy if the currently-owning transaction
651 * still needs it. If it is on the Forget list, the
652 * committing transaction is past that stage. The
653 * buffer had better remain locked during the kmalloc,
654 * but that should be true --- we hold the journal lock
655 * still and the buffer is already on the BUF_JOURNAL
656 * list so won't be flushed.
657 *
658 * Subtle point, though: if this is a get_undo_access,
659 * then we will be relying on the frozen_data to contain
660 * the new value of the committed_data record after the
661 * transaction, so we HAVE to force the frozen_data copy
662 * in that case. */
663
664 if (jh->b_jlist != BJ_Forget || force_copy) {
665 JBUFFER_TRACE(jh, "generate frozen data");
666 if (!frozen_buffer) {
667 JBUFFER_TRACE(jh, "allocate memory for buffer");
668 jbd_unlock_bh_state(bh);
669 frozen_buffer =
670 jbd_slab_alloc(jh2bh(jh)->b_size,
671 GFP_NOFS);
672 if (!frozen_buffer) {
673 printk(KERN_EMERG
674 "%s: OOM for frozen_buffer\n",
675 __FUNCTION__);
676 JBUFFER_TRACE(jh, "oom!");
677 error = -ENOMEM;
678 jbd_lock_bh_state(bh);
679 goto done;
680 }
681 goto repeat;
682 }
683 jh->b_frozen_data = frozen_buffer;
684 frozen_buffer = NULL;
685 need_copy = 1;
686 }
687 jh->b_next_transaction = transaction;
688 }
689
690
691 /*
692 * Finally, if the buffer is not journaled right now, we need to make
693 * sure it doesn't get written to disk before the caller actually
694 * commits the new data
695 */
696 if (!jh->b_transaction) {
697 JBUFFER_TRACE(jh, "no transaction");
698 J_ASSERT_JH(jh, !jh->b_next_transaction);
699 jh->b_transaction = transaction;
700 JBUFFER_TRACE(jh, "file as BJ_Reserved");
701 spin_lock(&journal->j_list_lock);
702 __journal_file_buffer(jh, transaction, BJ_Reserved);
703 spin_unlock(&journal->j_list_lock);
704 }
705
706done:
707 if (need_copy) {
708 struct page *page;
709 int offset;
710 char *source;
711
712 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
713 "Possible IO failure.\n");
714 page = jh2bh(jh)->b_page;
715 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
716 source = kmap_atomic(page, KM_USER0);
717 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
718 kunmap_atomic(source, KM_USER0);
719 }
720 jbd_unlock_bh_state(bh);
721
722 /*
723 * If we are about to journal a buffer, then any revoke pending on it is
724 * no longer valid
725 */
726 journal_cancel_revoke(handle, jh);
727
728out:
729 if (unlikely(frozen_buffer)) /* It's usually NULL */
730 jbd_slab_free(frozen_buffer, bh->b_size);
731
732 JBUFFER_TRACE(jh, "exit");
733 return error;
734}
735
736/**
737 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
738 * @handle: transaction to add buffer modifications to
739 * @bh: bh to be used for metadata writes
740 * @credits: variable that will receive credits for the buffer
741 *
742 * Returns an error code or 0 on success.
743 *
744 * In full data journalling mode the buffer may be of type BJ_AsyncData,
745 * because we're write()ing a buffer which is also part of a shared mapping.
746 */
747
748int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
749{
750 struct journal_head *jh = journal_add_journal_head(bh);
751 int rc;
752
753 /* We do not want to get caught playing with fields which the
754 * log thread also manipulates. Make sure that the buffer
755 * completes any outstanding IO before proceeding. */
756 rc = do_get_write_access(handle, jh, 0);
757 journal_put_journal_head(jh);
758 return rc;
759}
760
761
762/*
763 * When the user wants to journal a newly created buffer_head
764 * (ie. getblk() returned a new buffer and we are going to populate it
765 * manually rather than reading off disk), then we need to keep the
766 * buffer_head locked until it has been completely filled with new
767 * data. In this case, we should be able to make the assertion that
768 * the bh is not already part of an existing transaction.
769 *
770 * The buffer should already be locked by the caller by this point.
771 * There is no lock ranking violation: it was a newly created,
772 * unlocked buffer beforehand. */
773
774/**
775 * int journal_get_create_access () - notify intent to use newly created bh
776 * @handle: transaction to new buffer to
777 * @bh: new buffer.
778 *
779 * Call this if you create a new bh.
780 */
781int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
782{
783 transaction_t *transaction = handle->h_transaction;
784 journal_t *journal = transaction->t_journal;
785 struct journal_head *jh = journal_add_journal_head(bh);
786 int err;
787
788 jbd_debug(5, "journal_head %p\n", jh);
789 err = -EROFS;
790 if (is_handle_aborted(handle))
791 goto out;
792 err = 0;
793
794 JBUFFER_TRACE(jh, "entry");
795 /*
796 * The buffer may already belong to this transaction due to pre-zeroing
797 * in the filesystem's new_block code. It may also be on the previous,
798 * committing transaction's lists, but it HAS to be in Forget state in
799 * that case: the transaction must have deleted the buffer for it to be
800 * reused here.
801 */
802 jbd_lock_bh_state(bh);
803 spin_lock(&journal->j_list_lock);
804 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
805 jh->b_transaction == NULL ||
806 (jh->b_transaction == journal->j_committing_transaction &&
807 jh->b_jlist == BJ_Forget)));
808
809 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
810 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
811
812 if (jh->b_transaction == NULL) {
813 jh->b_transaction = transaction;
814 JBUFFER_TRACE(jh, "file as BJ_Reserved");
815 __journal_file_buffer(jh, transaction, BJ_Reserved);
816 } else if (jh->b_transaction == journal->j_committing_transaction) {
817 JBUFFER_TRACE(jh, "set next transaction");
818 jh->b_next_transaction = transaction;
819 }
820 spin_unlock(&journal->j_list_lock);
821 jbd_unlock_bh_state(bh);
822
823 /*
824 * akpm: I added this. ext3_alloc_branch can pick up new indirect
825 * blocks which contain freed but then revoked metadata. We need
826 * to cancel the revoke in case we end up freeing it yet again
827 * and the reallocating as data - this would cause a second revoke,
828 * which hits an assertion error.
829 */
830 JBUFFER_TRACE(jh, "cancelling revoke");
831 journal_cancel_revoke(handle, jh);
832 journal_put_journal_head(jh);
833out:
834 return err;
835}
836
837/**
838 * int journal_get_undo_access() - Notify intent to modify metadata with
839 * non-rewindable consequences
840 * @handle: transaction
841 * @bh: buffer to undo
842 * @credits: store the number of taken credits here (if not NULL)
843 *
844 * Sometimes there is a need to distinguish between metadata which has
845 * been committed to disk and that which has not. The ext3fs code uses
846 * this for freeing and allocating space, we have to make sure that we
847 * do not reuse freed space until the deallocation has been committed,
848 * since if we overwrote that space we would make the delete
849 * un-rewindable in case of a crash.
850 *
851 * To deal with that, journal_get_undo_access requests write access to a
852 * buffer for parts of non-rewindable operations such as delete
853 * operations on the bitmaps. The journaling code must keep a copy of
854 * the buffer's contents prior to the undo_access call until such time
855 * as we know that the buffer has definitely been committed to disk.
856 *
857 * We never need to know which transaction the committed data is part
858 * of, buffers touched here are guaranteed to be dirtied later and so
859 * will be committed to a new transaction in due course, at which point
860 * we can discard the old committed data pointer.
861 *
862 * Returns error number or 0 on success.
863 */
864int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
865{
866 int err;
867 struct journal_head *jh = journal_add_journal_head(bh);
868 char *committed_data = NULL;
869
870 JBUFFER_TRACE(jh, "entry");
871
872 /*
873 * Do this first --- it can drop the journal lock, so we want to
874 * make sure that obtaining the committed_data is done
875 * atomically wrt. completion of any outstanding commits.
876 */
877 err = do_get_write_access(handle, jh, 1);
878 if (err)
879 goto out;
880
881repeat:
882 if (!jh->b_committed_data) {
883 committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
884 if (!committed_data) {
885 printk(KERN_EMERG "%s: No memory for committed data\n",
886 __FUNCTION__);
887 err = -ENOMEM;
888 goto out;
889 }
890 }
891
892 jbd_lock_bh_state(bh);
893 if (!jh->b_committed_data) {
894 /* Copy out the current buffer contents into the
895 * preserved, committed copy. */
896 JBUFFER_TRACE(jh, "generate b_committed data");
897 if (!committed_data) {
898 jbd_unlock_bh_state(bh);
899 goto repeat;
900 }
901
902 jh->b_committed_data = committed_data;
903 committed_data = NULL;
904 memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
905 }
906 jbd_unlock_bh_state(bh);
907out:
908 journal_put_journal_head(jh);
909 if (unlikely(committed_data))
910 jbd_slab_free(committed_data, bh->b_size);
911 return err;
912}
913
914/**
915 * int journal_dirty_data() - mark a buffer as containing dirty data which
916 * needs to be flushed before we can commit the
917 * current transaction.
918 * @handle: transaction
919 * @bh: bufferhead to mark
920 *
921 * The buffer is placed on the transaction's data list and is marked as
922 * belonging to the transaction.
923 *
924 * Returns error number or 0 on success.
925 *
926 * journal_dirty_data() can be called via page_launder->ext3_writepage
927 * by kswapd.
928 */
929int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
930{
931 journal_t *journal = handle->h_transaction->t_journal;
932 int need_brelse = 0;
933 struct journal_head *jh;
934
935 if (is_handle_aborted(handle))
936 return 0;
937
938 jh = journal_add_journal_head(bh);
939 JBUFFER_TRACE(jh, "entry");
940
941 /*
942 * The buffer could *already* be dirty. Writeout can start
943 * at any time.
944 */
945 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
946
947 /*
948 * What if the buffer is already part of a running transaction?
949 *
950 * There are two cases:
951 * 1) It is part of the current running transaction. Refile it,
952 * just in case we have allocated it as metadata, deallocated
953 * it, then reallocated it as data.
954 * 2) It is part of the previous, still-committing transaction.
955 * If all we want to do is to guarantee that the buffer will be
956 * written to disk before this new transaction commits, then
957 * being sure that the *previous* transaction has this same
958 * property is sufficient for us! Just leave it on its old
959 * transaction.
960 *
961 * In case (2), the buffer must not already exist as metadata
962 * --- that would violate write ordering (a transaction is free
963 * to write its data at any point, even before the previous
964 * committing transaction has committed). The caller must
965 * never, ever allow this to happen: there's nothing we can do
966 * about it in this layer.
967 */
968 jbd_lock_bh_state(bh);
969 spin_lock(&journal->j_list_lock);
970 if (jh->b_transaction) {
971 JBUFFER_TRACE(jh, "has transaction");
972 if (jh->b_transaction != handle->h_transaction) {
973 JBUFFER_TRACE(jh, "belongs to older transaction");
974 J_ASSERT_JH(jh, jh->b_transaction ==
975 journal->j_committing_transaction);
976
977 /* @@@ IS THIS TRUE ? */
978 /*
979 * Not any more. Scenario: someone does a write()
980 * in data=journal mode. The buffer's transaction has
981 * moved into commit. Then someone does another
982 * write() to the file. We do the frozen data copyout
983 * and set b_next_transaction to point to j_running_t.
984 * And while we're in that state, someone does a
985 * writepage() in an attempt to pageout the same area
986 * of the file via a shared mapping. At present that
987 * calls journal_dirty_data(), and we get right here.
988 * It may be too late to journal the data. Simply
989 * falling through to the next test will suffice: the
990 * data will be dirty and wil be checkpointed. The
991 * ordering comments in the next comment block still
992 * apply.
993 */
994 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
995
996 /*
997 * If we're journalling data, and this buffer was
998 * subject to a write(), it could be metadata, forget
999 * or shadow against the committing transaction. Now,
1000 * someone has dirtied the same darn page via a mapping
1001 * and it is being writepage()'d.
1002 * We *could* just steal the page from commit, with some
1003 * fancy locking there. Instead, we just skip it -
1004 * don't tie the page's buffers to the new transaction
1005 * at all.
1006 * Implication: if we crash before the writepage() data
1007 * is written into the filesystem, recovery will replay
1008 * the write() data.
1009 */
1010 if (jh->b_jlist != BJ_None &&
1011 jh->b_jlist != BJ_SyncData &&
1012 jh->b_jlist != BJ_Locked) {
1013 JBUFFER_TRACE(jh, "Not stealing");
1014 goto no_journal;
1015 }
1016
1017 /*
1018 * This buffer may be undergoing writeout in commit. We
1019 * can't return from here and let the caller dirty it
1020 * again because that can cause the write-out loop in
1021 * commit to never terminate.
1022 */
1023 if (buffer_dirty(bh)) {
1024 get_bh(bh);
1025 spin_unlock(&journal->j_list_lock);
1026 jbd_unlock_bh_state(bh);
1027 need_brelse = 1;
1028 sync_dirty_buffer(bh);
1029 jbd_lock_bh_state(bh);
1030 spin_lock(&journal->j_list_lock);
1031 /* The buffer may become locked again at any
1032 time if it is redirtied */
1033 }
1034
1035 /* journal_clean_data_list() may have got there first */
1036 if (jh->b_transaction != NULL) {
1037 JBUFFER_TRACE(jh, "unfile from commit");
1038 __journal_temp_unlink_buffer(jh);
1039 /* It still points to the committing
1040 * transaction; move it to this one so
1041 * that the refile assert checks are
1042 * happy. */
1043 jh->b_transaction = handle->h_transaction;
1044 }
1045 /* The buffer will be refiled below */
1046
1047 }
1048 /*
1049 * Special case --- the buffer might actually have been
1050 * allocated and then immediately deallocated in the previous,
1051 * committing transaction, so might still be left on that
1052 * transaction's metadata lists.
1053 */
1054 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1055 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1056 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1057 __journal_temp_unlink_buffer(jh);
1058 jh->b_transaction = handle->h_transaction;
1059 JBUFFER_TRACE(jh, "file as data");
1060 __journal_file_buffer(jh, handle->h_transaction,
1061 BJ_SyncData);
1062 }
1063 } else {
1064 JBUFFER_TRACE(jh, "not on a transaction");
1065 __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1066 }
1067no_journal:
1068 spin_unlock(&journal->j_list_lock);
1069 jbd_unlock_bh_state(bh);
1070 if (need_brelse) {
1071 BUFFER_TRACE(bh, "brelse");
1072 __brelse(bh);
1073 }
1074 JBUFFER_TRACE(jh, "exit");
1075 journal_put_journal_head(jh);
1076 return 0;
1077}
1078
1079/**
1080 * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
1081 * @handle: transaction to add buffer to.
1082 * @bh: buffer to mark
1083 *
1084 * mark dirty metadata which needs to be journaled as part of the current
1085 * transaction.
1086 *
1087 * The buffer is placed on the transaction's metadata list and is marked
1088 * as belonging to the transaction.
1089 *
1090 * Returns error number or 0 on success.
1091 *
1092 * Special care needs to be taken if the buffer already belongs to the
1093 * current committing transaction (in which case we should have frozen
1094 * data present for that commit). In that case, we don't relink the
1095 * buffer: that only gets done when the old transaction finally
1096 * completes its commit.
1097 */
1098int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1099{
1100 transaction_t *transaction = handle->h_transaction;
1101 journal_t *journal = transaction->t_journal;
1102 struct journal_head *jh = bh2jh(bh);
1103
1104 jbd_debug(5, "journal_head %p\n", jh);
1105 JBUFFER_TRACE(jh, "entry");
1106 if (is_handle_aborted(handle))
1107 goto out;
1108
1109 jbd_lock_bh_state(bh);
1110
1111 if (jh->b_modified == 0) {
1112 /*
1113 * This buffer's got modified and becoming part
1114 * of the transaction. This needs to be done
1115 * once a transaction -bzzz
1116 */
1117 jh->b_modified = 1;
1118 J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1119 handle->h_buffer_credits--;
1120 }
1121
1122 /*
1123 * fastpath, to avoid expensive locking. If this buffer is already
1124 * on the running transaction's metadata list there is nothing to do.
1125 * Nobody can take it off again because there is a handle open.
1126 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1127 * result in this test being false, so we go in and take the locks.
1128 */
1129 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1130 JBUFFER_TRACE(jh, "fastpath");
1131 J_ASSERT_JH(jh, jh->b_transaction ==
1132 journal->j_running_transaction);
1133 goto out_unlock_bh;
1134 }
1135
1136 set_buffer_jbddirty(bh);
1137
1138 /*
1139 * Metadata already on the current transaction list doesn't
1140 * need to be filed. Metadata on another transaction's list must
1141 * be committing, and will be refiled once the commit completes:
1142 * leave it alone for now.
1143 */
1144 if (jh->b_transaction != transaction) {
1145 JBUFFER_TRACE(jh, "already on other transaction");
1146 J_ASSERT_JH(jh, jh->b_transaction ==
1147 journal->j_committing_transaction);
1148 J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1149 /* And this case is illegal: we can't reuse another
1150 * transaction's data buffer, ever. */
1151 goto out_unlock_bh;
1152 }
1153
1154 /* That test should have eliminated the following case: */
1155 J_ASSERT_JH(jh, jh->b_frozen_data == 0);
1156
1157 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1158 spin_lock(&journal->j_list_lock);
1159 __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1160 spin_unlock(&journal->j_list_lock);
1161out_unlock_bh:
1162 jbd_unlock_bh_state(bh);
1163out:
1164 JBUFFER_TRACE(jh, "exit");
1165 return 0;
1166}
1167
1168/*
1169 * journal_release_buffer: undo a get_write_access without any buffer
1170 * updates, if the update decided in the end that it didn't need access.
1171 *
1172 */
1173void
1174journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1175{
1176 BUFFER_TRACE(bh, "entry");
1177}
1178
1179/**
1180 * void journal_forget() - bforget() for potentially-journaled buffers.
1181 * @handle: transaction handle
1182 * @bh: bh to 'forget'
1183 *
1184 * We can only do the bforget if there are no commits pending against the
1185 * buffer. If the buffer is dirty in the current running transaction we
1186 * can safely unlink it.
1187 *
1188 * bh may not be a journalled buffer at all - it may be a non-JBD
1189 * buffer which came off the hashtable. Check for this.
1190 *
1191 * Decrements bh->b_count by one.
1192 *
1193 * Allow this call even if the handle has aborted --- it may be part of
1194 * the caller's cleanup after an abort.
1195 */
1196int journal_forget (handle_t *handle, struct buffer_head *bh)
1197{
1198 transaction_t *transaction = handle->h_transaction;
1199 journal_t *journal = transaction->t_journal;
1200 struct journal_head *jh;
1201 int drop_reserve = 0;
1202 int err = 0;
1203
1204 BUFFER_TRACE(bh, "entry");
1205
1206 jbd_lock_bh_state(bh);
1207 spin_lock(&journal->j_list_lock);
1208
1209 if (!buffer_jbd(bh))
1210 goto not_jbd;
1211 jh = bh2jh(bh);
1212
1213 /* Critical error: attempting to delete a bitmap buffer, maybe?
1214 * Don't do any jbd operations, and return an error. */
1215 if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1216 "inconsistent data on disk")) {
1217 err = -EIO;
1218 goto not_jbd;
1219 }
1220
1221 /*
1222 * The buffer's going from the transaction, we must drop
1223 * all references -bzzz
1224 */
1225 jh->b_modified = 0;
1226
1227 if (jh->b_transaction == handle->h_transaction) {
1228 J_ASSERT_JH(jh, !jh->b_frozen_data);
1229
1230 /* If we are forgetting a buffer which is already part
1231 * of this transaction, then we can just drop it from
1232 * the transaction immediately. */
1233 clear_buffer_dirty(bh);
1234 clear_buffer_jbddirty(bh);
1235
1236 JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1237
1238 drop_reserve = 1;
1239
1240 /*
1241 * We are no longer going to journal this buffer.
1242 * However, the commit of this transaction is still
1243 * important to the buffer: the delete that we are now
1244 * processing might obsolete an old log entry, so by
1245 * committing, we can satisfy the buffer's checkpoint.
1246 *
1247 * So, if we have a checkpoint on the buffer, we should
1248 * now refile the buffer on our BJ_Forget list so that
1249 * we know to remove the checkpoint after we commit.
1250 */
1251
1252 if (jh->b_cp_transaction) {
1253 __journal_temp_unlink_buffer(jh);
1254 __journal_file_buffer(jh, transaction, BJ_Forget);
1255 } else {
1256 __journal_unfile_buffer(jh);
1257 journal_remove_journal_head(bh);
1258 __brelse(bh);
1259 if (!buffer_jbd(bh)) {
1260 spin_unlock(&journal->j_list_lock);
1261 jbd_unlock_bh_state(bh);
1262 __bforget(bh);
1263 goto drop;
1264 }
1265 }
1266 } else if (jh->b_transaction) {
1267 J_ASSERT_JH(jh, (jh->b_transaction ==
1268 journal->j_committing_transaction));
1269 /* However, if the buffer is still owned by a prior
1270 * (committing) transaction, we can't drop it yet... */
1271 JBUFFER_TRACE(jh, "belongs to older transaction");
1272 /* ... but we CAN drop it from the new transaction if we
1273 * have also modified it since the original commit. */
1274
1275 if (jh->b_next_transaction) {
1276 J_ASSERT(jh->b_next_transaction == transaction);
1277 jh->b_next_transaction = NULL;
1278 drop_reserve = 1;
1279 }
1280 }
1281
1282not_jbd:
1283 spin_unlock(&journal->j_list_lock);
1284 jbd_unlock_bh_state(bh);
1285 __brelse(bh);
1286drop:
1287 if (drop_reserve) {
1288 /* no need to reserve log space for this block -bzzz */
1289 handle->h_buffer_credits++;
1290 }
1291 return err;
1292}
1293
1294/**
1295 * int journal_stop() - complete a transaction
1296 * @handle: tranaction to complete.
1297 *
1298 * All done for a particular handle.
1299 *
1300 * There is not much action needed here. We just return any remaining
1301 * buffer credits to the transaction and remove the handle. The only
1302 * complication is that we need to start a commit operation if the
1303 * filesystem is marked for synchronous update.
1304 *
1305 * journal_stop itself will not usually return an error, but it may
1306 * do so in unusual circumstances. In particular, expect it to
1307 * return -EIO if a journal_abort has been executed since the
1308 * transaction began.
1309 */
1310int journal_stop(handle_t *handle)
1311{
1312 transaction_t *transaction = handle->h_transaction;
1313 journal_t *journal = transaction->t_journal;
1314 int old_handle_count, err;
1315 pid_t pid;
1316
1317 J_ASSERT(transaction->t_updates > 0);
1318 J_ASSERT(journal_current_handle() == handle);
1319
1320 if (is_handle_aborted(handle))
1321 err = -EIO;
1322 else
1323 err = 0;
1324
1325 if (--handle->h_ref > 0) {
1326 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1327 handle->h_ref);
1328 return err;
1329 }
1330
1331 jbd_debug(4, "Handle %p going down\n", handle);
1332
1333 /*
1334 * Implement synchronous transaction batching. If the handle
1335 * was synchronous, don't force a commit immediately. Let's
1336 * yield and let another thread piggyback onto this transaction.
1337 * Keep doing that while new threads continue to arrive.
1338 * It doesn't cost much - we're about to run a commit and sleep
1339 * on IO anyway. Speeds up many-threaded, many-dir operations
1340 * by 30x or more...
1341 *
1342 * But don't do this if this process was the most recent one to
1343 * perform a synchronous write. We do this to detect the case where a
1344 * single process is doing a stream of sync writes. No point in waiting
1345 * for joiners in that case.
1346 */
1347 pid = current->pid;
1348 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1349 journal->j_last_sync_writer = pid;
1350 do {
1351 old_handle_count = transaction->t_handle_count;
1352 schedule_timeout_uninterruptible(1);
1353 } while (old_handle_count != transaction->t_handle_count);
1354 }
1355
1356 current->journal_info = NULL;
1357 spin_lock(&journal->j_state_lock);
1358 spin_lock(&transaction->t_handle_lock);
1359 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1360 transaction->t_updates--;
1361 if (!transaction->t_updates) {
1362 wake_up(&journal->j_wait_updates);
1363 if (journal->j_barrier_count)
1364 wake_up(&journal->j_wait_transaction_locked);
1365 }
1366
1367 /*
1368 * If the handle is marked SYNC, we need to set another commit
1369 * going! We also want to force a commit if the current
1370 * transaction is occupying too much of the log, or if the
1371 * transaction is too old now.
1372 */
1373 if (handle->h_sync ||
1374 transaction->t_outstanding_credits >
1375 journal->j_max_transaction_buffers ||
1376 time_after_eq(jiffies, transaction->t_expires)) {
1377 /* Do this even for aborted journals: an abort still
1378 * completes the commit thread, it just doesn't write
1379 * anything to disk. */
1380 tid_t tid = transaction->t_tid;
1381
1382 spin_unlock(&transaction->t_handle_lock);
1383 jbd_debug(2, "transaction too old, requesting commit for "
1384 "handle %p\n", handle);
1385 /* This is non-blocking */
1386 __log_start_commit(journal, transaction->t_tid);
1387 spin_unlock(&journal->j_state_lock);
1388
1389 /*
1390 * Special case: JFS_SYNC synchronous updates require us
1391 * to wait for the commit to complete.
1392 */
1393 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1394 err = log_wait_commit(journal, tid);
1395 } else {
1396 spin_unlock(&transaction->t_handle_lock);
1397 spin_unlock(&journal->j_state_lock);
1398 }
1399
1400 jbd_free_handle(handle);
1401 return err;
1402}
1403
1404/**int journal_force_commit() - force any uncommitted transactions
1405 * @journal: journal to force
1406 *
1407 * For synchronous operations: force any uncommitted transactions
1408 * to disk. May seem kludgy, but it reuses all the handle batching
1409 * code in a very simple manner.
1410 */
1411int journal_force_commit(journal_t *journal)
1412{
1413 handle_t *handle;
1414 int ret;
1415
1416 handle = journal_start(journal, 1);
1417 if (IS_ERR(handle)) {
1418 ret = PTR_ERR(handle);
1419 } else {
1420 handle->h_sync = 1;
1421 ret = journal_stop(handle);
1422 }
1423 return ret;
1424}
1425
1426/*
1427 *
1428 * List management code snippets: various functions for manipulating the
1429 * transaction buffer lists.
1430 *
1431 */
1432
1433/*
1434 * Append a buffer to a transaction list, given the transaction's list head
1435 * pointer.
1436 *
1437 * j_list_lock is held.
1438 *
1439 * jbd_lock_bh_state(jh2bh(jh)) is held.
1440 */
1441
1442static inline void
1443__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1444{
1445 if (!*list) {
1446 jh->b_tnext = jh->b_tprev = jh;
1447 *list = jh;
1448 } else {
1449 /* Insert at the tail of the list to preserve order */
1450 struct journal_head *first = *list, *last = first->b_tprev;
1451 jh->b_tprev = last;
1452 jh->b_tnext = first;
1453 last->b_tnext = first->b_tprev = jh;
1454 }
1455}
1456
1457/*
1458 * Remove a buffer from a transaction list, given the transaction's list
1459 * head pointer.
1460 *
1461 * Called with j_list_lock held, and the journal may not be locked.
1462 *
1463 * jbd_lock_bh_state(jh2bh(jh)) is held.
1464 */
1465
1466static inline void
1467__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1468{
1469 if (*list == jh) {
1470 *list = jh->b_tnext;
1471 if (*list == jh)
1472 *list = NULL;
1473 }
1474 jh->b_tprev->b_tnext = jh->b_tnext;
1475 jh->b_tnext->b_tprev = jh->b_tprev;
1476}
1477
1478/*
1479 * Remove a buffer from the appropriate transaction list.
1480 *
1481 * Note that this function can *change* the value of
1482 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
1483 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
1484 * is holding onto a copy of one of thee pointers, it could go bad.
1485 * Generally the caller needs to re-read the pointer from the transaction_t.
1486 *
1487 * Called under j_list_lock. The journal may not be locked.
1488 */
1489void __journal_temp_unlink_buffer(struct journal_head *jh)
1490{
1491 struct journal_head **list = NULL;
1492 transaction_t *transaction;
1493 struct buffer_head *bh = jh2bh(jh);
1494
1495 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1496 transaction = jh->b_transaction;
1497 if (transaction)
1498 assert_spin_locked(&transaction->t_journal->j_list_lock);
1499
1500 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1501 if (jh->b_jlist != BJ_None)
1502 J_ASSERT_JH(jh, transaction != 0);
1503
1504 switch (jh->b_jlist) {
1505 case BJ_None:
1506 return;
1507 case BJ_SyncData:
1508 list = &transaction->t_sync_datalist;
1509 break;
1510 case BJ_Metadata:
1511 transaction->t_nr_buffers--;
1512 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1513 list = &transaction->t_buffers;
1514 break;
1515 case BJ_Forget:
1516 list = &transaction->t_forget;
1517 break;
1518 case BJ_IO:
1519 list = &transaction->t_iobuf_list;
1520 break;
1521 case BJ_Shadow:
1522 list = &transaction->t_shadow_list;
1523 break;
1524 case BJ_LogCtl:
1525 list = &transaction->t_log_list;
1526 break;
1527 case BJ_Reserved:
1528 list = &transaction->t_reserved_list;
1529 break;
1530 case BJ_Locked:
1531 list = &transaction->t_locked_list;
1532 break;
1533 }
1534
1535 __blist_del_buffer(list, jh);
1536 jh->b_jlist = BJ_None;
1537 if (test_clear_buffer_jbddirty(bh))
1538 mark_buffer_dirty(bh); /* Expose it to the VM */
1539}
1540
1541void __journal_unfile_buffer(struct journal_head *jh)
1542{
1543 __journal_temp_unlink_buffer(jh);
1544 jh->b_transaction = NULL;
1545}
1546
1547void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1548{
1549 jbd_lock_bh_state(jh2bh(jh));
1550 spin_lock(&journal->j_list_lock);
1551 __journal_unfile_buffer(jh);
1552 spin_unlock(&journal->j_list_lock);
1553 jbd_unlock_bh_state(jh2bh(jh));
1554}
1555
1556/*
1557 * Called from journal_try_to_free_buffers().
1558 *
1559 * Called under jbd_lock_bh_state(bh)
1560 */
1561static void
1562__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1563{
1564 struct journal_head *jh;
1565
1566 jh = bh2jh(bh);
1567
1568 if (buffer_locked(bh) || buffer_dirty(bh))
1569 goto out;
1570
1571 if (jh->b_next_transaction != 0)
1572 goto out;
1573
1574 spin_lock(&journal->j_list_lock);
1575 if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
1576 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1577 /* A written-back ordered data buffer */
1578 JBUFFER_TRACE(jh, "release data");
1579 __journal_unfile_buffer(jh);
1580 journal_remove_journal_head(bh);
1581 __brelse(bh);
1582 }
1583 } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
1584 /* written-back checkpointed metadata buffer */
1585 if (jh->b_jlist == BJ_None) {
1586 JBUFFER_TRACE(jh, "remove from checkpoint list");
1587 __journal_remove_checkpoint(jh);
1588 journal_remove_journal_head(bh);
1589 __brelse(bh);
1590 }
1591 }
1592 spin_unlock(&journal->j_list_lock);
1593out:
1594 return;
1595}
1596
1597
1598/**
1599 * int journal_try_to_free_buffers() - try to free page buffers.
1600 * @journal: journal for operation
1601 * @page: to try and free
1602 * @unused_gfp_mask: unused
1603 *
1604 *
1605 * For all the buffers on this page,
1606 * if they are fully written out ordered data, move them onto BUF_CLEAN
1607 * so try_to_free_buffers() can reap them.
1608 *
1609 * This function returns non-zero if we wish try_to_free_buffers()
1610 * to be called. We do this if the page is releasable by try_to_free_buffers().
1611 * We also do it if the page has locked or dirty buffers and the caller wants
1612 * us to perform sync or async writeout.
1613 *
1614 * This complicates JBD locking somewhat. We aren't protected by the
1615 * BKL here. We wish to remove the buffer from its committing or
1616 * running transaction's ->t_datalist via __journal_unfile_buffer.
1617 *
1618 * This may *change* the value of transaction_t->t_datalist, so anyone
1619 * who looks at t_datalist needs to lock against this function.
1620 *
1621 * Even worse, someone may be doing a journal_dirty_data on this
1622 * buffer. So we need to lock against that. journal_dirty_data()
1623 * will come out of the lock with the buffer dirty, which makes it
1624 * ineligible for release here.
1625 *
1626 * Who else is affected by this? hmm... Really the only contender
1627 * is do_get_write_access() - it could be looking at the buffer while
1628 * journal_try_to_free_buffer() is changing its state. But that
1629 * cannot happen because we never reallocate freed data as metadata
1630 * while the data is part of a transaction. Yes?
1631 */
1632int journal_try_to_free_buffers(journal_t *journal,
1633 struct page *page, gfp_t unused_gfp_mask)
1634{
1635 struct buffer_head *head;
1636 struct buffer_head *bh;
1637 int ret = 0;
1638
1639 J_ASSERT(PageLocked(page));
1640
1641 head = page_buffers(page);
1642 bh = head;
1643 do {
1644 struct journal_head *jh;
1645
1646 /*
1647 * We take our own ref against the journal_head here to avoid
1648 * having to add tons of locking around each instance of
1649 * journal_remove_journal_head() and journal_put_journal_head().
1650 */
1651 jh = journal_grab_journal_head(bh);
1652 if (!jh)
1653 continue;
1654
1655 jbd_lock_bh_state(bh);
1656 __journal_try_to_free_buffer(journal, bh);
1657 journal_put_journal_head(jh);
1658 jbd_unlock_bh_state(bh);
1659 if (buffer_jbd(bh))
1660 goto busy;
1661 } while ((bh = bh->b_this_page) != head);
1662 ret = try_to_free_buffers(page);
1663busy:
1664 return ret;
1665}
1666
1667/*
1668 * This buffer is no longer needed. If it is on an older transaction's
1669 * checkpoint list we need to record it on this transaction's forget list
1670 * to pin this buffer (and hence its checkpointing transaction) down until
1671 * this transaction commits. If the buffer isn't on a checkpoint list, we
1672 * release it.
1673 * Returns non-zero if JBD no longer has an interest in the buffer.
1674 *
1675 * Called under j_list_lock.
1676 *
1677 * Called under jbd_lock_bh_state(bh).
1678 */
1679static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1680{
1681 int may_free = 1;
1682 struct buffer_head *bh = jh2bh(jh);
1683
1684 __journal_unfile_buffer(jh);
1685
1686 if (jh->b_cp_transaction) {
1687 JBUFFER_TRACE(jh, "on running+cp transaction");
1688 __journal_file_buffer(jh, transaction, BJ_Forget);
1689 clear_buffer_jbddirty(bh);
1690 may_free = 0;
1691 } else {
1692 JBUFFER_TRACE(jh, "on running transaction");
1693 journal_remove_journal_head(bh);
1694 __brelse(bh);
1695 }
1696 return may_free;
1697}
1698
1699/*
1700 * journal_invalidatepage
1701 *
1702 * This code is tricky. It has a number of cases to deal with.
1703 *
1704 * There are two invariants which this code relies on:
1705 *
1706 * i_size must be updated on disk before we start calling invalidatepage on the
1707 * data.
1708 *
1709 * This is done in ext3 by defining an ext3_setattr method which
1710 * updates i_size before truncate gets going. By maintaining this
1711 * invariant, we can be sure that it is safe to throw away any buffers
1712 * attached to the current transaction: once the transaction commits,
1713 * we know that the data will not be needed.
1714 *
1715 * Note however that we can *not* throw away data belonging to the
1716 * previous, committing transaction!
1717 *
1718 * Any disk blocks which *are* part of the previous, committing
1719 * transaction (and which therefore cannot be discarded immediately) are
1720 * not going to be reused in the new running transaction
1721 *
1722 * The bitmap committed_data images guarantee this: any block which is
1723 * allocated in one transaction and removed in the next will be marked
1724 * as in-use in the committed_data bitmap, so cannot be reused until
1725 * the next transaction to delete the block commits. This means that
1726 * leaving committing buffers dirty is quite safe: the disk blocks
1727 * cannot be reallocated to a different file and so buffer aliasing is
1728 * not possible.
1729 *
1730 *
1731 * The above applies mainly to ordered data mode. In writeback mode we
1732 * don't make guarantees about the order in which data hits disk --- in
1733 * particular we don't guarantee that new dirty data is flushed before
1734 * transaction commit --- so it is always safe just to discard data
1735 * immediately in that mode. --sct
1736 */
1737
1738/*
1739 * The journal_unmap_buffer helper function returns zero if the buffer
1740 * concerned remains pinned as an anonymous buffer belonging to an older
1741 * transaction.
1742 *
1743 * We're outside-transaction here. Either or both of j_running_transaction
1744 * and j_committing_transaction may be NULL.
1745 */
1746static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1747{
1748 transaction_t *transaction;
1749 struct journal_head *jh;
1750 int may_free = 1;
1751 int ret;
1752
1753 BUFFER_TRACE(bh, "entry");
1754
1755 /*
1756 * It is safe to proceed here without the j_list_lock because the
1757 * buffers cannot be stolen by try_to_free_buffers as long as we are
1758 * holding the page lock. --sct
1759 */
1760
1761 if (!buffer_jbd(bh))
1762 goto zap_buffer_unlocked;
1763
1764 spin_lock(&journal->j_state_lock);
1765 jbd_lock_bh_state(bh);
1766 spin_lock(&journal->j_list_lock);
1767
1768 jh = journal_grab_journal_head(bh);
1769 if (!jh)
1770 goto zap_buffer_no_jh;
1771
1772 transaction = jh->b_transaction;
1773 if (transaction == NULL) {
1774 /* First case: not on any transaction. If it
1775 * has no checkpoint link, then we can zap it:
1776 * it's a writeback-mode buffer so we don't care
1777 * if it hits disk safely. */
1778 if (!jh->b_cp_transaction) {
1779 JBUFFER_TRACE(jh, "not on any transaction: zap");
1780 goto zap_buffer;
1781 }
1782
1783 if (!buffer_dirty(bh)) {
1784 /* bdflush has written it. We can drop it now */
1785 goto zap_buffer;
1786 }
1787
1788 /* OK, it must be in the journal but still not
1789 * written fully to disk: it's metadata or
1790 * journaled data... */
1791
1792 if (journal->j_running_transaction) {
1793 /* ... and once the current transaction has
1794 * committed, the buffer won't be needed any
1795 * longer. */
1796 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1797 ret = __dispose_buffer(jh,
1798 journal->j_running_transaction);
1799 journal_put_journal_head(jh);
1800 spin_unlock(&journal->j_list_lock);
1801 jbd_unlock_bh_state(bh);
1802 spin_unlock(&journal->j_state_lock);
1803 return ret;
1804 } else {
1805 /* There is no currently-running transaction. So the
1806 * orphan record which we wrote for this file must have
1807 * passed into commit. We must attach this buffer to
1808 * the committing transaction, if it exists. */
1809 if (journal->j_committing_transaction) {
1810 JBUFFER_TRACE(jh, "give to committing trans");
1811 ret = __dispose_buffer(jh,
1812 journal->j_committing_transaction);
1813 journal_put_journal_head(jh);
1814 spin_unlock(&journal->j_list_lock);
1815 jbd_unlock_bh_state(bh);
1816 spin_unlock(&journal->j_state_lock);
1817 return ret;
1818 } else {
1819 /* The orphan record's transaction has
1820 * committed. We can cleanse this buffer */
1821 clear_buffer_jbddirty(bh);
1822 goto zap_buffer;
1823 }
1824 }
1825 } else if (transaction == journal->j_committing_transaction) {
1826 if (jh->b_jlist == BJ_Locked) {
1827 /*
1828 * The buffer is on the committing transaction's locked
1829 * list. We have the buffer locked, so I/O has
1830 * completed. So we can nail the buffer now.
1831 */
1832 may_free = __dispose_buffer(jh, transaction);
1833 goto zap_buffer;
1834 }
1835 /*
1836 * If it is committing, we simply cannot touch it. We
1837 * can remove it's next_transaction pointer from the
1838 * running transaction if that is set, but nothing
1839 * else. */
1840 JBUFFER_TRACE(jh, "on committing transaction");
1841 set_buffer_freed(bh);
1842 if (jh->b_next_transaction) {
1843 J_ASSERT(jh->b_next_transaction ==
1844 journal->j_running_transaction);
1845 jh->b_next_transaction = NULL;
1846 }
1847 journal_put_journal_head(jh);
1848 spin_unlock(&journal->j_list_lock);
1849 jbd_unlock_bh_state(bh);
1850 spin_unlock(&journal->j_state_lock);
1851 return 0;
1852 } else {
1853 /* Good, the buffer belongs to the running transaction.
1854 * We are writing our own transaction's data, not any
1855 * previous one's, so it is safe to throw it away
1856 * (remember that we expect the filesystem to have set
1857 * i_size already for this truncate so recovery will not
1858 * expose the disk blocks we are discarding here.) */
1859 J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1860 may_free = __dispose_buffer(jh, transaction);
1861 }
1862
1863zap_buffer:
1864 journal_put_journal_head(jh);
1865zap_buffer_no_jh:
1866 spin_unlock(&journal->j_list_lock);
1867 jbd_unlock_bh_state(bh);
1868 spin_unlock(&journal->j_state_lock);
1869zap_buffer_unlocked:
1870 clear_buffer_dirty(bh);
1871 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
1872 clear_buffer_mapped(bh);
1873 clear_buffer_req(bh);
1874 clear_buffer_new(bh);
1875 bh->b_bdev = NULL;
1876 return may_free;
1877}
1878
1879/**
1880 * void journal_invalidatepage()
1881 * @journal: journal to use for flush...
1882 * @page: page to flush
1883 * @offset: length of page to invalidate.
1884 *
1885 * Reap page buffers containing data after offset in page.
1886 *
1887 */
1888void journal_invalidatepage(journal_t *journal,
1889 struct page *page,
1890 unsigned long offset)
1891{
1892 struct buffer_head *head, *bh, *next;
1893 unsigned int curr_off = 0;
1894 int may_free = 1;
1895
1896 if (!PageLocked(page))
1897 BUG();
1898 if (!page_has_buffers(page))
1899 return;
1900
1901 /* We will potentially be playing with lists other than just the
1902 * data lists (especially for journaled data mode), so be
1903 * cautious in our locking. */
1904
1905 head = bh = page_buffers(page);
1906 do {
1907 unsigned int next_off = curr_off + bh->b_size;
1908 next = bh->b_this_page;
1909
1910 if (offset <= curr_off) {
1911 /* This block is wholly outside the truncation point */
1912 lock_buffer(bh);
1913 may_free &= journal_unmap_buffer(journal, bh);
1914 unlock_buffer(bh);
1915 }
1916 curr_off = next_off;
1917 bh = next;
1918
1919 } while (bh != head);
1920
1921 if (!offset) {
1922 if (may_free && try_to_free_buffers(page))
1923 J_ASSERT(!page_has_buffers(page));
1924 }
1925}
1926
1927/*
1928 * File a buffer on the given transaction list.
1929 */
1930void __journal_file_buffer(struct journal_head *jh,
1931 transaction_t *transaction, int jlist)
1932{
1933 struct journal_head **list = NULL;
1934 int was_dirty = 0;
1935 struct buffer_head *bh = jh2bh(jh);
1936
1937 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1938 assert_spin_locked(&transaction->t_journal->j_list_lock);
1939
1940 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1941 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1942 jh->b_transaction == 0);
1943
1944 if (jh->b_transaction && jh->b_jlist == jlist)
1945 return;
1946
1947 /* The following list of buffer states needs to be consistent
1948 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
1949 * state. */
1950
1951 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
1952 jlist == BJ_Shadow || jlist == BJ_Forget) {
1953 if (test_clear_buffer_dirty(bh) ||
1954 test_clear_buffer_jbddirty(bh))
1955 was_dirty = 1;
1956 }
1957
1958 if (jh->b_transaction)
1959 __journal_temp_unlink_buffer(jh);
1960 jh->b_transaction = transaction;
1961
1962 switch (jlist) {
1963 case BJ_None:
1964 J_ASSERT_JH(jh, !jh->b_committed_data);
1965 J_ASSERT_JH(jh, !jh->b_frozen_data);
1966 return;
1967 case BJ_SyncData:
1968 list = &transaction->t_sync_datalist;
1969 break;
1970 case BJ_Metadata:
1971 transaction->t_nr_buffers++;
1972 list = &transaction->t_buffers;
1973 break;
1974 case BJ_Forget:
1975 list = &transaction->t_forget;
1976 break;
1977 case BJ_IO:
1978 list = &transaction->t_iobuf_list;
1979 break;
1980 case BJ_Shadow:
1981 list = &transaction->t_shadow_list;
1982 break;
1983 case BJ_LogCtl:
1984 list = &transaction->t_log_list;
1985 break;
1986 case BJ_Reserved:
1987 list = &transaction->t_reserved_list;
1988 break;
1989 case BJ_Locked:
1990 list = &transaction->t_locked_list;
1991 break;
1992 }
1993
1994 __blist_add_buffer(list, jh);
1995 jh->b_jlist = jlist;
1996
1997 if (was_dirty)
1998 set_buffer_jbddirty(bh);
1999}
2000
2001void journal_file_buffer(struct journal_head *jh,
2002 transaction_t *transaction, int jlist)
2003{
2004 jbd_lock_bh_state(jh2bh(jh));
2005 spin_lock(&transaction->t_journal->j_list_lock);
2006 __journal_file_buffer(jh, transaction, jlist);
2007 spin_unlock(&transaction->t_journal->j_list_lock);
2008 jbd_unlock_bh_state(jh2bh(jh));
2009}
2010
2011/*
2012 * Remove a buffer from its current buffer list in preparation for
2013 * dropping it from its current transaction entirely. If the buffer has
2014 * already started to be used by a subsequent transaction, refile the
2015 * buffer on that transaction's metadata list.
2016 *
2017 * Called under journal->j_list_lock
2018 *
2019 * Called under jbd_lock_bh_state(jh2bh(jh))
2020 */
2021void __journal_refile_buffer(struct journal_head *jh)
2022{
2023 int was_dirty;
2024 struct buffer_head *bh = jh2bh(jh);
2025
2026 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2027 if (jh->b_transaction)
2028 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2029
2030 /* If the buffer is now unused, just drop it. */
2031 if (jh->b_next_transaction == NULL) {
2032 __journal_unfile_buffer(jh);
2033 return;
2034 }
2035
2036 /*
2037 * It has been modified by a later transaction: add it to the new
2038 * transaction's metadata list.
2039 */
2040
2041 was_dirty = test_clear_buffer_jbddirty(bh);
2042 __journal_temp_unlink_buffer(jh);
2043 jh->b_transaction = jh->b_next_transaction;
2044 jh->b_next_transaction = NULL;
2045 __journal_file_buffer(jh, jh->b_transaction,
2046 was_dirty ? BJ_Metadata : BJ_Reserved);
2047 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2048
2049 if (was_dirty)
2050 set_buffer_jbddirty(bh);
2051}
2052
2053/*
2054 * For the unlocked version of this call, also make sure that any
2055 * hanging journal_head is cleaned up if necessary.
2056 *
2057 * __journal_refile_buffer is usually called as part of a single locked
2058 * operation on a buffer_head, in which the caller is probably going to
2059 * be hooking the journal_head onto other lists. In that case it is up
2060 * to the caller to remove the journal_head if necessary. For the
2061 * unlocked journal_refile_buffer call, the caller isn't going to be
2062 * doing anything else to the buffer so we need to do the cleanup
2063 * ourselves to avoid a jh leak.
2064 *
2065 * *** The journal_head may be freed by this call! ***
2066 */
2067void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2068{
2069 struct buffer_head *bh = jh2bh(jh);
2070
2071 jbd_lock_bh_state(bh);
2072 spin_lock(&journal->j_list_lock);
2073
2074 __journal_refile_buffer(jh);
2075 jbd_unlock_bh_state(bh);
2076 journal_remove_journal_head(bh);
2077
2078 spin_unlock(&journal->j_list_lock);
2079 __brelse(bh);
2080}