aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jbd
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'fs/jbd')
-rw-r--r--fs/jbd/Makefile7
-rw-r--r--fs/jbd/checkpoint.c636
-rw-r--r--fs/jbd/commit.c844
-rw-r--r--fs/jbd/journal.c2003
-rw-r--r--fs/jbd/recovery.c591
-rw-r--r--fs/jbd/revoke.c702
-rw-r--r--fs/jbd/transaction.c2062
7 files changed, 6845 insertions, 0 deletions
diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile
new file mode 100644
index 000000000000..54aca4868a36
--- /dev/null
+++ b/fs/jbd/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the linux journaling routines.
3#
4
5obj-$(CONFIG_JBD) += jbd.o
6
7jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
new file mode 100644
index 000000000000..98d830401c56
--- /dev/null
+++ b/fs/jbd/checkpoint.c
@@ -0,0 +1,636 @@
1/*
2 * linux/fs/checkpoint.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Checkpoint routines for the generic filesystem journaling code.
13 * Part of the ext2fs journaling system.
14 *
15 * Checkpointing is the process of ensuring that a section of the log is
16 * committed fully to disk, so that that portion of the log can be
17 * reused.
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25
26/*
27 * Unlink a buffer from a transaction.
28 *
29 * Called with j_list_lock held.
30 */
31
32static inline void __buffer_unlink(struct journal_head *jh)
33{
34 transaction_t *transaction;
35
36 transaction = jh->b_cp_transaction;
37 jh->b_cp_transaction = NULL;
38
39 jh->b_cpnext->b_cpprev = jh->b_cpprev;
40 jh->b_cpprev->b_cpnext = jh->b_cpnext;
41 if (transaction->t_checkpoint_list == jh)
42 transaction->t_checkpoint_list = jh->b_cpnext;
43 if (transaction->t_checkpoint_list == jh)
44 transaction->t_checkpoint_list = NULL;
45}
46
47/*
48 * Try to release a checkpointed buffer from its transaction.
49 * Returns 1 if we released it.
50 * Requires j_list_lock
51 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
52 */
53static int __try_to_free_cp_buf(struct journal_head *jh)
54{
55 int ret = 0;
56 struct buffer_head *bh = jh2bh(jh);
57
58 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
59 JBUFFER_TRACE(jh, "remove from checkpoint list");
60 __journal_remove_checkpoint(jh);
61 jbd_unlock_bh_state(bh);
62 journal_remove_journal_head(bh);
63 BUFFER_TRACE(bh, "release");
64 __brelse(bh);
65 ret = 1;
66 } else {
67 jbd_unlock_bh_state(bh);
68 }
69 return ret;
70}
71
72/*
73 * __log_wait_for_space: wait until there is space in the journal.
74 *
75 * Called under j-state_lock *only*. It will be unlocked if we have to wait
76 * for a checkpoint to free up some space in the log.
77 */
78void __log_wait_for_space(journal_t *journal)
79{
80 int nblocks;
81 assert_spin_locked(&journal->j_state_lock);
82
83 nblocks = jbd_space_needed(journal);
84 while (__log_space_left(journal) < nblocks) {
85 if (journal->j_flags & JFS_ABORT)
86 return;
87 spin_unlock(&journal->j_state_lock);
88 down(&journal->j_checkpoint_sem);
89
90 /*
91 * Test again, another process may have checkpointed while we
92 * were waiting for the checkpoint lock
93 */
94 spin_lock(&journal->j_state_lock);
95 nblocks = jbd_space_needed(journal);
96 if (__log_space_left(journal) < nblocks) {
97 spin_unlock(&journal->j_state_lock);
98 log_do_checkpoint(journal);
99 spin_lock(&journal->j_state_lock);
100 }
101 up(&journal->j_checkpoint_sem);
102 }
103}
104
105/*
106 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
107 * The caller must restart a list walk. Wait for someone else to run
108 * jbd_unlock_bh_state().
109 */
110static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
111{
112 get_bh(bh);
113 spin_unlock(&journal->j_list_lock);
114 jbd_lock_bh_state(bh);
115 jbd_unlock_bh_state(bh);
116 put_bh(bh);
117}
118
119/*
120 * Clean up a transaction's checkpoint list.
121 *
122 * We wait for any pending IO to complete and make sure any clean
123 * buffers are removed from the transaction.
124 *
125 * Return 1 if we performed any actions which might have destroyed the
126 * checkpoint. (journal_remove_checkpoint() deletes the transaction when
127 * the last checkpoint buffer is cleansed)
128 *
129 * Called with j_list_lock held.
130 */
131static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
132{
133 struct journal_head *jh, *next_jh, *last_jh;
134 struct buffer_head *bh;
135 int ret = 0;
136
137 assert_spin_locked(&journal->j_list_lock);
138 jh = transaction->t_checkpoint_list;
139 if (!jh)
140 return 0;
141
142 last_jh = jh->b_cpprev;
143 next_jh = jh;
144 do {
145 jh = next_jh;
146 bh = jh2bh(jh);
147 if (buffer_locked(bh)) {
148 atomic_inc(&bh->b_count);
149 spin_unlock(&journal->j_list_lock);
150 wait_on_buffer(bh);
151 /* the journal_head may have gone by now */
152 BUFFER_TRACE(bh, "brelse");
153 __brelse(bh);
154 goto out_return_1;
155 }
156
157 /*
158 * This is foul
159 */
160 if (!jbd_trylock_bh_state(bh)) {
161 jbd_sync_bh(journal, bh);
162 goto out_return_1;
163 }
164
165 if (jh->b_transaction != NULL) {
166 transaction_t *t = jh->b_transaction;
167 tid_t tid = t->t_tid;
168
169 spin_unlock(&journal->j_list_lock);
170 jbd_unlock_bh_state(bh);
171 log_start_commit(journal, tid);
172 log_wait_commit(journal, tid);
173 goto out_return_1;
174 }
175
176 /*
177 * AKPM: I think the buffer_jbddirty test is redundant - it
178 * shouldn't have NULL b_transaction?
179 */
180 next_jh = jh->b_cpnext;
181 if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
182 BUFFER_TRACE(bh, "remove from checkpoint");
183 __journal_remove_checkpoint(jh);
184 jbd_unlock_bh_state(bh);
185 journal_remove_journal_head(bh);
186 __brelse(bh);
187 ret = 1;
188 } else {
189 jbd_unlock_bh_state(bh);
190 }
191 jh = next_jh;
192 } while (jh != last_jh);
193
194 return ret;
195out_return_1:
196 spin_lock(&journal->j_list_lock);
197 return 1;
198}
199
200#define NR_BATCH 64
201
202static void
203__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
204{
205 int i;
206
207 spin_unlock(&journal->j_list_lock);
208 ll_rw_block(WRITE, *batch_count, bhs);
209 spin_lock(&journal->j_list_lock);
210 for (i = 0; i < *batch_count; i++) {
211 struct buffer_head *bh = bhs[i];
212 clear_buffer_jwrite(bh);
213 BUFFER_TRACE(bh, "brelse");
214 __brelse(bh);
215 }
216 *batch_count = 0;
217}
218
219/*
220 * Try to flush one buffer from the checkpoint list to disk.
221 *
222 * Return 1 if something happened which requires us to abort the current
223 * scan of the checkpoint list.
224 *
225 * Called with j_list_lock held.
226 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
227 */
228static int __flush_buffer(journal_t *journal, struct journal_head *jh,
229 struct buffer_head **bhs, int *batch_count,
230 int *drop_count)
231{
232 struct buffer_head *bh = jh2bh(jh);
233 int ret = 0;
234
235 if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
236 J_ASSERT_JH(jh, jh->b_transaction == NULL);
237
238 /*
239 * Important: we are about to write the buffer, and
240 * possibly block, while still holding the journal lock.
241 * We cannot afford to let the transaction logic start
242 * messing around with this buffer before we write it to
243 * disk, as that would break recoverability.
244 */
245 BUFFER_TRACE(bh, "queue");
246 get_bh(bh);
247 J_ASSERT_BH(bh, !buffer_jwrite(bh));
248 set_buffer_jwrite(bh);
249 bhs[*batch_count] = bh;
250 jbd_unlock_bh_state(bh);
251 (*batch_count)++;
252 if (*batch_count == NR_BATCH) {
253 __flush_batch(journal, bhs, batch_count);
254 ret = 1;
255 }
256 } else {
257 int last_buffer = 0;
258 if (jh->b_cpnext == jh) {
259 /* We may be about to drop the transaction. Tell the
260 * caller that the lists have changed.
261 */
262 last_buffer = 1;
263 }
264 if (__try_to_free_cp_buf(jh)) {
265 (*drop_count)++;
266 ret = last_buffer;
267 }
268 }
269 return ret;
270}
271
272/*
273 * Perform an actual checkpoint. We don't write out only enough to
274 * satisfy the current blocked requests: rather we submit a reasonably
275 * sized chunk of the outstanding data to disk at once for
276 * efficiency. __log_wait_for_space() will retry if we didn't free enough.
277 *
278 * However, we _do_ take into account the amount requested so that once
279 * the IO has been queued, we can return as soon as enough of it has
280 * completed to disk.
281 *
282 * The journal should be locked before calling this function.
283 */
284int log_do_checkpoint(journal_t *journal)
285{
286 int result;
287 int batch_count = 0;
288 struct buffer_head *bhs[NR_BATCH];
289
290 jbd_debug(1, "Start checkpoint\n");
291
292 /*
293 * First thing: if there are any transactions in the log which
294 * don't need checkpointing, just eliminate them from the
295 * journal straight away.
296 */
297 result = cleanup_journal_tail(journal);
298 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
299 if (result <= 0)
300 return result;
301
302 /*
303 * OK, we need to start writing disk blocks. Try to free up a
304 * quarter of the log in a single checkpoint if we can.
305 */
306 /*
307 * AKPM: check this code. I had a feeling a while back that it
308 * degenerates into a busy loop at unmount time.
309 */
310 spin_lock(&journal->j_list_lock);
311 while (journal->j_checkpoint_transactions) {
312 transaction_t *transaction;
313 struct journal_head *jh, *last_jh, *next_jh;
314 int drop_count = 0;
315 int cleanup_ret, retry = 0;
316 tid_t this_tid;
317
318 transaction = journal->j_checkpoint_transactions;
319 this_tid = transaction->t_tid;
320 jh = transaction->t_checkpoint_list;
321 last_jh = jh->b_cpprev;
322 next_jh = jh;
323 do {
324 struct buffer_head *bh;
325
326 jh = next_jh;
327 next_jh = jh->b_cpnext;
328 bh = jh2bh(jh);
329 if (!jbd_trylock_bh_state(bh)) {
330 jbd_sync_bh(journal, bh);
331 spin_lock(&journal->j_list_lock);
332 retry = 1;
333 break;
334 }
335 retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
336 if (cond_resched_lock(&journal->j_list_lock)) {
337 retry = 1;
338 break;
339 }
340 } while (jh != last_jh && !retry);
341
342 if (batch_count)
343 __flush_batch(journal, bhs, &batch_count);
344
345 /*
346 * If someone cleaned up this transaction while we slept, we're
347 * done
348 */
349 if (journal->j_checkpoint_transactions != transaction)
350 break;
351 if (retry)
352 continue;
353 /*
354 * Maybe it's a new transaction, but it fell at the same
355 * address
356 */
357 if (transaction->t_tid != this_tid)
358 continue;
359 /*
360 * We have walked the whole transaction list without
361 * finding anything to write to disk. We had better be
362 * able to make some progress or we are in trouble.
363 */
364 cleanup_ret = __cleanup_transaction(journal, transaction);
365 J_ASSERT(drop_count != 0 || cleanup_ret != 0);
366 if (journal->j_checkpoint_transactions != transaction)
367 break;
368 }
369 spin_unlock(&journal->j_list_lock);
370 result = cleanup_journal_tail(journal);
371 if (result < 0)
372 return result;
373
374 return 0;
375}
376
377/*
378 * Check the list of checkpoint transactions for the journal to see if
379 * we have already got rid of any since the last update of the log tail
380 * in the journal superblock. If so, we can instantly roll the
381 * superblock forward to remove those transactions from the log.
382 *
383 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
384 *
385 * Called with the journal lock held.
386 *
387 * This is the only part of the journaling code which really needs to be
388 * aware of transaction aborts. Checkpointing involves writing to the
389 * main filesystem area rather than to the journal, so it can proceed
390 * even in abort state, but we must not update the journal superblock if
391 * we have an abort error outstanding.
392 */
393
394int cleanup_journal_tail(journal_t *journal)
395{
396 transaction_t * transaction;
397 tid_t first_tid;
398 unsigned long blocknr, freed;
399
400 /* OK, work out the oldest transaction remaining in the log, and
401 * the log block it starts at.
402 *
403 * If the log is now empty, we need to work out which is the
404 * next transaction ID we will write, and where it will
405 * start. */
406
407 spin_lock(&journal->j_state_lock);
408 spin_lock(&journal->j_list_lock);
409 transaction = journal->j_checkpoint_transactions;
410 if (transaction) {
411 first_tid = transaction->t_tid;
412 blocknr = transaction->t_log_start;
413 } else if ((transaction = journal->j_committing_transaction) != NULL) {
414 first_tid = transaction->t_tid;
415 blocknr = transaction->t_log_start;
416 } else if ((transaction = journal->j_running_transaction) != NULL) {
417 first_tid = transaction->t_tid;
418 blocknr = journal->j_head;
419 } else {
420 first_tid = journal->j_transaction_sequence;
421 blocknr = journal->j_head;
422 }
423 spin_unlock(&journal->j_list_lock);
424 J_ASSERT(blocknr != 0);
425
426 /* If the oldest pinned transaction is at the tail of the log
427 already then there's not much we can do right now. */
428 if (journal->j_tail_sequence == first_tid) {
429 spin_unlock(&journal->j_state_lock);
430 return 1;
431 }
432
433 /* OK, update the superblock to recover the freed space.
434 * Physical blocks come first: have we wrapped beyond the end of
435 * the log? */
436 freed = blocknr - journal->j_tail;
437 if (blocknr < journal->j_tail)
438 freed = freed + journal->j_last - journal->j_first;
439
440 jbd_debug(1,
441 "Cleaning journal tail from %d to %d (offset %lu), "
442 "freeing %lu\n",
443 journal->j_tail_sequence, first_tid, blocknr, freed);
444
445 journal->j_free += freed;
446 journal->j_tail_sequence = first_tid;
447 journal->j_tail = blocknr;
448 spin_unlock(&journal->j_state_lock);
449 if (!(journal->j_flags & JFS_ABORT))
450 journal_update_superblock(journal, 1);
451 return 0;
452}
453
454
455/* Checkpoint list management */
456
457/*
458 * journal_clean_checkpoint_list
459 *
460 * Find all the written-back checkpoint buffers in the journal and release them.
461 *
462 * Called with the journal locked.
463 * Called with j_list_lock held.
464 * Returns number of bufers reaped (for debug)
465 */
466
467int __journal_clean_checkpoint_list(journal_t *journal)
468{
469 transaction_t *transaction, *last_transaction, *next_transaction;
470 int ret = 0;
471
472 transaction = journal->j_checkpoint_transactions;
473 if (transaction == 0)
474 goto out;
475
476 last_transaction = transaction->t_cpprev;
477 next_transaction = transaction;
478 do {
479 struct journal_head *jh;
480
481 transaction = next_transaction;
482 next_transaction = transaction->t_cpnext;
483 jh = transaction->t_checkpoint_list;
484 if (jh) {
485 struct journal_head *last_jh = jh->b_cpprev;
486 struct journal_head *next_jh = jh;
487
488 do {
489 jh = next_jh;
490 next_jh = jh->b_cpnext;
491 /* Use trylock because of the ranknig */
492 if (jbd_trylock_bh_state(jh2bh(jh)))
493 ret += __try_to_free_cp_buf(jh);
494 /*
495 * This function only frees up some memory
496 * if possible so we dont have an obligation
497 * to finish processing. Bail out if preemption
498 * requested:
499 */
500 if (need_resched())
501 goto out;
502 } while (jh != last_jh);
503 }
504 } while (transaction != last_transaction);
505out:
506 return ret;
507}
508
509/*
510 * journal_remove_checkpoint: called after a buffer has been committed
511 * to disk (either by being write-back flushed to disk, or being
512 * committed to the log).
513 *
514 * We cannot safely clean a transaction out of the log until all of the
515 * buffer updates committed in that transaction have safely been stored
516 * elsewhere on disk. To achieve this, all of the buffers in a
517 * transaction need to be maintained on the transaction's checkpoint
518 * list until they have been rewritten, at which point this function is
519 * called to remove the buffer from the existing transaction's
520 * checkpoint list.
521 *
522 * This function is called with the journal locked.
523 * This function is called with j_list_lock held.
524 */
525
526void __journal_remove_checkpoint(struct journal_head *jh)
527{
528 transaction_t *transaction;
529 journal_t *journal;
530
531 JBUFFER_TRACE(jh, "entry");
532
533 if ((transaction = jh->b_cp_transaction) == NULL) {
534 JBUFFER_TRACE(jh, "not on transaction");
535 goto out;
536 }
537 journal = transaction->t_journal;
538
539 __buffer_unlink(jh);
540
541 if (transaction->t_checkpoint_list != NULL)
542 goto out;
543 JBUFFER_TRACE(jh, "transaction has no more buffers");
544
545 /*
546 * There is one special case to worry about: if we have just pulled the
547 * buffer off a committing transaction's forget list, then even if the
548 * checkpoint list is empty, the transaction obviously cannot be
549 * dropped!
550 *
551 * The locking here around j_committing_transaction is a bit sleazy.
552 * See the comment at the end of journal_commit_transaction().
553 */
554 if (transaction == journal->j_committing_transaction) {
555 JBUFFER_TRACE(jh, "belongs to committing transaction");
556 goto out;
557 }
558
559 /* OK, that was the last buffer for the transaction: we can now
560 safely remove this transaction from the log */
561
562 __journal_drop_transaction(journal, transaction);
563
564 /* Just in case anybody was waiting for more transactions to be
565 checkpointed... */
566 wake_up(&journal->j_wait_logspace);
567out:
568 JBUFFER_TRACE(jh, "exit");
569}
570
571/*
572 * journal_insert_checkpoint: put a committed buffer onto a checkpoint
573 * list so that we know when it is safe to clean the transaction out of
574 * the log.
575 *
576 * Called with the journal locked.
577 * Called with j_list_lock held.
578 */
579void __journal_insert_checkpoint(struct journal_head *jh,
580 transaction_t *transaction)
581{
582 JBUFFER_TRACE(jh, "entry");
583 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
584 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
585
586 jh->b_cp_transaction = transaction;
587
588 if (!transaction->t_checkpoint_list) {
589 jh->b_cpnext = jh->b_cpprev = jh;
590 } else {
591 jh->b_cpnext = transaction->t_checkpoint_list;
592 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
593 jh->b_cpprev->b_cpnext = jh;
594 jh->b_cpnext->b_cpprev = jh;
595 }
596 transaction->t_checkpoint_list = jh;
597}
598
599/*
600 * We've finished with this transaction structure: adios...
601 *
602 * The transaction must have no links except for the checkpoint by this
603 * point.
604 *
605 * Called with the journal locked.
606 * Called with j_list_lock held.
607 */
608
609void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
610{
611 assert_spin_locked(&journal->j_list_lock);
612 if (transaction->t_cpnext) {
613 transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
614 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
615 if (journal->j_checkpoint_transactions == transaction)
616 journal->j_checkpoint_transactions =
617 transaction->t_cpnext;
618 if (journal->j_checkpoint_transactions == transaction)
619 journal->j_checkpoint_transactions = NULL;
620 }
621
622 J_ASSERT(transaction->t_state == T_FINISHED);
623 J_ASSERT(transaction->t_buffers == NULL);
624 J_ASSERT(transaction->t_sync_datalist == NULL);
625 J_ASSERT(transaction->t_forget == NULL);
626 J_ASSERT(transaction->t_iobuf_list == NULL);
627 J_ASSERT(transaction->t_shadow_list == NULL);
628 J_ASSERT(transaction->t_log_list == NULL);
629 J_ASSERT(transaction->t_checkpoint_list == NULL);
630 J_ASSERT(transaction->t_updates == 0);
631 J_ASSERT(journal->j_committing_transaction != transaction);
632 J_ASSERT(journal->j_running_transaction != transaction);
633
634 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
635 kfree(transaction);
636}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
new file mode 100644
index 000000000000..dac720c837ab
--- /dev/null
+++ b/fs/jbd/commit.c
@@ -0,0 +1,844 @@
1/*
2 * linux/fs/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
23#include <linux/smp_lock.h>
24
25/*
26 * Default IO end handler for temporary BJ_IO buffer_heads.
27 */
28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29{
30 BUFFER_TRACE(bh, "");
31 if (uptodate)
32 set_buffer_uptodate(bh);
33 else
34 clear_buffer_uptodate(bh);
35 unlock_buffer(bh);
36}
37
38/*
39 * When an ext3-ordered file is truncated, it is possible that many pages are
40 * not sucessfully freed, because they are attached to a committing transaction.
41 * After the transaction commits, these pages are left on the LRU, with no
42 * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 * the numbers in /proc/meminfo look odd.
45 *
46 * So here, we have a buffer which has just come off the forget list. Look to
47 * see if we can strip all buffers from the backing page.
48 *
49 * Called under lock_journal(), and possibly under journal_datalist_lock. The
50 * caller provided us with a ref against the buffer, and we drop that here.
51 */
52static void release_buffer_page(struct buffer_head *bh)
53{
54 struct page *page;
55
56 if (buffer_dirty(bh))
57 goto nope;
58 if (atomic_read(&bh->b_count) != 1)
59 goto nope;
60 page = bh->b_page;
61 if (!page)
62 goto nope;
63 if (page->mapping)
64 goto nope;
65
66 /* OK, it's a truncated page */
67 if (TestSetPageLocked(page))
68 goto nope;
69
70 page_cache_get(page);
71 __brelse(bh);
72 try_to_free_buffers(page);
73 unlock_page(page);
74 page_cache_release(page);
75 return;
76
77nope:
78 __brelse(bh);
79}
80
81/*
82 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83 * held. For ranking reasons we must trylock. If we lose, schedule away and
84 * return 0. j_list_lock is dropped in this case.
85 */
86static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87{
88 if (!jbd_trylock_bh_state(bh)) {
89 spin_unlock(&journal->j_list_lock);
90 schedule();
91 return 0;
92 }
93 return 1;
94}
95
96/* Done it all: now write the commit record. We should have
97 * cleaned up our previous buffers by now, so if we are in abort
98 * mode we can now just skip the rest of the journal write
99 * entirely.
100 *
101 * Returns 1 if the journal needs to be aborted or 0 on success
102 */
103static int journal_write_commit_record(journal_t *journal,
104 transaction_t *commit_transaction)
105{
106 struct journal_head *descriptor;
107 struct buffer_head *bh;
108 int i, ret;
109 int barrier_done = 0;
110
111 if (is_journal_aborted(journal))
112 return 0;
113
114 descriptor = journal_get_descriptor_buffer(journal);
115 if (!descriptor)
116 return 1;
117
118 bh = jh2bh(descriptor);
119
120 /* AKPM: buglet - add `i' to tmp! */
121 for (i = 0; i < bh->b_size; i += 512) {
122 journal_header_t *tmp = (journal_header_t*)bh->b_data;
123 tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
124 tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
125 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126 }
127
128 JBUFFER_TRACE(descriptor, "write commit block");
129 set_buffer_dirty(bh);
130 if (journal->j_flags & JFS_BARRIER) {
131 set_buffer_ordered(bh);
132 barrier_done = 1;
133 }
134 ret = sync_dirty_buffer(bh);
135 /* is it possible for another commit to fail at roughly
136 * the same time as this one? If so, we don't want to
137 * trust the barrier flag in the super, but instead want
138 * to remember if we sent a barrier request
139 */
140 if (ret == -EOPNOTSUPP && barrier_done) {
141 char b[BDEVNAME_SIZE];
142
143 printk(KERN_WARNING
144 "JBD: barrier-based sync failed on %s - "
145 "disabling barriers\n",
146 bdevname(journal->j_dev, b));
147 spin_lock(&journal->j_state_lock);
148 journal->j_flags &= ~JFS_BARRIER;
149 spin_unlock(&journal->j_state_lock);
150
151 /* And try again, without the barrier */
152 clear_buffer_ordered(bh);
153 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh);
156 }
157 put_bh(bh); /* One for getblk() */
158 journal_put_journal_head(descriptor);
159
160 return (ret == -EIO);
161}
162
163/*
164 * journal_commit_transaction
165 *
166 * The primary function for committing a transaction to the log. This
167 * function is called by the journal thread to begin a complete commit.
168 */
169void journal_commit_transaction(journal_t *journal)
170{
171 transaction_t *commit_transaction;
172 struct journal_head *jh, *new_jh, *descriptor;
173 struct buffer_head **wbuf = journal->j_wbuf;
174 int bufs;
175 int flags;
176 int err;
177 unsigned long blocknr;
178 char *tagp = NULL;
179 journal_header_t *header;
180 journal_block_tag_t *tag = NULL;
181 int space_left = 0;
182 int first_tag = 0;
183 int tag_flag;
184 int i;
185
186 /*
187 * First job: lock down the current transaction and wait for
188 * all outstanding updates to complete.
189 */
190
191#ifdef COMMIT_STATS
192 spin_lock(&journal->j_list_lock);
193 summarise_journal_usage(journal);
194 spin_unlock(&journal->j_list_lock);
195#endif
196
197 /* Do we need to erase the effects of a prior journal_flush? */
198 if (journal->j_flags & JFS_FLUSHED) {
199 jbd_debug(3, "super block updated\n");
200 journal_update_superblock(journal, 1);
201 } else {
202 jbd_debug(3, "superblock not updated\n");
203 }
204
205 J_ASSERT(journal->j_running_transaction != NULL);
206 J_ASSERT(journal->j_committing_transaction == NULL);
207
208 commit_transaction = journal->j_running_transaction;
209 J_ASSERT(commit_transaction->t_state == T_RUNNING);
210
211 jbd_debug(1, "JBD: starting commit of transaction %d\n",
212 commit_transaction->t_tid);
213
214 spin_lock(&journal->j_state_lock);
215 commit_transaction->t_state = T_LOCKED;
216
217 spin_lock(&commit_transaction->t_handle_lock);
218 while (commit_transaction->t_updates) {
219 DEFINE_WAIT(wait);
220
221 prepare_to_wait(&journal->j_wait_updates, &wait,
222 TASK_UNINTERRUPTIBLE);
223 if (commit_transaction->t_updates) {
224 spin_unlock(&commit_transaction->t_handle_lock);
225 spin_unlock(&journal->j_state_lock);
226 schedule();
227 spin_lock(&journal->j_state_lock);
228 spin_lock(&commit_transaction->t_handle_lock);
229 }
230 finish_wait(&journal->j_wait_updates, &wait);
231 }
232 spin_unlock(&commit_transaction->t_handle_lock);
233
234 J_ASSERT (commit_transaction->t_outstanding_credits <=
235 journal->j_max_transaction_buffers);
236
237 /*
238 * First thing we are allowed to do is to discard any remaining
239 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
240 * that there are no such buffers: if a large filesystem
241 * operation like a truncate needs to split itself over multiple
242 * transactions, then it may try to do a journal_restart() while
243 * there are still BJ_Reserved buffers outstanding. These must
244 * be released cleanly from the current transaction.
245 *
246 * In this case, the filesystem must still reserve write access
247 * again before modifying the buffer in the new transaction, but
248 * we do not require it to remember exactly which old buffers it
249 * has reserved. This is consistent with the existing behaviour
250 * that multiple journal_get_write_access() calls to the same
251 * buffer are perfectly permissable.
252 */
253 while (commit_transaction->t_reserved_list) {
254 jh = commit_transaction->t_reserved_list;
255 JBUFFER_TRACE(jh, "reserved, unused: refile");
256 /*
257 * A journal_get_undo_access()+journal_release_buffer() may
258 * leave undo-committed data.
259 */
260 if (jh->b_committed_data) {
261 struct buffer_head *bh = jh2bh(jh);
262
263 jbd_lock_bh_state(bh);
264 if (jh->b_committed_data) {
265 kfree(jh->b_committed_data);
266 jh->b_committed_data = NULL;
267 }
268 jbd_unlock_bh_state(bh);
269 }
270 journal_refile_buffer(journal, jh);
271 }
272
273 /*
274 * Now try to drop any written-back buffers from the journal's
275 * checkpoint lists. We do this *before* commit because it potentially
276 * frees some memory
277 */
278 spin_lock(&journal->j_list_lock);
279 __journal_clean_checkpoint_list(journal);
280 spin_unlock(&journal->j_list_lock);
281
282 jbd_debug (3, "JBD: commit phase 1\n");
283
284 /*
285 * Switch to a new revoke table.
286 */
287 journal_switch_revoke_table(journal);
288
289 commit_transaction->t_state = T_FLUSH;
290 journal->j_committing_transaction = commit_transaction;
291 journal->j_running_transaction = NULL;
292 commit_transaction->t_log_start = journal->j_head;
293 wake_up(&journal->j_wait_transaction_locked);
294 spin_unlock(&journal->j_state_lock);
295
296 jbd_debug (3, "JBD: commit phase 2\n");
297
298 /*
299 * First, drop modified flag: all accesses to the buffers
300 * will be tracked for a new trasaction only -bzzz
301 */
302 spin_lock(&journal->j_list_lock);
303 if (commit_transaction->t_buffers) {
304 new_jh = jh = commit_transaction->t_buffers->b_tnext;
305 do {
306 J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
307 new_jh->b_modified == 0);
308 new_jh->b_modified = 0;
309 new_jh = new_jh->b_tnext;
310 } while (new_jh != jh);
311 }
312 spin_unlock(&journal->j_list_lock);
313
314 /*
315 * Now start flushing things to disk, in the order they appear
316 * on the transaction lists. Data blocks go first.
317 */
318
319 err = 0;
320 /*
321 * Whenever we unlock the journal and sleep, things can get added
322 * onto ->t_sync_datalist, so we have to keep looping back to
323 * write_out_data until we *know* that the list is empty.
324 */
325 bufs = 0;
326 /*
327 * Cleanup any flushed data buffers from the data list. Even in
328 * abort mode, we want to flush this out as soon as possible.
329 */
330write_out_data:
331 cond_resched();
332 spin_lock(&journal->j_list_lock);
333
334 while (commit_transaction->t_sync_datalist) {
335 struct buffer_head *bh;
336
337 jh = commit_transaction->t_sync_datalist;
338 commit_transaction->t_sync_datalist = jh->b_tnext;
339 bh = jh2bh(jh);
340 if (buffer_locked(bh)) {
341 BUFFER_TRACE(bh, "locked");
342 if (!inverted_lock(journal, bh))
343 goto write_out_data;
344 __journal_temp_unlink_buffer(jh);
345 __journal_file_buffer(jh, commit_transaction,
346 BJ_Locked);
347 jbd_unlock_bh_state(bh);
348 if (lock_need_resched(&journal->j_list_lock)) {
349 spin_unlock(&journal->j_list_lock);
350 goto write_out_data;
351 }
352 } else {
353 if (buffer_dirty(bh)) {
354 BUFFER_TRACE(bh, "start journal writeout");
355 get_bh(bh);
356 wbuf[bufs++] = bh;
357 if (bufs == journal->j_wbufsize) {
358 jbd_debug(2, "submit %d writes\n",
359 bufs);
360 spin_unlock(&journal->j_list_lock);
361 ll_rw_block(WRITE, bufs, wbuf);
362 journal_brelse_array(wbuf, bufs);
363 bufs = 0;
364 goto write_out_data;
365 }
366 } else {
367 BUFFER_TRACE(bh, "writeout complete: unfile");
368 if (!inverted_lock(journal, bh))
369 goto write_out_data;
370 __journal_unfile_buffer(jh);
371 jbd_unlock_bh_state(bh);
372 journal_remove_journal_head(bh);
373 put_bh(bh);
374 if (lock_need_resched(&journal->j_list_lock)) {
375 spin_unlock(&journal->j_list_lock);
376 goto write_out_data;
377 }
378 }
379 }
380 }
381
382 if (bufs) {
383 spin_unlock(&journal->j_list_lock);
384 ll_rw_block(WRITE, bufs, wbuf);
385 journal_brelse_array(wbuf, bufs);
386 spin_lock(&journal->j_list_lock);
387 }
388
389 /*
390 * Wait for all previously submitted IO to complete.
391 */
392 while (commit_transaction->t_locked_list) {
393 struct buffer_head *bh;
394
395 jh = commit_transaction->t_locked_list->b_tprev;
396 bh = jh2bh(jh);
397 get_bh(bh);
398 if (buffer_locked(bh)) {
399 spin_unlock(&journal->j_list_lock);
400 wait_on_buffer(bh);
401 if (unlikely(!buffer_uptodate(bh)))
402 err = -EIO;
403 spin_lock(&journal->j_list_lock);
404 }
405 if (!inverted_lock(journal, bh)) {
406 put_bh(bh);
407 spin_lock(&journal->j_list_lock);
408 continue;
409 }
410 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
411 __journal_unfile_buffer(jh);
412 jbd_unlock_bh_state(bh);
413 journal_remove_journal_head(bh);
414 put_bh(bh);
415 } else {
416 jbd_unlock_bh_state(bh);
417 }
418 put_bh(bh);
419 cond_resched_lock(&journal->j_list_lock);
420 }
421 spin_unlock(&journal->j_list_lock);
422
423 if (err)
424 __journal_abort_hard(journal);
425
426 journal_write_revoke_records(journal, commit_transaction);
427
428 jbd_debug(3, "JBD: commit phase 2\n");
429
430 /*
431 * If we found any dirty or locked buffers, then we should have
432 * looped back up to the write_out_data label. If there weren't
433 * any then journal_clean_data_list should have wiped the list
434 * clean by now, so check that it is in fact empty.
435 */
436 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
437
438 jbd_debug (3, "JBD: commit phase 3\n");
439
440 /*
441 * Way to go: we have now written out all of the data for a
442 * transaction! Now comes the tricky part: we need to write out
443 * metadata. Loop over the transaction's entire buffer list:
444 */
445 commit_transaction->t_state = T_COMMIT;
446
447 descriptor = NULL;
448 bufs = 0;
449 while (commit_transaction->t_buffers) {
450
451 /* Find the next buffer to be journaled... */
452
453 jh = commit_transaction->t_buffers;
454
455 /* If we're in abort mode, we just un-journal the buffer and
456 release it for background writing. */
457
458 if (is_journal_aborted(journal)) {
459 JBUFFER_TRACE(jh, "journal is aborting: refile");
460 journal_refile_buffer(journal, jh);
461 /* If that was the last one, we need to clean up
462 * any descriptor buffers which may have been
463 * already allocated, even if we are now
464 * aborting. */
465 if (!commit_transaction->t_buffers)
466 goto start_journal_io;
467 continue;
468 }
469
470 /* Make sure we have a descriptor block in which to
471 record the metadata buffer. */
472
473 if (!descriptor) {
474 struct buffer_head *bh;
475
476 J_ASSERT (bufs == 0);
477
478 jbd_debug(4, "JBD: get descriptor\n");
479
480 descriptor = journal_get_descriptor_buffer(journal);
481 if (!descriptor) {
482 __journal_abort_hard(journal);
483 continue;
484 }
485
486 bh = jh2bh(descriptor);
487 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
488 (unsigned long long)bh->b_blocknr, bh->b_data);
489 header = (journal_header_t *)&bh->b_data[0];
490 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
491 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
492 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
493
494 tagp = &bh->b_data[sizeof(journal_header_t)];
495 space_left = bh->b_size - sizeof(journal_header_t);
496 first_tag = 1;
497 set_buffer_jwrite(bh);
498 set_buffer_dirty(bh);
499 wbuf[bufs++] = bh;
500
501 /* Record it so that we can wait for IO
502 completion later */
503 BUFFER_TRACE(bh, "ph3: file as descriptor");
504 journal_file_buffer(descriptor, commit_transaction,
505 BJ_LogCtl);
506 }
507
508 /* Where is the buffer to be written? */
509
510 err = journal_next_log_block(journal, &blocknr);
511 /* If the block mapping failed, just abandon the buffer
512 and repeat this loop: we'll fall into the
513 refile-on-abort condition above. */
514 if (err) {
515 __journal_abort_hard(journal);
516 continue;
517 }
518
519 /*
520 * start_this_handle() uses t_outstanding_credits to determine
521 * the free space in the log, but this counter is changed
522 * by journal_next_log_block() also.
523 */
524 commit_transaction->t_outstanding_credits--;
525
526 /* Bump b_count to prevent truncate from stumbling over
527 the shadowed buffer! @@@ This can go if we ever get
528 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
529 atomic_inc(&jh2bh(jh)->b_count);
530
531 /* Make a temporary IO buffer with which to write it out
532 (this will requeue both the metadata buffer and the
533 temporary IO buffer). new_bh goes on BJ_IO*/
534
535 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
536 /*
537 * akpm: journal_write_metadata_buffer() sets
538 * new_bh->b_transaction to commit_transaction.
539 * We need to clean this up before we release new_bh
540 * (which is of type BJ_IO)
541 */
542 JBUFFER_TRACE(jh, "ph3: write metadata");
543 flags = journal_write_metadata_buffer(commit_transaction,
544 jh, &new_jh, blocknr);
545 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
546 wbuf[bufs++] = jh2bh(new_jh);
547
548 /* Record the new block's tag in the current descriptor
549 buffer */
550
551 tag_flag = 0;
552 if (flags & 1)
553 tag_flag |= JFS_FLAG_ESCAPE;
554 if (!first_tag)
555 tag_flag |= JFS_FLAG_SAME_UUID;
556
557 tag = (journal_block_tag_t *) tagp;
558 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
559 tag->t_flags = cpu_to_be32(tag_flag);
560 tagp += sizeof(journal_block_tag_t);
561 space_left -= sizeof(journal_block_tag_t);
562
563 if (first_tag) {
564 memcpy (tagp, journal->j_uuid, 16);
565 tagp += 16;
566 space_left -= 16;
567 first_tag = 0;
568 }
569
570 /* If there's no more to do, or if the descriptor is full,
571 let the IO rip! */
572
573 if (bufs == journal->j_wbufsize ||
574 commit_transaction->t_buffers == NULL ||
575 space_left < sizeof(journal_block_tag_t) + 16) {
576
577 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
578
579 /* Write an end-of-descriptor marker before
580 submitting the IOs. "tag" still points to
581 the last tag we set up. */
582
583 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
584
585start_journal_io:
586 for (i = 0; i < bufs; i++) {
587 struct buffer_head *bh = wbuf[i];
588 lock_buffer(bh);
589 clear_buffer_dirty(bh);
590 set_buffer_uptodate(bh);
591 bh->b_end_io = journal_end_buffer_io_sync;
592 submit_bh(WRITE, bh);
593 }
594 cond_resched();
595
596 /* Force a new descriptor to be generated next
597 time round the loop. */
598 descriptor = NULL;
599 bufs = 0;
600 }
601 }
602
603 /* Lo and behold: we have just managed to send a transaction to
604 the log. Before we can commit it, wait for the IO so far to
605 complete. Control buffers being written are on the
606 transaction's t_log_list queue, and metadata buffers are on
607 the t_iobuf_list queue.
608
609 Wait for the buffers in reverse order. That way we are
610 less likely to be woken up until all IOs have completed, and
611 so we incur less scheduling load.
612 */
613
614 jbd_debug(3, "JBD: commit phase 4\n");
615
616 /*
617 * akpm: these are BJ_IO, and j_list_lock is not needed.
618 * See __journal_try_to_free_buffer.
619 */
620wait_for_iobuf:
621 while (commit_transaction->t_iobuf_list != NULL) {
622 struct buffer_head *bh;
623
624 jh = commit_transaction->t_iobuf_list->b_tprev;
625 bh = jh2bh(jh);
626 if (buffer_locked(bh)) {
627 wait_on_buffer(bh);
628 goto wait_for_iobuf;
629 }
630 if (cond_resched())
631 goto wait_for_iobuf;
632
633 if (unlikely(!buffer_uptodate(bh)))
634 err = -EIO;
635
636 clear_buffer_jwrite(bh);
637
638 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
639 journal_unfile_buffer(journal, jh);
640
641 /*
642 * ->t_iobuf_list should contain only dummy buffer_heads
643 * which were created by journal_write_metadata_buffer().
644 */
645 BUFFER_TRACE(bh, "dumping temporary bh");
646 journal_put_journal_head(jh);
647 __brelse(bh);
648 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
649 free_buffer_head(bh);
650
651 /* We also have to unlock and free the corresponding
652 shadowed buffer */
653 jh = commit_transaction->t_shadow_list->b_tprev;
654 bh = jh2bh(jh);
655 clear_bit(BH_JWrite, &bh->b_state);
656 J_ASSERT_BH(bh, buffer_jbddirty(bh));
657
658 /* The metadata is now released for reuse, but we need
659 to remember it against this transaction so that when
660 we finally commit, we can do any checkpointing
661 required. */
662 JBUFFER_TRACE(jh, "file as BJ_Forget");
663 journal_file_buffer(jh, commit_transaction, BJ_Forget);
664 /* Wake up any transactions which were waiting for this
665 IO to complete */
666 wake_up_bit(&bh->b_state, BH_Unshadow);
667 JBUFFER_TRACE(jh, "brelse shadowed buffer");
668 __brelse(bh);
669 }
670
671 J_ASSERT (commit_transaction->t_shadow_list == NULL);
672
673 jbd_debug(3, "JBD: commit phase 5\n");
674
675 /* Here we wait for the revoke record and descriptor record buffers */
676 wait_for_ctlbuf:
677 while (commit_transaction->t_log_list != NULL) {
678 struct buffer_head *bh;
679
680 jh = commit_transaction->t_log_list->b_tprev;
681 bh = jh2bh(jh);
682 if (buffer_locked(bh)) {
683 wait_on_buffer(bh);
684 goto wait_for_ctlbuf;
685 }
686 if (cond_resched())
687 goto wait_for_ctlbuf;
688
689 if (unlikely(!buffer_uptodate(bh)))
690 err = -EIO;
691
692 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
693 clear_buffer_jwrite(bh);
694 journal_unfile_buffer(journal, jh);
695 journal_put_journal_head(jh);
696 __brelse(bh); /* One for getblk */
697 /* AKPM: bforget here */
698 }
699
700 jbd_debug(3, "JBD: commit phase 6\n");
701
702 if (journal_write_commit_record(journal, commit_transaction))
703 err = -EIO;
704
705 if (err)
706 __journal_abort_hard(journal);
707
708 /* End of a transaction! Finally, we can do checkpoint
709 processing: any buffers committed as a result of this
710 transaction can be removed from any checkpoint list it was on
711 before. */
712
713 jbd_debug(3, "JBD: commit phase 7\n");
714
715 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
716 J_ASSERT(commit_transaction->t_buffers == NULL);
717 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
718 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
719 J_ASSERT(commit_transaction->t_shadow_list == NULL);
720 J_ASSERT(commit_transaction->t_log_list == NULL);
721
722restart_loop:
723 while (commit_transaction->t_forget) {
724 transaction_t *cp_transaction;
725 struct buffer_head *bh;
726
727 jh = commit_transaction->t_forget;
728 bh = jh2bh(jh);
729 jbd_lock_bh_state(bh);
730 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
731 jh->b_transaction == journal->j_running_transaction);
732
733 /*
734 * If there is undo-protected committed data against
735 * this buffer, then we can remove it now. If it is a
736 * buffer needing such protection, the old frozen_data
737 * field now points to a committed version of the
738 * buffer, so rotate that field to the new committed
739 * data.
740 *
741 * Otherwise, we can just throw away the frozen data now.
742 */
743 if (jh->b_committed_data) {
744 kfree(jh->b_committed_data);
745 jh->b_committed_data = NULL;
746 if (jh->b_frozen_data) {
747 jh->b_committed_data = jh->b_frozen_data;
748 jh->b_frozen_data = NULL;
749 }
750 } else if (jh->b_frozen_data) {
751 kfree(jh->b_frozen_data);
752 jh->b_frozen_data = NULL;
753 }
754
755 spin_lock(&journal->j_list_lock);
756 cp_transaction = jh->b_cp_transaction;
757 if (cp_transaction) {
758 JBUFFER_TRACE(jh, "remove from old cp transaction");
759 __journal_remove_checkpoint(jh);
760 }
761
762 /* Only re-checkpoint the buffer_head if it is marked
763 * dirty. If the buffer was added to the BJ_Forget list
764 * by journal_forget, it may no longer be dirty and
765 * there's no point in keeping a checkpoint record for
766 * it. */
767
768 /* A buffer which has been freed while still being
769 * journaled by a previous transaction may end up still
770 * being dirty here, but we want to avoid writing back
771 * that buffer in the future now that the last use has
772 * been committed. That's not only a performance gain,
773 * it also stops aliasing problems if the buffer is left
774 * behind for writeback and gets reallocated for another
775 * use in a different page. */
776 if (buffer_freed(bh)) {
777 clear_buffer_freed(bh);
778 clear_buffer_jbddirty(bh);
779 }
780
781 if (buffer_jbddirty(bh)) {
782 JBUFFER_TRACE(jh, "add to new checkpointing trans");
783 __journal_insert_checkpoint(jh, commit_transaction);
784 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
785 __journal_refile_buffer(jh);
786 jbd_unlock_bh_state(bh);
787 } else {
788 J_ASSERT_BH(bh, !buffer_dirty(bh));
789 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
790 __journal_unfile_buffer(jh);
791 jbd_unlock_bh_state(bh);
792 journal_remove_journal_head(bh); /* needs a brelse */
793 release_buffer_page(bh);
794 }
795 spin_unlock(&journal->j_list_lock);
796 if (cond_resched())
797 goto restart_loop;
798 }
799
800 /* Done with this transaction! */
801
802 jbd_debug(3, "JBD: commit phase 8\n");
803
804 J_ASSERT(commit_transaction->t_state == T_COMMIT);
805
806 /*
807 * This is a bit sleazy. We borrow j_list_lock to protect
808 * journal->j_committing_transaction in __journal_remove_checkpoint.
809 * Really, __jornal_remove_checkpoint should be using j_state_lock but
810 * it's a bit hassle to hold that across __journal_remove_checkpoint
811 */
812 spin_lock(&journal->j_state_lock);
813 spin_lock(&journal->j_list_lock);
814 commit_transaction->t_state = T_FINISHED;
815 J_ASSERT(commit_transaction == journal->j_committing_transaction);
816 journal->j_commit_sequence = commit_transaction->t_tid;
817 journal->j_committing_transaction = NULL;
818 spin_unlock(&journal->j_state_lock);
819
820 if (commit_transaction->t_checkpoint_list == NULL) {
821 __journal_drop_transaction(journal, commit_transaction);
822 } else {
823 if (journal->j_checkpoint_transactions == NULL) {
824 journal->j_checkpoint_transactions = commit_transaction;
825 commit_transaction->t_cpnext = commit_transaction;
826 commit_transaction->t_cpprev = commit_transaction;
827 } else {
828 commit_transaction->t_cpnext =
829 journal->j_checkpoint_transactions;
830 commit_transaction->t_cpprev =
831 commit_transaction->t_cpnext->t_cpprev;
832 commit_transaction->t_cpnext->t_cpprev =
833 commit_transaction;
834 commit_transaction->t_cpprev->t_cpnext =
835 commit_transaction;
836 }
837 }
838 spin_unlock(&journal->j_list_lock);
839
840 jbd_debug(1, "JBD: commit %d complete, head %d\n",
841 journal->j_commit_sequence, journal->j_tail_sequence);
842
843 wake_up(&journal->j_wait_done_commit);
844}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
new file mode 100644
index 000000000000..1e6f2e2ad4a3
--- /dev/null
+++ b/fs/jbd/journal.c
@@ -0,0 +1,2003 @@
1/*
2 * linux/fs/journal.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem journal-writing code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages journals: areas of disk reserved for logging
16 * transactional updates. This includes the kernel journaling thread
17 * which is responsible for scheduling updates to the log.
18 *
19 * We do not actually manage the physical storage of the journal in this
20 * file: that is left to a per-journal policy function, which allows us
21 * to store the journal within a filesystem-specified area for ext2
22 * journaling (ext2 can use a reserved inode for storing the log).
23 */
24
25#include <linux/module.h>
26#include <linux/time.h>
27#include <linux/fs.h>
28#include <linux/jbd.h>
29#include <linux/errno.h>
30#include <linux/slab.h>
31#include <linux/smp_lock.h>
32#include <linux/init.h>
33#include <linux/mm.h>
34#include <linux/suspend.h>
35#include <linux/pagemap.h>
36#include <asm/uaccess.h>
37#include <asm/page.h>
38#include <linux/proc_fs.h>
39
40EXPORT_SYMBOL(journal_start);
41EXPORT_SYMBOL(journal_restart);
42EXPORT_SYMBOL(journal_extend);
43EXPORT_SYMBOL(journal_stop);
44EXPORT_SYMBOL(journal_lock_updates);
45EXPORT_SYMBOL(journal_unlock_updates);
46EXPORT_SYMBOL(journal_get_write_access);
47EXPORT_SYMBOL(journal_get_create_access);
48EXPORT_SYMBOL(journal_get_undo_access);
49EXPORT_SYMBOL(journal_dirty_data);
50EXPORT_SYMBOL(journal_dirty_metadata);
51EXPORT_SYMBOL(journal_release_buffer);
52EXPORT_SYMBOL(journal_forget);
53#if 0
54EXPORT_SYMBOL(journal_sync_buffer);
55#endif
56EXPORT_SYMBOL(journal_flush);
57EXPORT_SYMBOL(journal_revoke);
58
59EXPORT_SYMBOL(journal_init_dev);
60EXPORT_SYMBOL(journal_init_inode);
61EXPORT_SYMBOL(journal_update_format);
62EXPORT_SYMBOL(journal_check_used_features);
63EXPORT_SYMBOL(journal_check_available_features);
64EXPORT_SYMBOL(journal_set_features);
65EXPORT_SYMBOL(journal_create);
66EXPORT_SYMBOL(journal_load);
67EXPORT_SYMBOL(journal_destroy);
68EXPORT_SYMBOL(journal_recover);
69EXPORT_SYMBOL(journal_update_superblock);
70EXPORT_SYMBOL(journal_abort);
71EXPORT_SYMBOL(journal_errno);
72EXPORT_SYMBOL(journal_ack_err);
73EXPORT_SYMBOL(journal_clear_err);
74EXPORT_SYMBOL(log_wait_commit);
75EXPORT_SYMBOL(journal_start_commit);
76EXPORT_SYMBOL(journal_force_commit_nested);
77EXPORT_SYMBOL(journal_wipe);
78EXPORT_SYMBOL(journal_blocks_per_page);
79EXPORT_SYMBOL(journal_invalidatepage);
80EXPORT_SYMBOL(journal_try_to_free_buffers);
81EXPORT_SYMBOL(journal_force_commit);
82
83static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
84
85/*
86 * Helper function used to manage commit timeouts
87 */
88
89static void commit_timeout(unsigned long __data)
90{
91 struct task_struct * p = (struct task_struct *) __data;
92
93 wake_up_process(p);
94}
95
96/* Static check for data structure consistency. There's no code
97 * invoked --- we'll just get a linker failure if things aren't right.
98 */
99void __journal_internal_check(void)
100{
101 extern void journal_bad_superblock_size(void);
102 if (sizeof(struct journal_superblock_s) != 1024)
103 journal_bad_superblock_size();
104}
105
106/*
107 * kjournald: The main thread function used to manage a logging device
108 * journal.
109 *
110 * This kernel thread is responsible for two things:
111 *
112 * 1) COMMIT: Every so often we need to commit the current state of the
113 * filesystem to disk. The journal thread is responsible for writing
114 * all of the metadata buffers to disk.
115 *
116 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
117 * of the data in that part of the log has been rewritten elsewhere on
118 * the disk. Flushing these old buffers to reclaim space in the log is
119 * known as checkpointing, and this thread is responsible for that job.
120 */
121
122journal_t *current_journal; // AKPM: debug
123
124int kjournald(void *arg)
125{
126 journal_t *journal = (journal_t *) arg;
127 transaction_t *transaction;
128 struct timer_list timer;
129
130 current_journal = journal;
131
132 daemonize("kjournald");
133
134 /* Set up an interval timer which can be used to trigger a
135 commit wakeup after the commit interval expires */
136 init_timer(&timer);
137 timer.data = (unsigned long) current;
138 timer.function = commit_timeout;
139 journal->j_commit_timer = &timer;
140
141 /* Record that the journal thread is running */
142 journal->j_task = current;
143 wake_up(&journal->j_wait_done_commit);
144
145 printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
146 journal->j_commit_interval / HZ);
147
148 /*
149 * And now, wait forever for commit wakeup events.
150 */
151 spin_lock(&journal->j_state_lock);
152
153loop:
154 if (journal->j_flags & JFS_UNMOUNT)
155 goto end_loop;
156
157 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
158 journal->j_commit_sequence, journal->j_commit_request);
159
160 if (journal->j_commit_sequence != journal->j_commit_request) {
161 jbd_debug(1, "OK, requests differ\n");
162 spin_unlock(&journal->j_state_lock);
163 del_timer_sync(journal->j_commit_timer);
164 journal_commit_transaction(journal);
165 spin_lock(&journal->j_state_lock);
166 goto loop;
167 }
168
169 wake_up(&journal->j_wait_done_commit);
170 if (current->flags & PF_FREEZE) {
171 /*
172 * The simpler the better. Flushing journal isn't a
173 * good idea, because that depends on threads that may
174 * be already stopped.
175 */
176 jbd_debug(1, "Now suspending kjournald\n");
177 spin_unlock(&journal->j_state_lock);
178 refrigerator(PF_FREEZE);
179 spin_lock(&journal->j_state_lock);
180 } else {
181 /*
182 * We assume on resume that commits are already there,
183 * so we don't sleep
184 */
185 DEFINE_WAIT(wait);
186 int should_sleep = 1;
187
188 prepare_to_wait(&journal->j_wait_commit, &wait,
189 TASK_INTERRUPTIBLE);
190 if (journal->j_commit_sequence != journal->j_commit_request)
191 should_sleep = 0;
192 transaction = journal->j_running_transaction;
193 if (transaction && time_after_eq(jiffies,
194 transaction->t_expires))
195 should_sleep = 0;
196 if (should_sleep) {
197 spin_unlock(&journal->j_state_lock);
198 schedule();
199 spin_lock(&journal->j_state_lock);
200 }
201 finish_wait(&journal->j_wait_commit, &wait);
202 }
203
204 jbd_debug(1, "kjournald wakes\n");
205
206 /*
207 * Were we woken up by a commit wakeup event?
208 */
209 transaction = journal->j_running_transaction;
210 if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
211 journal->j_commit_request = transaction->t_tid;
212 jbd_debug(1, "woke because of timeout\n");
213 }
214 goto loop;
215
216end_loop:
217 spin_unlock(&journal->j_state_lock);
218 del_timer_sync(journal->j_commit_timer);
219 journal->j_task = NULL;
220 wake_up(&journal->j_wait_done_commit);
221 jbd_debug(1, "Journal thread exiting.\n");
222 return 0;
223}
224
225static void journal_start_thread(journal_t *journal)
226{
227 kernel_thread(kjournald, journal, CLONE_VM|CLONE_FS|CLONE_FILES);
228 wait_event(journal->j_wait_done_commit, journal->j_task != 0);
229}
230
231static void journal_kill_thread(journal_t *journal)
232{
233 spin_lock(&journal->j_state_lock);
234 journal->j_flags |= JFS_UNMOUNT;
235
236 while (journal->j_task) {
237 wake_up(&journal->j_wait_commit);
238 spin_unlock(&journal->j_state_lock);
239 wait_event(journal->j_wait_done_commit, journal->j_task == 0);
240 spin_lock(&journal->j_state_lock);
241 }
242 spin_unlock(&journal->j_state_lock);
243}
244
245/*
246 * journal_write_metadata_buffer: write a metadata buffer to the journal.
247 *
248 * Writes a metadata buffer to a given disk block. The actual IO is not
249 * performed but a new buffer_head is constructed which labels the data
250 * to be written with the correct destination disk block.
251 *
252 * Any magic-number escaping which needs to be done will cause a
253 * copy-out here. If the buffer happens to start with the
254 * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
255 * magic number is only written to the log for descripter blocks. In
256 * this case, we copy the data and replace the first word with 0, and we
257 * return a result code which indicates that this buffer needs to be
258 * marked as an escaped buffer in the corresponding log descriptor
259 * block. The missing word can then be restored when the block is read
260 * during recovery.
261 *
262 * If the source buffer has already been modified by a new transaction
263 * since we took the last commit snapshot, we use the frozen copy of
264 * that data for IO. If we end up using the existing buffer_head's data
265 * for the write, then we *have* to lock the buffer to prevent anyone
266 * else from using and possibly modifying it while the IO is in
267 * progress.
268 *
269 * The function returns a pointer to the buffer_heads to be used for IO.
270 *
271 * We assume that the journal has already been locked in this function.
272 *
273 * Return value:
274 * <0: Error
275 * >=0: Finished OK
276 *
277 * On success:
278 * Bit 0 set == escape performed on the data
279 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
280 */
281
282int journal_write_metadata_buffer(transaction_t *transaction,
283 struct journal_head *jh_in,
284 struct journal_head **jh_out,
285 int blocknr)
286{
287 int need_copy_out = 0;
288 int done_copy_out = 0;
289 int do_escape = 0;
290 char *mapped_data;
291 struct buffer_head *new_bh;
292 struct journal_head *new_jh;
293 struct page *new_page;
294 unsigned int new_offset;
295 struct buffer_head *bh_in = jh2bh(jh_in);
296
297 /*
298 * The buffer really shouldn't be locked: only the current committing
299 * transaction is allowed to write it, so nobody else is allowed
300 * to do any IO.
301 *
302 * akpm: except if we're journalling data, and write() output is
303 * also part of a shared mapping, and another thread has
304 * decided to launch a writepage() against this buffer.
305 */
306 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
307
308 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
309
310 /*
311 * If a new transaction has already done a buffer copy-out, then
312 * we use that version of the data for the commit.
313 */
314 jbd_lock_bh_state(bh_in);
315repeat:
316 if (jh_in->b_frozen_data) {
317 done_copy_out = 1;
318 new_page = virt_to_page(jh_in->b_frozen_data);
319 new_offset = offset_in_page(jh_in->b_frozen_data);
320 } else {
321 new_page = jh2bh(jh_in)->b_page;
322 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
323 }
324
325 mapped_data = kmap_atomic(new_page, KM_USER0);
326 /*
327 * Check for escaping
328 */
329 if (*((__be32 *)(mapped_data + new_offset)) ==
330 cpu_to_be32(JFS_MAGIC_NUMBER)) {
331 need_copy_out = 1;
332 do_escape = 1;
333 }
334 kunmap_atomic(mapped_data, KM_USER0);
335
336 /*
337 * Do we need to do a data copy?
338 */
339 if (need_copy_out && !done_copy_out) {
340 char *tmp;
341
342 jbd_unlock_bh_state(bh_in);
343 tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS);
344 jbd_lock_bh_state(bh_in);
345 if (jh_in->b_frozen_data) {
346 kfree(tmp);
347 goto repeat;
348 }
349
350 jh_in->b_frozen_data = tmp;
351 mapped_data = kmap_atomic(new_page, KM_USER0);
352 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
353 kunmap_atomic(mapped_data, KM_USER0);
354
355 new_page = virt_to_page(tmp);
356 new_offset = offset_in_page(tmp);
357 done_copy_out = 1;
358 }
359
360 /*
361 * Did we need to do an escaping? Now we've done all the
362 * copying, we can finally do so.
363 */
364 if (do_escape) {
365 mapped_data = kmap_atomic(new_page, KM_USER0);
366 *((unsigned int *)(mapped_data + new_offset)) = 0;
367 kunmap_atomic(mapped_data, KM_USER0);
368 }
369
370 /* keep subsequent assertions sane */
371 new_bh->b_state = 0;
372 init_buffer(new_bh, NULL, NULL);
373 atomic_set(&new_bh->b_count, 1);
374 jbd_unlock_bh_state(bh_in);
375
376 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
377
378 set_bh_page(new_bh, new_page, new_offset);
379 new_jh->b_transaction = NULL;
380 new_bh->b_size = jh2bh(jh_in)->b_size;
381 new_bh->b_bdev = transaction->t_journal->j_dev;
382 new_bh->b_blocknr = blocknr;
383 set_buffer_mapped(new_bh);
384 set_buffer_dirty(new_bh);
385
386 *jh_out = new_jh;
387
388 /*
389 * The to-be-written buffer needs to get moved to the io queue,
390 * and the original buffer whose contents we are shadowing or
391 * copying is moved to the transaction's shadow queue.
392 */
393 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
394 journal_file_buffer(jh_in, transaction, BJ_Shadow);
395 JBUFFER_TRACE(new_jh, "file as BJ_IO");
396 journal_file_buffer(new_jh, transaction, BJ_IO);
397
398 return do_escape | (done_copy_out << 1);
399}
400
401/*
402 * Allocation code for the journal file. Manage the space left in the
403 * journal, so that we can begin checkpointing when appropriate.
404 */
405
406/*
407 * __log_space_left: Return the number of free blocks left in the journal.
408 *
409 * Called with the journal already locked.
410 *
411 * Called under j_state_lock
412 */
413
414int __log_space_left(journal_t *journal)
415{
416 int left = journal->j_free;
417
418 assert_spin_locked(&journal->j_state_lock);
419
420 /*
421 * Be pessimistic here about the number of those free blocks which
422 * might be required for log descriptor control blocks.
423 */
424
425#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
426
427 left -= MIN_LOG_RESERVED_BLOCKS;
428
429 if (left <= 0)
430 return 0;
431 left -= (left >> 3);
432 return left;
433}
434
435/*
436 * Called under j_state_lock. Returns true if a transaction was started.
437 */
438int __log_start_commit(journal_t *journal, tid_t target)
439{
440 /*
441 * Are we already doing a recent enough commit?
442 */
443 if (!tid_geq(journal->j_commit_request, target)) {
444 /*
445 * We want a new commit: OK, mark the request and wakup the
446 * commit thread. We do _not_ do the commit ourselves.
447 */
448
449 journal->j_commit_request = target;
450 jbd_debug(1, "JBD: requesting commit %d/%d\n",
451 journal->j_commit_request,
452 journal->j_commit_sequence);
453 wake_up(&journal->j_wait_commit);
454 return 1;
455 }
456 return 0;
457}
458
459int log_start_commit(journal_t *journal, tid_t tid)
460{
461 int ret;
462
463 spin_lock(&journal->j_state_lock);
464 ret = __log_start_commit(journal, tid);
465 spin_unlock(&journal->j_state_lock);
466 return ret;
467}
468
469/*
470 * Force and wait upon a commit if the calling process is not within
471 * transaction. This is used for forcing out undo-protected data which contains
472 * bitmaps, when the fs is running out of space.
473 *
474 * We can only force the running transaction if we don't have an active handle;
475 * otherwise, we will deadlock.
476 *
477 * Returns true if a transaction was started.
478 */
479int journal_force_commit_nested(journal_t *journal)
480{
481 transaction_t *transaction = NULL;
482 tid_t tid;
483
484 spin_lock(&journal->j_state_lock);
485 if (journal->j_running_transaction && !current->journal_info) {
486 transaction = journal->j_running_transaction;
487 __log_start_commit(journal, transaction->t_tid);
488 } else if (journal->j_committing_transaction)
489 transaction = journal->j_committing_transaction;
490
491 if (!transaction) {
492 spin_unlock(&journal->j_state_lock);
493 return 0; /* Nothing to retry */
494 }
495
496 tid = transaction->t_tid;
497 spin_unlock(&journal->j_state_lock);
498 log_wait_commit(journal, tid);
499 return 1;
500}
501
502/*
503 * Start a commit of the current running transaction (if any). Returns true
504 * if a transaction was started, and fills its tid in at *ptid
505 */
506int journal_start_commit(journal_t *journal, tid_t *ptid)
507{
508 int ret = 0;
509
510 spin_lock(&journal->j_state_lock);
511 if (journal->j_running_transaction) {
512 tid_t tid = journal->j_running_transaction->t_tid;
513
514 ret = __log_start_commit(journal, tid);
515 if (ret && ptid)
516 *ptid = tid;
517 } else if (journal->j_committing_transaction && ptid) {
518 /*
519 * If ext3_write_super() recently started a commit, then we
520 * have to wait for completion of that transaction
521 */
522 *ptid = journal->j_committing_transaction->t_tid;
523 ret = 1;
524 }
525 spin_unlock(&journal->j_state_lock);
526 return ret;
527}
528
529/*
530 * Wait for a specified commit to complete.
531 * The caller may not hold the journal lock.
532 */
533int log_wait_commit(journal_t *journal, tid_t tid)
534{
535 int err = 0;
536
537#ifdef CONFIG_JBD_DEBUG
538 spin_lock(&journal->j_state_lock);
539 if (!tid_geq(journal->j_commit_request, tid)) {
540 printk(KERN_EMERG
541 "%s: error: j_commit_request=%d, tid=%d\n",
542 __FUNCTION__, journal->j_commit_request, tid);
543 }
544 spin_unlock(&journal->j_state_lock);
545#endif
546 spin_lock(&journal->j_state_lock);
547 while (tid_gt(tid, journal->j_commit_sequence)) {
548 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
549 tid, journal->j_commit_sequence);
550 wake_up(&journal->j_wait_commit);
551 spin_unlock(&journal->j_state_lock);
552 wait_event(journal->j_wait_done_commit,
553 !tid_gt(tid, journal->j_commit_sequence));
554 spin_lock(&journal->j_state_lock);
555 }
556 spin_unlock(&journal->j_state_lock);
557
558 if (unlikely(is_journal_aborted(journal))) {
559 printk(KERN_EMERG "journal commit I/O error\n");
560 err = -EIO;
561 }
562 return err;
563}
564
565/*
566 * Log buffer allocation routines:
567 */
568
569int journal_next_log_block(journal_t *journal, unsigned long *retp)
570{
571 unsigned long blocknr;
572
573 spin_lock(&journal->j_state_lock);
574 J_ASSERT(journal->j_free > 1);
575
576 blocknr = journal->j_head;
577 journal->j_head++;
578 journal->j_free--;
579 if (journal->j_head == journal->j_last)
580 journal->j_head = journal->j_first;
581 spin_unlock(&journal->j_state_lock);
582 return journal_bmap(journal, blocknr, retp);
583}
584
585/*
586 * Conversion of logical to physical block numbers for the journal
587 *
588 * On external journals the journal blocks are identity-mapped, so
589 * this is a no-op. If needed, we can use j_blk_offset - everything is
590 * ready.
591 */
592int journal_bmap(journal_t *journal, unsigned long blocknr,
593 unsigned long *retp)
594{
595 int err = 0;
596 unsigned long ret;
597
598 if (journal->j_inode) {
599 ret = bmap(journal->j_inode, blocknr);
600 if (ret)
601 *retp = ret;
602 else {
603 char b[BDEVNAME_SIZE];
604
605 printk(KERN_ALERT "%s: journal block not found "
606 "at offset %lu on %s\n",
607 __FUNCTION__,
608 blocknr,
609 bdevname(journal->j_dev, b));
610 err = -EIO;
611 __journal_abort_soft(journal, err);
612 }
613 } else {
614 *retp = blocknr; /* +journal->j_blk_offset */
615 }
616 return err;
617}
618
619/*
620 * We play buffer_head aliasing tricks to write data/metadata blocks to
621 * the journal without copying their contents, but for journal
622 * descriptor blocks we do need to generate bona fide buffers.
623 *
624 * After the caller of journal_get_descriptor_buffer() has finished modifying
625 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
626 * But we don't bother doing that, so there will be coherency problems with
627 * mmaps of blockdevs which hold live JBD-controlled filesystems.
628 */
629struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
630{
631 struct buffer_head *bh;
632 unsigned long blocknr;
633 int err;
634
635 err = journal_next_log_block(journal, &blocknr);
636
637 if (err)
638 return NULL;
639
640 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
641 lock_buffer(bh);
642 memset(bh->b_data, 0, journal->j_blocksize);
643 set_buffer_uptodate(bh);
644 unlock_buffer(bh);
645 BUFFER_TRACE(bh, "return this buffer");
646 return journal_add_journal_head(bh);
647}
648
649/*
650 * Management for journal control blocks: functions to create and
651 * destroy journal_t structures, and to initialise and read existing
652 * journal blocks from disk. */
653
654/* First: create and setup a journal_t object in memory. We initialise
655 * very few fields yet: that has to wait until we have created the
656 * journal structures from from scratch, or loaded them from disk. */
657
658static journal_t * journal_init_common (void)
659{
660 journal_t *journal;
661 int err;
662
663 journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
664 if (!journal)
665 goto fail;
666 memset(journal, 0, sizeof(*journal));
667
668 init_waitqueue_head(&journal->j_wait_transaction_locked);
669 init_waitqueue_head(&journal->j_wait_logspace);
670 init_waitqueue_head(&journal->j_wait_done_commit);
671 init_waitqueue_head(&journal->j_wait_checkpoint);
672 init_waitqueue_head(&journal->j_wait_commit);
673 init_waitqueue_head(&journal->j_wait_updates);
674 init_MUTEX(&journal->j_barrier);
675 init_MUTEX(&journal->j_checkpoint_sem);
676 spin_lock_init(&journal->j_revoke_lock);
677 spin_lock_init(&journal->j_list_lock);
678 spin_lock_init(&journal->j_state_lock);
679
680 journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
681
682 /* The journal is marked for error until we succeed with recovery! */
683 journal->j_flags = JFS_ABORT;
684
685 /* Set up a default-sized revoke table for the new mount. */
686 err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
687 if (err) {
688 kfree(journal);
689 goto fail;
690 }
691 return journal;
692fail:
693 return NULL;
694}
695
696/* journal_init_dev and journal_init_inode:
697 *
698 * Create a journal structure assigned some fixed set of disk blocks to
699 * the journal. We don't actually touch those disk blocks yet, but we
700 * need to set up all of the mapping information to tell the journaling
701 * system where the journal blocks are.
702 *
703 */
704
705/**
706 * journal_t * journal_init_dev() - creates an initialises a journal structure
707 * @bdev: Block device on which to create the journal
708 * @fs_dev: Device which hold journalled filesystem for this journal.
709 * @start: Block nr Start of journal.
710 * @len: Lenght of the journal in blocks.
711 * @blocksize: blocksize of journalling device
712 * @returns: a newly created journal_t *
713 *
714 * journal_init_dev creates a journal which maps a fixed contiguous
715 * range of blocks on an arbitrary block device.
716 *
717 */
718journal_t * journal_init_dev(struct block_device *bdev,
719 struct block_device *fs_dev,
720 int start, int len, int blocksize)
721{
722 journal_t *journal = journal_init_common();
723 struct buffer_head *bh;
724 int n;
725
726 if (!journal)
727 return NULL;
728
729 journal->j_dev = bdev;
730 journal->j_fs_dev = fs_dev;
731 journal->j_blk_offset = start;
732 journal->j_maxlen = len;
733 journal->j_blocksize = blocksize;
734
735 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
736 J_ASSERT(bh != NULL);
737 journal->j_sb_buffer = bh;
738 journal->j_superblock = (journal_superblock_t *)bh->b_data;
739
740 /* journal descriptor can store up to n blocks -bzzz */
741 n = journal->j_blocksize / sizeof(journal_block_tag_t);
742 journal->j_wbufsize = n;
743 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
744 if (!journal->j_wbuf) {
745 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
746 __FUNCTION__);
747 kfree(journal);
748 journal = NULL;
749 }
750
751 return journal;
752}
753
754/**
755 * journal_t * journal_init_inode () - creates a journal which maps to a inode.
756 * @inode: An inode to create the journal in
757 *
758 * journal_init_inode creates a journal which maps an on-disk inode as
759 * the journal. The inode must exist already, must support bmap() and
760 * must have all data blocks preallocated.
761 */
762journal_t * journal_init_inode (struct inode *inode)
763{
764 struct buffer_head *bh;
765 journal_t *journal = journal_init_common();
766 int err;
767 int n;
768 unsigned long blocknr;
769
770 if (!journal)
771 return NULL;
772
773 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
774 journal->j_inode = inode;
775 jbd_debug(1,
776 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
777 journal, inode->i_sb->s_id, inode->i_ino,
778 (long long) inode->i_size,
779 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
780
781 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
782 journal->j_blocksize = inode->i_sb->s_blocksize;
783
784 /* journal descriptor can store up to n blocks -bzzz */
785 n = journal->j_blocksize / sizeof(journal_block_tag_t);
786 journal->j_wbufsize = n;
787 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
788 if (!journal->j_wbuf) {
789 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
790 __FUNCTION__);
791 kfree(journal);
792 return NULL;
793 }
794
795 err = journal_bmap(journal, 0, &blocknr);
796 /* If that failed, give up */
797 if (err) {
798 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
799 __FUNCTION__);
800 kfree(journal);
801 return NULL;
802 }
803
804 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
805 J_ASSERT(bh != NULL);
806 journal->j_sb_buffer = bh;
807 journal->j_superblock = (journal_superblock_t *)bh->b_data;
808
809 return journal;
810}
811
812/*
813 * If the journal init or create aborts, we need to mark the journal
814 * superblock as being NULL to prevent the journal destroy from writing
815 * back a bogus superblock.
816 */
817static void journal_fail_superblock (journal_t *journal)
818{
819 struct buffer_head *bh = journal->j_sb_buffer;
820 brelse(bh);
821 journal->j_sb_buffer = NULL;
822}
823
824/*
825 * Given a journal_t structure, initialise the various fields for
826 * startup of a new journaling session. We use this both when creating
827 * a journal, and after recovering an old journal to reset it for
828 * subsequent use.
829 */
830
831static int journal_reset(journal_t *journal)
832{
833 journal_superblock_t *sb = journal->j_superblock;
834 unsigned int first, last;
835
836 first = be32_to_cpu(sb->s_first);
837 last = be32_to_cpu(sb->s_maxlen);
838
839 journal->j_first = first;
840 journal->j_last = last;
841
842 journal->j_head = first;
843 journal->j_tail = first;
844 journal->j_free = last - first;
845
846 journal->j_tail_sequence = journal->j_transaction_sequence;
847 journal->j_commit_sequence = journal->j_transaction_sequence - 1;
848 journal->j_commit_request = journal->j_commit_sequence;
849
850 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
851
852 /* Add the dynamic fields and write it to disk. */
853 journal_update_superblock(journal, 1);
854 journal_start_thread(journal);
855 return 0;
856}
857
858/**
859 * int journal_create() - Initialise the new journal file
860 * @journal: Journal to create. This structure must have been initialised
861 *
862 * Given a journal_t structure which tells us which disk blocks we can
863 * use, create a new journal superblock and initialise all of the
864 * journal fields from scratch.
865 **/
866int journal_create(journal_t *journal)
867{
868 unsigned long blocknr;
869 struct buffer_head *bh;
870 journal_superblock_t *sb;
871 int i, err;
872
873 if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
874 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
875 journal->j_maxlen);
876 journal_fail_superblock(journal);
877 return -EINVAL;
878 }
879
880 if (journal->j_inode == NULL) {
881 /*
882 * We don't know what block to start at!
883 */
884 printk(KERN_EMERG
885 "%s: creation of journal on external device!\n",
886 __FUNCTION__);
887 BUG();
888 }
889
890 /* Zero out the entire journal on disk. We cannot afford to
891 have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
892 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
893 for (i = 0; i < journal->j_maxlen; i++) {
894 err = journal_bmap(journal, i, &blocknr);
895 if (err)
896 return err;
897 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
898 lock_buffer(bh);
899 memset (bh->b_data, 0, journal->j_blocksize);
900 BUFFER_TRACE(bh, "marking dirty");
901 mark_buffer_dirty(bh);
902 BUFFER_TRACE(bh, "marking uptodate");
903 set_buffer_uptodate(bh);
904 unlock_buffer(bh);
905 __brelse(bh);
906 }
907
908 sync_blockdev(journal->j_dev);
909 jbd_debug(1, "JBD: journal cleared.\n");
910
911 /* OK, fill in the initial static fields in the new superblock */
912 sb = journal->j_superblock;
913
914 sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
915 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
916
917 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
918 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
919 sb->s_first = cpu_to_be32(1);
920
921 journal->j_transaction_sequence = 1;
922
923 journal->j_flags &= ~JFS_ABORT;
924 journal->j_format_version = 2;
925
926 return journal_reset(journal);
927}
928
929/**
930 * void journal_update_superblock() - Update journal sb on disk.
931 * @journal: The journal to update.
932 * @wait: Set to '0' if you don't want to wait for IO completion.
933 *
934 * Update a journal's dynamic superblock fields and write it to disk,
935 * optionally waiting for the IO to complete.
936 */
937void journal_update_superblock(journal_t *journal, int wait)
938{
939 journal_superblock_t *sb = journal->j_superblock;
940 struct buffer_head *bh = journal->j_sb_buffer;
941
942 /*
943 * As a special case, if the on-disk copy is already marked as needing
944 * no recovery (s_start == 0) and there are no outstanding transactions
945 * in the filesystem, then we can safely defer the superblock update
946 * until the next commit by setting JFS_FLUSHED. This avoids
947 * attempting a write to a potential-readonly device.
948 */
949 if (sb->s_start == 0 && journal->j_tail_sequence ==
950 journal->j_transaction_sequence) {
951 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
952 "(start %ld, seq %d, errno %d)\n",
953 journal->j_tail, journal->j_tail_sequence,
954 journal->j_errno);
955 goto out;
956 }
957
958 spin_lock(&journal->j_state_lock);
959 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
960 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
961
962 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
963 sb->s_start = cpu_to_be32(journal->j_tail);
964 sb->s_errno = cpu_to_be32(journal->j_errno);
965 spin_unlock(&journal->j_state_lock);
966
967 BUFFER_TRACE(bh, "marking dirty");
968 mark_buffer_dirty(bh);
969 if (wait)
970 sync_dirty_buffer(bh);
971 else
972 ll_rw_block(WRITE, 1, &bh);
973
974out:
975 /* If we have just flushed the log (by marking s_start==0), then
976 * any future commit will have to be careful to update the
977 * superblock again to re-record the true start of the log. */
978
979 spin_lock(&journal->j_state_lock);
980 if (sb->s_start)
981 journal->j_flags &= ~JFS_FLUSHED;
982 else
983 journal->j_flags |= JFS_FLUSHED;
984 spin_unlock(&journal->j_state_lock);
985}
986
987/*
988 * Read the superblock for a given journal, performing initial
989 * validation of the format.
990 */
991
992static int journal_get_superblock(journal_t *journal)
993{
994 struct buffer_head *bh;
995 journal_superblock_t *sb;
996 int err = -EIO;
997
998 bh = journal->j_sb_buffer;
999
1000 J_ASSERT(bh != NULL);
1001 if (!buffer_uptodate(bh)) {
1002 ll_rw_block(READ, 1, &bh);
1003 wait_on_buffer(bh);
1004 if (!buffer_uptodate(bh)) {
1005 printk (KERN_ERR
1006 "JBD: IO error reading journal superblock\n");
1007 goto out;
1008 }
1009 }
1010
1011 sb = journal->j_superblock;
1012
1013 err = -EINVAL;
1014
1015 if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) ||
1016 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1017 printk(KERN_WARNING "JBD: no valid journal superblock found\n");
1018 goto out;
1019 }
1020
1021 switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1022 case JFS_SUPERBLOCK_V1:
1023 journal->j_format_version = 1;
1024 break;
1025 case JFS_SUPERBLOCK_V2:
1026 journal->j_format_version = 2;
1027 break;
1028 default:
1029 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
1030 goto out;
1031 }
1032
1033 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1034 journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1035 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1036 printk (KERN_WARNING "JBD: journal file too short\n");
1037 goto out;
1038 }
1039
1040 return 0;
1041
1042out:
1043 journal_fail_superblock(journal);
1044 return err;
1045}
1046
1047/*
1048 * Load the on-disk journal superblock and read the key fields into the
1049 * journal_t.
1050 */
1051
1052static int load_superblock(journal_t *journal)
1053{
1054 int err;
1055 journal_superblock_t *sb;
1056
1057 err = journal_get_superblock(journal);
1058 if (err)
1059 return err;
1060
1061 sb = journal->j_superblock;
1062
1063 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1064 journal->j_tail = be32_to_cpu(sb->s_start);
1065 journal->j_first = be32_to_cpu(sb->s_first);
1066 journal->j_last = be32_to_cpu(sb->s_maxlen);
1067 journal->j_errno = be32_to_cpu(sb->s_errno);
1068
1069 return 0;
1070}
1071
1072
1073/**
1074 * int journal_load() - Read journal from disk.
1075 * @journal: Journal to act on.
1076 *
1077 * Given a journal_t structure which tells us which disk blocks contain
1078 * a journal, read the journal from disk to initialise the in-memory
1079 * structures.
1080 */
1081int journal_load(journal_t *journal)
1082{
1083 int err;
1084
1085 err = load_superblock(journal);
1086 if (err)
1087 return err;
1088
1089 /* If this is a V2 superblock, then we have to check the
1090 * features flags on it. */
1091
1092 if (journal->j_format_version >= 2) {
1093 journal_superblock_t *sb = journal->j_superblock;
1094
1095 if ((sb->s_feature_ro_compat &
1096 ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
1097 (sb->s_feature_incompat &
1098 ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
1099 printk (KERN_WARNING
1100 "JBD: Unrecognised features on journal\n");
1101 return -EINVAL;
1102 }
1103 }
1104
1105 /* Let the recovery code check whether it needs to recover any
1106 * data from the journal. */
1107 if (journal_recover(journal))
1108 goto recovery_error;
1109
1110 /* OK, we've finished with the dynamic journal bits:
1111 * reinitialise the dynamic contents of the superblock in memory
1112 * and reset them on disk. */
1113 if (journal_reset(journal))
1114 goto recovery_error;
1115
1116 journal->j_flags &= ~JFS_ABORT;
1117 journal->j_flags |= JFS_LOADED;
1118 return 0;
1119
1120recovery_error:
1121 printk (KERN_WARNING "JBD: recovery failed\n");
1122 return -EIO;
1123}
1124
1125/**
1126 * void journal_destroy() - Release a journal_t structure.
1127 * @journal: Journal to act on.
1128 *
1129 * Release a journal_t structure once it is no longer in use by the
1130 * journaled object.
1131 */
1132void journal_destroy(journal_t *journal)
1133{
1134 /* Wait for the commit thread to wake up and die. */
1135 journal_kill_thread(journal);
1136
1137 /* Force a final log commit */
1138 if (journal->j_running_transaction)
1139 journal_commit_transaction(journal);
1140
1141 /* Force any old transactions to disk */
1142
1143 /* Totally anal locking here... */
1144 spin_lock(&journal->j_list_lock);
1145 while (journal->j_checkpoint_transactions != NULL) {
1146 spin_unlock(&journal->j_list_lock);
1147 log_do_checkpoint(journal);
1148 spin_lock(&journal->j_list_lock);
1149 }
1150
1151 J_ASSERT(journal->j_running_transaction == NULL);
1152 J_ASSERT(journal->j_committing_transaction == NULL);
1153 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1154 spin_unlock(&journal->j_list_lock);
1155
1156 /* We can now mark the journal as empty. */
1157 journal->j_tail = 0;
1158 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1159 if (journal->j_sb_buffer) {
1160 journal_update_superblock(journal, 1);
1161 brelse(journal->j_sb_buffer);
1162 }
1163
1164 if (journal->j_inode)
1165 iput(journal->j_inode);
1166 if (journal->j_revoke)
1167 journal_destroy_revoke(journal);
1168 kfree(journal->j_wbuf);
1169 kfree(journal);
1170}
1171
1172
1173/**
1174 *int journal_check_used_features () - Check if features specified are used.
1175 * @journal: Journal to check.
1176 * @compat: bitmask of compatible features
1177 * @ro: bitmask of features that force read-only mount
1178 * @incompat: bitmask of incompatible features
1179 *
1180 * Check whether the journal uses all of a given set of
1181 * features. Return true (non-zero) if it does.
1182 **/
1183
1184int journal_check_used_features (journal_t *journal, unsigned long compat,
1185 unsigned long ro, unsigned long incompat)
1186{
1187 journal_superblock_t *sb;
1188
1189 if (!compat && !ro && !incompat)
1190 return 1;
1191 if (journal->j_format_version == 1)
1192 return 0;
1193
1194 sb = journal->j_superblock;
1195
1196 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1197 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1198 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1199 return 1;
1200
1201 return 0;
1202}
1203
1204/**
1205 * int journal_check_available_features() - Check feature set in journalling layer
1206 * @journal: Journal to check.
1207 * @compat: bitmask of compatible features
1208 * @ro: bitmask of features that force read-only mount
1209 * @incompat: bitmask of incompatible features
1210 *
1211 * Check whether the journaling code supports the use of
1212 * all of a given set of features on this journal. Return true
1213 * (non-zero) if it can. */
1214
1215int journal_check_available_features (journal_t *journal, unsigned long compat,
1216 unsigned long ro, unsigned long incompat)
1217{
1218 journal_superblock_t *sb;
1219
1220 if (!compat && !ro && !incompat)
1221 return 1;
1222
1223 sb = journal->j_superblock;
1224
1225 /* We can support any known requested features iff the
1226 * superblock is in version 2. Otherwise we fail to support any
1227 * extended sb features. */
1228
1229 if (journal->j_format_version != 2)
1230 return 0;
1231
1232 if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
1233 (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
1234 (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
1235 return 1;
1236
1237 return 0;
1238}
1239
1240/**
1241 * int journal_set_features () - Mark a given journal feature in the superblock
1242 * @journal: Journal to act on.
1243 * @compat: bitmask of compatible features
1244 * @ro: bitmask of features that force read-only mount
1245 * @incompat: bitmask of incompatible features
1246 *
1247 * Mark a given journal feature as present on the
1248 * superblock. Returns true if the requested features could be set.
1249 *
1250 */
1251
1252int journal_set_features (journal_t *journal, unsigned long compat,
1253 unsigned long ro, unsigned long incompat)
1254{
1255 journal_superblock_t *sb;
1256
1257 if (journal_check_used_features(journal, compat, ro, incompat))
1258 return 1;
1259
1260 if (!journal_check_available_features(journal, compat, ro, incompat))
1261 return 0;
1262
1263 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1264 compat, ro, incompat);
1265
1266 sb = journal->j_superblock;
1267
1268 sb->s_feature_compat |= cpu_to_be32(compat);
1269 sb->s_feature_ro_compat |= cpu_to_be32(ro);
1270 sb->s_feature_incompat |= cpu_to_be32(incompat);
1271
1272 return 1;
1273}
1274
1275
1276/**
1277 * int journal_update_format () - Update on-disk journal structure.
1278 * @journal: Journal to act on.
1279 *
1280 * Given an initialised but unloaded journal struct, poke about in the
1281 * on-disk structure to update it to the most recent supported version.
1282 */
1283int journal_update_format (journal_t *journal)
1284{
1285 journal_superblock_t *sb;
1286 int err;
1287
1288 err = journal_get_superblock(journal);
1289 if (err)
1290 return err;
1291
1292 sb = journal->j_superblock;
1293
1294 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1295 case JFS_SUPERBLOCK_V2:
1296 return 0;
1297 case JFS_SUPERBLOCK_V1:
1298 return journal_convert_superblock_v1(journal, sb);
1299 default:
1300 break;
1301 }
1302 return -EINVAL;
1303}
1304
1305static int journal_convert_superblock_v1(journal_t *journal,
1306 journal_superblock_t *sb)
1307{
1308 int offset, blocksize;
1309 struct buffer_head *bh;
1310
1311 printk(KERN_WARNING
1312 "JBD: Converting superblock from version 1 to 2.\n");
1313
1314 /* Pre-initialise new fields to zero */
1315 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1316 blocksize = be32_to_cpu(sb->s_blocksize);
1317 memset(&sb->s_feature_compat, 0, blocksize-offset);
1318
1319 sb->s_nr_users = cpu_to_be32(1);
1320 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
1321 journal->j_format_version = 2;
1322
1323 bh = journal->j_sb_buffer;
1324 BUFFER_TRACE(bh, "marking dirty");
1325 mark_buffer_dirty(bh);
1326 sync_dirty_buffer(bh);
1327 return 0;
1328}
1329
1330
1331/**
1332 * int journal_flush () - Flush journal
1333 * @journal: Journal to act on.
1334 *
1335 * Flush all data for a given journal to disk and empty the journal.
1336 * Filesystems can use this when remounting readonly to ensure that
1337 * recovery does not need to happen on remount.
1338 */
1339
1340int journal_flush(journal_t *journal)
1341{
1342 int err = 0;
1343 transaction_t *transaction = NULL;
1344 unsigned long old_tail;
1345
1346 spin_lock(&journal->j_state_lock);
1347
1348 /* Force everything buffered to the log... */
1349 if (journal->j_running_transaction) {
1350 transaction = journal->j_running_transaction;
1351 __log_start_commit(journal, transaction->t_tid);
1352 } else if (journal->j_committing_transaction)
1353 transaction = journal->j_committing_transaction;
1354
1355 /* Wait for the log commit to complete... */
1356 if (transaction) {
1357 tid_t tid = transaction->t_tid;
1358
1359 spin_unlock(&journal->j_state_lock);
1360 log_wait_commit(journal, tid);
1361 } else {
1362 spin_unlock(&journal->j_state_lock);
1363 }
1364
1365 /* ...and flush everything in the log out to disk. */
1366 spin_lock(&journal->j_list_lock);
1367 while (!err && journal->j_checkpoint_transactions != NULL) {
1368 spin_unlock(&journal->j_list_lock);
1369 err = log_do_checkpoint(journal);
1370 spin_lock(&journal->j_list_lock);
1371 }
1372 spin_unlock(&journal->j_list_lock);
1373 cleanup_journal_tail(journal);
1374
1375 /* Finally, mark the journal as really needing no recovery.
1376 * This sets s_start==0 in the underlying superblock, which is
1377 * the magic code for a fully-recovered superblock. Any future
1378 * commits of data to the journal will restore the current
1379 * s_start value. */
1380 spin_lock(&journal->j_state_lock);
1381 old_tail = journal->j_tail;
1382 journal->j_tail = 0;
1383 spin_unlock(&journal->j_state_lock);
1384 journal_update_superblock(journal, 1);
1385 spin_lock(&journal->j_state_lock);
1386 journal->j_tail = old_tail;
1387
1388 J_ASSERT(!journal->j_running_transaction);
1389 J_ASSERT(!journal->j_committing_transaction);
1390 J_ASSERT(!journal->j_checkpoint_transactions);
1391 J_ASSERT(journal->j_head == journal->j_tail);
1392 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1393 spin_unlock(&journal->j_state_lock);
1394 return err;
1395}
1396
1397/**
1398 * int journal_wipe() - Wipe journal contents
1399 * @journal: Journal to act on.
1400 * @write: flag (see below)
1401 *
1402 * Wipe out all of the contents of a journal, safely. This will produce
1403 * a warning if the journal contains any valid recovery information.
1404 * Must be called between journal_init_*() and journal_load().
1405 *
1406 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
1407 * we merely suppress recovery.
1408 */
1409
1410int journal_wipe(journal_t *journal, int write)
1411{
1412 journal_superblock_t *sb;
1413 int err = 0;
1414
1415 J_ASSERT (!(journal->j_flags & JFS_LOADED));
1416
1417 err = load_superblock(journal);
1418 if (err)
1419 return err;
1420
1421 sb = journal->j_superblock;
1422
1423 if (!journal->j_tail)
1424 goto no_recovery;
1425
1426 printk (KERN_WARNING "JBD: %s recovery information on journal\n",
1427 write ? "Clearing" : "Ignoring");
1428
1429 err = journal_skip_recovery(journal);
1430 if (write)
1431 journal_update_superblock(journal, 1);
1432
1433 no_recovery:
1434 return err;
1435}
1436
1437/*
1438 * journal_dev_name: format a character string to describe on what
1439 * device this journal is present.
1440 */
1441
1442const char *journal_dev_name(journal_t *journal, char *buffer)
1443{
1444 struct block_device *bdev;
1445
1446 if (journal->j_inode)
1447 bdev = journal->j_inode->i_sb->s_bdev;
1448 else
1449 bdev = journal->j_dev;
1450
1451 return bdevname(bdev, buffer);
1452}
1453
1454/*
1455 * Journal abort has very specific semantics, which we describe
1456 * for journal abort.
1457 *
1458 * Two internal function, which provide abort to te jbd layer
1459 * itself are here.
1460 */
1461
1462/*
1463 * Quick version for internal journal use (doesn't lock the journal).
1464 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
1465 * and don't attempt to make any other journal updates.
1466 */
1467void __journal_abort_hard(journal_t *journal)
1468{
1469 transaction_t *transaction;
1470 char b[BDEVNAME_SIZE];
1471
1472 if (journal->j_flags & JFS_ABORT)
1473 return;
1474
1475 printk(KERN_ERR "Aborting journal on device %s.\n",
1476 journal_dev_name(journal, b));
1477
1478 spin_lock(&journal->j_state_lock);
1479 journal->j_flags |= JFS_ABORT;
1480 transaction = journal->j_running_transaction;
1481 if (transaction)
1482 __log_start_commit(journal, transaction->t_tid);
1483 spin_unlock(&journal->j_state_lock);
1484}
1485
1486/* Soft abort: record the abort error status in the journal superblock,
1487 * but don't do any other IO. */
1488void __journal_abort_soft (journal_t *journal, int errno)
1489{
1490 if (journal->j_flags & JFS_ABORT)
1491 return;
1492
1493 if (!journal->j_errno)
1494 journal->j_errno = errno;
1495
1496 __journal_abort_hard(journal);
1497
1498 if (errno)
1499 journal_update_superblock(journal, 1);
1500}
1501
1502/**
1503 * void journal_abort () - Shutdown the journal immediately.
1504 * @journal: the journal to shutdown.
1505 * @errno: an error number to record in the journal indicating
1506 * the reason for the shutdown.
1507 *
1508 * Perform a complete, immediate shutdown of the ENTIRE
1509 * journal (not of a single transaction). This operation cannot be
1510 * undone without closing and reopening the journal.
1511 *
1512 * The journal_abort function is intended to support higher level error
1513 * recovery mechanisms such as the ext2/ext3 remount-readonly error
1514 * mode.
1515 *
1516 * Journal abort has very specific semantics. Any existing dirty,
1517 * unjournaled buffers in the main filesystem will still be written to
1518 * disk by bdflush, but the journaling mechanism will be suspended
1519 * immediately and no further transaction commits will be honoured.
1520 *
1521 * Any dirty, journaled buffers will be written back to disk without
1522 * hitting the journal. Atomicity cannot be guaranteed on an aborted
1523 * filesystem, but we _do_ attempt to leave as much data as possible
1524 * behind for fsck to use for cleanup.
1525 *
1526 * Any attempt to get a new transaction handle on a journal which is in
1527 * ABORT state will just result in an -EROFS error return. A
1528 * journal_stop on an existing handle will return -EIO if we have
1529 * entered abort state during the update.
1530 *
1531 * Recursive transactions are not disturbed by journal abort until the
1532 * final journal_stop, which will receive the -EIO error.
1533 *
1534 * Finally, the journal_abort call allows the caller to supply an errno
1535 * which will be recorded (if possible) in the journal superblock. This
1536 * allows a client to record failure conditions in the middle of a
1537 * transaction without having to complete the transaction to record the
1538 * failure to disk. ext3_error, for example, now uses this
1539 * functionality.
1540 *
1541 * Errors which originate from within the journaling layer will NOT
1542 * supply an errno; a null errno implies that absolutely no further
1543 * writes are done to the journal (unless there are any already in
1544 * progress).
1545 *
1546 */
1547
1548void journal_abort(journal_t *journal, int errno)
1549{
1550 __journal_abort_soft(journal, errno);
1551}
1552
1553/**
1554 * int journal_errno () - returns the journal's error state.
1555 * @journal: journal to examine.
1556 *
1557 * This is the errno numbet set with journal_abort(), the last
1558 * time the journal was mounted - if the journal was stopped
1559 * without calling abort this will be 0.
1560 *
1561 * If the journal has been aborted on this mount time -EROFS will
1562 * be returned.
1563 */
1564int journal_errno(journal_t *journal)
1565{
1566 int err;
1567
1568 spin_lock(&journal->j_state_lock);
1569 if (journal->j_flags & JFS_ABORT)
1570 err = -EROFS;
1571 else
1572 err = journal->j_errno;
1573 spin_unlock(&journal->j_state_lock);
1574 return err;
1575}
1576
1577/**
1578 * int journal_clear_err () - clears the journal's error state
1579 * @journal: journal to act on.
1580 *
1581 * An error must be cleared or Acked to take a FS out of readonly
1582 * mode.
1583 */
1584int journal_clear_err(journal_t *journal)
1585{
1586 int err = 0;
1587
1588 spin_lock(&journal->j_state_lock);
1589 if (journal->j_flags & JFS_ABORT)
1590 err = -EROFS;
1591 else
1592 journal->j_errno = 0;
1593 spin_unlock(&journal->j_state_lock);
1594 return err;
1595}
1596
1597/**
1598 * void journal_ack_err() - Ack journal err.
1599 * @journal: journal to act on.
1600 *
1601 * An error must be cleared or Acked to take a FS out of readonly
1602 * mode.
1603 */
1604void journal_ack_err(journal_t *journal)
1605{
1606 spin_lock(&journal->j_state_lock);
1607 if (journal->j_errno)
1608 journal->j_flags |= JFS_ACK_ERR;
1609 spin_unlock(&journal->j_state_lock);
1610}
1611
1612int journal_blocks_per_page(struct inode *inode)
1613{
1614 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1615}
1616
1617/*
1618 * Simple support for retrying memory allocations. Introduced to help to
1619 * debug different VM deadlock avoidance strategies.
1620 */
1621void * __jbd_kmalloc (const char *where, size_t size, int flags, int retry)
1622{
1623 return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
1624}
1625
1626/*
1627 * Journal_head storage management
1628 */
1629static kmem_cache_t *journal_head_cache;
1630#ifdef CONFIG_JBD_DEBUG
1631static atomic_t nr_journal_heads = ATOMIC_INIT(0);
1632#endif
1633
1634static int journal_init_journal_head_cache(void)
1635{
1636 int retval;
1637
1638 J_ASSERT(journal_head_cache == 0);
1639 journal_head_cache = kmem_cache_create("journal_head",
1640 sizeof(struct journal_head),
1641 0, /* offset */
1642 0, /* flags */
1643 NULL, /* ctor */
1644 NULL); /* dtor */
1645 retval = 0;
1646 if (journal_head_cache == 0) {
1647 retval = -ENOMEM;
1648 printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
1649 }
1650 return retval;
1651}
1652
1653static void journal_destroy_journal_head_cache(void)
1654{
1655 J_ASSERT(journal_head_cache != NULL);
1656 kmem_cache_destroy(journal_head_cache);
1657 journal_head_cache = NULL;
1658}
1659
1660/*
1661 * journal_head splicing and dicing
1662 */
1663static struct journal_head *journal_alloc_journal_head(void)
1664{
1665 struct journal_head *ret;
1666 static unsigned long last_warning;
1667
1668#ifdef CONFIG_JBD_DEBUG
1669 atomic_inc(&nr_journal_heads);
1670#endif
1671 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
1672 if (ret == 0) {
1673 jbd_debug(1, "out of memory for journal_head\n");
1674 if (time_after(jiffies, last_warning + 5*HZ)) {
1675 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1676 __FUNCTION__);
1677 last_warning = jiffies;
1678 }
1679 while (ret == 0) {
1680 yield();
1681 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
1682 }
1683 }
1684 return ret;
1685}
1686
1687static void journal_free_journal_head(struct journal_head *jh)
1688{
1689#ifdef CONFIG_JBD_DEBUG
1690 atomic_dec(&nr_journal_heads);
1691 memset(jh, 0x5b, sizeof(*jh));
1692#endif
1693 kmem_cache_free(journal_head_cache, jh);
1694}
1695
1696/*
1697 * A journal_head is attached to a buffer_head whenever JBD has an
1698 * interest in the buffer.
1699 *
1700 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
1701 * is set. This bit is tested in core kernel code where we need to take
1702 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
1703 * there.
1704 *
1705 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
1706 *
1707 * When a buffer has its BH_JBD bit set it is immune from being released by
1708 * core kernel code, mainly via ->b_count.
1709 *
1710 * A journal_head may be detached from its buffer_head when the journal_head's
1711 * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
1712 * Various places in JBD call journal_remove_journal_head() to indicate that the
1713 * journal_head can be dropped if needed.
1714 *
1715 * Various places in the kernel want to attach a journal_head to a buffer_head
1716 * _before_ attaching the journal_head to a transaction. To protect the
1717 * journal_head in this situation, journal_add_journal_head elevates the
1718 * journal_head's b_jcount refcount by one. The caller must call
1719 * journal_put_journal_head() to undo this.
1720 *
1721 * So the typical usage would be:
1722 *
1723 * (Attach a journal_head if needed. Increments b_jcount)
1724 * struct journal_head *jh = journal_add_journal_head(bh);
1725 * ...
1726 * jh->b_transaction = xxx;
1727 * journal_put_journal_head(jh);
1728 *
1729 * Now, the journal_head's b_jcount is zero, but it is safe from being released
1730 * because it has a non-zero b_transaction.
1731 */
1732
1733/*
1734 * Give a buffer_head a journal_head.
1735 *
1736 * Doesn't need the journal lock.
1737 * May sleep.
1738 */
1739struct journal_head *journal_add_journal_head(struct buffer_head *bh)
1740{
1741 struct journal_head *jh;
1742 struct journal_head *new_jh = NULL;
1743
1744repeat:
1745 if (!buffer_jbd(bh)) {
1746 new_jh = journal_alloc_journal_head();
1747 memset(new_jh, 0, sizeof(*new_jh));
1748 }
1749
1750 jbd_lock_bh_journal_head(bh);
1751 if (buffer_jbd(bh)) {
1752 jh = bh2jh(bh);
1753 } else {
1754 J_ASSERT_BH(bh,
1755 (atomic_read(&bh->b_count) > 0) ||
1756 (bh->b_page && bh->b_page->mapping));
1757
1758 if (!new_jh) {
1759 jbd_unlock_bh_journal_head(bh);
1760 goto repeat;
1761 }
1762
1763 jh = new_jh;
1764 new_jh = NULL; /* We consumed it */
1765 set_buffer_jbd(bh);
1766 bh->b_private = jh;
1767 jh->b_bh = bh;
1768 get_bh(bh);
1769 BUFFER_TRACE(bh, "added journal_head");
1770 }
1771 jh->b_jcount++;
1772 jbd_unlock_bh_journal_head(bh);
1773 if (new_jh)
1774 journal_free_journal_head(new_jh);
1775 return bh->b_private;
1776}
1777
1778/*
1779 * Grab a ref against this buffer_head's journal_head. If it ended up not
1780 * having a journal_head, return NULL
1781 */
1782struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
1783{
1784 struct journal_head *jh = NULL;
1785
1786 jbd_lock_bh_journal_head(bh);
1787 if (buffer_jbd(bh)) {
1788 jh = bh2jh(bh);
1789 jh->b_jcount++;
1790 }
1791 jbd_unlock_bh_journal_head(bh);
1792 return jh;
1793}
1794
1795static void __journal_remove_journal_head(struct buffer_head *bh)
1796{
1797 struct journal_head *jh = bh2jh(bh);
1798
1799 J_ASSERT_JH(jh, jh->b_jcount >= 0);
1800
1801 get_bh(bh);
1802 if (jh->b_jcount == 0) {
1803 if (jh->b_transaction == NULL &&
1804 jh->b_next_transaction == NULL &&
1805 jh->b_cp_transaction == NULL) {
1806 J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
1807 J_ASSERT_BH(bh, buffer_jbd(bh));
1808 J_ASSERT_BH(bh, jh2bh(jh) == bh);
1809 BUFFER_TRACE(bh, "remove journal_head");
1810 if (jh->b_frozen_data) {
1811 printk(KERN_WARNING "%s: freeing "
1812 "b_frozen_data\n",
1813 __FUNCTION__);
1814 kfree(jh->b_frozen_data);
1815 }
1816 if (jh->b_committed_data) {
1817 printk(KERN_WARNING "%s: freeing "
1818 "b_committed_data\n",
1819 __FUNCTION__);
1820 kfree(jh->b_committed_data);
1821 }
1822 bh->b_private = NULL;
1823 jh->b_bh = NULL; /* debug, really */
1824 clear_buffer_jbd(bh);
1825 __brelse(bh);
1826 journal_free_journal_head(jh);
1827 } else {
1828 BUFFER_TRACE(bh, "journal_head was locked");
1829 }
1830 }
1831}
1832
1833/*
1834 * journal_remove_journal_head(): if the buffer isn't attached to a transaction
1835 * and has a zero b_jcount then remove and release its journal_head. If we did
1836 * see that the buffer is not used by any transaction we also "logically"
1837 * decrement ->b_count.
1838 *
1839 * We in fact take an additional increment on ->b_count as a convenience,
1840 * because the caller usually wants to do additional things with the bh
1841 * after calling here.
1842 * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
1843 * time. Once the caller has run __brelse(), the buffer is eligible for
1844 * reaping by try_to_free_buffers().
1845 */
1846void journal_remove_journal_head(struct buffer_head *bh)
1847{
1848 jbd_lock_bh_journal_head(bh);
1849 __journal_remove_journal_head(bh);
1850 jbd_unlock_bh_journal_head(bh);
1851}
1852
1853/*
1854 * Drop a reference on the passed journal_head. If it fell to zero then try to
1855 * release the journal_head from the buffer_head.
1856 */
1857void journal_put_journal_head(struct journal_head *jh)
1858{
1859 struct buffer_head *bh = jh2bh(jh);
1860
1861 jbd_lock_bh_journal_head(bh);
1862 J_ASSERT_JH(jh, jh->b_jcount > 0);
1863 --jh->b_jcount;
1864 if (!jh->b_jcount && !jh->b_transaction) {
1865 __journal_remove_journal_head(bh);
1866 __brelse(bh);
1867 }
1868 jbd_unlock_bh_journal_head(bh);
1869}
1870
1871/*
1872 * /proc tunables
1873 */
1874#if defined(CONFIG_JBD_DEBUG)
1875int journal_enable_debug;
1876EXPORT_SYMBOL(journal_enable_debug);
1877#endif
1878
1879#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
1880
1881static struct proc_dir_entry *proc_jbd_debug;
1882
1883int read_jbd_debug(char *page, char **start, off_t off,
1884 int count, int *eof, void *data)
1885{
1886 int ret;
1887
1888 ret = sprintf(page + off, "%d\n", journal_enable_debug);
1889 *eof = 1;
1890 return ret;
1891}
1892
1893int write_jbd_debug(struct file *file, const char __user *buffer,
1894 unsigned long count, void *data)
1895{
1896 char buf[32];
1897
1898 if (count > ARRAY_SIZE(buf) - 1)
1899 count = ARRAY_SIZE(buf) - 1;
1900 if (copy_from_user(buf, buffer, count))
1901 return -EFAULT;
1902 buf[ARRAY_SIZE(buf) - 1] = '\0';
1903 journal_enable_debug = simple_strtoul(buf, NULL, 10);
1904 return count;
1905}
1906
1907#define JBD_PROC_NAME "sys/fs/jbd-debug"
1908
1909static void __init create_jbd_proc_entry(void)
1910{
1911 proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
1912 if (proc_jbd_debug) {
1913 /* Why is this so hard? */
1914 proc_jbd_debug->read_proc = read_jbd_debug;
1915 proc_jbd_debug->write_proc = write_jbd_debug;
1916 }
1917}
1918
1919static void __exit remove_jbd_proc_entry(void)
1920{
1921 if (proc_jbd_debug)
1922 remove_proc_entry(JBD_PROC_NAME, NULL);
1923}
1924
1925#else
1926
1927#define create_jbd_proc_entry() do {} while (0)
1928#define remove_jbd_proc_entry() do {} while (0)
1929
1930#endif
1931
1932kmem_cache_t *jbd_handle_cache;
1933
1934static int __init journal_init_handle_cache(void)
1935{
1936 jbd_handle_cache = kmem_cache_create("journal_handle",
1937 sizeof(handle_t),
1938 0, /* offset */
1939 0, /* flags */
1940 NULL, /* ctor */
1941 NULL); /* dtor */
1942 if (jbd_handle_cache == NULL) {
1943 printk(KERN_EMERG "JBD: failed to create handle cache\n");
1944 return -ENOMEM;
1945 }
1946 return 0;
1947}
1948
1949static void journal_destroy_handle_cache(void)
1950{
1951 if (jbd_handle_cache)
1952 kmem_cache_destroy(jbd_handle_cache);
1953}
1954
1955/*
1956 * Module startup and shutdown
1957 */
1958
1959static int __init journal_init_caches(void)
1960{
1961 int ret;
1962
1963 ret = journal_init_revoke_caches();
1964 if (ret == 0)
1965 ret = journal_init_journal_head_cache();
1966 if (ret == 0)
1967 ret = journal_init_handle_cache();
1968 return ret;
1969}
1970
1971static void journal_destroy_caches(void)
1972{
1973 journal_destroy_revoke_caches();
1974 journal_destroy_journal_head_cache();
1975 journal_destroy_handle_cache();
1976}
1977
1978static int __init journal_init(void)
1979{
1980 int ret;
1981
1982 ret = journal_init_caches();
1983 if (ret != 0)
1984 journal_destroy_caches();
1985 create_jbd_proc_entry();
1986 return ret;
1987}
1988
1989static void __exit journal_exit(void)
1990{
1991#ifdef CONFIG_JBD_DEBUG
1992 int n = atomic_read(&nr_journal_heads);
1993 if (n)
1994 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
1995#endif
1996 remove_jbd_proc_entry();
1997 journal_destroy_caches();
1998}
1999
2000MODULE_LICENSE("GPL");
2001module_init(journal_init);
2002module_exit(journal_exit);
2003
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
new file mode 100644
index 000000000000..103c34e4fb28
--- /dev/null
+++ b/fs/jbd/recovery.c
@@ -0,0 +1,591 @@
1/*
2 * linux/fs/recovery.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal recovery routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#ifndef __KERNEL__
17#include "jfs_user.h"
18#else
19#include <linux/time.h>
20#include <linux/fs.h>
21#include <linux/jbd.h>
22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif
25
26/*
27 * Maintain information about the progress of the recovery job, so that
28 * the different passes can carry information between them.
29 */
30struct recovery_info
31{
32 tid_t start_transaction;
33 tid_t end_transaction;
34
35 int nr_replays;
36 int nr_revokes;
37 int nr_revoke_hits;
38};
39
40enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
41static int do_one_pass(journal_t *journal,
42 struct recovery_info *info, enum passtype pass);
43static int scan_revoke_records(journal_t *, struct buffer_head *,
44 tid_t, struct recovery_info *);
45
46#ifdef __KERNEL__
47
48/* Release readahead buffers after use */
49void journal_brelse_array(struct buffer_head *b[], int n)
50{
51 while (--n >= 0)
52 brelse (b[n]);
53}
54
55
56/*
57 * When reading from the journal, we are going through the block device
58 * layer directly and so there is no readahead being done for us. We
59 * need to implement any readahead ourselves if we want it to happen at
60 * all. Recovery is basically one long sequential read, so make sure we
61 * do the IO in reasonably large chunks.
62 *
63 * This is not so critical that we need to be enormously clever about
64 * the readahead size, though. 128K is a purely arbitrary, good-enough
65 * fixed value.
66 */
67
68#define MAXBUF 8
69static int do_readahead(journal_t *journal, unsigned int start)
70{
71 int err;
72 unsigned int max, nbufs, next;
73 unsigned long blocknr;
74 struct buffer_head *bh;
75
76 struct buffer_head * bufs[MAXBUF];
77
78 /* Do up to 128K of readahead */
79 max = start + (128 * 1024 / journal->j_blocksize);
80 if (max > journal->j_maxlen)
81 max = journal->j_maxlen;
82
83 /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
84 * a time to the block device IO layer. */
85
86 nbufs = 0;
87
88 for (next = start; next < max; next++) {
89 err = journal_bmap(journal, next, &blocknr);
90
91 if (err) {
92 printk (KERN_ERR "JBD: bad block at offset %u\n",
93 next);
94 goto failed;
95 }
96
97 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
98 if (!bh) {
99 err = -ENOMEM;
100 goto failed;
101 }
102
103 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
104 bufs[nbufs++] = bh;
105 if (nbufs == MAXBUF) {
106 ll_rw_block(READ, nbufs, bufs);
107 journal_brelse_array(bufs, nbufs);
108 nbufs = 0;
109 }
110 } else
111 brelse(bh);
112 }
113
114 if (nbufs)
115 ll_rw_block(READ, nbufs, bufs);
116 err = 0;
117
118failed:
119 if (nbufs)
120 journal_brelse_array(bufs, nbufs);
121 return err;
122}
123
124#endif /* __KERNEL__ */
125
126
127/*
128 * Read a block from the journal
129 */
130
131static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset)
133{
134 int err;
135 unsigned long blocknr;
136 struct buffer_head *bh;
137
138 *bhp = NULL;
139
140 if (offset >= journal->j_maxlen) {
141 printk(KERN_ERR "JBD: corrupted journal superblock\n");
142 return -EIO;
143 }
144
145 err = journal_bmap(journal, offset, &blocknr);
146
147 if (err) {
148 printk (KERN_ERR "JBD: bad block at offset %u\n",
149 offset);
150 return err;
151 }
152
153 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
154 if (!bh)
155 return -ENOMEM;
156
157 if (!buffer_uptodate(bh)) {
158 /* If this is a brand new buffer, start readahead.
159 Otherwise, we assume we are already reading it. */
160 if (!buffer_req(bh))
161 do_readahead(journal, offset);
162 wait_on_buffer(bh);
163 }
164
165 if (!buffer_uptodate(bh)) {
166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
167 offset);
168 brelse(bh);
169 return -EIO;
170 }
171
172 *bhp = bh;
173 return 0;
174}
175
176
177/*
178 * Count the number of in-use tags in a journal descriptor block.
179 */
180
181static int count_tags(struct buffer_head *bh, int size)
182{
183 char * tagp;
184 journal_block_tag_t * tag;
185 int nr = 0;
186
187 tagp = &bh->b_data[sizeof(journal_header_t)];
188
189 while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
190 tag = (journal_block_tag_t *) tagp;
191
192 nr++;
193 tagp += sizeof(journal_block_tag_t);
194 if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
195 tagp += 16;
196
197 if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
198 break;
199 }
200
201 return nr;
202}
203
204
205/* Make sure we wrap around the log correctly! */
206#define wrap(journal, var) \
207do { \
208 if (var >= (journal)->j_last) \
209 var -= ((journal)->j_last - (journal)->j_first); \
210} while (0)
211
212/**
213 * int journal_recover(journal_t *journal) - recovers a on-disk journal
214 * @journal: the journal to recover
215 *
216 * The primary function for recovering the log contents when mounting a
217 * journaled device.
218 *
219 * Recovery is done in three passes. In the first pass, we look for the
220 * end of the log. In the second, we assemble the list of revoke
221 * blocks. In the third and final pass, we replay any un-revoked blocks
222 * in the log.
223 */
224int journal_recover(journal_t *journal)
225{
226 int err;
227 journal_superblock_t * sb;
228
229 struct recovery_info info;
230
231 memset(&info, 0, sizeof(info));
232 sb = journal->j_superblock;
233
234 /*
235 * The journal superblock's s_start field (the current log head)
236 * is always zero if, and only if, the journal was cleanly
237 * unmounted.
238 */
239
240 if (!sb->s_start) {
241 jbd_debug(1, "No recovery required, last transaction %d\n",
242 be32_to_cpu(sb->s_sequence));
243 journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
244 return 0;
245 }
246
247 err = do_one_pass(journal, &info, PASS_SCAN);
248 if (!err)
249 err = do_one_pass(journal, &info, PASS_REVOKE);
250 if (!err)
251 err = do_one_pass(journal, &info, PASS_REPLAY);
252
253 jbd_debug(0, "JBD: recovery, exit status %d, "
254 "recovered transactions %u to %u\n",
255 err, info.start_transaction, info.end_transaction);
256 jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
257 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
258
259 /* Restart the log at the next transaction ID, thus invalidating
260 * any existing commit records in the log. */
261 journal->j_transaction_sequence = ++info.end_transaction;
262
263 journal_clear_revoke(journal);
264 sync_blockdev(journal->j_fs_dev);
265 return err;
266}
267
268/**
269 * int journal_skip_recovery() - Start journal and wipe exiting records
270 * @journal: journal to startup
271 *
272 * Locate any valid recovery information from the journal and set up the
273 * journal structures in memory to ignore it (presumably because the
274 * caller has evidence that it is out of date).
275 * This function does'nt appear to be exorted..
276 *
277 * We perform one pass over the journal to allow us to tell the user how
278 * much recovery information is being erased, and to let us initialise
279 * the journal transaction sequence numbers to the next unused ID.
280 */
281int journal_skip_recovery(journal_t *journal)
282{
283 int err;
284 journal_superblock_t * sb;
285
286 struct recovery_info info;
287
288 memset (&info, 0, sizeof(info));
289 sb = journal->j_superblock;
290
291 err = do_one_pass(journal, &info, PASS_SCAN);
292
293 if (err) {
294 printk(KERN_ERR "JBD: error %d scanning journal\n", err);
295 ++journal->j_transaction_sequence;
296 } else {
297#ifdef CONFIG_JBD_DEBUG
298 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
299#endif
300 jbd_debug(0,
301 "JBD: ignoring %d transaction%s from the journal.\n",
302 dropped, (dropped == 1) ? "" : "s");
303 journal->j_transaction_sequence = ++info.end_transaction;
304 }
305
306 journal->j_tail = 0;
307 return err;
308}
309
310static int do_one_pass(journal_t *journal,
311 struct recovery_info *info, enum passtype pass)
312{
313 unsigned int first_commit_ID, next_commit_ID;
314 unsigned long next_log_block;
315 int err, success = 0;
316 journal_superblock_t * sb;
317 journal_header_t * tmp;
318 struct buffer_head * bh;
319 unsigned int sequence;
320 int blocktype;
321
322 /* Precompute the maximum metadata descriptors in a descriptor block */
323 int MAX_BLOCKS_PER_DESC;
324 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
325 / sizeof(journal_block_tag_t));
326
327 /*
328 * First thing is to establish what we expect to find in the log
329 * (in terms of transaction IDs), and where (in terms of log
330 * block offsets): query the superblock.
331 */
332
333 sb = journal->j_superblock;
334 next_commit_ID = be32_to_cpu(sb->s_sequence);
335 next_log_block = be32_to_cpu(sb->s_start);
336
337 first_commit_ID = next_commit_ID;
338 if (pass == PASS_SCAN)
339 info->start_transaction = first_commit_ID;
340
341 jbd_debug(1, "Starting recovery pass %d\n", pass);
342
343 /*
344 * Now we walk through the log, transaction by transaction,
345 * making sure that each transaction has a commit block in the
346 * expected place. Each complete transaction gets replayed back
347 * into the main filesystem.
348 */
349
350 while (1) {
351 int flags;
352 char * tagp;
353 journal_block_tag_t * tag;
354 struct buffer_head * obh;
355 struct buffer_head * nbh;
356
357 cond_resched(); /* We're under lock_kernel() */
358
359 /* If we already know where to stop the log traversal,
360 * check right now that we haven't gone past the end of
361 * the log. */
362
363 if (pass != PASS_SCAN)
364 if (tid_geq(next_commit_ID, info->end_transaction))
365 break;
366
367 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
368 next_commit_ID, next_log_block, journal->j_last);
369
370 /* Skip over each chunk of the transaction looking
371 * either the next descriptor block or the final commit
372 * record. */
373
374 jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
375 err = jread(&bh, journal, next_log_block);
376 if (err)
377 goto failed;
378
379 next_log_block++;
380 wrap(journal, next_log_block);
381
382 /* What kind of buffer is it?
383 *
384 * If it is a descriptor block, check that it has the
385 * expected sequence number. Otherwise, we're all done
386 * here. */
387
388 tmp = (journal_header_t *)bh->b_data;
389
390 if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
391 brelse(bh);
392 break;
393 }
394
395 blocktype = be32_to_cpu(tmp->h_blocktype);
396 sequence = be32_to_cpu(tmp->h_sequence);
397 jbd_debug(3, "Found magic %d, sequence %d\n",
398 blocktype, sequence);
399
400 if (sequence != next_commit_ID) {
401 brelse(bh);
402 break;
403 }
404
405 /* OK, we have a valid descriptor block which matches
406 * all of the sequence number checks. What are we going
407 * to do with it? That depends on the pass... */
408
409 switch(blocktype) {
410 case JFS_DESCRIPTOR_BLOCK:
411 /* If it is a valid descriptor block, replay it
412 * in pass REPLAY; otherwise, just skip over the
413 * blocks it describes. */
414 if (pass != PASS_REPLAY) {
415 next_log_block +=
416 count_tags(bh, journal->j_blocksize);
417 wrap(journal, next_log_block);
418 brelse(bh);
419 continue;
420 }
421
422 /* A descriptor block: we can now write all of
423 * the data blocks. Yay, useful work is finally
424 * getting done here! */
425
426 tagp = &bh->b_data[sizeof(journal_header_t)];
427 while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
428 <= journal->j_blocksize) {
429 unsigned long io_block;
430
431 tag = (journal_block_tag_t *) tagp;
432 flags = be32_to_cpu(tag->t_flags);
433
434 io_block = next_log_block++;
435 wrap(journal, next_log_block);
436 err = jread(&obh, journal, io_block);
437 if (err) {
438 /* Recover what we can, but
439 * report failure at the end. */
440 success = err;
441 printk (KERN_ERR
442 "JBD: IO error %d recovering "
443 "block %ld in log\n",
444 err, io_block);
445 } else {
446 unsigned long blocknr;
447
448 J_ASSERT(obh != NULL);
449 blocknr = be32_to_cpu(tag->t_blocknr);
450
451 /* If the block has been
452 * revoked, then we're all done
453 * here. */
454 if (journal_test_revoke
455 (journal, blocknr,
456 next_commit_ID)) {
457 brelse(obh);
458 ++info->nr_revoke_hits;
459 goto skip_write;
460 }
461
462 /* Find a buffer for the new
463 * data being restored */
464 nbh = __getblk(journal->j_fs_dev,
465 blocknr,
466 journal->j_blocksize);
467 if (nbh == NULL) {
468 printk(KERN_ERR
469 "JBD: Out of memory "
470 "during recovery.\n");
471 err = -ENOMEM;
472 brelse(bh);
473 brelse(obh);
474 goto failed;
475 }
476
477 lock_buffer(nbh);
478 memcpy(nbh->b_data, obh->b_data,
479 journal->j_blocksize);
480 if (flags & JFS_FLAG_ESCAPE) {
481 *((__be32 *)bh->b_data) =
482 cpu_to_be32(JFS_MAGIC_NUMBER);
483 }
484
485 BUFFER_TRACE(nbh, "marking dirty");
486 set_buffer_uptodate(nbh);
487 mark_buffer_dirty(nbh);
488 BUFFER_TRACE(nbh, "marking uptodate");
489 ++info->nr_replays;
490 /* ll_rw_block(WRITE, 1, &nbh); */
491 unlock_buffer(nbh);
492 brelse(obh);
493 brelse(nbh);
494 }
495
496 skip_write:
497 tagp += sizeof(journal_block_tag_t);
498 if (!(flags & JFS_FLAG_SAME_UUID))
499 tagp += 16;
500
501 if (flags & JFS_FLAG_LAST_TAG)
502 break;
503 }
504
505 brelse(bh);
506 continue;
507
508 case JFS_COMMIT_BLOCK:
509 /* Found an expected commit block: not much to
510 * do other than move on to the next sequence
511 * number. */
512 brelse(bh);
513 next_commit_ID++;
514 continue;
515
516 case JFS_REVOKE_BLOCK:
517 /* If we aren't in the REVOKE pass, then we can
518 * just skip over this block. */
519 if (pass != PASS_REVOKE) {
520 brelse(bh);
521 continue;
522 }
523
524 err = scan_revoke_records(journal, bh,
525 next_commit_ID, info);
526 brelse(bh);
527 if (err)
528 goto failed;
529 continue;
530
531 default:
532 jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
533 blocktype);
534 goto done;
535 }
536 }
537
538 done:
539 /*
540 * We broke out of the log scan loop: either we came to the
541 * known end of the log or we found an unexpected block in the
542 * log. If the latter happened, then we know that the "current"
543 * transaction marks the end of the valid log.
544 */
545
546 if (pass == PASS_SCAN)
547 info->end_transaction = next_commit_ID;
548 else {
549 /* It's really bad news if different passes end up at
550 * different places (but possible due to IO errors). */
551 if (info->end_transaction != next_commit_ID) {
552 printk (KERN_ERR "JBD: recovery pass %d ended at "
553 "transaction %u, expected %u\n",
554 pass, next_commit_ID, info->end_transaction);
555 if (!success)
556 success = -EIO;
557 }
558 }
559
560 return success;
561
562 failed:
563 return err;
564}
565
566
567/* Scan a revoke record, marking all blocks mentioned as revoked. */
568
569static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
570 tid_t sequence, struct recovery_info *info)
571{
572 journal_revoke_header_t *header;
573 int offset, max;
574
575 header = (journal_revoke_header_t *) bh->b_data;
576 offset = sizeof(journal_revoke_header_t);
577 max = be32_to_cpu(header->r_count);
578
579 while (offset < max) {
580 unsigned long blocknr;
581 int err;
582
583 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
584 offset += 4;
585 err = journal_set_revoke(journal, blocknr, sequence);
586 if (err)
587 return err;
588 ++info->nr_revokes;
589 }
590 return 0;
591}
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
new file mode 100644
index 000000000000..d327a598f861
--- /dev/null
+++ b/fs/jbd/revoke.c
@@ -0,0 +1,702 @@
1/*
2 * linux/fs/revoke.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
5 *
6 * Copyright 2000 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal revoke routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 *
15 * Revoke is the mechanism used to prevent old log records for deleted
16 * metadata from being replayed on top of newer data using the same
17 * blocks. The revoke mechanism is used in two separate places:
18 *
19 * + Commit: during commit we write the entire list of the current
20 * transaction's revoked blocks to the journal
21 *
22 * + Recovery: during recovery we record the transaction ID of all
23 * revoked blocks. If there are multiple revoke records in the log
24 * for a single block, only the last one counts, and if there is a log
25 * entry for a block beyond the last revoke, then that log entry still
26 * gets replayed.
27 *
28 * We can get interactions between revokes and new log data within a
29 * single transaction:
30 *
31 * Block is revoked and then journaled:
32 * The desired end result is the journaling of the new block, so we
33 * cancel the revoke before the transaction commits.
34 *
35 * Block is journaled and then revoked:
36 * The revoke must take precedence over the write of the block, so we
37 * need either to cancel the journal entry or to write the revoke
38 * later in the log than the log block. In this case, we choose the
39 * latter: journaling a block cancels any revoke record for that block
40 * in the current transaction, so any revoke for that block in the
41 * transaction must have happened after the block was journaled and so
42 * the revoke must take precedence.
43 *
44 * Block is revoked and then written as data:
45 * The data write is allowed to succeed, but the revoke is _not_
46 * cancelled. We still need to prevent old log records from
47 * overwriting the new data. We don't even need to clear the revoke
48 * bit here.
49 *
50 * Revoke information on buffers is a tri-state value:
51 *
52 * RevokeValid clear: no cached revoke status, need to look it up
53 * RevokeValid set, Revoked clear:
54 * buffer has not been revoked, and cancel_revoke
55 * need do nothing.
56 * RevokeValid set, Revoked set:
57 * buffer has been revoked.
58 */
59
60#ifndef __KERNEL__
61#include "jfs_user.h"
62#else
63#include <linux/time.h>
64#include <linux/fs.h>
65#include <linux/jbd.h>
66#include <linux/errno.h>
67#include <linux/slab.h>
68#include <linux/list.h>
69#include <linux/smp_lock.h>
70#include <linux/init.h>
71#endif
72
73static kmem_cache_t *revoke_record_cache;
74static kmem_cache_t *revoke_table_cache;
75
76/* Each revoke record represents one single revoked block. During
77 journal replay, this involves recording the transaction ID of the
78 last transaction to revoke this block. */
79
80struct jbd_revoke_record_s
81{
82 struct list_head hash;
83 tid_t sequence; /* Used for recovery only */
84 unsigned long blocknr;
85};
86
87
88/* The revoke table is just a simple hash table of revoke records. */
89struct jbd_revoke_table_s
90{
91 /* It is conceivable that we might want a larger hash table
92 * for recovery. Must be a power of two. */
93 int hash_size;
94 int hash_shift;
95 struct list_head *hash_table;
96};
97
98
99#ifdef __KERNEL__
100static void write_one_revoke_record(journal_t *, transaction_t *,
101 struct journal_head **, int *,
102 struct jbd_revoke_record_s *);
103static void flush_descriptor(journal_t *, struct journal_head *, int);
104#endif
105
106/* Utility functions to maintain the revoke table */
107
108/* Borrowed from buffer.c: this is a tried and tested block hash function */
109static inline int hash(journal_t *journal, unsigned long block)
110{
111 struct jbd_revoke_table_s *table = journal->j_revoke;
112 int hash_shift = table->hash_shift;
113
114 return ((block << (hash_shift - 6)) ^
115 (block >> 13) ^
116 (block << (hash_shift - 12))) & (table->hash_size - 1);
117}
118
119int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq)
120{
121 struct list_head *hash_list;
122 struct jbd_revoke_record_s *record;
123
124repeat:
125 record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
126 if (!record)
127 goto oom;
128
129 record->sequence = seq;
130 record->blocknr = blocknr;
131 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
132 spin_lock(&journal->j_revoke_lock);
133 list_add(&record->hash, hash_list);
134 spin_unlock(&journal->j_revoke_lock);
135 return 0;
136
137oom:
138 if (!journal_oom_retry)
139 return -ENOMEM;
140 jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
141 yield();
142 goto repeat;
143}
144
145/* Find a revoke record in the journal's hash table. */
146
147static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
148 unsigned long blocknr)
149{
150 struct list_head *hash_list;
151 struct jbd_revoke_record_s *record;
152
153 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
154
155 spin_lock(&journal->j_revoke_lock);
156 record = (struct jbd_revoke_record_s *) hash_list->next;
157 while (&(record->hash) != hash_list) {
158 if (record->blocknr == blocknr) {
159 spin_unlock(&journal->j_revoke_lock);
160 return record;
161 }
162 record = (struct jbd_revoke_record_s *) record->hash.next;
163 }
164 spin_unlock(&journal->j_revoke_lock);
165 return NULL;
166}
167
168int __init journal_init_revoke_caches(void)
169{
170 revoke_record_cache = kmem_cache_create("revoke_record",
171 sizeof(struct jbd_revoke_record_s),
172 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
173 if (revoke_record_cache == 0)
174 return -ENOMEM;
175
176 revoke_table_cache = kmem_cache_create("revoke_table",
177 sizeof(struct jbd_revoke_table_s),
178 0, 0, NULL, NULL);
179 if (revoke_table_cache == 0) {
180 kmem_cache_destroy(revoke_record_cache);
181 revoke_record_cache = NULL;
182 return -ENOMEM;
183 }
184 return 0;
185}
186
187void journal_destroy_revoke_caches(void)
188{
189 kmem_cache_destroy(revoke_record_cache);
190 revoke_record_cache = NULL;
191 kmem_cache_destroy(revoke_table_cache);
192 revoke_table_cache = NULL;
193}
194
195/* Initialise the revoke table for a given journal to a given size. */
196
197int journal_init_revoke(journal_t *journal, int hash_size)
198{
199 int shift, tmp;
200
201 J_ASSERT (journal->j_revoke_table[0] == NULL);
202
203 shift = 0;
204 tmp = hash_size;
205 while((tmp >>= 1UL) != 0UL)
206 shift++;
207
208 journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
209 if (!journal->j_revoke_table[0])
210 return -ENOMEM;
211 journal->j_revoke = journal->j_revoke_table[0];
212
213 /* Check that the hash_size is a power of two */
214 J_ASSERT ((hash_size & (hash_size-1)) == 0);
215
216 journal->j_revoke->hash_size = hash_size;
217
218 journal->j_revoke->hash_shift = shift;
219
220 journal->j_revoke->hash_table =
221 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
222 if (!journal->j_revoke->hash_table) {
223 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
224 journal->j_revoke = NULL;
225 return -ENOMEM;
226 }
227
228 for (tmp = 0; tmp < hash_size; tmp++)
229 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
230
231 journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
232 if (!journal->j_revoke_table[1]) {
233 kfree(journal->j_revoke_table[0]->hash_table);
234 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
235 return -ENOMEM;
236 }
237
238 journal->j_revoke = journal->j_revoke_table[1];
239
240 /* Check that the hash_size is a power of two */
241 J_ASSERT ((hash_size & (hash_size-1)) == 0);
242
243 journal->j_revoke->hash_size = hash_size;
244
245 journal->j_revoke->hash_shift = shift;
246
247 journal->j_revoke->hash_table =
248 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
249 if (!journal->j_revoke->hash_table) {
250 kfree(journal->j_revoke_table[0]->hash_table);
251 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
252 kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
253 journal->j_revoke = NULL;
254 return -ENOMEM;
255 }
256
257 for (tmp = 0; tmp < hash_size; tmp++)
258 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
259
260 spin_lock_init(&journal->j_revoke_lock);
261
262 return 0;
263}
264
265/* Destoy a journal's revoke table. The table must already be empty! */
266
267void journal_destroy_revoke(journal_t *journal)
268{
269 struct jbd_revoke_table_s *table;
270 struct list_head *hash_list;
271 int i;
272
273 table = journal->j_revoke_table[0];
274 if (!table)
275 return;
276
277 for (i=0; i<table->hash_size; i++) {
278 hash_list = &table->hash_table[i];
279 J_ASSERT (list_empty(hash_list));
280 }
281
282 kfree(table->hash_table);
283 kmem_cache_free(revoke_table_cache, table);
284 journal->j_revoke = NULL;
285
286 table = journal->j_revoke_table[1];
287 if (!table)
288 return;
289
290 for (i=0; i<table->hash_size; i++) {
291 hash_list = &table->hash_table[i];
292 J_ASSERT (list_empty(hash_list));
293 }
294
295 kfree(table->hash_table);
296 kmem_cache_free(revoke_table_cache, table);
297 journal->j_revoke = NULL;
298}
299
300
301#ifdef __KERNEL__
302
303/*
304 * journal_revoke: revoke a given buffer_head from the journal. This
305 * prevents the block from being replayed during recovery if we take a
306 * crash after this current transaction commits. Any subsequent
307 * metadata writes of the buffer in this transaction cancel the
308 * revoke.
309 *
310 * Note that this call may block --- it is up to the caller to make
311 * sure that there are no further calls to journal_write_metadata
312 * before the revoke is complete. In ext3, this implies calling the
313 * revoke before clearing the block bitmap when we are deleting
314 * metadata.
315 *
316 * Revoke performs a journal_forget on any buffer_head passed in as a
317 * parameter, but does _not_ forget the buffer_head if the bh was only
318 * found implicitly.
319 *
320 * bh_in may not be a journalled buffer - it may have come off
321 * the hash tables without an attached journal_head.
322 *
323 * If bh_in is non-zero, journal_revoke() will decrement its b_count
324 * by one.
325 */
326
327int journal_revoke(handle_t *handle, unsigned long blocknr,
328 struct buffer_head *bh_in)
329{
330 struct buffer_head *bh = NULL;
331 journal_t *journal;
332 struct block_device *bdev;
333 int err;
334
335 might_sleep();
336 if (bh_in)
337 BUFFER_TRACE(bh_in, "enter");
338
339 journal = handle->h_transaction->t_journal;
340 if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
341 J_ASSERT (!"Cannot set revoke feature!");
342 return -EINVAL;
343 }
344
345 bdev = journal->j_fs_dev;
346 bh = bh_in;
347
348 if (!bh) {
349 bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
350 if (bh)
351 BUFFER_TRACE(bh, "found on hash");
352 }
353#ifdef JBD_EXPENSIVE_CHECKING
354 else {
355 struct buffer_head *bh2;
356
357 /* If there is a different buffer_head lying around in
358 * memory anywhere... */
359 bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
360 if (bh2) {
361 /* ... and it has RevokeValid status... */
362 if (bh2 != bh && buffer_revokevalid(bh2))
363 /* ...then it better be revoked too,
364 * since it's illegal to create a revoke
365 * record against a buffer_head which is
366 * not marked revoked --- that would
367 * risk missing a subsequent revoke
368 * cancel. */
369 J_ASSERT_BH(bh2, buffer_revoked(bh2));
370 put_bh(bh2);
371 }
372 }
373#endif
374
375 /* We really ought not ever to revoke twice in a row without
376 first having the revoke cancelled: it's illegal to free a
377 block twice without allocating it in between! */
378 if (bh) {
379 if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
380 "inconsistent data on disk")) {
381 if (!bh_in)
382 brelse(bh);
383 return -EIO;
384 }
385 set_buffer_revoked(bh);
386 set_buffer_revokevalid(bh);
387 if (bh_in) {
388 BUFFER_TRACE(bh_in, "call journal_forget");
389 journal_forget(handle, bh_in);
390 } else {
391 BUFFER_TRACE(bh, "call brelse");
392 __brelse(bh);
393 }
394 }
395
396 jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
397 err = insert_revoke_hash(journal, blocknr,
398 handle->h_transaction->t_tid);
399 BUFFER_TRACE(bh_in, "exit");
400 return err;
401}
402
403/*
404 * Cancel an outstanding revoke. For use only internally by the
405 * journaling code (called from journal_get_write_access).
406 *
407 * We trust buffer_revoked() on the buffer if the buffer is already
408 * being journaled: if there is no revoke pending on the buffer, then we
409 * don't do anything here.
410 *
411 * This would break if it were possible for a buffer to be revoked and
412 * discarded, and then reallocated within the same transaction. In such
413 * a case we would have lost the revoked bit, but when we arrived here
414 * the second time we would still have a pending revoke to cancel. So,
415 * do not trust the Revoked bit on buffers unless RevokeValid is also
416 * set.
417 *
418 * The caller must have the journal locked.
419 */
420int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
421{
422 struct jbd_revoke_record_s *record;
423 journal_t *journal = handle->h_transaction->t_journal;
424 int need_cancel;
425 int did_revoke = 0; /* akpm: debug */
426 struct buffer_head *bh = jh2bh(jh);
427
428 jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
429
430 /* Is the existing Revoke bit valid? If so, we trust it, and
431 * only perform the full cancel if the revoke bit is set. If
432 * not, we can't trust the revoke bit, and we need to do the
433 * full search for a revoke record. */
434 if (test_set_buffer_revokevalid(bh)) {
435 need_cancel = test_clear_buffer_revoked(bh);
436 } else {
437 need_cancel = 1;
438 clear_buffer_revoked(bh);
439 }
440
441 if (need_cancel) {
442 record = find_revoke_record(journal, bh->b_blocknr);
443 if (record) {
444 jbd_debug(4, "cancelled existing revoke on "
445 "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
446 spin_lock(&journal->j_revoke_lock);
447 list_del(&record->hash);
448 spin_unlock(&journal->j_revoke_lock);
449 kmem_cache_free(revoke_record_cache, record);
450 did_revoke = 1;
451 }
452 }
453
454#ifdef JBD_EXPENSIVE_CHECKING
455 /* There better not be one left behind by now! */
456 record = find_revoke_record(journal, bh->b_blocknr);
457 J_ASSERT_JH(jh, record == NULL);
458#endif
459
460 /* Finally, have we just cleared revoke on an unhashed
461 * buffer_head? If so, we'd better make sure we clear the
462 * revoked status on any hashed alias too, otherwise the revoke
463 * state machine will get very upset later on. */
464 if (need_cancel) {
465 struct buffer_head *bh2;
466 bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
467 if (bh2) {
468 if (bh2 != bh)
469 clear_buffer_revoked(bh2);
470 __brelse(bh2);
471 }
472 }
473 return did_revoke;
474}
475
476/* journal_switch_revoke table select j_revoke for next transaction
477 * we do not want to suspend any processing until all revokes are
478 * written -bzzz
479 */
480void journal_switch_revoke_table(journal_t *journal)
481{
482 int i;
483
484 if (journal->j_revoke == journal->j_revoke_table[0])
485 journal->j_revoke = journal->j_revoke_table[1];
486 else
487 journal->j_revoke = journal->j_revoke_table[0];
488
489 for (i = 0; i < journal->j_revoke->hash_size; i++)
490 INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
491}
492
493/*
494 * Write revoke records to the journal for all entries in the current
495 * revoke hash, deleting the entries as we go.
496 *
497 * Called with the journal lock held.
498 */
499
500void journal_write_revoke_records(journal_t *journal,
501 transaction_t *transaction)
502{
503 struct journal_head *descriptor;
504 struct jbd_revoke_record_s *record;
505 struct jbd_revoke_table_s *revoke;
506 struct list_head *hash_list;
507 int i, offset, count;
508
509 descriptor = NULL;
510 offset = 0;
511 count = 0;
512
513 /* select revoke table for committing transaction */
514 revoke = journal->j_revoke == journal->j_revoke_table[0] ?
515 journal->j_revoke_table[1] : journal->j_revoke_table[0];
516
517 for (i = 0; i < revoke->hash_size; i++) {
518 hash_list = &revoke->hash_table[i];
519
520 while (!list_empty(hash_list)) {
521 record = (struct jbd_revoke_record_s *)
522 hash_list->next;
523 write_one_revoke_record(journal, transaction,
524 &descriptor, &offset,
525 record);
526 count++;
527 list_del(&record->hash);
528 kmem_cache_free(revoke_record_cache, record);
529 }
530 }
531 if (descriptor)
532 flush_descriptor(journal, descriptor, offset);
533 jbd_debug(1, "Wrote %d revoke records\n", count);
534}
535
536/*
537 * Write out one revoke record. We need to create a new descriptor
538 * block if the old one is full or if we have not already created one.
539 */
540
541static void write_one_revoke_record(journal_t *journal,
542 transaction_t *transaction,
543 struct journal_head **descriptorp,
544 int *offsetp,
545 struct jbd_revoke_record_s *record)
546{
547 struct journal_head *descriptor;
548 int offset;
549 journal_header_t *header;
550
551 /* If we are already aborting, this all becomes a noop. We
552 still need to go round the loop in
553 journal_write_revoke_records in order to free all of the
554 revoke records: only the IO to the journal is omitted. */
555 if (is_journal_aborted(journal))
556 return;
557
558 descriptor = *descriptorp;
559 offset = *offsetp;
560
561 /* Make sure we have a descriptor with space left for the record */
562 if (descriptor) {
563 if (offset == journal->j_blocksize) {
564 flush_descriptor(journal, descriptor, offset);
565 descriptor = NULL;
566 }
567 }
568
569 if (!descriptor) {
570 descriptor = journal_get_descriptor_buffer(journal);
571 if (!descriptor)
572 return;
573 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
574 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
575 header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
576 header->h_sequence = cpu_to_be32(transaction->t_tid);
577
578 /* Record it so that we can wait for IO completion later */
579 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
580 journal_file_buffer(descriptor, transaction, BJ_LogCtl);
581
582 offset = sizeof(journal_revoke_header_t);
583 *descriptorp = descriptor;
584 }
585
586 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
587 cpu_to_be32(record->blocknr);
588 offset += 4;
589 *offsetp = offset;
590}
591
592/*
593 * Flush a revoke descriptor out to the journal. If we are aborting,
594 * this is a noop; otherwise we are generating a buffer which needs to
595 * be waited for during commit, so it has to go onto the appropriate
596 * journal buffer list.
597 */
598
599static void flush_descriptor(journal_t *journal,
600 struct journal_head *descriptor,
601 int offset)
602{
603 journal_revoke_header_t *header;
604 struct buffer_head *bh = jh2bh(descriptor);
605
606 if (is_journal_aborted(journal)) {
607 put_bh(bh);
608 return;
609 }
610
611 header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
612 header->r_count = cpu_to_be32(offset);
613 set_buffer_jwrite(bh);
614 BUFFER_TRACE(bh, "write");
615 set_buffer_dirty(bh);
616 ll_rw_block(WRITE, 1, &bh);
617}
618#endif
619
620/*
621 * Revoke support for recovery.
622 *
623 * Recovery needs to be able to:
624 *
625 * record all revoke records, including the tid of the latest instance
626 * of each revoke in the journal
627 *
628 * check whether a given block in a given transaction should be replayed
629 * (ie. has not been revoked by a revoke record in that or a subsequent
630 * transaction)
631 *
632 * empty the revoke table after recovery.
633 */
634
635/*
636 * First, setting revoke records. We create a new revoke record for
637 * every block ever revoked in the log as we scan it for recovery, and
638 * we update the existing records if we find multiple revokes for a
639 * single block.
640 */
641
642int journal_set_revoke(journal_t *journal,
643 unsigned long blocknr,
644 tid_t sequence)
645{
646 struct jbd_revoke_record_s *record;
647
648 record = find_revoke_record(journal, blocknr);
649 if (record) {
650 /* If we have multiple occurrences, only record the
651 * latest sequence number in the hashed record */
652 if (tid_gt(sequence, record->sequence))
653 record->sequence = sequence;
654 return 0;
655 }
656 return insert_revoke_hash(journal, blocknr, sequence);
657}
658
659/*
660 * Test revoke records. For a given block referenced in the log, has
661 * that block been revoked? A revoke record with a given transaction
662 * sequence number revokes all blocks in that transaction and earlier
663 * ones, but later transactions still need replayed.
664 */
665
666int journal_test_revoke(journal_t *journal,
667 unsigned long blocknr,
668 tid_t sequence)
669{
670 struct jbd_revoke_record_s *record;
671
672 record = find_revoke_record(journal, blocknr);
673 if (!record)
674 return 0;
675 if (tid_gt(sequence, record->sequence))
676 return 0;
677 return 1;
678}
679
680/*
681 * Finally, once recovery is over, we need to clear the revoke table so
682 * that it can be reused by the running filesystem.
683 */
684
685void journal_clear_revoke(journal_t *journal)
686{
687 int i;
688 struct list_head *hash_list;
689 struct jbd_revoke_record_s *record;
690 struct jbd_revoke_table_s *revoke;
691
692 revoke = journal->j_revoke;
693
694 for (i = 0; i < revoke->hash_size; i++) {
695 hash_list = &revoke->hash_table[i];
696 while (!list_empty(hash_list)) {
697 record = (struct jbd_revoke_record_s*) hash_list->next;
698 list_del(&record->hash);
699 kmem_cache_free(revoke_record_cache, record);
700 }
701 }
702}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
new file mode 100644
index 000000000000..932e7c1ef4a1
--- /dev/null
+++ b/fs/jbd/transaction.c
@@ -0,0 +1,2062 @@
1/*
2 * linux/fs/transaction.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem transaction handling code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages transactions (compound commits managed by the
16 * journaling code) and handles (individual atomic operations by the
17 * filesystem).
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/timer.h>
26#include <linux/smp_lock.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29
30/*
31 * get_transaction: obtain a new transaction_t object.
32 *
33 * Simply allocate and initialise a new transaction. Create it in
34 * RUNNING state and add it to the current journal (which should not
35 * have an existing running transaction: we only make a new transaction
36 * once we have started to commit the old one).
37 *
38 * Preconditions:
39 * The journal MUST be locked. We don't perform atomic mallocs on the
40 * new transaction and we can't block without protecting against other
41 * processes trying to touch the journal while it is in transition.
42 *
43 * Called under j_state_lock
44 */
45
46static transaction_t *
47get_transaction(journal_t *journal, transaction_t *transaction)
48{
49 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING;
51 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock);
54
55 /* Set up the commit timer for the new transaction. */
56 journal->j_commit_timer->expires = transaction->t_expires;
57 add_timer(journal->j_commit_timer);
58
59 J_ASSERT(journal->j_running_transaction == NULL);
60 journal->j_running_transaction = transaction;
61
62 return transaction;
63}
64
65/*
66 * Handle management.
67 *
68 * A handle_t is an object which represents a single atomic update to a
69 * filesystem, and which tracks all of the modifications which form part
70 * of that one update.
71 */
72
73/*
74 * start_this_handle: Given a handle, deal with any locking or stalling
75 * needed to make sure that there is enough journal space for the handle
76 * to begin. Attach the handle to a transaction and set up the
77 * transaction's buffer credits.
78 */
79
80static int start_this_handle(journal_t *journal, handle_t *handle)
81{
82 transaction_t *transaction;
83 int needed;
84 int nblocks = handle->h_buffer_credits;
85 transaction_t *new_transaction = NULL;
86 int ret = 0;
87
88 if (nblocks > journal->j_max_transaction_buffers) {
89 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
90 current->comm, nblocks,
91 journal->j_max_transaction_buffers);
92 ret = -ENOSPC;
93 goto out;
94 }
95
96alloc_transaction:
97 if (!journal->j_running_transaction) {
98 new_transaction = jbd_kmalloc(sizeof(*new_transaction),
99 GFP_NOFS);
100 if (!new_transaction) {
101 ret = -ENOMEM;
102 goto out;
103 }
104 memset(new_transaction, 0, sizeof(*new_transaction));
105 }
106
107 jbd_debug(3, "New handle %p going live.\n", handle);
108
109repeat:
110
111 /*
112 * We need to hold j_state_lock until t_updates has been incremented,
113 * for proper journal barrier handling
114 */
115 spin_lock(&journal->j_state_lock);
116repeat_locked:
117 if (is_journal_aborted(journal) ||
118 (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
119 spin_unlock(&journal->j_state_lock);
120 ret = -EROFS;
121 goto out;
122 }
123
124 /* Wait on the journal's transaction barrier if necessary */
125 if (journal->j_barrier_count) {
126 spin_unlock(&journal->j_state_lock);
127 wait_event(journal->j_wait_transaction_locked,
128 journal->j_barrier_count == 0);
129 goto repeat;
130 }
131
132 if (!journal->j_running_transaction) {
133 if (!new_transaction) {
134 spin_unlock(&journal->j_state_lock);
135 goto alloc_transaction;
136 }
137 get_transaction(journal, new_transaction);
138 new_transaction = NULL;
139 }
140
141 transaction = journal->j_running_transaction;
142
143 /*
144 * If the current transaction is locked down for commit, wait for the
145 * lock to be released.
146 */
147 if (transaction->t_state == T_LOCKED) {
148 DEFINE_WAIT(wait);
149
150 prepare_to_wait(&journal->j_wait_transaction_locked,
151 &wait, TASK_UNINTERRUPTIBLE);
152 spin_unlock(&journal->j_state_lock);
153 schedule();
154 finish_wait(&journal->j_wait_transaction_locked, &wait);
155 goto repeat;
156 }
157
158 /*
159 * If there is not enough space left in the log to write all potential
160 * buffers requested by this operation, we need to stall pending a log
161 * checkpoint to free some more log space.
162 */
163 spin_lock(&transaction->t_handle_lock);
164 needed = transaction->t_outstanding_credits + nblocks;
165
166 if (needed > journal->j_max_transaction_buffers) {
167 /*
168 * If the current transaction is already too large, then start
169 * to commit it: we can then go back and attach this handle to
170 * a new transaction.
171 */
172 DEFINE_WAIT(wait);
173
174 jbd_debug(2, "Handle %p starting new commit...\n", handle);
175 spin_unlock(&transaction->t_handle_lock);
176 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
177 TASK_UNINTERRUPTIBLE);
178 __log_start_commit(journal, transaction->t_tid);
179 spin_unlock(&journal->j_state_lock);
180 schedule();
181 finish_wait(&journal->j_wait_transaction_locked, &wait);
182 goto repeat;
183 }
184
185 /*
186 * The commit code assumes that it can get enough log space
187 * without forcing a checkpoint. This is *critical* for
188 * correctness: a checkpoint of a buffer which is also
189 * associated with a committing transaction creates a deadlock,
190 * so commit simply cannot force through checkpoints.
191 *
192 * We must therefore ensure the necessary space in the journal
193 * *before* starting to dirty potentially checkpointed buffers
194 * in the new transaction.
195 *
196 * The worst part is, any transaction currently committing can
197 * reduce the free space arbitrarily. Be careful to account for
198 * those buffers when checkpointing.
199 */
200
201 /*
202 * @@@ AKPM: This seems rather over-defensive. We're giving commit
203 * a _lot_ of headroom: 1/4 of the journal plus the size of
204 * the committing transaction. Really, we only need to give it
205 * committing_transaction->t_outstanding_credits plus "enough" for
206 * the log control blocks.
207 * Also, this test is inconsitent with the matching one in
208 * journal_extend().
209 */
210 if (__log_space_left(journal) < jbd_space_needed(journal)) {
211 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
212 spin_unlock(&transaction->t_handle_lock);
213 __log_wait_for_space(journal);
214 goto repeat_locked;
215 }
216
217 /* OK, account for the buffers that this operation expects to
218 * use and add the handle to the running transaction. */
219
220 handle->h_transaction = transaction;
221 transaction->t_outstanding_credits += nblocks;
222 transaction->t_updates++;
223 transaction->t_handle_count++;
224 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
225 handle, nblocks, transaction->t_outstanding_credits,
226 __log_space_left(journal));
227 spin_unlock(&transaction->t_handle_lock);
228 spin_unlock(&journal->j_state_lock);
229out:
230 if (new_transaction)
231 kfree(new_transaction);
232 return ret;
233}
234
235/* Allocate a new handle. This should probably be in a slab... */
236static handle_t *new_handle(int nblocks)
237{
238 handle_t *handle = jbd_alloc_handle(GFP_NOFS);
239 if (!handle)
240 return NULL;
241 memset(handle, 0, sizeof(*handle));
242 handle->h_buffer_credits = nblocks;
243 handle->h_ref = 1;
244
245 return handle;
246}
247
248/**
249 * handle_t *journal_start() - Obtain a new handle.
250 * @journal: Journal to start transaction on.
251 * @nblocks: number of block buffer we might modify
252 *
253 * We make sure that the transaction can guarantee at least nblocks of
254 * modified buffers in the log. We block until the log can guarantee
255 * that much space.
256 *
257 * This function is visible to journal users (like ext3fs), so is not
258 * called with the journal already locked.
259 *
260 * Return a pointer to a newly allocated handle, or NULL on failure
261 */
262handle_t *journal_start(journal_t *journal, int nblocks)
263{
264 handle_t *handle = journal_current_handle();
265 int err;
266
267 if (!journal)
268 return ERR_PTR(-EROFS);
269
270 if (handle) {
271 J_ASSERT(handle->h_transaction->t_journal == journal);
272 handle->h_ref++;
273 return handle;
274 }
275
276 handle = new_handle(nblocks);
277 if (!handle)
278 return ERR_PTR(-ENOMEM);
279
280 current->journal_info = handle;
281
282 err = start_this_handle(journal, handle);
283 if (err < 0) {
284 jbd_free_handle(handle);
285 current->journal_info = NULL;
286 handle = ERR_PTR(err);
287 }
288 return handle;
289}
290
291/**
292 * int journal_extend() - extend buffer credits.
293 * @handle: handle to 'extend'
294 * @nblocks: nr blocks to try to extend by.
295 *
296 * Some transactions, such as large extends and truncates, can be done
297 * atomically all at once or in several stages. The operation requests
298 * a credit for a number of buffer modications in advance, but can
299 * extend its credit if it needs more.
300 *
301 * journal_extend tries to give the running handle more buffer credits.
302 * It does not guarantee that allocation - this is a best-effort only.
303 * The calling process MUST be able to deal cleanly with a failure to
304 * extend here.
305 *
306 * Return 0 on success, non-zero on failure.
307 *
308 * return code < 0 implies an error
309 * return code > 0 implies normal transaction-full status.
310 */
311int journal_extend(handle_t *handle, int nblocks)
312{
313 transaction_t *transaction = handle->h_transaction;
314 journal_t *journal = transaction->t_journal;
315 int result;
316 int wanted;
317
318 result = -EIO;
319 if (is_handle_aborted(handle))
320 goto out;
321
322 result = 1;
323
324 spin_lock(&journal->j_state_lock);
325
326 /* Don't extend a locked-down transaction! */
327 if (handle->h_transaction->t_state != T_RUNNING) {
328 jbd_debug(3, "denied handle %p %d blocks: "
329 "transaction not running\n", handle, nblocks);
330 goto error_out;
331 }
332
333 spin_lock(&transaction->t_handle_lock);
334 wanted = transaction->t_outstanding_credits + nblocks;
335
336 if (wanted > journal->j_max_transaction_buffers) {
337 jbd_debug(3, "denied handle %p %d blocks: "
338 "transaction too large\n", handle, nblocks);
339 goto unlock;
340 }
341
342 if (wanted > __log_space_left(journal)) {
343 jbd_debug(3, "denied handle %p %d blocks: "
344 "insufficient log space\n", handle, nblocks);
345 goto unlock;
346 }
347
348 handle->h_buffer_credits += nblocks;
349 transaction->t_outstanding_credits += nblocks;
350 result = 0;
351
352 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
353unlock:
354 spin_unlock(&transaction->t_handle_lock);
355error_out:
356 spin_unlock(&journal->j_state_lock);
357out:
358 return result;
359}
360
361
362/**
363 * int journal_restart() - restart a handle .
364 * @handle: handle to restart
365 * @nblocks: nr credits requested
366 *
367 * Restart a handle for a multi-transaction filesystem
368 * operation.
369 *
370 * If the journal_extend() call above fails to grant new buffer credits
371 * to a running handle, a call to journal_restart will commit the
372 * handle's transaction so far and reattach the handle to a new
373 * transaction capabable of guaranteeing the requested number of
374 * credits.
375 */
376
377int journal_restart(handle_t *handle, int nblocks)
378{
379 transaction_t *transaction = handle->h_transaction;
380 journal_t *journal = transaction->t_journal;
381 int ret;
382
383 /* If we've had an abort of any type, don't even think about
384 * actually doing the restart! */
385 if (is_handle_aborted(handle))
386 return 0;
387
388 /*
389 * First unlink the handle from its current transaction, and start the
390 * commit on that.
391 */
392 J_ASSERT(transaction->t_updates > 0);
393 J_ASSERT(journal_current_handle() == handle);
394
395 spin_lock(&journal->j_state_lock);
396 spin_lock(&transaction->t_handle_lock);
397 transaction->t_outstanding_credits -= handle->h_buffer_credits;
398 transaction->t_updates--;
399
400 if (!transaction->t_updates)
401 wake_up(&journal->j_wait_updates);
402 spin_unlock(&transaction->t_handle_lock);
403
404 jbd_debug(2, "restarting handle %p\n", handle);
405 __log_start_commit(journal, transaction->t_tid);
406 spin_unlock(&journal->j_state_lock);
407
408 handle->h_buffer_credits = nblocks;
409 ret = start_this_handle(journal, handle);
410 return ret;
411}
412
413
414/**
415 * void journal_lock_updates () - establish a transaction barrier.
416 * @journal: Journal to establish a barrier on.
417 *
418 * This locks out any further updates from being started, and blocks
419 * until all existing updates have completed, returning only once the
420 * journal is in a quiescent state with no updates running.
421 *
422 * The journal lock should not be held on entry.
423 */
424void journal_lock_updates(journal_t *journal)
425{
426 DEFINE_WAIT(wait);
427
428 spin_lock(&journal->j_state_lock);
429 ++journal->j_barrier_count;
430
431 /* Wait until there are no running updates */
432 while (1) {
433 transaction_t *transaction = journal->j_running_transaction;
434
435 if (!transaction)
436 break;
437
438 spin_lock(&transaction->t_handle_lock);
439 if (!transaction->t_updates) {
440 spin_unlock(&transaction->t_handle_lock);
441 break;
442 }
443 prepare_to_wait(&journal->j_wait_updates, &wait,
444 TASK_UNINTERRUPTIBLE);
445 spin_unlock(&transaction->t_handle_lock);
446 spin_unlock(&journal->j_state_lock);
447 schedule();
448 finish_wait(&journal->j_wait_updates, &wait);
449 spin_lock(&journal->j_state_lock);
450 }
451 spin_unlock(&journal->j_state_lock);
452
453 /*
454 * We have now established a barrier against other normal updates, but
455 * we also need to barrier against other journal_lock_updates() calls
456 * to make sure that we serialise special journal-locked operations
457 * too.
458 */
459 down(&journal->j_barrier);
460}
461
462/**
463 * void journal_unlock_updates (journal_t* journal) - release barrier
464 * @journal: Journal to release the barrier on.
465 *
466 * Release a transaction barrier obtained with journal_lock_updates().
467 *
468 * Should be called without the journal lock held.
469 */
470void journal_unlock_updates (journal_t *journal)
471{
472 J_ASSERT(journal->j_barrier_count != 0);
473
474 up(&journal->j_barrier);
475 spin_lock(&journal->j_state_lock);
476 --journal->j_barrier_count;
477 spin_unlock(&journal->j_state_lock);
478 wake_up(&journal->j_wait_transaction_locked);
479}
480
481/*
482 * Report any unexpected dirty buffers which turn up. Normally those
483 * indicate an error, but they can occur if the user is running (say)
484 * tune2fs to modify the live filesystem, so we need the option of
485 * continuing as gracefully as possible. #
486 *
487 * The caller should already hold the journal lock and
488 * j_list_lock spinlock: most callers will need those anyway
489 * in order to probe the buffer's journaling state safely.
490 */
491static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
492{
493 struct buffer_head *bh = jh2bh(jh);
494 int jlist;
495
496 if (buffer_dirty(bh)) {
497 /* If this buffer is one which might reasonably be dirty
498 * --- ie. data, or not part of this journal --- then
499 * we're OK to leave it alone, but otherwise we need to
500 * move the dirty bit to the journal's own internal
501 * JBDDirty bit. */
502 jlist = jh->b_jlist;
503
504 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
505 jlist == BJ_Shadow || jlist == BJ_Forget) {
506 if (test_clear_buffer_dirty(jh2bh(jh))) {
507 set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
508 }
509 }
510 }
511}
512
513/*
514 * If the buffer is already part of the current transaction, then there
515 * is nothing we need to do. If it is already part of a prior
516 * transaction which we are still committing to disk, then we need to
517 * make sure that we do not overwrite the old copy: we do copy-out to
518 * preserve the copy going to disk. We also account the buffer against
519 * the handle's metadata buffer credits (unless the buffer is already
520 * part of the transaction, that is).
521 *
522 */
523static int
524do_get_write_access(handle_t *handle, struct journal_head *jh,
525 int force_copy)
526{
527 struct buffer_head *bh;
528 transaction_t *transaction;
529 journal_t *journal;
530 int error;
531 char *frozen_buffer = NULL;
532 int need_copy = 0;
533
534 if (is_handle_aborted(handle))
535 return -EROFS;
536
537 transaction = handle->h_transaction;
538 journal = transaction->t_journal;
539
540 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
541
542 JBUFFER_TRACE(jh, "entry");
543repeat:
544 bh = jh2bh(jh);
545
546 /* @@@ Need to check for errors here at some point. */
547
548 lock_buffer(bh);
549 jbd_lock_bh_state(bh);
550
551 /* We now hold the buffer lock so it is safe to query the buffer
552 * state. Is the buffer dirty?
553 *
554 * If so, there are two possibilities. The buffer may be
555 * non-journaled, and undergoing a quite legitimate writeback.
556 * Otherwise, it is journaled, and we don't expect dirty buffers
557 * in that state (the buffers should be marked JBD_Dirty
558 * instead.) So either the IO is being done under our own
559 * control and this is a bug, or it's a third party IO such as
560 * dump(8) (which may leave the buffer scheduled for read ---
561 * ie. locked but not dirty) or tune2fs (which may actually have
562 * the buffer dirtied, ugh.) */
563
564 if (buffer_dirty(bh)) {
565 /*
566 * First question: is this buffer already part of the current
567 * transaction or the existing committing transaction?
568 */
569 if (jh->b_transaction) {
570 J_ASSERT_JH(jh,
571 jh->b_transaction == transaction ||
572 jh->b_transaction ==
573 journal->j_committing_transaction);
574 if (jh->b_next_transaction)
575 J_ASSERT_JH(jh, jh->b_next_transaction ==
576 transaction);
577 JBUFFER_TRACE(jh, "Unexpected dirty buffer");
578 jbd_unexpected_dirty_buffer(jh);
579 }
580 }
581
582 unlock_buffer(bh);
583
584 error = -EROFS;
585 if (is_handle_aborted(handle)) {
586 jbd_unlock_bh_state(bh);
587 goto out;
588 }
589 error = 0;
590
591 /*
592 * The buffer is already part of this transaction if b_transaction or
593 * b_next_transaction points to it
594 */
595 if (jh->b_transaction == transaction ||
596 jh->b_next_transaction == transaction)
597 goto done;
598
599 /*
600 * If there is already a copy-out version of this buffer, then we don't
601 * need to make another one
602 */
603 if (jh->b_frozen_data) {
604 JBUFFER_TRACE(jh, "has frozen data");
605 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
606 jh->b_next_transaction = transaction;
607 goto done;
608 }
609
610 /* Is there data here we need to preserve? */
611
612 if (jh->b_transaction && jh->b_transaction != transaction) {
613 JBUFFER_TRACE(jh, "owned by older transaction");
614 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
615 J_ASSERT_JH(jh, jh->b_transaction ==
616 journal->j_committing_transaction);
617
618 /* There is one case we have to be very careful about.
619 * If the committing transaction is currently writing
620 * this buffer out to disk and has NOT made a copy-out,
621 * then we cannot modify the buffer contents at all
622 * right now. The essence of copy-out is that it is the
623 * extra copy, not the primary copy, which gets
624 * journaled. If the primary copy is already going to
625 * disk then we cannot do copy-out here. */
626
627 if (jh->b_jlist == BJ_Shadow) {
628 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
629 wait_queue_head_t *wqh;
630
631 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
632
633 JBUFFER_TRACE(jh, "on shadow: sleep");
634 jbd_unlock_bh_state(bh);
635 /* commit wakes up all shadow buffers after IO */
636 for ( ; ; ) {
637 prepare_to_wait(wqh, &wait.wait,
638 TASK_UNINTERRUPTIBLE);
639 if (jh->b_jlist != BJ_Shadow)
640 break;
641 schedule();
642 }
643 finish_wait(wqh, &wait.wait);
644 goto repeat;
645 }
646
647 /* Only do the copy if the currently-owning transaction
648 * still needs it. If it is on the Forget list, the
649 * committing transaction is past that stage. The
650 * buffer had better remain locked during the kmalloc,
651 * but that should be true --- we hold the journal lock
652 * still and the buffer is already on the BUF_JOURNAL
653 * list so won't be flushed.
654 *
655 * Subtle point, though: if this is a get_undo_access,
656 * then we will be relying on the frozen_data to contain
657 * the new value of the committed_data record after the
658 * transaction, so we HAVE to force the frozen_data copy
659 * in that case. */
660
661 if (jh->b_jlist != BJ_Forget || force_copy) {
662 JBUFFER_TRACE(jh, "generate frozen data");
663 if (!frozen_buffer) {
664 JBUFFER_TRACE(jh, "allocate memory for buffer");
665 jbd_unlock_bh_state(bh);
666 frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
667 GFP_NOFS);
668 if (!frozen_buffer) {
669 printk(KERN_EMERG
670 "%s: OOM for frozen_buffer\n",
671 __FUNCTION__);
672 JBUFFER_TRACE(jh, "oom!");
673 error = -ENOMEM;
674 jbd_lock_bh_state(bh);
675 goto done;
676 }
677 goto repeat;
678 }
679 jh->b_frozen_data = frozen_buffer;
680 frozen_buffer = NULL;
681 need_copy = 1;
682 }
683 jh->b_next_transaction = transaction;
684 }
685
686
687 /*
688 * Finally, if the buffer is not journaled right now, we need to make
689 * sure it doesn't get written to disk before the caller actually
690 * commits the new data
691 */
692 if (!jh->b_transaction) {
693 JBUFFER_TRACE(jh, "no transaction");
694 J_ASSERT_JH(jh, !jh->b_next_transaction);
695 jh->b_transaction = transaction;
696 JBUFFER_TRACE(jh, "file as BJ_Reserved");
697 spin_lock(&journal->j_list_lock);
698 __journal_file_buffer(jh, transaction, BJ_Reserved);
699 spin_unlock(&journal->j_list_lock);
700 }
701
702done:
703 if (need_copy) {
704 struct page *page;
705 int offset;
706 char *source;
707
708 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
709 "Possible IO failure.\n");
710 page = jh2bh(jh)->b_page;
711 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
712 source = kmap_atomic(page, KM_USER0);
713 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
714 kunmap_atomic(source, KM_USER0);
715 }
716 jbd_unlock_bh_state(bh);
717
718 /*
719 * If we are about to journal a buffer, then any revoke pending on it is
720 * no longer valid
721 */
722 journal_cancel_revoke(handle, jh);
723
724out:
725 if (frozen_buffer)
726 kfree(frozen_buffer);
727
728 JBUFFER_TRACE(jh, "exit");
729 return error;
730}
731
732/**
733 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
734 * @handle: transaction to add buffer modifications to
735 * @bh: bh to be used for metadata writes
736 * @credits: variable that will receive credits for the buffer
737 *
738 * Returns an error code or 0 on success.
739 *
740 * In full data journalling mode the buffer may be of type BJ_AsyncData,
741 * because we're write()ing a buffer which is also part of a shared mapping.
742 */
743
744int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
745{
746 struct journal_head *jh = journal_add_journal_head(bh);
747 int rc;
748
749 /* We do not want to get caught playing with fields which the
750 * log thread also manipulates. Make sure that the buffer
751 * completes any outstanding IO before proceeding. */
752 rc = do_get_write_access(handle, jh, 0);
753 journal_put_journal_head(jh);
754 return rc;
755}
756
757
758/*
759 * When the user wants to journal a newly created buffer_head
760 * (ie. getblk() returned a new buffer and we are going to populate it
761 * manually rather than reading off disk), then we need to keep the
762 * buffer_head locked until it has been completely filled with new
763 * data. In this case, we should be able to make the assertion that
764 * the bh is not already part of an existing transaction.
765 *
766 * The buffer should already be locked by the caller by this point.
767 * There is no lock ranking violation: it was a newly created,
768 * unlocked buffer beforehand. */
769
770/**
771 * int journal_get_create_access () - notify intent to use newly created bh
772 * @handle: transaction to new buffer to
773 * @bh: new buffer.
774 *
775 * Call this if you create a new bh.
776 */
777int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
778{
779 transaction_t *transaction = handle->h_transaction;
780 journal_t *journal = transaction->t_journal;
781 struct journal_head *jh = journal_add_journal_head(bh);
782 int err;
783
784 jbd_debug(5, "journal_head %p\n", jh);
785 err = -EROFS;
786 if (is_handle_aborted(handle))
787 goto out;
788 err = 0;
789
790 JBUFFER_TRACE(jh, "entry");
791 /*
792 * The buffer may already belong to this transaction due to pre-zeroing
793 * in the filesystem's new_block code. It may also be on the previous,
794 * committing transaction's lists, but it HAS to be in Forget state in
795 * that case: the transaction must have deleted the buffer for it to be
796 * reused here.
797 */
798 jbd_lock_bh_state(bh);
799 spin_lock(&journal->j_list_lock);
800 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
801 jh->b_transaction == NULL ||
802 (jh->b_transaction == journal->j_committing_transaction &&
803 jh->b_jlist == BJ_Forget)));
804
805 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
806 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
807
808 if (jh->b_transaction == NULL) {
809 jh->b_transaction = transaction;
810 JBUFFER_TRACE(jh, "file as BJ_Reserved");
811 __journal_file_buffer(jh, transaction, BJ_Reserved);
812 } else if (jh->b_transaction == journal->j_committing_transaction) {
813 JBUFFER_TRACE(jh, "set next transaction");
814 jh->b_next_transaction = transaction;
815 }
816 spin_unlock(&journal->j_list_lock);
817 jbd_unlock_bh_state(bh);
818
819 /*
820 * akpm: I added this. ext3_alloc_branch can pick up new indirect
821 * blocks which contain freed but then revoked metadata. We need
822 * to cancel the revoke in case we end up freeing it yet again
823 * and the reallocating as data - this would cause a second revoke,
824 * which hits an assertion error.
825 */
826 JBUFFER_TRACE(jh, "cancelling revoke");
827 journal_cancel_revoke(handle, jh);
828 journal_put_journal_head(jh);
829out:
830 return err;
831}
832
833/**
834 * int journal_get_undo_access() - Notify intent to modify metadata with
835 * non-rewindable consequences
836 * @handle: transaction
837 * @bh: buffer to undo
838 * @credits: store the number of taken credits here (if not NULL)
839 *
840 * Sometimes there is a need to distinguish between metadata which has
841 * been committed to disk and that which has not. The ext3fs code uses
842 * this for freeing and allocating space, we have to make sure that we
843 * do not reuse freed space until the deallocation has been committed,
844 * since if we overwrote that space we would make the delete
845 * un-rewindable in case of a crash.
846 *
847 * To deal with that, journal_get_undo_access requests write access to a
848 * buffer for parts of non-rewindable operations such as delete
849 * operations on the bitmaps. The journaling code must keep a copy of
850 * the buffer's contents prior to the undo_access call until such time
851 * as we know that the buffer has definitely been committed to disk.
852 *
853 * We never need to know which transaction the committed data is part
854 * of, buffers touched here are guaranteed to be dirtied later and so
855 * will be committed to a new transaction in due course, at which point
856 * we can discard the old committed data pointer.
857 *
858 * Returns error number or 0 on success.
859 */
860int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
861{
862 int err;
863 struct journal_head *jh = journal_add_journal_head(bh);
864 char *committed_data = NULL;
865
866 JBUFFER_TRACE(jh, "entry");
867
868 /*
869 * Do this first --- it can drop the journal lock, so we want to
870 * make sure that obtaining the committed_data is done
871 * atomically wrt. completion of any outstanding commits.
872 */
873 err = do_get_write_access(handle, jh, 1);
874 if (err)
875 goto out;
876
877repeat:
878 if (!jh->b_committed_data) {
879 committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS);
880 if (!committed_data) {
881 printk(KERN_EMERG "%s: No memory for committed data\n",
882 __FUNCTION__);
883 err = -ENOMEM;
884 goto out;
885 }
886 }
887
888 jbd_lock_bh_state(bh);
889 if (!jh->b_committed_data) {
890 /* Copy out the current buffer contents into the
891 * preserved, committed copy. */
892 JBUFFER_TRACE(jh, "generate b_committed data");
893 if (!committed_data) {
894 jbd_unlock_bh_state(bh);
895 goto repeat;
896 }
897
898 jh->b_committed_data = committed_data;
899 committed_data = NULL;
900 memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
901 }
902 jbd_unlock_bh_state(bh);
903out:
904 journal_put_journal_head(jh);
905 if (committed_data)
906 kfree(committed_data);
907 return err;
908}
909
910/**
911 * int journal_dirty_data() - mark a buffer as containing dirty data which
912 * needs to be flushed before we can commit the
913 * current transaction.
914 * @handle: transaction
915 * @bh: bufferhead to mark
916 *
917 * The buffer is placed on the transaction's data list and is marked as
918 * belonging to the transaction.
919 *
920 * Returns error number or 0 on success.
921 *
922 * journal_dirty_data() can be called via page_launder->ext3_writepage
923 * by kswapd.
924 */
925int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
926{
927 journal_t *journal = handle->h_transaction->t_journal;
928 int need_brelse = 0;
929 struct journal_head *jh;
930
931 if (is_handle_aborted(handle))
932 return 0;
933
934 jh = journal_add_journal_head(bh);
935 JBUFFER_TRACE(jh, "entry");
936
937 /*
938 * The buffer could *already* be dirty. Writeout can start
939 * at any time.
940 */
941 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
942
943 /*
944 * What if the buffer is already part of a running transaction?
945 *
946 * There are two cases:
947 * 1) It is part of the current running transaction. Refile it,
948 * just in case we have allocated it as metadata, deallocated
949 * it, then reallocated it as data.
950 * 2) It is part of the previous, still-committing transaction.
951 * If all we want to do is to guarantee that the buffer will be
952 * written to disk before this new transaction commits, then
953 * being sure that the *previous* transaction has this same
954 * property is sufficient for us! Just leave it on its old
955 * transaction.
956 *
957 * In case (2), the buffer must not already exist as metadata
958 * --- that would violate write ordering (a transaction is free
959 * to write its data at any point, even before the previous
960 * committing transaction has committed). The caller must
961 * never, ever allow this to happen: there's nothing we can do
962 * about it in this layer.
963 */
964 jbd_lock_bh_state(bh);
965 spin_lock(&journal->j_list_lock);
966 if (jh->b_transaction) {
967 JBUFFER_TRACE(jh, "has transaction");
968 if (jh->b_transaction != handle->h_transaction) {
969 JBUFFER_TRACE(jh, "belongs to older transaction");
970 J_ASSERT_JH(jh, jh->b_transaction ==
971 journal->j_committing_transaction);
972
973 /* @@@ IS THIS TRUE ? */
974 /*
975 * Not any more. Scenario: someone does a write()
976 * in data=journal mode. The buffer's transaction has
977 * moved into commit. Then someone does another
978 * write() to the file. We do the frozen data copyout
979 * and set b_next_transaction to point to j_running_t.
980 * And while we're in that state, someone does a
981 * writepage() in an attempt to pageout the same area
982 * of the file via a shared mapping. At present that
983 * calls journal_dirty_data(), and we get right here.
984 * It may be too late to journal the data. Simply
985 * falling through to the next test will suffice: the
986 * data will be dirty and wil be checkpointed. The
987 * ordering comments in the next comment block still
988 * apply.
989 */
990 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
991
992 /*
993 * If we're journalling data, and this buffer was
994 * subject to a write(), it could be metadata, forget
995 * or shadow against the committing transaction. Now,
996 * someone has dirtied the same darn page via a mapping
997 * and it is being writepage()'d.
998 * We *could* just steal the page from commit, with some
999 * fancy locking there. Instead, we just skip it -
1000 * don't tie the page's buffers to the new transaction
1001 * at all.
1002 * Implication: if we crash before the writepage() data
1003 * is written into the filesystem, recovery will replay
1004 * the write() data.
1005 */
1006 if (jh->b_jlist != BJ_None &&
1007 jh->b_jlist != BJ_SyncData &&
1008 jh->b_jlist != BJ_Locked) {
1009 JBUFFER_TRACE(jh, "Not stealing");
1010 goto no_journal;
1011 }
1012
1013 /*
1014 * This buffer may be undergoing writeout in commit. We
1015 * can't return from here and let the caller dirty it
1016 * again because that can cause the write-out loop in
1017 * commit to never terminate.
1018 */
1019 if (buffer_dirty(bh)) {
1020 get_bh(bh);
1021 spin_unlock(&journal->j_list_lock);
1022 jbd_unlock_bh_state(bh);
1023 need_brelse = 1;
1024 sync_dirty_buffer(bh);
1025 jbd_lock_bh_state(bh);
1026 spin_lock(&journal->j_list_lock);
1027 /* The buffer may become locked again at any
1028 time if it is redirtied */
1029 }
1030
1031 /* journal_clean_data_list() may have got there first */
1032 if (jh->b_transaction != NULL) {
1033 JBUFFER_TRACE(jh, "unfile from commit");
1034 __journal_temp_unlink_buffer(jh);
1035 /* It still points to the committing
1036 * transaction; move it to this one so
1037 * that the refile assert checks are
1038 * happy. */
1039 jh->b_transaction = handle->h_transaction;
1040 }
1041 /* The buffer will be refiled below */
1042
1043 }
1044 /*
1045 * Special case --- the buffer might actually have been
1046 * allocated and then immediately deallocated in the previous,
1047 * committing transaction, so might still be left on that
1048 * transaction's metadata lists.
1049 */
1050 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1051 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1052 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1053 __journal_temp_unlink_buffer(jh);
1054 jh->b_transaction = handle->h_transaction;
1055 JBUFFER_TRACE(jh, "file as data");
1056 __journal_file_buffer(jh, handle->h_transaction,
1057 BJ_SyncData);
1058 }
1059 } else {
1060 JBUFFER_TRACE(jh, "not on a transaction");
1061 __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1062 }
1063no_journal:
1064 spin_unlock(&journal->j_list_lock);
1065 jbd_unlock_bh_state(bh);
1066 if (need_brelse) {
1067 BUFFER_TRACE(bh, "brelse");
1068 __brelse(bh);
1069 }
1070 JBUFFER_TRACE(jh, "exit");
1071 journal_put_journal_head(jh);
1072 return 0;
1073}
1074
1075/**
1076 * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
1077 * @handle: transaction to add buffer to.
1078 * @bh: buffer to mark
1079 *
1080 * mark dirty metadata which needs to be journaled as part of the current
1081 * transaction.
1082 *
1083 * The buffer is placed on the transaction's metadata list and is marked
1084 * as belonging to the transaction.
1085 *
1086 * Returns error number or 0 on success.
1087 *
1088 * Special care needs to be taken if the buffer already belongs to the
1089 * current committing transaction (in which case we should have frozen
1090 * data present for that commit). In that case, we don't relink the
1091 * buffer: that only gets done when the old transaction finally
1092 * completes its commit.
1093 */
1094int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1095{
1096 transaction_t *transaction = handle->h_transaction;
1097 journal_t *journal = transaction->t_journal;
1098 struct journal_head *jh = bh2jh(bh);
1099
1100 jbd_debug(5, "journal_head %p\n", jh);
1101 JBUFFER_TRACE(jh, "entry");
1102 if (is_handle_aborted(handle))
1103 goto out;
1104
1105 jbd_lock_bh_state(bh);
1106
1107 if (jh->b_modified == 0) {
1108 /*
1109 * This buffer's got modified and becoming part
1110 * of the transaction. This needs to be done
1111 * once a transaction -bzzz
1112 */
1113 jh->b_modified = 1;
1114 J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1115 handle->h_buffer_credits--;
1116 }
1117
1118 /*
1119 * fastpath, to avoid expensive locking. If this buffer is already
1120 * on the running transaction's metadata list there is nothing to do.
1121 * Nobody can take it off again because there is a handle open.
1122 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1123 * result in this test being false, so we go in and take the locks.
1124 */
1125 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1126 JBUFFER_TRACE(jh, "fastpath");
1127 J_ASSERT_JH(jh, jh->b_transaction ==
1128 journal->j_running_transaction);
1129 goto out_unlock_bh;
1130 }
1131
1132 set_buffer_jbddirty(bh);
1133
1134 /*
1135 * Metadata already on the current transaction list doesn't
1136 * need to be filed. Metadata on another transaction's list must
1137 * be committing, and will be refiled once the commit completes:
1138 * leave it alone for now.
1139 */
1140 if (jh->b_transaction != transaction) {
1141 JBUFFER_TRACE(jh, "already on other transaction");
1142 J_ASSERT_JH(jh, jh->b_transaction ==
1143 journal->j_committing_transaction);
1144 J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1145 /* And this case is illegal: we can't reuse another
1146 * transaction's data buffer, ever. */
1147 goto out_unlock_bh;
1148 }
1149
1150 /* That test should have eliminated the following case: */
1151 J_ASSERT_JH(jh, jh->b_frozen_data == 0);
1152
1153 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1154 spin_lock(&journal->j_list_lock);
1155 __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1156 spin_unlock(&journal->j_list_lock);
1157out_unlock_bh:
1158 jbd_unlock_bh_state(bh);
1159out:
1160 JBUFFER_TRACE(jh, "exit");
1161 return 0;
1162}
1163
1164/*
1165 * journal_release_buffer: undo a get_write_access without any buffer
1166 * updates, if the update decided in the end that it didn't need access.
1167 *
1168 */
1169void
1170journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1171{
1172 BUFFER_TRACE(bh, "entry");
1173}
1174
1175/**
1176 * void journal_forget() - bforget() for potentially-journaled buffers.
1177 * @handle: transaction handle
1178 * @bh: bh to 'forget'
1179 *
1180 * We can only do the bforget if there are no commits pending against the
1181 * buffer. If the buffer is dirty in the current running transaction we
1182 * can safely unlink it.
1183 *
1184 * bh may not be a journalled buffer at all - it may be a non-JBD
1185 * buffer which came off the hashtable. Check for this.
1186 *
1187 * Decrements bh->b_count by one.
1188 *
1189 * Allow this call even if the handle has aborted --- it may be part of
1190 * the caller's cleanup after an abort.
1191 */
1192int journal_forget (handle_t *handle, struct buffer_head *bh)
1193{
1194 transaction_t *transaction = handle->h_transaction;
1195 journal_t *journal = transaction->t_journal;
1196 struct journal_head *jh;
1197 int drop_reserve = 0;
1198 int err = 0;
1199
1200 BUFFER_TRACE(bh, "entry");
1201
1202 jbd_lock_bh_state(bh);
1203 spin_lock(&journal->j_list_lock);
1204
1205 if (!buffer_jbd(bh))
1206 goto not_jbd;
1207 jh = bh2jh(bh);
1208
1209 /* Critical error: attempting to delete a bitmap buffer, maybe?
1210 * Don't do any jbd operations, and return an error. */
1211 if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1212 "inconsistent data on disk")) {
1213 err = -EIO;
1214 goto not_jbd;
1215 }
1216
1217 /*
1218 * The buffer's going from the transaction, we must drop
1219 * all references -bzzz
1220 */
1221 jh->b_modified = 0;
1222
1223 if (jh->b_transaction == handle->h_transaction) {
1224 J_ASSERT_JH(jh, !jh->b_frozen_data);
1225
1226 /* If we are forgetting a buffer which is already part
1227 * of this transaction, then we can just drop it from
1228 * the transaction immediately. */
1229 clear_buffer_dirty(bh);
1230 clear_buffer_jbddirty(bh);
1231
1232 JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1233
1234 drop_reserve = 1;
1235
1236 /*
1237 * We are no longer going to journal this buffer.
1238 * However, the commit of this transaction is still
1239 * important to the buffer: the delete that we are now
1240 * processing might obsolete an old log entry, so by
1241 * committing, we can satisfy the buffer's checkpoint.
1242 *
1243 * So, if we have a checkpoint on the buffer, we should
1244 * now refile the buffer on our BJ_Forget list so that
1245 * we know to remove the checkpoint after we commit.
1246 */
1247
1248 if (jh->b_cp_transaction) {
1249 __journal_temp_unlink_buffer(jh);
1250 __journal_file_buffer(jh, transaction, BJ_Forget);
1251 } else {
1252 __journal_unfile_buffer(jh);
1253 journal_remove_journal_head(bh);
1254 __brelse(bh);
1255 if (!buffer_jbd(bh)) {
1256 spin_unlock(&journal->j_list_lock);
1257 jbd_unlock_bh_state(bh);
1258 __bforget(bh);
1259 goto drop;
1260 }
1261 }
1262 } else if (jh->b_transaction) {
1263 J_ASSERT_JH(jh, (jh->b_transaction ==
1264 journal->j_committing_transaction));
1265 /* However, if the buffer is still owned by a prior
1266 * (committing) transaction, we can't drop it yet... */
1267 JBUFFER_TRACE(jh, "belongs to older transaction");
1268 /* ... but we CAN drop it from the new transaction if we
1269 * have also modified it since the original commit. */
1270
1271 if (jh->b_next_transaction) {
1272 J_ASSERT(jh->b_next_transaction == transaction);
1273 jh->b_next_transaction = NULL;
1274 drop_reserve = 1;
1275 }
1276 }
1277
1278not_jbd:
1279 spin_unlock(&journal->j_list_lock);
1280 jbd_unlock_bh_state(bh);
1281 __brelse(bh);
1282drop:
1283 if (drop_reserve) {
1284 /* no need to reserve log space for this block -bzzz */
1285 handle->h_buffer_credits++;
1286 }
1287 return err;
1288}
1289
1290/**
1291 * int journal_stop() - complete a transaction
1292 * @handle: tranaction to complete.
1293 *
1294 * All done for a particular handle.
1295 *
1296 * There is not much action needed here. We just return any remaining
1297 * buffer credits to the transaction and remove the handle. The only
1298 * complication is that we need to start a commit operation if the
1299 * filesystem is marked for synchronous update.
1300 *
1301 * journal_stop itself will not usually return an error, but it may
1302 * do so in unusual circumstances. In particular, expect it to
1303 * return -EIO if a journal_abort has been executed since the
1304 * transaction began.
1305 */
1306int journal_stop(handle_t *handle)
1307{
1308 transaction_t *transaction = handle->h_transaction;
1309 journal_t *journal = transaction->t_journal;
1310 int old_handle_count, err;
1311
1312 J_ASSERT(transaction->t_updates > 0);
1313 J_ASSERT(journal_current_handle() == handle);
1314
1315 if (is_handle_aborted(handle))
1316 err = -EIO;
1317 else
1318 err = 0;
1319
1320 if (--handle->h_ref > 0) {
1321 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1322 handle->h_ref);
1323 return err;
1324 }
1325
1326 jbd_debug(4, "Handle %p going down\n", handle);
1327
1328 /*
1329 * Implement synchronous transaction batching. If the handle
1330 * was synchronous, don't force a commit immediately. Let's
1331 * yield and let another thread piggyback onto this transaction.
1332 * Keep doing that while new threads continue to arrive.
1333 * It doesn't cost much - we're about to run a commit and sleep
1334 * on IO anyway. Speeds up many-threaded, many-dir operations
1335 * by 30x or more...
1336 */
1337 if (handle->h_sync) {
1338 do {
1339 old_handle_count = transaction->t_handle_count;
1340 set_current_state(TASK_UNINTERRUPTIBLE);
1341 schedule_timeout(1);
1342 } while (old_handle_count != transaction->t_handle_count);
1343 }
1344
1345 current->journal_info = NULL;
1346 spin_lock(&journal->j_state_lock);
1347 spin_lock(&transaction->t_handle_lock);
1348 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1349 transaction->t_updates--;
1350 if (!transaction->t_updates) {
1351 wake_up(&journal->j_wait_updates);
1352 if (journal->j_barrier_count)
1353 wake_up(&journal->j_wait_transaction_locked);
1354 }
1355
1356 /*
1357 * If the handle is marked SYNC, we need to set another commit
1358 * going! We also want to force a commit if the current
1359 * transaction is occupying too much of the log, or if the
1360 * transaction is too old now.
1361 */
1362 if (handle->h_sync ||
1363 transaction->t_outstanding_credits >
1364 journal->j_max_transaction_buffers ||
1365 time_after_eq(jiffies, transaction->t_expires)) {
1366 /* Do this even for aborted journals: an abort still
1367 * completes the commit thread, it just doesn't write
1368 * anything to disk. */
1369 tid_t tid = transaction->t_tid;
1370
1371 spin_unlock(&transaction->t_handle_lock);
1372 jbd_debug(2, "transaction too old, requesting commit for "
1373 "handle %p\n", handle);
1374 /* This is non-blocking */
1375 __log_start_commit(journal, transaction->t_tid);
1376 spin_unlock(&journal->j_state_lock);
1377
1378 /*
1379 * Special case: JFS_SYNC synchronous updates require us
1380 * to wait for the commit to complete.
1381 */
1382 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1383 err = log_wait_commit(journal, tid);
1384 } else {
1385 spin_unlock(&transaction->t_handle_lock);
1386 spin_unlock(&journal->j_state_lock);
1387 }
1388
1389 jbd_free_handle(handle);
1390 return err;
1391}
1392
1393/**int journal_force_commit() - force any uncommitted transactions
1394 * @journal: journal to force
1395 *
1396 * For synchronous operations: force any uncommitted transactions
1397 * to disk. May seem kludgy, but it reuses all the handle batching
1398 * code in a very simple manner.
1399 */
1400int journal_force_commit(journal_t *journal)
1401{
1402 handle_t *handle;
1403 int ret;
1404
1405 handle = journal_start(journal, 1);
1406 if (IS_ERR(handle)) {
1407 ret = PTR_ERR(handle);
1408 } else {
1409 handle->h_sync = 1;
1410 ret = journal_stop(handle);
1411 }
1412 return ret;
1413}
1414
1415/*
1416 *
1417 * List management code snippets: various functions for manipulating the
1418 * transaction buffer lists.
1419 *
1420 */
1421
1422/*
1423 * Append a buffer to a transaction list, given the transaction's list head
1424 * pointer.
1425 *
1426 * j_list_lock is held.
1427 *
1428 * jbd_lock_bh_state(jh2bh(jh)) is held.
1429 */
1430
1431static inline void
1432__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1433{
1434 if (!*list) {
1435 jh->b_tnext = jh->b_tprev = jh;
1436 *list = jh;
1437 } else {
1438 /* Insert at the tail of the list to preserve order */
1439 struct journal_head *first = *list, *last = first->b_tprev;
1440 jh->b_tprev = last;
1441 jh->b_tnext = first;
1442 last->b_tnext = first->b_tprev = jh;
1443 }
1444}
1445
1446/*
1447 * Remove a buffer from a transaction list, given the transaction's list
1448 * head pointer.
1449 *
1450 * Called with j_list_lock held, and the journal may not be locked.
1451 *
1452 * jbd_lock_bh_state(jh2bh(jh)) is held.
1453 */
1454
1455static inline void
1456__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1457{
1458 if (*list == jh) {
1459 *list = jh->b_tnext;
1460 if (*list == jh)
1461 *list = NULL;
1462 }
1463 jh->b_tprev->b_tnext = jh->b_tnext;
1464 jh->b_tnext->b_tprev = jh->b_tprev;
1465}
1466
1467/*
1468 * Remove a buffer from the appropriate transaction list.
1469 *
1470 * Note that this function can *change* the value of
1471 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
1472 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
1473 * is holding onto a copy of one of thee pointers, it could go bad.
1474 * Generally the caller needs to re-read the pointer from the transaction_t.
1475 *
1476 * Called under j_list_lock. The journal may not be locked.
1477 */
1478void __journal_temp_unlink_buffer(struct journal_head *jh)
1479{
1480 struct journal_head **list = NULL;
1481 transaction_t *transaction;
1482 struct buffer_head *bh = jh2bh(jh);
1483
1484 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1485 transaction = jh->b_transaction;
1486 if (transaction)
1487 assert_spin_locked(&transaction->t_journal->j_list_lock);
1488
1489 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1490 if (jh->b_jlist != BJ_None)
1491 J_ASSERT_JH(jh, transaction != 0);
1492
1493 switch (jh->b_jlist) {
1494 case BJ_None:
1495 return;
1496 case BJ_SyncData:
1497 list = &transaction->t_sync_datalist;
1498 break;
1499 case BJ_Metadata:
1500 transaction->t_nr_buffers--;
1501 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1502 list = &transaction->t_buffers;
1503 break;
1504 case BJ_Forget:
1505 list = &transaction->t_forget;
1506 break;
1507 case BJ_IO:
1508 list = &transaction->t_iobuf_list;
1509 break;
1510 case BJ_Shadow:
1511 list = &transaction->t_shadow_list;
1512 break;
1513 case BJ_LogCtl:
1514 list = &transaction->t_log_list;
1515 break;
1516 case BJ_Reserved:
1517 list = &transaction->t_reserved_list;
1518 break;
1519 case BJ_Locked:
1520 list = &transaction->t_locked_list;
1521 break;
1522 }
1523
1524 __blist_del_buffer(list, jh);
1525 jh->b_jlist = BJ_None;
1526 if (test_clear_buffer_jbddirty(bh))
1527 mark_buffer_dirty(bh); /* Expose it to the VM */
1528}
1529
1530void __journal_unfile_buffer(struct journal_head *jh)
1531{
1532 __journal_temp_unlink_buffer(jh);
1533 jh->b_transaction = NULL;
1534}
1535
1536void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1537{
1538 jbd_lock_bh_state(jh2bh(jh));
1539 spin_lock(&journal->j_list_lock);
1540 __journal_unfile_buffer(jh);
1541 spin_unlock(&journal->j_list_lock);
1542 jbd_unlock_bh_state(jh2bh(jh));
1543}
1544
1545/*
1546 * Called from journal_try_to_free_buffers().
1547 *
1548 * Called under jbd_lock_bh_state(bh)
1549 */
1550static void
1551__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1552{
1553 struct journal_head *jh;
1554
1555 jh = bh2jh(bh);
1556
1557 if (buffer_locked(bh) || buffer_dirty(bh))
1558 goto out;
1559
1560 if (jh->b_next_transaction != 0)
1561 goto out;
1562
1563 spin_lock(&journal->j_list_lock);
1564 if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
1565 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1566 /* A written-back ordered data buffer */
1567 JBUFFER_TRACE(jh, "release data");
1568 __journal_unfile_buffer(jh);
1569 journal_remove_journal_head(bh);
1570 __brelse(bh);
1571 }
1572 } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
1573 /* written-back checkpointed metadata buffer */
1574 if (jh->b_jlist == BJ_None) {
1575 JBUFFER_TRACE(jh, "remove from checkpoint list");
1576 __journal_remove_checkpoint(jh);
1577 journal_remove_journal_head(bh);
1578 __brelse(bh);
1579 }
1580 }
1581 spin_unlock(&journal->j_list_lock);
1582out:
1583 return;
1584}
1585
1586
1587/**
1588 * int journal_try_to_free_buffers() - try to free page buffers.
1589 * @journal: journal for operation
1590 * @page: to try and free
1591 * @unused_gfp_mask: unused
1592 *
1593 *
1594 * For all the buffers on this page,
1595 * if they are fully written out ordered data, move them onto BUF_CLEAN
1596 * so try_to_free_buffers() can reap them.
1597 *
1598 * This function returns non-zero if we wish try_to_free_buffers()
1599 * to be called. We do this if the page is releasable by try_to_free_buffers().
1600 * We also do it if the page has locked or dirty buffers and the caller wants
1601 * us to perform sync or async writeout.
1602 *
1603 * This complicates JBD locking somewhat. We aren't protected by the
1604 * BKL here. We wish to remove the buffer from its committing or
1605 * running transaction's ->t_datalist via __journal_unfile_buffer.
1606 *
1607 * This may *change* the value of transaction_t->t_datalist, so anyone
1608 * who looks at t_datalist needs to lock against this function.
1609 *
1610 * Even worse, someone may be doing a journal_dirty_data on this
1611 * buffer. So we need to lock against that. journal_dirty_data()
1612 * will come out of the lock with the buffer dirty, which makes it
1613 * ineligible for release here.
1614 *
1615 * Who else is affected by this? hmm... Really the only contender
1616 * is do_get_write_access() - it could be looking at the buffer while
1617 * journal_try_to_free_buffer() is changing its state. But that
1618 * cannot happen because we never reallocate freed data as metadata
1619 * while the data is part of a transaction. Yes?
1620 */
1621int journal_try_to_free_buffers(journal_t *journal,
1622 struct page *page, int unused_gfp_mask)
1623{
1624 struct buffer_head *head;
1625 struct buffer_head *bh;
1626 int ret = 0;
1627
1628 J_ASSERT(PageLocked(page));
1629
1630 head = page_buffers(page);
1631 bh = head;
1632 do {
1633 struct journal_head *jh;
1634
1635 /*
1636 * We take our own ref against the journal_head here to avoid
1637 * having to add tons of locking around each instance of
1638 * journal_remove_journal_head() and journal_put_journal_head().
1639 */
1640 jh = journal_grab_journal_head(bh);
1641 if (!jh)
1642 continue;
1643
1644 jbd_lock_bh_state(bh);
1645 __journal_try_to_free_buffer(journal, bh);
1646 journal_put_journal_head(jh);
1647 jbd_unlock_bh_state(bh);
1648 if (buffer_jbd(bh))
1649 goto busy;
1650 } while ((bh = bh->b_this_page) != head);
1651 ret = try_to_free_buffers(page);
1652busy:
1653 return ret;
1654}
1655
1656/*
1657 * This buffer is no longer needed. If it is on an older transaction's
1658 * checkpoint list we need to record it on this transaction's forget list
1659 * to pin this buffer (and hence its checkpointing transaction) down until
1660 * this transaction commits. If the buffer isn't on a checkpoint list, we
1661 * release it.
1662 * Returns non-zero if JBD no longer has an interest in the buffer.
1663 *
1664 * Called under j_list_lock.
1665 *
1666 * Called under jbd_lock_bh_state(bh).
1667 */
1668static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1669{
1670 int may_free = 1;
1671 struct buffer_head *bh = jh2bh(jh);
1672
1673 __journal_unfile_buffer(jh);
1674
1675 if (jh->b_cp_transaction) {
1676 JBUFFER_TRACE(jh, "on running+cp transaction");
1677 __journal_file_buffer(jh, transaction, BJ_Forget);
1678 clear_buffer_jbddirty(bh);
1679 may_free = 0;
1680 } else {
1681 JBUFFER_TRACE(jh, "on running transaction");
1682 journal_remove_journal_head(bh);
1683 __brelse(bh);
1684 }
1685 return may_free;
1686}
1687
1688/*
1689 * journal_invalidatepage
1690 *
1691 * This code is tricky. It has a number of cases to deal with.
1692 *
1693 * There are two invariants which this code relies on:
1694 *
1695 * i_size must be updated on disk before we start calling invalidatepage on the
1696 * data.
1697 *
1698 * This is done in ext3 by defining an ext3_setattr method which
1699 * updates i_size before truncate gets going. By maintaining this
1700 * invariant, we can be sure that it is safe to throw away any buffers
1701 * attached to the current transaction: once the transaction commits,
1702 * we know that the data will not be needed.
1703 *
1704 * Note however that we can *not* throw away data belonging to the
1705 * previous, committing transaction!
1706 *
1707 * Any disk blocks which *are* part of the previous, committing
1708 * transaction (and which therefore cannot be discarded immediately) are
1709 * not going to be reused in the new running transaction
1710 *
1711 * The bitmap committed_data images guarantee this: any block which is
1712 * allocated in one transaction and removed in the next will be marked
1713 * as in-use in the committed_data bitmap, so cannot be reused until
1714 * the next transaction to delete the block commits. This means that
1715 * leaving committing buffers dirty is quite safe: the disk blocks
1716 * cannot be reallocated to a different file and so buffer aliasing is
1717 * not possible.
1718 *
1719 *
1720 * The above applies mainly to ordered data mode. In writeback mode we
1721 * don't make guarantees about the order in which data hits disk --- in
1722 * particular we don't guarantee that new dirty data is flushed before
1723 * transaction commit --- so it is always safe just to discard data
1724 * immediately in that mode. --sct
1725 */
1726
1727/*
1728 * The journal_unmap_buffer helper function returns zero if the buffer
1729 * concerned remains pinned as an anonymous buffer belonging to an older
1730 * transaction.
1731 *
1732 * We're outside-transaction here. Either or both of j_running_transaction
1733 * and j_committing_transaction may be NULL.
1734 */
1735static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1736{
1737 transaction_t *transaction;
1738 struct journal_head *jh;
1739 int may_free = 1;
1740 int ret;
1741
1742 BUFFER_TRACE(bh, "entry");
1743
1744 /*
1745 * It is safe to proceed here without the j_list_lock because the
1746 * buffers cannot be stolen by try_to_free_buffers as long as we are
1747 * holding the page lock. --sct
1748 */
1749
1750 if (!buffer_jbd(bh))
1751 goto zap_buffer_unlocked;
1752
1753 spin_lock(&journal->j_state_lock);
1754 jbd_lock_bh_state(bh);
1755 spin_lock(&journal->j_list_lock);
1756
1757 jh = journal_grab_journal_head(bh);
1758 if (!jh)
1759 goto zap_buffer_no_jh;
1760
1761 transaction = jh->b_transaction;
1762 if (transaction == NULL) {
1763 /* First case: not on any transaction. If it
1764 * has no checkpoint link, then we can zap it:
1765 * it's a writeback-mode buffer so we don't care
1766 * if it hits disk safely. */
1767 if (!jh->b_cp_transaction) {
1768 JBUFFER_TRACE(jh, "not on any transaction: zap");
1769 goto zap_buffer;
1770 }
1771
1772 if (!buffer_dirty(bh)) {
1773 /* bdflush has written it. We can drop it now */
1774 goto zap_buffer;
1775 }
1776
1777 /* OK, it must be in the journal but still not
1778 * written fully to disk: it's metadata or
1779 * journaled data... */
1780
1781 if (journal->j_running_transaction) {
1782 /* ... and once the current transaction has
1783 * committed, the buffer won't be needed any
1784 * longer. */
1785 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1786 ret = __dispose_buffer(jh,
1787 journal->j_running_transaction);
1788 journal_put_journal_head(jh);
1789 spin_unlock(&journal->j_list_lock);
1790 jbd_unlock_bh_state(bh);
1791 spin_unlock(&journal->j_state_lock);
1792 return ret;
1793 } else {
1794 /* There is no currently-running transaction. So the
1795 * orphan record which we wrote for this file must have
1796 * passed into commit. We must attach this buffer to
1797 * the committing transaction, if it exists. */
1798 if (journal->j_committing_transaction) {
1799 JBUFFER_TRACE(jh, "give to committing trans");
1800 ret = __dispose_buffer(jh,
1801 journal->j_committing_transaction);
1802 journal_put_journal_head(jh);
1803 spin_unlock(&journal->j_list_lock);
1804 jbd_unlock_bh_state(bh);
1805 spin_unlock(&journal->j_state_lock);
1806 return ret;
1807 } else {
1808 /* The orphan record's transaction has
1809 * committed. We can cleanse this buffer */
1810 clear_buffer_jbddirty(bh);
1811 goto zap_buffer;
1812 }
1813 }
1814 } else if (transaction == journal->j_committing_transaction) {
1815 /* If it is committing, we simply cannot touch it. We
1816 * can remove it's next_transaction pointer from the
1817 * running transaction if that is set, but nothing
1818 * else. */
1819 JBUFFER_TRACE(jh, "on committing transaction");
1820 set_buffer_freed(bh);
1821 if (jh->b_next_transaction) {
1822 J_ASSERT(jh->b_next_transaction ==
1823 journal->j_running_transaction);
1824 jh->b_next_transaction = NULL;
1825 }
1826 journal_put_journal_head(jh);
1827 spin_unlock(&journal->j_list_lock);
1828 jbd_unlock_bh_state(bh);
1829 spin_unlock(&journal->j_state_lock);
1830 return 0;
1831 } else {
1832 /* Good, the buffer belongs to the running transaction.
1833 * We are writing our own transaction's data, not any
1834 * previous one's, so it is safe to throw it away
1835 * (remember that we expect the filesystem to have set
1836 * i_size already for this truncate so recovery will not
1837 * expose the disk blocks we are discarding here.) */
1838 J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1839 may_free = __dispose_buffer(jh, transaction);
1840 }
1841
1842zap_buffer:
1843 journal_put_journal_head(jh);
1844zap_buffer_no_jh:
1845 spin_unlock(&journal->j_list_lock);
1846 jbd_unlock_bh_state(bh);
1847 spin_unlock(&journal->j_state_lock);
1848zap_buffer_unlocked:
1849 clear_buffer_dirty(bh);
1850 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
1851 clear_buffer_mapped(bh);
1852 clear_buffer_req(bh);
1853 clear_buffer_new(bh);
1854 bh->b_bdev = NULL;
1855 return may_free;
1856}
1857
1858/**
1859 * int journal_invalidatepage()
1860 * @journal: journal to use for flush...
1861 * @page: page to flush
1862 * @offset: length of page to invalidate.
1863 *
1864 * Reap page buffers containing data after offset in page.
1865 *
1866 * Return non-zero if the page's buffers were successfully reaped.
1867 */
1868int journal_invalidatepage(journal_t *journal,
1869 struct page *page,
1870 unsigned long offset)
1871{
1872 struct buffer_head *head, *bh, *next;
1873 unsigned int curr_off = 0;
1874 int may_free = 1;
1875
1876 if (!PageLocked(page))
1877 BUG();
1878 if (!page_has_buffers(page))
1879 return 1;
1880
1881 /* We will potentially be playing with lists other than just the
1882 * data lists (especially for journaled data mode), so be
1883 * cautious in our locking. */
1884
1885 head = bh = page_buffers(page);
1886 do {
1887 unsigned int next_off = curr_off + bh->b_size;
1888 next = bh->b_this_page;
1889
1890 /* AKPM: doing lock_buffer here may be overly paranoid */
1891 if (offset <= curr_off) {
1892 /* This block is wholly outside the truncation point */
1893 lock_buffer(bh);
1894 may_free &= journal_unmap_buffer(journal, bh);
1895 unlock_buffer(bh);
1896 }
1897 curr_off = next_off;
1898 bh = next;
1899
1900 } while (bh != head);
1901
1902 if (!offset) {
1903 if (!may_free || !try_to_free_buffers(page))
1904 return 0;
1905 J_ASSERT(!page_has_buffers(page));
1906 }
1907 return 1;
1908}
1909
1910/*
1911 * File a buffer on the given transaction list.
1912 */
1913void __journal_file_buffer(struct journal_head *jh,
1914 transaction_t *transaction, int jlist)
1915{
1916 struct journal_head **list = NULL;
1917 int was_dirty = 0;
1918 struct buffer_head *bh = jh2bh(jh);
1919
1920 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1921 assert_spin_locked(&transaction->t_journal->j_list_lock);
1922
1923 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1924 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1925 jh->b_transaction == 0);
1926
1927 if (jh->b_transaction && jh->b_jlist == jlist)
1928 return;
1929
1930 /* The following list of buffer states needs to be consistent
1931 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
1932 * state. */
1933
1934 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
1935 jlist == BJ_Shadow || jlist == BJ_Forget) {
1936 if (test_clear_buffer_dirty(bh) ||
1937 test_clear_buffer_jbddirty(bh))
1938 was_dirty = 1;
1939 }
1940
1941 if (jh->b_transaction)
1942 __journal_temp_unlink_buffer(jh);
1943 jh->b_transaction = transaction;
1944
1945 switch (jlist) {
1946 case BJ_None:
1947 J_ASSERT_JH(jh, !jh->b_committed_data);
1948 J_ASSERT_JH(jh, !jh->b_frozen_data);
1949 return;
1950 case BJ_SyncData:
1951 list = &transaction->t_sync_datalist;
1952 break;
1953 case BJ_Metadata:
1954 transaction->t_nr_buffers++;
1955 list = &transaction->t_buffers;
1956 break;
1957 case BJ_Forget:
1958 list = &transaction->t_forget;
1959 break;
1960 case BJ_IO:
1961 list = &transaction->t_iobuf_list;
1962 break;
1963 case BJ_Shadow:
1964 list = &transaction->t_shadow_list;
1965 break;
1966 case BJ_LogCtl:
1967 list = &transaction->t_log_list;
1968 break;
1969 case BJ_Reserved:
1970 list = &transaction->t_reserved_list;
1971 break;
1972 case BJ_Locked:
1973 list = &transaction->t_locked_list;
1974 break;
1975 }
1976
1977 __blist_add_buffer(list, jh);
1978 jh->b_jlist = jlist;
1979
1980 if (was_dirty)
1981 set_buffer_jbddirty(bh);
1982}
1983
1984void journal_file_buffer(struct journal_head *jh,
1985 transaction_t *transaction, int jlist)
1986{
1987 jbd_lock_bh_state(jh2bh(jh));
1988 spin_lock(&transaction->t_journal->j_list_lock);
1989 __journal_file_buffer(jh, transaction, jlist);
1990 spin_unlock(&transaction->t_journal->j_list_lock);
1991 jbd_unlock_bh_state(jh2bh(jh));
1992}
1993
1994/*
1995 * Remove a buffer from its current buffer list in preparation for
1996 * dropping it from its current transaction entirely. If the buffer has
1997 * already started to be used by a subsequent transaction, refile the
1998 * buffer on that transaction's metadata list.
1999 *
2000 * Called under journal->j_list_lock
2001 *
2002 * Called under jbd_lock_bh_state(jh2bh(jh))
2003 */
2004void __journal_refile_buffer(struct journal_head *jh)
2005{
2006 int was_dirty;
2007 struct buffer_head *bh = jh2bh(jh);
2008
2009 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2010 if (jh->b_transaction)
2011 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2012
2013 /* If the buffer is now unused, just drop it. */
2014 if (jh->b_next_transaction == NULL) {
2015 __journal_unfile_buffer(jh);
2016 return;
2017 }
2018
2019 /*
2020 * It has been modified by a later transaction: add it to the new
2021 * transaction's metadata list.
2022 */
2023
2024 was_dirty = test_clear_buffer_jbddirty(bh);
2025 __journal_temp_unlink_buffer(jh);
2026 jh->b_transaction = jh->b_next_transaction;
2027 jh->b_next_transaction = NULL;
2028 __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
2029 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2030
2031 if (was_dirty)
2032 set_buffer_jbddirty(bh);
2033}
2034
2035/*
2036 * For the unlocked version of this call, also make sure that any
2037 * hanging journal_head is cleaned up if necessary.
2038 *
2039 * __journal_refile_buffer is usually called as part of a single locked
2040 * operation on a buffer_head, in which the caller is probably going to
2041 * be hooking the journal_head onto other lists. In that case it is up
2042 * to the caller to remove the journal_head if necessary. For the
2043 * unlocked journal_refile_buffer call, the caller isn't going to be
2044 * doing anything else to the buffer so we need to do the cleanup
2045 * ourselves to avoid a jh leak.
2046 *
2047 * *** The journal_head may be freed by this call! ***
2048 */
2049void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2050{
2051 struct buffer_head *bh = jh2bh(jh);
2052
2053 jbd_lock_bh_state(bh);
2054 spin_lock(&journal->j_list_lock);
2055
2056 __journal_refile_buffer(jh);
2057 jbd_unlock_bh_state(bh);
2058 journal_remove_journal_head(bh);
2059
2060 spin_unlock(&journal->j_list_lock);
2061 __brelse(bh);
2062}