diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jbd |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'fs/jbd')
-rw-r--r-- | fs/jbd/Makefile | 7 | ||||
-rw-r--r-- | fs/jbd/checkpoint.c | 636 | ||||
-rw-r--r-- | fs/jbd/commit.c | 844 | ||||
-rw-r--r-- | fs/jbd/journal.c | 2003 | ||||
-rw-r--r-- | fs/jbd/recovery.c | 591 | ||||
-rw-r--r-- | fs/jbd/revoke.c | 702 | ||||
-rw-r--r-- | fs/jbd/transaction.c | 2062 |
7 files changed, 6845 insertions, 0 deletions
diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile new file mode 100644 index 000000000000..54aca4868a36 --- /dev/null +++ b/fs/jbd/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | # | ||
2 | # Makefile for the linux journaling routines. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_JBD) += jbd.o | ||
6 | |||
7 | jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o | ||
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c new file mode 100644 index 000000000000..98d830401c56 --- /dev/null +++ b/fs/jbd/checkpoint.c | |||
@@ -0,0 +1,636 @@ | |||
1 | /* | ||
2 | * linux/fs/checkpoint.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 | ||
5 | * | ||
6 | * Copyright 1999 Red Hat Software --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Checkpoint routines for the generic filesystem journaling code. | ||
13 | * Part of the ext2fs journaling system. | ||
14 | * | ||
15 | * Checkpointing is the process of ensuring that a section of the log is | ||
16 | * committed fully to disk, so that that portion of the log can be | ||
17 | * reused. | ||
18 | */ | ||
19 | |||
20 | #include <linux/time.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/jbd.h> | ||
23 | #include <linux/errno.h> | ||
24 | #include <linux/slab.h> | ||
25 | |||
26 | /* | ||
27 | * Unlink a buffer from a transaction. | ||
28 | * | ||
29 | * Called with j_list_lock held. | ||
30 | */ | ||
31 | |||
32 | static inline void __buffer_unlink(struct journal_head *jh) | ||
33 | { | ||
34 | transaction_t *transaction; | ||
35 | |||
36 | transaction = jh->b_cp_transaction; | ||
37 | jh->b_cp_transaction = NULL; | ||
38 | |||
39 | jh->b_cpnext->b_cpprev = jh->b_cpprev; | ||
40 | jh->b_cpprev->b_cpnext = jh->b_cpnext; | ||
41 | if (transaction->t_checkpoint_list == jh) | ||
42 | transaction->t_checkpoint_list = jh->b_cpnext; | ||
43 | if (transaction->t_checkpoint_list == jh) | ||
44 | transaction->t_checkpoint_list = NULL; | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * Try to release a checkpointed buffer from its transaction. | ||
49 | * Returns 1 if we released it. | ||
50 | * Requires j_list_lock | ||
51 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | ||
52 | */ | ||
53 | static int __try_to_free_cp_buf(struct journal_head *jh) | ||
54 | { | ||
55 | int ret = 0; | ||
56 | struct buffer_head *bh = jh2bh(jh); | ||
57 | |||
58 | if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { | ||
59 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | ||
60 | __journal_remove_checkpoint(jh); | ||
61 | jbd_unlock_bh_state(bh); | ||
62 | journal_remove_journal_head(bh); | ||
63 | BUFFER_TRACE(bh, "release"); | ||
64 | __brelse(bh); | ||
65 | ret = 1; | ||
66 | } else { | ||
67 | jbd_unlock_bh_state(bh); | ||
68 | } | ||
69 | return ret; | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * __log_wait_for_space: wait until there is space in the journal. | ||
74 | * | ||
75 | * Called under j-state_lock *only*. It will be unlocked if we have to wait | ||
76 | * for a checkpoint to free up some space in the log. | ||
77 | */ | ||
78 | void __log_wait_for_space(journal_t *journal) | ||
79 | { | ||
80 | int nblocks; | ||
81 | assert_spin_locked(&journal->j_state_lock); | ||
82 | |||
83 | nblocks = jbd_space_needed(journal); | ||
84 | while (__log_space_left(journal) < nblocks) { | ||
85 | if (journal->j_flags & JFS_ABORT) | ||
86 | return; | ||
87 | spin_unlock(&journal->j_state_lock); | ||
88 | down(&journal->j_checkpoint_sem); | ||
89 | |||
90 | /* | ||
91 | * Test again, another process may have checkpointed while we | ||
92 | * were waiting for the checkpoint lock | ||
93 | */ | ||
94 | spin_lock(&journal->j_state_lock); | ||
95 | nblocks = jbd_space_needed(journal); | ||
96 | if (__log_space_left(journal) < nblocks) { | ||
97 | spin_unlock(&journal->j_state_lock); | ||
98 | log_do_checkpoint(journal); | ||
99 | spin_lock(&journal->j_state_lock); | ||
100 | } | ||
101 | up(&journal->j_checkpoint_sem); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. | ||
107 | * The caller must restart a list walk. Wait for someone else to run | ||
108 | * jbd_unlock_bh_state(). | ||
109 | */ | ||
110 | static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) | ||
111 | { | ||
112 | get_bh(bh); | ||
113 | spin_unlock(&journal->j_list_lock); | ||
114 | jbd_lock_bh_state(bh); | ||
115 | jbd_unlock_bh_state(bh); | ||
116 | put_bh(bh); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Clean up a transaction's checkpoint list. | ||
121 | * | ||
122 | * We wait for any pending IO to complete and make sure any clean | ||
123 | * buffers are removed from the transaction. | ||
124 | * | ||
125 | * Return 1 if we performed any actions which might have destroyed the | ||
126 | * checkpoint. (journal_remove_checkpoint() deletes the transaction when | ||
127 | * the last checkpoint buffer is cleansed) | ||
128 | * | ||
129 | * Called with j_list_lock held. | ||
130 | */ | ||
131 | static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) | ||
132 | { | ||
133 | struct journal_head *jh, *next_jh, *last_jh; | ||
134 | struct buffer_head *bh; | ||
135 | int ret = 0; | ||
136 | |||
137 | assert_spin_locked(&journal->j_list_lock); | ||
138 | jh = transaction->t_checkpoint_list; | ||
139 | if (!jh) | ||
140 | return 0; | ||
141 | |||
142 | last_jh = jh->b_cpprev; | ||
143 | next_jh = jh; | ||
144 | do { | ||
145 | jh = next_jh; | ||
146 | bh = jh2bh(jh); | ||
147 | if (buffer_locked(bh)) { | ||
148 | atomic_inc(&bh->b_count); | ||
149 | spin_unlock(&journal->j_list_lock); | ||
150 | wait_on_buffer(bh); | ||
151 | /* the journal_head may have gone by now */ | ||
152 | BUFFER_TRACE(bh, "brelse"); | ||
153 | __brelse(bh); | ||
154 | goto out_return_1; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * This is foul | ||
159 | */ | ||
160 | if (!jbd_trylock_bh_state(bh)) { | ||
161 | jbd_sync_bh(journal, bh); | ||
162 | goto out_return_1; | ||
163 | } | ||
164 | |||
165 | if (jh->b_transaction != NULL) { | ||
166 | transaction_t *t = jh->b_transaction; | ||
167 | tid_t tid = t->t_tid; | ||
168 | |||
169 | spin_unlock(&journal->j_list_lock); | ||
170 | jbd_unlock_bh_state(bh); | ||
171 | log_start_commit(journal, tid); | ||
172 | log_wait_commit(journal, tid); | ||
173 | goto out_return_1; | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * AKPM: I think the buffer_jbddirty test is redundant - it | ||
178 | * shouldn't have NULL b_transaction? | ||
179 | */ | ||
180 | next_jh = jh->b_cpnext; | ||
181 | if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { | ||
182 | BUFFER_TRACE(bh, "remove from checkpoint"); | ||
183 | __journal_remove_checkpoint(jh); | ||
184 | jbd_unlock_bh_state(bh); | ||
185 | journal_remove_journal_head(bh); | ||
186 | __brelse(bh); | ||
187 | ret = 1; | ||
188 | } else { | ||
189 | jbd_unlock_bh_state(bh); | ||
190 | } | ||
191 | jh = next_jh; | ||
192 | } while (jh != last_jh); | ||
193 | |||
194 | return ret; | ||
195 | out_return_1: | ||
196 | spin_lock(&journal->j_list_lock); | ||
197 | return 1; | ||
198 | } | ||
199 | |||
200 | #define NR_BATCH 64 | ||
201 | |||
202 | static void | ||
203 | __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) | ||
204 | { | ||
205 | int i; | ||
206 | |||
207 | spin_unlock(&journal->j_list_lock); | ||
208 | ll_rw_block(WRITE, *batch_count, bhs); | ||
209 | spin_lock(&journal->j_list_lock); | ||
210 | for (i = 0; i < *batch_count; i++) { | ||
211 | struct buffer_head *bh = bhs[i]; | ||
212 | clear_buffer_jwrite(bh); | ||
213 | BUFFER_TRACE(bh, "brelse"); | ||
214 | __brelse(bh); | ||
215 | } | ||
216 | *batch_count = 0; | ||
217 | } | ||
218 | |||
219 | /* | ||
220 | * Try to flush one buffer from the checkpoint list to disk. | ||
221 | * | ||
222 | * Return 1 if something happened which requires us to abort the current | ||
223 | * scan of the checkpoint list. | ||
224 | * | ||
225 | * Called with j_list_lock held. | ||
226 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | ||
227 | */ | ||
228 | static int __flush_buffer(journal_t *journal, struct journal_head *jh, | ||
229 | struct buffer_head **bhs, int *batch_count, | ||
230 | int *drop_count) | ||
231 | { | ||
232 | struct buffer_head *bh = jh2bh(jh); | ||
233 | int ret = 0; | ||
234 | |||
235 | if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { | ||
236 | J_ASSERT_JH(jh, jh->b_transaction == NULL); | ||
237 | |||
238 | /* | ||
239 | * Important: we are about to write the buffer, and | ||
240 | * possibly block, while still holding the journal lock. | ||
241 | * We cannot afford to let the transaction logic start | ||
242 | * messing around with this buffer before we write it to | ||
243 | * disk, as that would break recoverability. | ||
244 | */ | ||
245 | BUFFER_TRACE(bh, "queue"); | ||
246 | get_bh(bh); | ||
247 | J_ASSERT_BH(bh, !buffer_jwrite(bh)); | ||
248 | set_buffer_jwrite(bh); | ||
249 | bhs[*batch_count] = bh; | ||
250 | jbd_unlock_bh_state(bh); | ||
251 | (*batch_count)++; | ||
252 | if (*batch_count == NR_BATCH) { | ||
253 | __flush_batch(journal, bhs, batch_count); | ||
254 | ret = 1; | ||
255 | } | ||
256 | } else { | ||
257 | int last_buffer = 0; | ||
258 | if (jh->b_cpnext == jh) { | ||
259 | /* We may be about to drop the transaction. Tell the | ||
260 | * caller that the lists have changed. | ||
261 | */ | ||
262 | last_buffer = 1; | ||
263 | } | ||
264 | if (__try_to_free_cp_buf(jh)) { | ||
265 | (*drop_count)++; | ||
266 | ret = last_buffer; | ||
267 | } | ||
268 | } | ||
269 | return ret; | ||
270 | } | ||
271 | |||
272 | /* | ||
273 | * Perform an actual checkpoint. We don't write out only enough to | ||
274 | * satisfy the current blocked requests: rather we submit a reasonably | ||
275 | * sized chunk of the outstanding data to disk at once for | ||
276 | * efficiency. __log_wait_for_space() will retry if we didn't free enough. | ||
277 | * | ||
278 | * However, we _do_ take into account the amount requested so that once | ||
279 | * the IO has been queued, we can return as soon as enough of it has | ||
280 | * completed to disk. | ||
281 | * | ||
282 | * The journal should be locked before calling this function. | ||
283 | */ | ||
284 | int log_do_checkpoint(journal_t *journal) | ||
285 | { | ||
286 | int result; | ||
287 | int batch_count = 0; | ||
288 | struct buffer_head *bhs[NR_BATCH]; | ||
289 | |||
290 | jbd_debug(1, "Start checkpoint\n"); | ||
291 | |||
292 | /* | ||
293 | * First thing: if there are any transactions in the log which | ||
294 | * don't need checkpointing, just eliminate them from the | ||
295 | * journal straight away. | ||
296 | */ | ||
297 | result = cleanup_journal_tail(journal); | ||
298 | jbd_debug(1, "cleanup_journal_tail returned %d\n", result); | ||
299 | if (result <= 0) | ||
300 | return result; | ||
301 | |||
302 | /* | ||
303 | * OK, we need to start writing disk blocks. Try to free up a | ||
304 | * quarter of the log in a single checkpoint if we can. | ||
305 | */ | ||
306 | /* | ||
307 | * AKPM: check this code. I had a feeling a while back that it | ||
308 | * degenerates into a busy loop at unmount time. | ||
309 | */ | ||
310 | spin_lock(&journal->j_list_lock); | ||
311 | while (journal->j_checkpoint_transactions) { | ||
312 | transaction_t *transaction; | ||
313 | struct journal_head *jh, *last_jh, *next_jh; | ||
314 | int drop_count = 0; | ||
315 | int cleanup_ret, retry = 0; | ||
316 | tid_t this_tid; | ||
317 | |||
318 | transaction = journal->j_checkpoint_transactions; | ||
319 | this_tid = transaction->t_tid; | ||
320 | jh = transaction->t_checkpoint_list; | ||
321 | last_jh = jh->b_cpprev; | ||
322 | next_jh = jh; | ||
323 | do { | ||
324 | struct buffer_head *bh; | ||
325 | |||
326 | jh = next_jh; | ||
327 | next_jh = jh->b_cpnext; | ||
328 | bh = jh2bh(jh); | ||
329 | if (!jbd_trylock_bh_state(bh)) { | ||
330 | jbd_sync_bh(journal, bh); | ||
331 | spin_lock(&journal->j_list_lock); | ||
332 | retry = 1; | ||
333 | break; | ||
334 | } | ||
335 | retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); | ||
336 | if (cond_resched_lock(&journal->j_list_lock)) { | ||
337 | retry = 1; | ||
338 | break; | ||
339 | } | ||
340 | } while (jh != last_jh && !retry); | ||
341 | |||
342 | if (batch_count) | ||
343 | __flush_batch(journal, bhs, &batch_count); | ||
344 | |||
345 | /* | ||
346 | * If someone cleaned up this transaction while we slept, we're | ||
347 | * done | ||
348 | */ | ||
349 | if (journal->j_checkpoint_transactions != transaction) | ||
350 | break; | ||
351 | if (retry) | ||
352 | continue; | ||
353 | /* | ||
354 | * Maybe it's a new transaction, but it fell at the same | ||
355 | * address | ||
356 | */ | ||
357 | if (transaction->t_tid != this_tid) | ||
358 | continue; | ||
359 | /* | ||
360 | * We have walked the whole transaction list without | ||
361 | * finding anything to write to disk. We had better be | ||
362 | * able to make some progress or we are in trouble. | ||
363 | */ | ||
364 | cleanup_ret = __cleanup_transaction(journal, transaction); | ||
365 | J_ASSERT(drop_count != 0 || cleanup_ret != 0); | ||
366 | if (journal->j_checkpoint_transactions != transaction) | ||
367 | break; | ||
368 | } | ||
369 | spin_unlock(&journal->j_list_lock); | ||
370 | result = cleanup_journal_tail(journal); | ||
371 | if (result < 0) | ||
372 | return result; | ||
373 | |||
374 | return 0; | ||
375 | } | ||
376 | |||
377 | /* | ||
378 | * Check the list of checkpoint transactions for the journal to see if | ||
379 | * we have already got rid of any since the last update of the log tail | ||
380 | * in the journal superblock. If so, we can instantly roll the | ||
381 | * superblock forward to remove those transactions from the log. | ||
382 | * | ||
383 | * Return <0 on error, 0 on success, 1 if there was nothing to clean up. | ||
384 | * | ||
385 | * Called with the journal lock held. | ||
386 | * | ||
387 | * This is the only part of the journaling code which really needs to be | ||
388 | * aware of transaction aborts. Checkpointing involves writing to the | ||
389 | * main filesystem area rather than to the journal, so it can proceed | ||
390 | * even in abort state, but we must not update the journal superblock if | ||
391 | * we have an abort error outstanding. | ||
392 | */ | ||
393 | |||
394 | int cleanup_journal_tail(journal_t *journal) | ||
395 | { | ||
396 | transaction_t * transaction; | ||
397 | tid_t first_tid; | ||
398 | unsigned long blocknr, freed; | ||
399 | |||
400 | /* OK, work out the oldest transaction remaining in the log, and | ||
401 | * the log block it starts at. | ||
402 | * | ||
403 | * If the log is now empty, we need to work out which is the | ||
404 | * next transaction ID we will write, and where it will | ||
405 | * start. */ | ||
406 | |||
407 | spin_lock(&journal->j_state_lock); | ||
408 | spin_lock(&journal->j_list_lock); | ||
409 | transaction = journal->j_checkpoint_transactions; | ||
410 | if (transaction) { | ||
411 | first_tid = transaction->t_tid; | ||
412 | blocknr = transaction->t_log_start; | ||
413 | } else if ((transaction = journal->j_committing_transaction) != NULL) { | ||
414 | first_tid = transaction->t_tid; | ||
415 | blocknr = transaction->t_log_start; | ||
416 | } else if ((transaction = journal->j_running_transaction) != NULL) { | ||
417 | first_tid = transaction->t_tid; | ||
418 | blocknr = journal->j_head; | ||
419 | } else { | ||
420 | first_tid = journal->j_transaction_sequence; | ||
421 | blocknr = journal->j_head; | ||
422 | } | ||
423 | spin_unlock(&journal->j_list_lock); | ||
424 | J_ASSERT(blocknr != 0); | ||
425 | |||
426 | /* If the oldest pinned transaction is at the tail of the log | ||
427 | already then there's not much we can do right now. */ | ||
428 | if (journal->j_tail_sequence == first_tid) { | ||
429 | spin_unlock(&journal->j_state_lock); | ||
430 | return 1; | ||
431 | } | ||
432 | |||
433 | /* OK, update the superblock to recover the freed space. | ||
434 | * Physical blocks come first: have we wrapped beyond the end of | ||
435 | * the log? */ | ||
436 | freed = blocknr - journal->j_tail; | ||
437 | if (blocknr < journal->j_tail) | ||
438 | freed = freed + journal->j_last - journal->j_first; | ||
439 | |||
440 | jbd_debug(1, | ||
441 | "Cleaning journal tail from %d to %d (offset %lu), " | ||
442 | "freeing %lu\n", | ||
443 | journal->j_tail_sequence, first_tid, blocknr, freed); | ||
444 | |||
445 | journal->j_free += freed; | ||
446 | journal->j_tail_sequence = first_tid; | ||
447 | journal->j_tail = blocknr; | ||
448 | spin_unlock(&journal->j_state_lock); | ||
449 | if (!(journal->j_flags & JFS_ABORT)) | ||
450 | journal_update_superblock(journal, 1); | ||
451 | return 0; | ||
452 | } | ||
453 | |||
454 | |||
455 | /* Checkpoint list management */ | ||
456 | |||
457 | /* | ||
458 | * journal_clean_checkpoint_list | ||
459 | * | ||
460 | * Find all the written-back checkpoint buffers in the journal and release them. | ||
461 | * | ||
462 | * Called with the journal locked. | ||
463 | * Called with j_list_lock held. | ||
464 | * Returns number of bufers reaped (for debug) | ||
465 | */ | ||
466 | |||
467 | int __journal_clean_checkpoint_list(journal_t *journal) | ||
468 | { | ||
469 | transaction_t *transaction, *last_transaction, *next_transaction; | ||
470 | int ret = 0; | ||
471 | |||
472 | transaction = journal->j_checkpoint_transactions; | ||
473 | if (transaction == 0) | ||
474 | goto out; | ||
475 | |||
476 | last_transaction = transaction->t_cpprev; | ||
477 | next_transaction = transaction; | ||
478 | do { | ||
479 | struct journal_head *jh; | ||
480 | |||
481 | transaction = next_transaction; | ||
482 | next_transaction = transaction->t_cpnext; | ||
483 | jh = transaction->t_checkpoint_list; | ||
484 | if (jh) { | ||
485 | struct journal_head *last_jh = jh->b_cpprev; | ||
486 | struct journal_head *next_jh = jh; | ||
487 | |||
488 | do { | ||
489 | jh = next_jh; | ||
490 | next_jh = jh->b_cpnext; | ||
491 | /* Use trylock because of the ranknig */ | ||
492 | if (jbd_trylock_bh_state(jh2bh(jh))) | ||
493 | ret += __try_to_free_cp_buf(jh); | ||
494 | /* | ||
495 | * This function only frees up some memory | ||
496 | * if possible so we dont have an obligation | ||
497 | * to finish processing. Bail out if preemption | ||
498 | * requested: | ||
499 | */ | ||
500 | if (need_resched()) | ||
501 | goto out; | ||
502 | } while (jh != last_jh); | ||
503 | } | ||
504 | } while (transaction != last_transaction); | ||
505 | out: | ||
506 | return ret; | ||
507 | } | ||
508 | |||
509 | /* | ||
510 | * journal_remove_checkpoint: called after a buffer has been committed | ||
511 | * to disk (either by being write-back flushed to disk, or being | ||
512 | * committed to the log). | ||
513 | * | ||
514 | * We cannot safely clean a transaction out of the log until all of the | ||
515 | * buffer updates committed in that transaction have safely been stored | ||
516 | * elsewhere on disk. To achieve this, all of the buffers in a | ||
517 | * transaction need to be maintained on the transaction's checkpoint | ||
518 | * list until they have been rewritten, at which point this function is | ||
519 | * called to remove the buffer from the existing transaction's | ||
520 | * checkpoint list. | ||
521 | * | ||
522 | * This function is called with the journal locked. | ||
523 | * This function is called with j_list_lock held. | ||
524 | */ | ||
525 | |||
526 | void __journal_remove_checkpoint(struct journal_head *jh) | ||
527 | { | ||
528 | transaction_t *transaction; | ||
529 | journal_t *journal; | ||
530 | |||
531 | JBUFFER_TRACE(jh, "entry"); | ||
532 | |||
533 | if ((transaction = jh->b_cp_transaction) == NULL) { | ||
534 | JBUFFER_TRACE(jh, "not on transaction"); | ||
535 | goto out; | ||
536 | } | ||
537 | journal = transaction->t_journal; | ||
538 | |||
539 | __buffer_unlink(jh); | ||
540 | |||
541 | if (transaction->t_checkpoint_list != NULL) | ||
542 | goto out; | ||
543 | JBUFFER_TRACE(jh, "transaction has no more buffers"); | ||
544 | |||
545 | /* | ||
546 | * There is one special case to worry about: if we have just pulled the | ||
547 | * buffer off a committing transaction's forget list, then even if the | ||
548 | * checkpoint list is empty, the transaction obviously cannot be | ||
549 | * dropped! | ||
550 | * | ||
551 | * The locking here around j_committing_transaction is a bit sleazy. | ||
552 | * See the comment at the end of journal_commit_transaction(). | ||
553 | */ | ||
554 | if (transaction == journal->j_committing_transaction) { | ||
555 | JBUFFER_TRACE(jh, "belongs to committing transaction"); | ||
556 | goto out; | ||
557 | } | ||
558 | |||
559 | /* OK, that was the last buffer for the transaction: we can now | ||
560 | safely remove this transaction from the log */ | ||
561 | |||
562 | __journal_drop_transaction(journal, transaction); | ||
563 | |||
564 | /* Just in case anybody was waiting for more transactions to be | ||
565 | checkpointed... */ | ||
566 | wake_up(&journal->j_wait_logspace); | ||
567 | out: | ||
568 | JBUFFER_TRACE(jh, "exit"); | ||
569 | } | ||
570 | |||
571 | /* | ||
572 | * journal_insert_checkpoint: put a committed buffer onto a checkpoint | ||
573 | * list so that we know when it is safe to clean the transaction out of | ||
574 | * the log. | ||
575 | * | ||
576 | * Called with the journal locked. | ||
577 | * Called with j_list_lock held. | ||
578 | */ | ||
579 | void __journal_insert_checkpoint(struct journal_head *jh, | ||
580 | transaction_t *transaction) | ||
581 | { | ||
582 | JBUFFER_TRACE(jh, "entry"); | ||
583 | J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); | ||
584 | J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); | ||
585 | |||
586 | jh->b_cp_transaction = transaction; | ||
587 | |||
588 | if (!transaction->t_checkpoint_list) { | ||
589 | jh->b_cpnext = jh->b_cpprev = jh; | ||
590 | } else { | ||
591 | jh->b_cpnext = transaction->t_checkpoint_list; | ||
592 | jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; | ||
593 | jh->b_cpprev->b_cpnext = jh; | ||
594 | jh->b_cpnext->b_cpprev = jh; | ||
595 | } | ||
596 | transaction->t_checkpoint_list = jh; | ||
597 | } | ||
598 | |||
599 | /* | ||
600 | * We've finished with this transaction structure: adios... | ||
601 | * | ||
602 | * The transaction must have no links except for the checkpoint by this | ||
603 | * point. | ||
604 | * | ||
605 | * Called with the journal locked. | ||
606 | * Called with j_list_lock held. | ||
607 | */ | ||
608 | |||
609 | void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) | ||
610 | { | ||
611 | assert_spin_locked(&journal->j_list_lock); | ||
612 | if (transaction->t_cpnext) { | ||
613 | transaction->t_cpnext->t_cpprev = transaction->t_cpprev; | ||
614 | transaction->t_cpprev->t_cpnext = transaction->t_cpnext; | ||
615 | if (journal->j_checkpoint_transactions == transaction) | ||
616 | journal->j_checkpoint_transactions = | ||
617 | transaction->t_cpnext; | ||
618 | if (journal->j_checkpoint_transactions == transaction) | ||
619 | journal->j_checkpoint_transactions = NULL; | ||
620 | } | ||
621 | |||
622 | J_ASSERT(transaction->t_state == T_FINISHED); | ||
623 | J_ASSERT(transaction->t_buffers == NULL); | ||
624 | J_ASSERT(transaction->t_sync_datalist == NULL); | ||
625 | J_ASSERT(transaction->t_forget == NULL); | ||
626 | J_ASSERT(transaction->t_iobuf_list == NULL); | ||
627 | J_ASSERT(transaction->t_shadow_list == NULL); | ||
628 | J_ASSERT(transaction->t_log_list == NULL); | ||
629 | J_ASSERT(transaction->t_checkpoint_list == NULL); | ||
630 | J_ASSERT(transaction->t_updates == 0); | ||
631 | J_ASSERT(journal->j_committing_transaction != transaction); | ||
632 | J_ASSERT(journal->j_running_transaction != transaction); | ||
633 | |||
634 | jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); | ||
635 | kfree(transaction); | ||
636 | } | ||
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c new file mode 100644 index 000000000000..dac720c837ab --- /dev/null +++ b/fs/jbd/commit.c | |||
@@ -0,0 +1,844 @@ | |||
1 | /* | ||
2 | * linux/fs/commit.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | ||
5 | * | ||
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Journal commit routines for the generic filesystem journaling code; | ||
13 | * part of the ext2fs journaling system. | ||
14 | */ | ||
15 | |||
16 | #include <linux/time.h> | ||
17 | #include <linux/fs.h> | ||
18 | #include <linux/jbd.h> | ||
19 | #include <linux/errno.h> | ||
20 | #include <linux/slab.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/pagemap.h> | ||
23 | #include <linux/smp_lock.h> | ||
24 | |||
25 | /* | ||
26 | * Default IO end handler for temporary BJ_IO buffer_heads. | ||
27 | */ | ||
28 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | ||
29 | { | ||
30 | BUFFER_TRACE(bh, ""); | ||
31 | if (uptodate) | ||
32 | set_buffer_uptodate(bh); | ||
33 | else | ||
34 | clear_buffer_uptodate(bh); | ||
35 | unlock_buffer(bh); | ||
36 | } | ||
37 | |||
38 | /* | ||
39 | * When an ext3-ordered file is truncated, it is possible that many pages are | ||
40 | * not sucessfully freed, because they are attached to a committing transaction. | ||
41 | * After the transaction commits, these pages are left on the LRU, with no | ||
42 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | ||
43 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | ||
44 | * the numbers in /proc/meminfo look odd. | ||
45 | * | ||
46 | * So here, we have a buffer which has just come off the forget list. Look to | ||
47 | * see if we can strip all buffers from the backing page. | ||
48 | * | ||
49 | * Called under lock_journal(), and possibly under journal_datalist_lock. The | ||
50 | * caller provided us with a ref against the buffer, and we drop that here. | ||
51 | */ | ||
52 | static void release_buffer_page(struct buffer_head *bh) | ||
53 | { | ||
54 | struct page *page; | ||
55 | |||
56 | if (buffer_dirty(bh)) | ||
57 | goto nope; | ||
58 | if (atomic_read(&bh->b_count) != 1) | ||
59 | goto nope; | ||
60 | page = bh->b_page; | ||
61 | if (!page) | ||
62 | goto nope; | ||
63 | if (page->mapping) | ||
64 | goto nope; | ||
65 | |||
66 | /* OK, it's a truncated page */ | ||
67 | if (TestSetPageLocked(page)) | ||
68 | goto nope; | ||
69 | |||
70 | page_cache_get(page); | ||
71 | __brelse(bh); | ||
72 | try_to_free_buffers(page); | ||
73 | unlock_page(page); | ||
74 | page_cache_release(page); | ||
75 | return; | ||
76 | |||
77 | nope: | ||
78 | __brelse(bh); | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is | ||
83 | * held. For ranking reasons we must trylock. If we lose, schedule away and | ||
84 | * return 0. j_list_lock is dropped in this case. | ||
85 | */ | ||
86 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) | ||
87 | { | ||
88 | if (!jbd_trylock_bh_state(bh)) { | ||
89 | spin_unlock(&journal->j_list_lock); | ||
90 | schedule(); | ||
91 | return 0; | ||
92 | } | ||
93 | return 1; | ||
94 | } | ||
95 | |||
96 | /* Done it all: now write the commit record. We should have | ||
97 | * cleaned up our previous buffers by now, so if we are in abort | ||
98 | * mode we can now just skip the rest of the journal write | ||
99 | * entirely. | ||
100 | * | ||
101 | * Returns 1 if the journal needs to be aborted or 0 on success | ||
102 | */ | ||
103 | static int journal_write_commit_record(journal_t *journal, | ||
104 | transaction_t *commit_transaction) | ||
105 | { | ||
106 | struct journal_head *descriptor; | ||
107 | struct buffer_head *bh; | ||
108 | int i, ret; | ||
109 | int barrier_done = 0; | ||
110 | |||
111 | if (is_journal_aborted(journal)) | ||
112 | return 0; | ||
113 | |||
114 | descriptor = journal_get_descriptor_buffer(journal); | ||
115 | if (!descriptor) | ||
116 | return 1; | ||
117 | |||
118 | bh = jh2bh(descriptor); | ||
119 | |||
120 | /* AKPM: buglet - add `i' to tmp! */ | ||
121 | for (i = 0; i < bh->b_size; i += 512) { | ||
122 | journal_header_t *tmp = (journal_header_t*)bh->b_data; | ||
123 | tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
124 | tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); | ||
125 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | ||
126 | } | ||
127 | |||
128 | JBUFFER_TRACE(descriptor, "write commit block"); | ||
129 | set_buffer_dirty(bh); | ||
130 | if (journal->j_flags & JFS_BARRIER) { | ||
131 | set_buffer_ordered(bh); | ||
132 | barrier_done = 1; | ||
133 | } | ||
134 | ret = sync_dirty_buffer(bh); | ||
135 | /* is it possible for another commit to fail at roughly | ||
136 | * the same time as this one? If so, we don't want to | ||
137 | * trust the barrier flag in the super, but instead want | ||
138 | * to remember if we sent a barrier request | ||
139 | */ | ||
140 | if (ret == -EOPNOTSUPP && barrier_done) { | ||
141 | char b[BDEVNAME_SIZE]; | ||
142 | |||
143 | printk(KERN_WARNING | ||
144 | "JBD: barrier-based sync failed on %s - " | ||
145 | "disabling barriers\n", | ||
146 | bdevname(journal->j_dev, b)); | ||
147 | spin_lock(&journal->j_state_lock); | ||
148 | journal->j_flags &= ~JFS_BARRIER; | ||
149 | spin_unlock(&journal->j_state_lock); | ||
150 | |||
151 | /* And try again, without the barrier */ | ||
152 | clear_buffer_ordered(bh); | ||
153 | set_buffer_uptodate(bh); | ||
154 | set_buffer_dirty(bh); | ||
155 | ret = sync_dirty_buffer(bh); | ||
156 | } | ||
157 | put_bh(bh); /* One for getblk() */ | ||
158 | journal_put_journal_head(descriptor); | ||
159 | |||
160 | return (ret == -EIO); | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * journal_commit_transaction | ||
165 | * | ||
166 | * The primary function for committing a transaction to the log. This | ||
167 | * function is called by the journal thread to begin a complete commit. | ||
168 | */ | ||
169 | void journal_commit_transaction(journal_t *journal) | ||
170 | { | ||
171 | transaction_t *commit_transaction; | ||
172 | struct journal_head *jh, *new_jh, *descriptor; | ||
173 | struct buffer_head **wbuf = journal->j_wbuf; | ||
174 | int bufs; | ||
175 | int flags; | ||
176 | int err; | ||
177 | unsigned long blocknr; | ||
178 | char *tagp = NULL; | ||
179 | journal_header_t *header; | ||
180 | journal_block_tag_t *tag = NULL; | ||
181 | int space_left = 0; | ||
182 | int first_tag = 0; | ||
183 | int tag_flag; | ||
184 | int i; | ||
185 | |||
186 | /* | ||
187 | * First job: lock down the current transaction and wait for | ||
188 | * all outstanding updates to complete. | ||
189 | */ | ||
190 | |||
191 | #ifdef COMMIT_STATS | ||
192 | spin_lock(&journal->j_list_lock); | ||
193 | summarise_journal_usage(journal); | ||
194 | spin_unlock(&journal->j_list_lock); | ||
195 | #endif | ||
196 | |||
197 | /* Do we need to erase the effects of a prior journal_flush? */ | ||
198 | if (journal->j_flags & JFS_FLUSHED) { | ||
199 | jbd_debug(3, "super block updated\n"); | ||
200 | journal_update_superblock(journal, 1); | ||
201 | } else { | ||
202 | jbd_debug(3, "superblock not updated\n"); | ||
203 | } | ||
204 | |||
205 | J_ASSERT(journal->j_running_transaction != NULL); | ||
206 | J_ASSERT(journal->j_committing_transaction == NULL); | ||
207 | |||
208 | commit_transaction = journal->j_running_transaction; | ||
209 | J_ASSERT(commit_transaction->t_state == T_RUNNING); | ||
210 | |||
211 | jbd_debug(1, "JBD: starting commit of transaction %d\n", | ||
212 | commit_transaction->t_tid); | ||
213 | |||
214 | spin_lock(&journal->j_state_lock); | ||
215 | commit_transaction->t_state = T_LOCKED; | ||
216 | |||
217 | spin_lock(&commit_transaction->t_handle_lock); | ||
218 | while (commit_transaction->t_updates) { | ||
219 | DEFINE_WAIT(wait); | ||
220 | |||
221 | prepare_to_wait(&journal->j_wait_updates, &wait, | ||
222 | TASK_UNINTERRUPTIBLE); | ||
223 | if (commit_transaction->t_updates) { | ||
224 | spin_unlock(&commit_transaction->t_handle_lock); | ||
225 | spin_unlock(&journal->j_state_lock); | ||
226 | schedule(); | ||
227 | spin_lock(&journal->j_state_lock); | ||
228 | spin_lock(&commit_transaction->t_handle_lock); | ||
229 | } | ||
230 | finish_wait(&journal->j_wait_updates, &wait); | ||
231 | } | ||
232 | spin_unlock(&commit_transaction->t_handle_lock); | ||
233 | |||
234 | J_ASSERT (commit_transaction->t_outstanding_credits <= | ||
235 | journal->j_max_transaction_buffers); | ||
236 | |||
237 | /* | ||
238 | * First thing we are allowed to do is to discard any remaining | ||
239 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume | ||
240 | * that there are no such buffers: if a large filesystem | ||
241 | * operation like a truncate needs to split itself over multiple | ||
242 | * transactions, then it may try to do a journal_restart() while | ||
243 | * there are still BJ_Reserved buffers outstanding. These must | ||
244 | * be released cleanly from the current transaction. | ||
245 | * | ||
246 | * In this case, the filesystem must still reserve write access | ||
247 | * again before modifying the buffer in the new transaction, but | ||
248 | * we do not require it to remember exactly which old buffers it | ||
249 | * has reserved. This is consistent with the existing behaviour | ||
250 | * that multiple journal_get_write_access() calls to the same | ||
251 | * buffer are perfectly permissable. | ||
252 | */ | ||
253 | while (commit_transaction->t_reserved_list) { | ||
254 | jh = commit_transaction->t_reserved_list; | ||
255 | JBUFFER_TRACE(jh, "reserved, unused: refile"); | ||
256 | /* | ||
257 | * A journal_get_undo_access()+journal_release_buffer() may | ||
258 | * leave undo-committed data. | ||
259 | */ | ||
260 | if (jh->b_committed_data) { | ||
261 | struct buffer_head *bh = jh2bh(jh); | ||
262 | |||
263 | jbd_lock_bh_state(bh); | ||
264 | if (jh->b_committed_data) { | ||
265 | kfree(jh->b_committed_data); | ||
266 | jh->b_committed_data = NULL; | ||
267 | } | ||
268 | jbd_unlock_bh_state(bh); | ||
269 | } | ||
270 | journal_refile_buffer(journal, jh); | ||
271 | } | ||
272 | |||
273 | /* | ||
274 | * Now try to drop any written-back buffers from the journal's | ||
275 | * checkpoint lists. We do this *before* commit because it potentially | ||
276 | * frees some memory | ||
277 | */ | ||
278 | spin_lock(&journal->j_list_lock); | ||
279 | __journal_clean_checkpoint_list(journal); | ||
280 | spin_unlock(&journal->j_list_lock); | ||
281 | |||
282 | jbd_debug (3, "JBD: commit phase 1\n"); | ||
283 | |||
284 | /* | ||
285 | * Switch to a new revoke table. | ||
286 | */ | ||
287 | journal_switch_revoke_table(journal); | ||
288 | |||
289 | commit_transaction->t_state = T_FLUSH; | ||
290 | journal->j_committing_transaction = commit_transaction; | ||
291 | journal->j_running_transaction = NULL; | ||
292 | commit_transaction->t_log_start = journal->j_head; | ||
293 | wake_up(&journal->j_wait_transaction_locked); | ||
294 | spin_unlock(&journal->j_state_lock); | ||
295 | |||
296 | jbd_debug (3, "JBD: commit phase 2\n"); | ||
297 | |||
298 | /* | ||
299 | * First, drop modified flag: all accesses to the buffers | ||
300 | * will be tracked for a new trasaction only -bzzz | ||
301 | */ | ||
302 | spin_lock(&journal->j_list_lock); | ||
303 | if (commit_transaction->t_buffers) { | ||
304 | new_jh = jh = commit_transaction->t_buffers->b_tnext; | ||
305 | do { | ||
306 | J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || | ||
307 | new_jh->b_modified == 0); | ||
308 | new_jh->b_modified = 0; | ||
309 | new_jh = new_jh->b_tnext; | ||
310 | } while (new_jh != jh); | ||
311 | } | ||
312 | spin_unlock(&journal->j_list_lock); | ||
313 | |||
314 | /* | ||
315 | * Now start flushing things to disk, in the order they appear | ||
316 | * on the transaction lists. Data blocks go first. | ||
317 | */ | ||
318 | |||
319 | err = 0; | ||
320 | /* | ||
321 | * Whenever we unlock the journal and sleep, things can get added | ||
322 | * onto ->t_sync_datalist, so we have to keep looping back to | ||
323 | * write_out_data until we *know* that the list is empty. | ||
324 | */ | ||
325 | bufs = 0; | ||
326 | /* | ||
327 | * Cleanup any flushed data buffers from the data list. Even in | ||
328 | * abort mode, we want to flush this out as soon as possible. | ||
329 | */ | ||
330 | write_out_data: | ||
331 | cond_resched(); | ||
332 | spin_lock(&journal->j_list_lock); | ||
333 | |||
334 | while (commit_transaction->t_sync_datalist) { | ||
335 | struct buffer_head *bh; | ||
336 | |||
337 | jh = commit_transaction->t_sync_datalist; | ||
338 | commit_transaction->t_sync_datalist = jh->b_tnext; | ||
339 | bh = jh2bh(jh); | ||
340 | if (buffer_locked(bh)) { | ||
341 | BUFFER_TRACE(bh, "locked"); | ||
342 | if (!inverted_lock(journal, bh)) | ||
343 | goto write_out_data; | ||
344 | __journal_temp_unlink_buffer(jh); | ||
345 | __journal_file_buffer(jh, commit_transaction, | ||
346 | BJ_Locked); | ||
347 | jbd_unlock_bh_state(bh); | ||
348 | if (lock_need_resched(&journal->j_list_lock)) { | ||
349 | spin_unlock(&journal->j_list_lock); | ||
350 | goto write_out_data; | ||
351 | } | ||
352 | } else { | ||
353 | if (buffer_dirty(bh)) { | ||
354 | BUFFER_TRACE(bh, "start journal writeout"); | ||
355 | get_bh(bh); | ||
356 | wbuf[bufs++] = bh; | ||
357 | if (bufs == journal->j_wbufsize) { | ||
358 | jbd_debug(2, "submit %d writes\n", | ||
359 | bufs); | ||
360 | spin_unlock(&journal->j_list_lock); | ||
361 | ll_rw_block(WRITE, bufs, wbuf); | ||
362 | journal_brelse_array(wbuf, bufs); | ||
363 | bufs = 0; | ||
364 | goto write_out_data; | ||
365 | } | ||
366 | } else { | ||
367 | BUFFER_TRACE(bh, "writeout complete: unfile"); | ||
368 | if (!inverted_lock(journal, bh)) | ||
369 | goto write_out_data; | ||
370 | __journal_unfile_buffer(jh); | ||
371 | jbd_unlock_bh_state(bh); | ||
372 | journal_remove_journal_head(bh); | ||
373 | put_bh(bh); | ||
374 | if (lock_need_resched(&journal->j_list_lock)) { | ||
375 | spin_unlock(&journal->j_list_lock); | ||
376 | goto write_out_data; | ||
377 | } | ||
378 | } | ||
379 | } | ||
380 | } | ||
381 | |||
382 | if (bufs) { | ||
383 | spin_unlock(&journal->j_list_lock); | ||
384 | ll_rw_block(WRITE, bufs, wbuf); | ||
385 | journal_brelse_array(wbuf, bufs); | ||
386 | spin_lock(&journal->j_list_lock); | ||
387 | } | ||
388 | |||
389 | /* | ||
390 | * Wait for all previously submitted IO to complete. | ||
391 | */ | ||
392 | while (commit_transaction->t_locked_list) { | ||
393 | struct buffer_head *bh; | ||
394 | |||
395 | jh = commit_transaction->t_locked_list->b_tprev; | ||
396 | bh = jh2bh(jh); | ||
397 | get_bh(bh); | ||
398 | if (buffer_locked(bh)) { | ||
399 | spin_unlock(&journal->j_list_lock); | ||
400 | wait_on_buffer(bh); | ||
401 | if (unlikely(!buffer_uptodate(bh))) | ||
402 | err = -EIO; | ||
403 | spin_lock(&journal->j_list_lock); | ||
404 | } | ||
405 | if (!inverted_lock(journal, bh)) { | ||
406 | put_bh(bh); | ||
407 | spin_lock(&journal->j_list_lock); | ||
408 | continue; | ||
409 | } | ||
410 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
411 | __journal_unfile_buffer(jh); | ||
412 | jbd_unlock_bh_state(bh); | ||
413 | journal_remove_journal_head(bh); | ||
414 | put_bh(bh); | ||
415 | } else { | ||
416 | jbd_unlock_bh_state(bh); | ||
417 | } | ||
418 | put_bh(bh); | ||
419 | cond_resched_lock(&journal->j_list_lock); | ||
420 | } | ||
421 | spin_unlock(&journal->j_list_lock); | ||
422 | |||
423 | if (err) | ||
424 | __journal_abort_hard(journal); | ||
425 | |||
426 | journal_write_revoke_records(journal, commit_transaction); | ||
427 | |||
428 | jbd_debug(3, "JBD: commit phase 2\n"); | ||
429 | |||
430 | /* | ||
431 | * If we found any dirty or locked buffers, then we should have | ||
432 | * looped back up to the write_out_data label. If there weren't | ||
433 | * any then journal_clean_data_list should have wiped the list | ||
434 | * clean by now, so check that it is in fact empty. | ||
435 | */ | ||
436 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); | ||
437 | |||
438 | jbd_debug (3, "JBD: commit phase 3\n"); | ||
439 | |||
440 | /* | ||
441 | * Way to go: we have now written out all of the data for a | ||
442 | * transaction! Now comes the tricky part: we need to write out | ||
443 | * metadata. Loop over the transaction's entire buffer list: | ||
444 | */ | ||
445 | commit_transaction->t_state = T_COMMIT; | ||
446 | |||
447 | descriptor = NULL; | ||
448 | bufs = 0; | ||
449 | while (commit_transaction->t_buffers) { | ||
450 | |||
451 | /* Find the next buffer to be journaled... */ | ||
452 | |||
453 | jh = commit_transaction->t_buffers; | ||
454 | |||
455 | /* If we're in abort mode, we just un-journal the buffer and | ||
456 | release it for background writing. */ | ||
457 | |||
458 | if (is_journal_aborted(journal)) { | ||
459 | JBUFFER_TRACE(jh, "journal is aborting: refile"); | ||
460 | journal_refile_buffer(journal, jh); | ||
461 | /* If that was the last one, we need to clean up | ||
462 | * any descriptor buffers which may have been | ||
463 | * already allocated, even if we are now | ||
464 | * aborting. */ | ||
465 | if (!commit_transaction->t_buffers) | ||
466 | goto start_journal_io; | ||
467 | continue; | ||
468 | } | ||
469 | |||
470 | /* Make sure we have a descriptor block in which to | ||
471 | record the metadata buffer. */ | ||
472 | |||
473 | if (!descriptor) { | ||
474 | struct buffer_head *bh; | ||
475 | |||
476 | J_ASSERT (bufs == 0); | ||
477 | |||
478 | jbd_debug(4, "JBD: get descriptor\n"); | ||
479 | |||
480 | descriptor = journal_get_descriptor_buffer(journal); | ||
481 | if (!descriptor) { | ||
482 | __journal_abort_hard(journal); | ||
483 | continue; | ||
484 | } | ||
485 | |||
486 | bh = jh2bh(descriptor); | ||
487 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", | ||
488 | (unsigned long long)bh->b_blocknr, bh->b_data); | ||
489 | header = (journal_header_t *)&bh->b_data[0]; | ||
490 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
491 | header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); | ||
492 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | ||
493 | |||
494 | tagp = &bh->b_data[sizeof(journal_header_t)]; | ||
495 | space_left = bh->b_size - sizeof(journal_header_t); | ||
496 | first_tag = 1; | ||
497 | set_buffer_jwrite(bh); | ||
498 | set_buffer_dirty(bh); | ||
499 | wbuf[bufs++] = bh; | ||
500 | |||
501 | /* Record it so that we can wait for IO | ||
502 | completion later */ | ||
503 | BUFFER_TRACE(bh, "ph3: file as descriptor"); | ||
504 | journal_file_buffer(descriptor, commit_transaction, | ||
505 | BJ_LogCtl); | ||
506 | } | ||
507 | |||
508 | /* Where is the buffer to be written? */ | ||
509 | |||
510 | err = journal_next_log_block(journal, &blocknr); | ||
511 | /* If the block mapping failed, just abandon the buffer | ||
512 | and repeat this loop: we'll fall into the | ||
513 | refile-on-abort condition above. */ | ||
514 | if (err) { | ||
515 | __journal_abort_hard(journal); | ||
516 | continue; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * start_this_handle() uses t_outstanding_credits to determine | ||
521 | * the free space in the log, but this counter is changed | ||
522 | * by journal_next_log_block() also. | ||
523 | */ | ||
524 | commit_transaction->t_outstanding_credits--; | ||
525 | |||
526 | /* Bump b_count to prevent truncate from stumbling over | ||
527 | the shadowed buffer! @@@ This can go if we ever get | ||
528 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ | ||
529 | atomic_inc(&jh2bh(jh)->b_count); | ||
530 | |||
531 | /* Make a temporary IO buffer with which to write it out | ||
532 | (this will requeue both the metadata buffer and the | ||
533 | temporary IO buffer). new_bh goes on BJ_IO*/ | ||
534 | |||
535 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); | ||
536 | /* | ||
537 | * akpm: journal_write_metadata_buffer() sets | ||
538 | * new_bh->b_transaction to commit_transaction. | ||
539 | * We need to clean this up before we release new_bh | ||
540 | * (which is of type BJ_IO) | ||
541 | */ | ||
542 | JBUFFER_TRACE(jh, "ph3: write metadata"); | ||
543 | flags = journal_write_metadata_buffer(commit_transaction, | ||
544 | jh, &new_jh, blocknr); | ||
545 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); | ||
546 | wbuf[bufs++] = jh2bh(new_jh); | ||
547 | |||
548 | /* Record the new block's tag in the current descriptor | ||
549 | buffer */ | ||
550 | |||
551 | tag_flag = 0; | ||
552 | if (flags & 1) | ||
553 | tag_flag |= JFS_FLAG_ESCAPE; | ||
554 | if (!first_tag) | ||
555 | tag_flag |= JFS_FLAG_SAME_UUID; | ||
556 | |||
557 | tag = (journal_block_tag_t *) tagp; | ||
558 | tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); | ||
559 | tag->t_flags = cpu_to_be32(tag_flag); | ||
560 | tagp += sizeof(journal_block_tag_t); | ||
561 | space_left -= sizeof(journal_block_tag_t); | ||
562 | |||
563 | if (first_tag) { | ||
564 | memcpy (tagp, journal->j_uuid, 16); | ||
565 | tagp += 16; | ||
566 | space_left -= 16; | ||
567 | first_tag = 0; | ||
568 | } | ||
569 | |||
570 | /* If there's no more to do, or if the descriptor is full, | ||
571 | let the IO rip! */ | ||
572 | |||
573 | if (bufs == journal->j_wbufsize || | ||
574 | commit_transaction->t_buffers == NULL || | ||
575 | space_left < sizeof(journal_block_tag_t) + 16) { | ||
576 | |||
577 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); | ||
578 | |||
579 | /* Write an end-of-descriptor marker before | ||
580 | submitting the IOs. "tag" still points to | ||
581 | the last tag we set up. */ | ||
582 | |||
583 | tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); | ||
584 | |||
585 | start_journal_io: | ||
586 | for (i = 0; i < bufs; i++) { | ||
587 | struct buffer_head *bh = wbuf[i]; | ||
588 | lock_buffer(bh); | ||
589 | clear_buffer_dirty(bh); | ||
590 | set_buffer_uptodate(bh); | ||
591 | bh->b_end_io = journal_end_buffer_io_sync; | ||
592 | submit_bh(WRITE, bh); | ||
593 | } | ||
594 | cond_resched(); | ||
595 | |||
596 | /* Force a new descriptor to be generated next | ||
597 | time round the loop. */ | ||
598 | descriptor = NULL; | ||
599 | bufs = 0; | ||
600 | } | ||
601 | } | ||
602 | |||
603 | /* Lo and behold: we have just managed to send a transaction to | ||
604 | the log. Before we can commit it, wait for the IO so far to | ||
605 | complete. Control buffers being written are on the | ||
606 | transaction's t_log_list queue, and metadata buffers are on | ||
607 | the t_iobuf_list queue. | ||
608 | |||
609 | Wait for the buffers in reverse order. That way we are | ||
610 | less likely to be woken up until all IOs have completed, and | ||
611 | so we incur less scheduling load. | ||
612 | */ | ||
613 | |||
614 | jbd_debug(3, "JBD: commit phase 4\n"); | ||
615 | |||
616 | /* | ||
617 | * akpm: these are BJ_IO, and j_list_lock is not needed. | ||
618 | * See __journal_try_to_free_buffer. | ||
619 | */ | ||
620 | wait_for_iobuf: | ||
621 | while (commit_transaction->t_iobuf_list != NULL) { | ||
622 | struct buffer_head *bh; | ||
623 | |||
624 | jh = commit_transaction->t_iobuf_list->b_tprev; | ||
625 | bh = jh2bh(jh); | ||
626 | if (buffer_locked(bh)) { | ||
627 | wait_on_buffer(bh); | ||
628 | goto wait_for_iobuf; | ||
629 | } | ||
630 | if (cond_resched()) | ||
631 | goto wait_for_iobuf; | ||
632 | |||
633 | if (unlikely(!buffer_uptodate(bh))) | ||
634 | err = -EIO; | ||
635 | |||
636 | clear_buffer_jwrite(bh); | ||
637 | |||
638 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); | ||
639 | journal_unfile_buffer(journal, jh); | ||
640 | |||
641 | /* | ||
642 | * ->t_iobuf_list should contain only dummy buffer_heads | ||
643 | * which were created by journal_write_metadata_buffer(). | ||
644 | */ | ||
645 | BUFFER_TRACE(bh, "dumping temporary bh"); | ||
646 | journal_put_journal_head(jh); | ||
647 | __brelse(bh); | ||
648 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); | ||
649 | free_buffer_head(bh); | ||
650 | |||
651 | /* We also have to unlock and free the corresponding | ||
652 | shadowed buffer */ | ||
653 | jh = commit_transaction->t_shadow_list->b_tprev; | ||
654 | bh = jh2bh(jh); | ||
655 | clear_bit(BH_JWrite, &bh->b_state); | ||
656 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | ||
657 | |||
658 | /* The metadata is now released for reuse, but we need | ||
659 | to remember it against this transaction so that when | ||
660 | we finally commit, we can do any checkpointing | ||
661 | required. */ | ||
662 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | ||
663 | journal_file_buffer(jh, commit_transaction, BJ_Forget); | ||
664 | /* Wake up any transactions which were waiting for this | ||
665 | IO to complete */ | ||
666 | wake_up_bit(&bh->b_state, BH_Unshadow); | ||
667 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | ||
668 | __brelse(bh); | ||
669 | } | ||
670 | |||
671 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | ||
672 | |||
673 | jbd_debug(3, "JBD: commit phase 5\n"); | ||
674 | |||
675 | /* Here we wait for the revoke record and descriptor record buffers */ | ||
676 | wait_for_ctlbuf: | ||
677 | while (commit_transaction->t_log_list != NULL) { | ||
678 | struct buffer_head *bh; | ||
679 | |||
680 | jh = commit_transaction->t_log_list->b_tprev; | ||
681 | bh = jh2bh(jh); | ||
682 | if (buffer_locked(bh)) { | ||
683 | wait_on_buffer(bh); | ||
684 | goto wait_for_ctlbuf; | ||
685 | } | ||
686 | if (cond_resched()) | ||
687 | goto wait_for_ctlbuf; | ||
688 | |||
689 | if (unlikely(!buffer_uptodate(bh))) | ||
690 | err = -EIO; | ||
691 | |||
692 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); | ||
693 | clear_buffer_jwrite(bh); | ||
694 | journal_unfile_buffer(journal, jh); | ||
695 | journal_put_journal_head(jh); | ||
696 | __brelse(bh); /* One for getblk */ | ||
697 | /* AKPM: bforget here */ | ||
698 | } | ||
699 | |||
700 | jbd_debug(3, "JBD: commit phase 6\n"); | ||
701 | |||
702 | if (journal_write_commit_record(journal, commit_transaction)) | ||
703 | err = -EIO; | ||
704 | |||
705 | if (err) | ||
706 | __journal_abort_hard(journal); | ||
707 | |||
708 | /* End of a transaction! Finally, we can do checkpoint | ||
709 | processing: any buffers committed as a result of this | ||
710 | transaction can be removed from any checkpoint list it was on | ||
711 | before. */ | ||
712 | |||
713 | jbd_debug(3, "JBD: commit phase 7\n"); | ||
714 | |||
715 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); | ||
716 | J_ASSERT(commit_transaction->t_buffers == NULL); | ||
717 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | ||
718 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | ||
719 | J_ASSERT(commit_transaction->t_shadow_list == NULL); | ||
720 | J_ASSERT(commit_transaction->t_log_list == NULL); | ||
721 | |||
722 | restart_loop: | ||
723 | while (commit_transaction->t_forget) { | ||
724 | transaction_t *cp_transaction; | ||
725 | struct buffer_head *bh; | ||
726 | |||
727 | jh = commit_transaction->t_forget; | ||
728 | bh = jh2bh(jh); | ||
729 | jbd_lock_bh_state(bh); | ||
730 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || | ||
731 | jh->b_transaction == journal->j_running_transaction); | ||
732 | |||
733 | /* | ||
734 | * If there is undo-protected committed data against | ||
735 | * this buffer, then we can remove it now. If it is a | ||
736 | * buffer needing such protection, the old frozen_data | ||
737 | * field now points to a committed version of the | ||
738 | * buffer, so rotate that field to the new committed | ||
739 | * data. | ||
740 | * | ||
741 | * Otherwise, we can just throw away the frozen data now. | ||
742 | */ | ||
743 | if (jh->b_committed_data) { | ||
744 | kfree(jh->b_committed_data); | ||
745 | jh->b_committed_data = NULL; | ||
746 | if (jh->b_frozen_data) { | ||
747 | jh->b_committed_data = jh->b_frozen_data; | ||
748 | jh->b_frozen_data = NULL; | ||
749 | } | ||
750 | } else if (jh->b_frozen_data) { | ||
751 | kfree(jh->b_frozen_data); | ||
752 | jh->b_frozen_data = NULL; | ||
753 | } | ||
754 | |||
755 | spin_lock(&journal->j_list_lock); | ||
756 | cp_transaction = jh->b_cp_transaction; | ||
757 | if (cp_transaction) { | ||
758 | JBUFFER_TRACE(jh, "remove from old cp transaction"); | ||
759 | __journal_remove_checkpoint(jh); | ||
760 | } | ||
761 | |||
762 | /* Only re-checkpoint the buffer_head if it is marked | ||
763 | * dirty. If the buffer was added to the BJ_Forget list | ||
764 | * by journal_forget, it may no longer be dirty and | ||
765 | * there's no point in keeping a checkpoint record for | ||
766 | * it. */ | ||
767 | |||
768 | /* A buffer which has been freed while still being | ||
769 | * journaled by a previous transaction may end up still | ||
770 | * being dirty here, but we want to avoid writing back | ||
771 | * that buffer in the future now that the last use has | ||
772 | * been committed. That's not only a performance gain, | ||
773 | * it also stops aliasing problems if the buffer is left | ||
774 | * behind for writeback and gets reallocated for another | ||
775 | * use in a different page. */ | ||
776 | if (buffer_freed(bh)) { | ||
777 | clear_buffer_freed(bh); | ||
778 | clear_buffer_jbddirty(bh); | ||
779 | } | ||
780 | |||
781 | if (buffer_jbddirty(bh)) { | ||
782 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); | ||
783 | __journal_insert_checkpoint(jh, commit_transaction); | ||
784 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); | ||
785 | __journal_refile_buffer(jh); | ||
786 | jbd_unlock_bh_state(bh); | ||
787 | } else { | ||
788 | J_ASSERT_BH(bh, !buffer_dirty(bh)); | ||
789 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
790 | __journal_unfile_buffer(jh); | ||
791 | jbd_unlock_bh_state(bh); | ||
792 | journal_remove_journal_head(bh); /* needs a brelse */ | ||
793 | release_buffer_page(bh); | ||
794 | } | ||
795 | spin_unlock(&journal->j_list_lock); | ||
796 | if (cond_resched()) | ||
797 | goto restart_loop; | ||
798 | } | ||
799 | |||
800 | /* Done with this transaction! */ | ||
801 | |||
802 | jbd_debug(3, "JBD: commit phase 8\n"); | ||
803 | |||
804 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | ||
805 | |||
806 | /* | ||
807 | * This is a bit sleazy. We borrow j_list_lock to protect | ||
808 | * journal->j_committing_transaction in __journal_remove_checkpoint. | ||
809 | * Really, __jornal_remove_checkpoint should be using j_state_lock but | ||
810 | * it's a bit hassle to hold that across __journal_remove_checkpoint | ||
811 | */ | ||
812 | spin_lock(&journal->j_state_lock); | ||
813 | spin_lock(&journal->j_list_lock); | ||
814 | commit_transaction->t_state = T_FINISHED; | ||
815 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | ||
816 | journal->j_commit_sequence = commit_transaction->t_tid; | ||
817 | journal->j_committing_transaction = NULL; | ||
818 | spin_unlock(&journal->j_state_lock); | ||
819 | |||
820 | if (commit_transaction->t_checkpoint_list == NULL) { | ||
821 | __journal_drop_transaction(journal, commit_transaction); | ||
822 | } else { | ||
823 | if (journal->j_checkpoint_transactions == NULL) { | ||
824 | journal->j_checkpoint_transactions = commit_transaction; | ||
825 | commit_transaction->t_cpnext = commit_transaction; | ||
826 | commit_transaction->t_cpprev = commit_transaction; | ||
827 | } else { | ||
828 | commit_transaction->t_cpnext = | ||
829 | journal->j_checkpoint_transactions; | ||
830 | commit_transaction->t_cpprev = | ||
831 | commit_transaction->t_cpnext->t_cpprev; | ||
832 | commit_transaction->t_cpnext->t_cpprev = | ||
833 | commit_transaction; | ||
834 | commit_transaction->t_cpprev->t_cpnext = | ||
835 | commit_transaction; | ||
836 | } | ||
837 | } | ||
838 | spin_unlock(&journal->j_list_lock); | ||
839 | |||
840 | jbd_debug(1, "JBD: commit %d complete, head %d\n", | ||
841 | journal->j_commit_sequence, journal->j_tail_sequence); | ||
842 | |||
843 | wake_up(&journal->j_wait_done_commit); | ||
844 | } | ||
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c new file mode 100644 index 000000000000..1e6f2e2ad4a3 --- /dev/null +++ b/fs/jbd/journal.c | |||
@@ -0,0 +1,2003 @@ | |||
1 | /* | ||
2 | * linux/fs/journal.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | ||
5 | * | ||
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Generic filesystem journal-writing code; part of the ext2fs | ||
13 | * journaling system. | ||
14 | * | ||
15 | * This file manages journals: areas of disk reserved for logging | ||
16 | * transactional updates. This includes the kernel journaling thread | ||
17 | * which is responsible for scheduling updates to the log. | ||
18 | * | ||
19 | * We do not actually manage the physical storage of the journal in this | ||
20 | * file: that is left to a per-journal policy function, which allows us | ||
21 | * to store the journal within a filesystem-specified area for ext2 | ||
22 | * journaling (ext2 can use a reserved inode for storing the log). | ||
23 | */ | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/time.h> | ||
27 | #include <linux/fs.h> | ||
28 | #include <linux/jbd.h> | ||
29 | #include <linux/errno.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/smp_lock.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/mm.h> | ||
34 | #include <linux/suspend.h> | ||
35 | #include <linux/pagemap.h> | ||
36 | #include <asm/uaccess.h> | ||
37 | #include <asm/page.h> | ||
38 | #include <linux/proc_fs.h> | ||
39 | |||
40 | EXPORT_SYMBOL(journal_start); | ||
41 | EXPORT_SYMBOL(journal_restart); | ||
42 | EXPORT_SYMBOL(journal_extend); | ||
43 | EXPORT_SYMBOL(journal_stop); | ||
44 | EXPORT_SYMBOL(journal_lock_updates); | ||
45 | EXPORT_SYMBOL(journal_unlock_updates); | ||
46 | EXPORT_SYMBOL(journal_get_write_access); | ||
47 | EXPORT_SYMBOL(journal_get_create_access); | ||
48 | EXPORT_SYMBOL(journal_get_undo_access); | ||
49 | EXPORT_SYMBOL(journal_dirty_data); | ||
50 | EXPORT_SYMBOL(journal_dirty_metadata); | ||
51 | EXPORT_SYMBOL(journal_release_buffer); | ||
52 | EXPORT_SYMBOL(journal_forget); | ||
53 | #if 0 | ||
54 | EXPORT_SYMBOL(journal_sync_buffer); | ||
55 | #endif | ||
56 | EXPORT_SYMBOL(journal_flush); | ||
57 | EXPORT_SYMBOL(journal_revoke); | ||
58 | |||
59 | EXPORT_SYMBOL(journal_init_dev); | ||
60 | EXPORT_SYMBOL(journal_init_inode); | ||
61 | EXPORT_SYMBOL(journal_update_format); | ||
62 | EXPORT_SYMBOL(journal_check_used_features); | ||
63 | EXPORT_SYMBOL(journal_check_available_features); | ||
64 | EXPORT_SYMBOL(journal_set_features); | ||
65 | EXPORT_SYMBOL(journal_create); | ||
66 | EXPORT_SYMBOL(journal_load); | ||
67 | EXPORT_SYMBOL(journal_destroy); | ||
68 | EXPORT_SYMBOL(journal_recover); | ||
69 | EXPORT_SYMBOL(journal_update_superblock); | ||
70 | EXPORT_SYMBOL(journal_abort); | ||
71 | EXPORT_SYMBOL(journal_errno); | ||
72 | EXPORT_SYMBOL(journal_ack_err); | ||
73 | EXPORT_SYMBOL(journal_clear_err); | ||
74 | EXPORT_SYMBOL(log_wait_commit); | ||
75 | EXPORT_SYMBOL(journal_start_commit); | ||
76 | EXPORT_SYMBOL(journal_force_commit_nested); | ||
77 | EXPORT_SYMBOL(journal_wipe); | ||
78 | EXPORT_SYMBOL(journal_blocks_per_page); | ||
79 | EXPORT_SYMBOL(journal_invalidatepage); | ||
80 | EXPORT_SYMBOL(journal_try_to_free_buffers); | ||
81 | EXPORT_SYMBOL(journal_force_commit); | ||
82 | |||
83 | static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); | ||
84 | |||
85 | /* | ||
86 | * Helper function used to manage commit timeouts | ||
87 | */ | ||
88 | |||
89 | static void commit_timeout(unsigned long __data) | ||
90 | { | ||
91 | struct task_struct * p = (struct task_struct *) __data; | ||
92 | |||
93 | wake_up_process(p); | ||
94 | } | ||
95 | |||
96 | /* Static check for data structure consistency. There's no code | ||
97 | * invoked --- we'll just get a linker failure if things aren't right. | ||
98 | */ | ||
99 | void __journal_internal_check(void) | ||
100 | { | ||
101 | extern void journal_bad_superblock_size(void); | ||
102 | if (sizeof(struct journal_superblock_s) != 1024) | ||
103 | journal_bad_superblock_size(); | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * kjournald: The main thread function used to manage a logging device | ||
108 | * journal. | ||
109 | * | ||
110 | * This kernel thread is responsible for two things: | ||
111 | * | ||
112 | * 1) COMMIT: Every so often we need to commit the current state of the | ||
113 | * filesystem to disk. The journal thread is responsible for writing | ||
114 | * all of the metadata buffers to disk. | ||
115 | * | ||
116 | * 2) CHECKPOINT: We cannot reuse a used section of the log file until all | ||
117 | * of the data in that part of the log has been rewritten elsewhere on | ||
118 | * the disk. Flushing these old buffers to reclaim space in the log is | ||
119 | * known as checkpointing, and this thread is responsible for that job. | ||
120 | */ | ||
121 | |||
122 | journal_t *current_journal; // AKPM: debug | ||
123 | |||
124 | int kjournald(void *arg) | ||
125 | { | ||
126 | journal_t *journal = (journal_t *) arg; | ||
127 | transaction_t *transaction; | ||
128 | struct timer_list timer; | ||
129 | |||
130 | current_journal = journal; | ||
131 | |||
132 | daemonize("kjournald"); | ||
133 | |||
134 | /* Set up an interval timer which can be used to trigger a | ||
135 | commit wakeup after the commit interval expires */ | ||
136 | init_timer(&timer); | ||
137 | timer.data = (unsigned long) current; | ||
138 | timer.function = commit_timeout; | ||
139 | journal->j_commit_timer = &timer; | ||
140 | |||
141 | /* Record that the journal thread is running */ | ||
142 | journal->j_task = current; | ||
143 | wake_up(&journal->j_wait_done_commit); | ||
144 | |||
145 | printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", | ||
146 | journal->j_commit_interval / HZ); | ||
147 | |||
148 | /* | ||
149 | * And now, wait forever for commit wakeup events. | ||
150 | */ | ||
151 | spin_lock(&journal->j_state_lock); | ||
152 | |||
153 | loop: | ||
154 | if (journal->j_flags & JFS_UNMOUNT) | ||
155 | goto end_loop; | ||
156 | |||
157 | jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", | ||
158 | journal->j_commit_sequence, journal->j_commit_request); | ||
159 | |||
160 | if (journal->j_commit_sequence != journal->j_commit_request) { | ||
161 | jbd_debug(1, "OK, requests differ\n"); | ||
162 | spin_unlock(&journal->j_state_lock); | ||
163 | del_timer_sync(journal->j_commit_timer); | ||
164 | journal_commit_transaction(journal); | ||
165 | spin_lock(&journal->j_state_lock); | ||
166 | goto loop; | ||
167 | } | ||
168 | |||
169 | wake_up(&journal->j_wait_done_commit); | ||
170 | if (current->flags & PF_FREEZE) { | ||
171 | /* | ||
172 | * The simpler the better. Flushing journal isn't a | ||
173 | * good idea, because that depends on threads that may | ||
174 | * be already stopped. | ||
175 | */ | ||
176 | jbd_debug(1, "Now suspending kjournald\n"); | ||
177 | spin_unlock(&journal->j_state_lock); | ||
178 | refrigerator(PF_FREEZE); | ||
179 | spin_lock(&journal->j_state_lock); | ||
180 | } else { | ||
181 | /* | ||
182 | * We assume on resume that commits are already there, | ||
183 | * so we don't sleep | ||
184 | */ | ||
185 | DEFINE_WAIT(wait); | ||
186 | int should_sleep = 1; | ||
187 | |||
188 | prepare_to_wait(&journal->j_wait_commit, &wait, | ||
189 | TASK_INTERRUPTIBLE); | ||
190 | if (journal->j_commit_sequence != journal->j_commit_request) | ||
191 | should_sleep = 0; | ||
192 | transaction = journal->j_running_transaction; | ||
193 | if (transaction && time_after_eq(jiffies, | ||
194 | transaction->t_expires)) | ||
195 | should_sleep = 0; | ||
196 | if (should_sleep) { | ||
197 | spin_unlock(&journal->j_state_lock); | ||
198 | schedule(); | ||
199 | spin_lock(&journal->j_state_lock); | ||
200 | } | ||
201 | finish_wait(&journal->j_wait_commit, &wait); | ||
202 | } | ||
203 | |||
204 | jbd_debug(1, "kjournald wakes\n"); | ||
205 | |||
206 | /* | ||
207 | * Were we woken up by a commit wakeup event? | ||
208 | */ | ||
209 | transaction = journal->j_running_transaction; | ||
210 | if (transaction && time_after_eq(jiffies, transaction->t_expires)) { | ||
211 | journal->j_commit_request = transaction->t_tid; | ||
212 | jbd_debug(1, "woke because of timeout\n"); | ||
213 | } | ||
214 | goto loop; | ||
215 | |||
216 | end_loop: | ||
217 | spin_unlock(&journal->j_state_lock); | ||
218 | del_timer_sync(journal->j_commit_timer); | ||
219 | journal->j_task = NULL; | ||
220 | wake_up(&journal->j_wait_done_commit); | ||
221 | jbd_debug(1, "Journal thread exiting.\n"); | ||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | static void journal_start_thread(journal_t *journal) | ||
226 | { | ||
227 | kernel_thread(kjournald, journal, CLONE_VM|CLONE_FS|CLONE_FILES); | ||
228 | wait_event(journal->j_wait_done_commit, journal->j_task != 0); | ||
229 | } | ||
230 | |||
231 | static void journal_kill_thread(journal_t *journal) | ||
232 | { | ||
233 | spin_lock(&journal->j_state_lock); | ||
234 | journal->j_flags |= JFS_UNMOUNT; | ||
235 | |||
236 | while (journal->j_task) { | ||
237 | wake_up(&journal->j_wait_commit); | ||
238 | spin_unlock(&journal->j_state_lock); | ||
239 | wait_event(journal->j_wait_done_commit, journal->j_task == 0); | ||
240 | spin_lock(&journal->j_state_lock); | ||
241 | } | ||
242 | spin_unlock(&journal->j_state_lock); | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * journal_write_metadata_buffer: write a metadata buffer to the journal. | ||
247 | * | ||
248 | * Writes a metadata buffer to a given disk block. The actual IO is not | ||
249 | * performed but a new buffer_head is constructed which labels the data | ||
250 | * to be written with the correct destination disk block. | ||
251 | * | ||
252 | * Any magic-number escaping which needs to be done will cause a | ||
253 | * copy-out here. If the buffer happens to start with the | ||
254 | * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the | ||
255 | * magic number is only written to the log for descripter blocks. In | ||
256 | * this case, we copy the data and replace the first word with 0, and we | ||
257 | * return a result code which indicates that this buffer needs to be | ||
258 | * marked as an escaped buffer in the corresponding log descriptor | ||
259 | * block. The missing word can then be restored when the block is read | ||
260 | * during recovery. | ||
261 | * | ||
262 | * If the source buffer has already been modified by a new transaction | ||
263 | * since we took the last commit snapshot, we use the frozen copy of | ||
264 | * that data for IO. If we end up using the existing buffer_head's data | ||
265 | * for the write, then we *have* to lock the buffer to prevent anyone | ||
266 | * else from using and possibly modifying it while the IO is in | ||
267 | * progress. | ||
268 | * | ||
269 | * The function returns a pointer to the buffer_heads to be used for IO. | ||
270 | * | ||
271 | * We assume that the journal has already been locked in this function. | ||
272 | * | ||
273 | * Return value: | ||
274 | * <0: Error | ||
275 | * >=0: Finished OK | ||
276 | * | ||
277 | * On success: | ||
278 | * Bit 0 set == escape performed on the data | ||
279 | * Bit 1 set == buffer copy-out performed (kfree the data after IO) | ||
280 | */ | ||
281 | |||
282 | int journal_write_metadata_buffer(transaction_t *transaction, | ||
283 | struct journal_head *jh_in, | ||
284 | struct journal_head **jh_out, | ||
285 | int blocknr) | ||
286 | { | ||
287 | int need_copy_out = 0; | ||
288 | int done_copy_out = 0; | ||
289 | int do_escape = 0; | ||
290 | char *mapped_data; | ||
291 | struct buffer_head *new_bh; | ||
292 | struct journal_head *new_jh; | ||
293 | struct page *new_page; | ||
294 | unsigned int new_offset; | ||
295 | struct buffer_head *bh_in = jh2bh(jh_in); | ||
296 | |||
297 | /* | ||
298 | * The buffer really shouldn't be locked: only the current committing | ||
299 | * transaction is allowed to write it, so nobody else is allowed | ||
300 | * to do any IO. | ||
301 | * | ||
302 | * akpm: except if we're journalling data, and write() output is | ||
303 | * also part of a shared mapping, and another thread has | ||
304 | * decided to launch a writepage() against this buffer. | ||
305 | */ | ||
306 | J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); | ||
307 | |||
308 | new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); | ||
309 | |||
310 | /* | ||
311 | * If a new transaction has already done a buffer copy-out, then | ||
312 | * we use that version of the data for the commit. | ||
313 | */ | ||
314 | jbd_lock_bh_state(bh_in); | ||
315 | repeat: | ||
316 | if (jh_in->b_frozen_data) { | ||
317 | done_copy_out = 1; | ||
318 | new_page = virt_to_page(jh_in->b_frozen_data); | ||
319 | new_offset = offset_in_page(jh_in->b_frozen_data); | ||
320 | } else { | ||
321 | new_page = jh2bh(jh_in)->b_page; | ||
322 | new_offset = offset_in_page(jh2bh(jh_in)->b_data); | ||
323 | } | ||
324 | |||
325 | mapped_data = kmap_atomic(new_page, KM_USER0); | ||
326 | /* | ||
327 | * Check for escaping | ||
328 | */ | ||
329 | if (*((__be32 *)(mapped_data + new_offset)) == | ||
330 | cpu_to_be32(JFS_MAGIC_NUMBER)) { | ||
331 | need_copy_out = 1; | ||
332 | do_escape = 1; | ||
333 | } | ||
334 | kunmap_atomic(mapped_data, KM_USER0); | ||
335 | |||
336 | /* | ||
337 | * Do we need to do a data copy? | ||
338 | */ | ||
339 | if (need_copy_out && !done_copy_out) { | ||
340 | char *tmp; | ||
341 | |||
342 | jbd_unlock_bh_state(bh_in); | ||
343 | tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS); | ||
344 | jbd_lock_bh_state(bh_in); | ||
345 | if (jh_in->b_frozen_data) { | ||
346 | kfree(tmp); | ||
347 | goto repeat; | ||
348 | } | ||
349 | |||
350 | jh_in->b_frozen_data = tmp; | ||
351 | mapped_data = kmap_atomic(new_page, KM_USER0); | ||
352 | memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); | ||
353 | kunmap_atomic(mapped_data, KM_USER0); | ||
354 | |||
355 | new_page = virt_to_page(tmp); | ||
356 | new_offset = offset_in_page(tmp); | ||
357 | done_copy_out = 1; | ||
358 | } | ||
359 | |||
360 | /* | ||
361 | * Did we need to do an escaping? Now we've done all the | ||
362 | * copying, we can finally do so. | ||
363 | */ | ||
364 | if (do_escape) { | ||
365 | mapped_data = kmap_atomic(new_page, KM_USER0); | ||
366 | *((unsigned int *)(mapped_data + new_offset)) = 0; | ||
367 | kunmap_atomic(mapped_data, KM_USER0); | ||
368 | } | ||
369 | |||
370 | /* keep subsequent assertions sane */ | ||
371 | new_bh->b_state = 0; | ||
372 | init_buffer(new_bh, NULL, NULL); | ||
373 | atomic_set(&new_bh->b_count, 1); | ||
374 | jbd_unlock_bh_state(bh_in); | ||
375 | |||
376 | new_jh = journal_add_journal_head(new_bh); /* This sleeps */ | ||
377 | |||
378 | set_bh_page(new_bh, new_page, new_offset); | ||
379 | new_jh->b_transaction = NULL; | ||
380 | new_bh->b_size = jh2bh(jh_in)->b_size; | ||
381 | new_bh->b_bdev = transaction->t_journal->j_dev; | ||
382 | new_bh->b_blocknr = blocknr; | ||
383 | set_buffer_mapped(new_bh); | ||
384 | set_buffer_dirty(new_bh); | ||
385 | |||
386 | *jh_out = new_jh; | ||
387 | |||
388 | /* | ||
389 | * The to-be-written buffer needs to get moved to the io queue, | ||
390 | * and the original buffer whose contents we are shadowing or | ||
391 | * copying is moved to the transaction's shadow queue. | ||
392 | */ | ||
393 | JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); | ||
394 | journal_file_buffer(jh_in, transaction, BJ_Shadow); | ||
395 | JBUFFER_TRACE(new_jh, "file as BJ_IO"); | ||
396 | journal_file_buffer(new_jh, transaction, BJ_IO); | ||
397 | |||
398 | return do_escape | (done_copy_out << 1); | ||
399 | } | ||
400 | |||
401 | /* | ||
402 | * Allocation code for the journal file. Manage the space left in the | ||
403 | * journal, so that we can begin checkpointing when appropriate. | ||
404 | */ | ||
405 | |||
406 | /* | ||
407 | * __log_space_left: Return the number of free blocks left in the journal. | ||
408 | * | ||
409 | * Called with the journal already locked. | ||
410 | * | ||
411 | * Called under j_state_lock | ||
412 | */ | ||
413 | |||
414 | int __log_space_left(journal_t *journal) | ||
415 | { | ||
416 | int left = journal->j_free; | ||
417 | |||
418 | assert_spin_locked(&journal->j_state_lock); | ||
419 | |||
420 | /* | ||
421 | * Be pessimistic here about the number of those free blocks which | ||
422 | * might be required for log descriptor control blocks. | ||
423 | */ | ||
424 | |||
425 | #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ | ||
426 | |||
427 | left -= MIN_LOG_RESERVED_BLOCKS; | ||
428 | |||
429 | if (left <= 0) | ||
430 | return 0; | ||
431 | left -= (left >> 3); | ||
432 | return left; | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * Called under j_state_lock. Returns true if a transaction was started. | ||
437 | */ | ||
438 | int __log_start_commit(journal_t *journal, tid_t target) | ||
439 | { | ||
440 | /* | ||
441 | * Are we already doing a recent enough commit? | ||
442 | */ | ||
443 | if (!tid_geq(journal->j_commit_request, target)) { | ||
444 | /* | ||
445 | * We want a new commit: OK, mark the request and wakup the | ||
446 | * commit thread. We do _not_ do the commit ourselves. | ||
447 | */ | ||
448 | |||
449 | journal->j_commit_request = target; | ||
450 | jbd_debug(1, "JBD: requesting commit %d/%d\n", | ||
451 | journal->j_commit_request, | ||
452 | journal->j_commit_sequence); | ||
453 | wake_up(&journal->j_wait_commit); | ||
454 | return 1; | ||
455 | } | ||
456 | return 0; | ||
457 | } | ||
458 | |||
459 | int log_start_commit(journal_t *journal, tid_t tid) | ||
460 | { | ||
461 | int ret; | ||
462 | |||
463 | spin_lock(&journal->j_state_lock); | ||
464 | ret = __log_start_commit(journal, tid); | ||
465 | spin_unlock(&journal->j_state_lock); | ||
466 | return ret; | ||
467 | } | ||
468 | |||
469 | /* | ||
470 | * Force and wait upon a commit if the calling process is not within | ||
471 | * transaction. This is used for forcing out undo-protected data which contains | ||
472 | * bitmaps, when the fs is running out of space. | ||
473 | * | ||
474 | * We can only force the running transaction if we don't have an active handle; | ||
475 | * otherwise, we will deadlock. | ||
476 | * | ||
477 | * Returns true if a transaction was started. | ||
478 | */ | ||
479 | int journal_force_commit_nested(journal_t *journal) | ||
480 | { | ||
481 | transaction_t *transaction = NULL; | ||
482 | tid_t tid; | ||
483 | |||
484 | spin_lock(&journal->j_state_lock); | ||
485 | if (journal->j_running_transaction && !current->journal_info) { | ||
486 | transaction = journal->j_running_transaction; | ||
487 | __log_start_commit(journal, transaction->t_tid); | ||
488 | } else if (journal->j_committing_transaction) | ||
489 | transaction = journal->j_committing_transaction; | ||
490 | |||
491 | if (!transaction) { | ||
492 | spin_unlock(&journal->j_state_lock); | ||
493 | return 0; /* Nothing to retry */ | ||
494 | } | ||
495 | |||
496 | tid = transaction->t_tid; | ||
497 | spin_unlock(&journal->j_state_lock); | ||
498 | log_wait_commit(journal, tid); | ||
499 | return 1; | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Start a commit of the current running transaction (if any). Returns true | ||
504 | * if a transaction was started, and fills its tid in at *ptid | ||
505 | */ | ||
506 | int journal_start_commit(journal_t *journal, tid_t *ptid) | ||
507 | { | ||
508 | int ret = 0; | ||
509 | |||
510 | spin_lock(&journal->j_state_lock); | ||
511 | if (journal->j_running_transaction) { | ||
512 | tid_t tid = journal->j_running_transaction->t_tid; | ||
513 | |||
514 | ret = __log_start_commit(journal, tid); | ||
515 | if (ret && ptid) | ||
516 | *ptid = tid; | ||
517 | } else if (journal->j_committing_transaction && ptid) { | ||
518 | /* | ||
519 | * If ext3_write_super() recently started a commit, then we | ||
520 | * have to wait for completion of that transaction | ||
521 | */ | ||
522 | *ptid = journal->j_committing_transaction->t_tid; | ||
523 | ret = 1; | ||
524 | } | ||
525 | spin_unlock(&journal->j_state_lock); | ||
526 | return ret; | ||
527 | } | ||
528 | |||
529 | /* | ||
530 | * Wait for a specified commit to complete. | ||
531 | * The caller may not hold the journal lock. | ||
532 | */ | ||
533 | int log_wait_commit(journal_t *journal, tid_t tid) | ||
534 | { | ||
535 | int err = 0; | ||
536 | |||
537 | #ifdef CONFIG_JBD_DEBUG | ||
538 | spin_lock(&journal->j_state_lock); | ||
539 | if (!tid_geq(journal->j_commit_request, tid)) { | ||
540 | printk(KERN_EMERG | ||
541 | "%s: error: j_commit_request=%d, tid=%d\n", | ||
542 | __FUNCTION__, journal->j_commit_request, tid); | ||
543 | } | ||
544 | spin_unlock(&journal->j_state_lock); | ||
545 | #endif | ||
546 | spin_lock(&journal->j_state_lock); | ||
547 | while (tid_gt(tid, journal->j_commit_sequence)) { | ||
548 | jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", | ||
549 | tid, journal->j_commit_sequence); | ||
550 | wake_up(&journal->j_wait_commit); | ||
551 | spin_unlock(&journal->j_state_lock); | ||
552 | wait_event(journal->j_wait_done_commit, | ||
553 | !tid_gt(tid, journal->j_commit_sequence)); | ||
554 | spin_lock(&journal->j_state_lock); | ||
555 | } | ||
556 | spin_unlock(&journal->j_state_lock); | ||
557 | |||
558 | if (unlikely(is_journal_aborted(journal))) { | ||
559 | printk(KERN_EMERG "journal commit I/O error\n"); | ||
560 | err = -EIO; | ||
561 | } | ||
562 | return err; | ||
563 | } | ||
564 | |||
565 | /* | ||
566 | * Log buffer allocation routines: | ||
567 | */ | ||
568 | |||
569 | int journal_next_log_block(journal_t *journal, unsigned long *retp) | ||
570 | { | ||
571 | unsigned long blocknr; | ||
572 | |||
573 | spin_lock(&journal->j_state_lock); | ||
574 | J_ASSERT(journal->j_free > 1); | ||
575 | |||
576 | blocknr = journal->j_head; | ||
577 | journal->j_head++; | ||
578 | journal->j_free--; | ||
579 | if (journal->j_head == journal->j_last) | ||
580 | journal->j_head = journal->j_first; | ||
581 | spin_unlock(&journal->j_state_lock); | ||
582 | return journal_bmap(journal, blocknr, retp); | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Conversion of logical to physical block numbers for the journal | ||
587 | * | ||
588 | * On external journals the journal blocks are identity-mapped, so | ||
589 | * this is a no-op. If needed, we can use j_blk_offset - everything is | ||
590 | * ready. | ||
591 | */ | ||
592 | int journal_bmap(journal_t *journal, unsigned long blocknr, | ||
593 | unsigned long *retp) | ||
594 | { | ||
595 | int err = 0; | ||
596 | unsigned long ret; | ||
597 | |||
598 | if (journal->j_inode) { | ||
599 | ret = bmap(journal->j_inode, blocknr); | ||
600 | if (ret) | ||
601 | *retp = ret; | ||
602 | else { | ||
603 | char b[BDEVNAME_SIZE]; | ||
604 | |||
605 | printk(KERN_ALERT "%s: journal block not found " | ||
606 | "at offset %lu on %s\n", | ||
607 | __FUNCTION__, | ||
608 | blocknr, | ||
609 | bdevname(journal->j_dev, b)); | ||
610 | err = -EIO; | ||
611 | __journal_abort_soft(journal, err); | ||
612 | } | ||
613 | } else { | ||
614 | *retp = blocknr; /* +journal->j_blk_offset */ | ||
615 | } | ||
616 | return err; | ||
617 | } | ||
618 | |||
619 | /* | ||
620 | * We play buffer_head aliasing tricks to write data/metadata blocks to | ||
621 | * the journal without copying their contents, but for journal | ||
622 | * descriptor blocks we do need to generate bona fide buffers. | ||
623 | * | ||
624 | * After the caller of journal_get_descriptor_buffer() has finished modifying | ||
625 | * the buffer's contents they really should run flush_dcache_page(bh->b_page). | ||
626 | * But we don't bother doing that, so there will be coherency problems with | ||
627 | * mmaps of blockdevs which hold live JBD-controlled filesystems. | ||
628 | */ | ||
629 | struct journal_head *journal_get_descriptor_buffer(journal_t *journal) | ||
630 | { | ||
631 | struct buffer_head *bh; | ||
632 | unsigned long blocknr; | ||
633 | int err; | ||
634 | |||
635 | err = journal_next_log_block(journal, &blocknr); | ||
636 | |||
637 | if (err) | ||
638 | return NULL; | ||
639 | |||
640 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
641 | lock_buffer(bh); | ||
642 | memset(bh->b_data, 0, journal->j_blocksize); | ||
643 | set_buffer_uptodate(bh); | ||
644 | unlock_buffer(bh); | ||
645 | BUFFER_TRACE(bh, "return this buffer"); | ||
646 | return journal_add_journal_head(bh); | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * Management for journal control blocks: functions to create and | ||
651 | * destroy journal_t structures, and to initialise and read existing | ||
652 | * journal blocks from disk. */ | ||
653 | |||
654 | /* First: create and setup a journal_t object in memory. We initialise | ||
655 | * very few fields yet: that has to wait until we have created the | ||
656 | * journal structures from from scratch, or loaded them from disk. */ | ||
657 | |||
658 | static journal_t * journal_init_common (void) | ||
659 | { | ||
660 | journal_t *journal; | ||
661 | int err; | ||
662 | |||
663 | journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL); | ||
664 | if (!journal) | ||
665 | goto fail; | ||
666 | memset(journal, 0, sizeof(*journal)); | ||
667 | |||
668 | init_waitqueue_head(&journal->j_wait_transaction_locked); | ||
669 | init_waitqueue_head(&journal->j_wait_logspace); | ||
670 | init_waitqueue_head(&journal->j_wait_done_commit); | ||
671 | init_waitqueue_head(&journal->j_wait_checkpoint); | ||
672 | init_waitqueue_head(&journal->j_wait_commit); | ||
673 | init_waitqueue_head(&journal->j_wait_updates); | ||
674 | init_MUTEX(&journal->j_barrier); | ||
675 | init_MUTEX(&journal->j_checkpoint_sem); | ||
676 | spin_lock_init(&journal->j_revoke_lock); | ||
677 | spin_lock_init(&journal->j_list_lock); | ||
678 | spin_lock_init(&journal->j_state_lock); | ||
679 | |||
680 | journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); | ||
681 | |||
682 | /* The journal is marked for error until we succeed with recovery! */ | ||
683 | journal->j_flags = JFS_ABORT; | ||
684 | |||
685 | /* Set up a default-sized revoke table for the new mount. */ | ||
686 | err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); | ||
687 | if (err) { | ||
688 | kfree(journal); | ||
689 | goto fail; | ||
690 | } | ||
691 | return journal; | ||
692 | fail: | ||
693 | return NULL; | ||
694 | } | ||
695 | |||
696 | /* journal_init_dev and journal_init_inode: | ||
697 | * | ||
698 | * Create a journal structure assigned some fixed set of disk blocks to | ||
699 | * the journal. We don't actually touch those disk blocks yet, but we | ||
700 | * need to set up all of the mapping information to tell the journaling | ||
701 | * system where the journal blocks are. | ||
702 | * | ||
703 | */ | ||
704 | |||
705 | /** | ||
706 | * journal_t * journal_init_dev() - creates an initialises a journal structure | ||
707 | * @bdev: Block device on which to create the journal | ||
708 | * @fs_dev: Device which hold journalled filesystem for this journal. | ||
709 | * @start: Block nr Start of journal. | ||
710 | * @len: Lenght of the journal in blocks. | ||
711 | * @blocksize: blocksize of journalling device | ||
712 | * @returns: a newly created journal_t * | ||
713 | * | ||
714 | * journal_init_dev creates a journal which maps a fixed contiguous | ||
715 | * range of blocks on an arbitrary block device. | ||
716 | * | ||
717 | */ | ||
718 | journal_t * journal_init_dev(struct block_device *bdev, | ||
719 | struct block_device *fs_dev, | ||
720 | int start, int len, int blocksize) | ||
721 | { | ||
722 | journal_t *journal = journal_init_common(); | ||
723 | struct buffer_head *bh; | ||
724 | int n; | ||
725 | |||
726 | if (!journal) | ||
727 | return NULL; | ||
728 | |||
729 | journal->j_dev = bdev; | ||
730 | journal->j_fs_dev = fs_dev; | ||
731 | journal->j_blk_offset = start; | ||
732 | journal->j_maxlen = len; | ||
733 | journal->j_blocksize = blocksize; | ||
734 | |||
735 | bh = __getblk(journal->j_dev, start, journal->j_blocksize); | ||
736 | J_ASSERT(bh != NULL); | ||
737 | journal->j_sb_buffer = bh; | ||
738 | journal->j_superblock = (journal_superblock_t *)bh->b_data; | ||
739 | |||
740 | /* journal descriptor can store up to n blocks -bzzz */ | ||
741 | n = journal->j_blocksize / sizeof(journal_block_tag_t); | ||
742 | journal->j_wbufsize = n; | ||
743 | journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); | ||
744 | if (!journal->j_wbuf) { | ||
745 | printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", | ||
746 | __FUNCTION__); | ||
747 | kfree(journal); | ||
748 | journal = NULL; | ||
749 | } | ||
750 | |||
751 | return journal; | ||
752 | } | ||
753 | |||
754 | /** | ||
755 | * journal_t * journal_init_inode () - creates a journal which maps to a inode. | ||
756 | * @inode: An inode to create the journal in | ||
757 | * | ||
758 | * journal_init_inode creates a journal which maps an on-disk inode as | ||
759 | * the journal. The inode must exist already, must support bmap() and | ||
760 | * must have all data blocks preallocated. | ||
761 | */ | ||
762 | journal_t * journal_init_inode (struct inode *inode) | ||
763 | { | ||
764 | struct buffer_head *bh; | ||
765 | journal_t *journal = journal_init_common(); | ||
766 | int err; | ||
767 | int n; | ||
768 | unsigned long blocknr; | ||
769 | |||
770 | if (!journal) | ||
771 | return NULL; | ||
772 | |||
773 | journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; | ||
774 | journal->j_inode = inode; | ||
775 | jbd_debug(1, | ||
776 | "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", | ||
777 | journal, inode->i_sb->s_id, inode->i_ino, | ||
778 | (long long) inode->i_size, | ||
779 | inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); | ||
780 | |||
781 | journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; | ||
782 | journal->j_blocksize = inode->i_sb->s_blocksize; | ||
783 | |||
784 | /* journal descriptor can store up to n blocks -bzzz */ | ||
785 | n = journal->j_blocksize / sizeof(journal_block_tag_t); | ||
786 | journal->j_wbufsize = n; | ||
787 | journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); | ||
788 | if (!journal->j_wbuf) { | ||
789 | printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", | ||
790 | __FUNCTION__); | ||
791 | kfree(journal); | ||
792 | return NULL; | ||
793 | } | ||
794 | |||
795 | err = journal_bmap(journal, 0, &blocknr); | ||
796 | /* If that failed, give up */ | ||
797 | if (err) { | ||
798 | printk(KERN_ERR "%s: Cannnot locate journal superblock\n", | ||
799 | __FUNCTION__); | ||
800 | kfree(journal); | ||
801 | return NULL; | ||
802 | } | ||
803 | |||
804 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
805 | J_ASSERT(bh != NULL); | ||
806 | journal->j_sb_buffer = bh; | ||
807 | journal->j_superblock = (journal_superblock_t *)bh->b_data; | ||
808 | |||
809 | return journal; | ||
810 | } | ||
811 | |||
812 | /* | ||
813 | * If the journal init or create aborts, we need to mark the journal | ||
814 | * superblock as being NULL to prevent the journal destroy from writing | ||
815 | * back a bogus superblock. | ||
816 | */ | ||
817 | static void journal_fail_superblock (journal_t *journal) | ||
818 | { | ||
819 | struct buffer_head *bh = journal->j_sb_buffer; | ||
820 | brelse(bh); | ||
821 | journal->j_sb_buffer = NULL; | ||
822 | } | ||
823 | |||
824 | /* | ||
825 | * Given a journal_t structure, initialise the various fields for | ||
826 | * startup of a new journaling session. We use this both when creating | ||
827 | * a journal, and after recovering an old journal to reset it for | ||
828 | * subsequent use. | ||
829 | */ | ||
830 | |||
831 | static int journal_reset(journal_t *journal) | ||
832 | { | ||
833 | journal_superblock_t *sb = journal->j_superblock; | ||
834 | unsigned int first, last; | ||
835 | |||
836 | first = be32_to_cpu(sb->s_first); | ||
837 | last = be32_to_cpu(sb->s_maxlen); | ||
838 | |||
839 | journal->j_first = first; | ||
840 | journal->j_last = last; | ||
841 | |||
842 | journal->j_head = first; | ||
843 | journal->j_tail = first; | ||
844 | journal->j_free = last - first; | ||
845 | |||
846 | journal->j_tail_sequence = journal->j_transaction_sequence; | ||
847 | journal->j_commit_sequence = journal->j_transaction_sequence - 1; | ||
848 | journal->j_commit_request = journal->j_commit_sequence; | ||
849 | |||
850 | journal->j_max_transaction_buffers = journal->j_maxlen / 4; | ||
851 | |||
852 | /* Add the dynamic fields and write it to disk. */ | ||
853 | journal_update_superblock(journal, 1); | ||
854 | journal_start_thread(journal); | ||
855 | return 0; | ||
856 | } | ||
857 | |||
858 | /** | ||
859 | * int journal_create() - Initialise the new journal file | ||
860 | * @journal: Journal to create. This structure must have been initialised | ||
861 | * | ||
862 | * Given a journal_t structure which tells us which disk blocks we can | ||
863 | * use, create a new journal superblock and initialise all of the | ||
864 | * journal fields from scratch. | ||
865 | **/ | ||
866 | int journal_create(journal_t *journal) | ||
867 | { | ||
868 | unsigned long blocknr; | ||
869 | struct buffer_head *bh; | ||
870 | journal_superblock_t *sb; | ||
871 | int i, err; | ||
872 | |||
873 | if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { | ||
874 | printk (KERN_ERR "Journal length (%d blocks) too short.\n", | ||
875 | journal->j_maxlen); | ||
876 | journal_fail_superblock(journal); | ||
877 | return -EINVAL; | ||
878 | } | ||
879 | |||
880 | if (journal->j_inode == NULL) { | ||
881 | /* | ||
882 | * We don't know what block to start at! | ||
883 | */ | ||
884 | printk(KERN_EMERG | ||
885 | "%s: creation of journal on external device!\n", | ||
886 | __FUNCTION__); | ||
887 | BUG(); | ||
888 | } | ||
889 | |||
890 | /* Zero out the entire journal on disk. We cannot afford to | ||
891 | have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ | ||
892 | jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); | ||
893 | for (i = 0; i < journal->j_maxlen; i++) { | ||
894 | err = journal_bmap(journal, i, &blocknr); | ||
895 | if (err) | ||
896 | return err; | ||
897 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
898 | lock_buffer(bh); | ||
899 | memset (bh->b_data, 0, journal->j_blocksize); | ||
900 | BUFFER_TRACE(bh, "marking dirty"); | ||
901 | mark_buffer_dirty(bh); | ||
902 | BUFFER_TRACE(bh, "marking uptodate"); | ||
903 | set_buffer_uptodate(bh); | ||
904 | unlock_buffer(bh); | ||
905 | __brelse(bh); | ||
906 | } | ||
907 | |||
908 | sync_blockdev(journal->j_dev); | ||
909 | jbd_debug(1, "JBD: journal cleared.\n"); | ||
910 | |||
911 | /* OK, fill in the initial static fields in the new superblock */ | ||
912 | sb = journal->j_superblock; | ||
913 | |||
914 | sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
915 | sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); | ||
916 | |||
917 | sb->s_blocksize = cpu_to_be32(journal->j_blocksize); | ||
918 | sb->s_maxlen = cpu_to_be32(journal->j_maxlen); | ||
919 | sb->s_first = cpu_to_be32(1); | ||
920 | |||
921 | journal->j_transaction_sequence = 1; | ||
922 | |||
923 | journal->j_flags &= ~JFS_ABORT; | ||
924 | journal->j_format_version = 2; | ||
925 | |||
926 | return journal_reset(journal); | ||
927 | } | ||
928 | |||
929 | /** | ||
930 | * void journal_update_superblock() - Update journal sb on disk. | ||
931 | * @journal: The journal to update. | ||
932 | * @wait: Set to '0' if you don't want to wait for IO completion. | ||
933 | * | ||
934 | * Update a journal's dynamic superblock fields and write it to disk, | ||
935 | * optionally waiting for the IO to complete. | ||
936 | */ | ||
937 | void journal_update_superblock(journal_t *journal, int wait) | ||
938 | { | ||
939 | journal_superblock_t *sb = journal->j_superblock; | ||
940 | struct buffer_head *bh = journal->j_sb_buffer; | ||
941 | |||
942 | /* | ||
943 | * As a special case, if the on-disk copy is already marked as needing | ||
944 | * no recovery (s_start == 0) and there are no outstanding transactions | ||
945 | * in the filesystem, then we can safely defer the superblock update | ||
946 | * until the next commit by setting JFS_FLUSHED. This avoids | ||
947 | * attempting a write to a potential-readonly device. | ||
948 | */ | ||
949 | if (sb->s_start == 0 && journal->j_tail_sequence == | ||
950 | journal->j_transaction_sequence) { | ||
951 | jbd_debug(1,"JBD: Skipping superblock update on recovered sb " | ||
952 | "(start %ld, seq %d, errno %d)\n", | ||
953 | journal->j_tail, journal->j_tail_sequence, | ||
954 | journal->j_errno); | ||
955 | goto out; | ||
956 | } | ||
957 | |||
958 | spin_lock(&journal->j_state_lock); | ||
959 | jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", | ||
960 | journal->j_tail, journal->j_tail_sequence, journal->j_errno); | ||
961 | |||
962 | sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); | ||
963 | sb->s_start = cpu_to_be32(journal->j_tail); | ||
964 | sb->s_errno = cpu_to_be32(journal->j_errno); | ||
965 | spin_unlock(&journal->j_state_lock); | ||
966 | |||
967 | BUFFER_TRACE(bh, "marking dirty"); | ||
968 | mark_buffer_dirty(bh); | ||
969 | if (wait) | ||
970 | sync_dirty_buffer(bh); | ||
971 | else | ||
972 | ll_rw_block(WRITE, 1, &bh); | ||
973 | |||
974 | out: | ||
975 | /* If we have just flushed the log (by marking s_start==0), then | ||
976 | * any future commit will have to be careful to update the | ||
977 | * superblock again to re-record the true start of the log. */ | ||
978 | |||
979 | spin_lock(&journal->j_state_lock); | ||
980 | if (sb->s_start) | ||
981 | journal->j_flags &= ~JFS_FLUSHED; | ||
982 | else | ||
983 | journal->j_flags |= JFS_FLUSHED; | ||
984 | spin_unlock(&journal->j_state_lock); | ||
985 | } | ||
986 | |||
987 | /* | ||
988 | * Read the superblock for a given journal, performing initial | ||
989 | * validation of the format. | ||
990 | */ | ||
991 | |||
992 | static int journal_get_superblock(journal_t *journal) | ||
993 | { | ||
994 | struct buffer_head *bh; | ||
995 | journal_superblock_t *sb; | ||
996 | int err = -EIO; | ||
997 | |||
998 | bh = journal->j_sb_buffer; | ||
999 | |||
1000 | J_ASSERT(bh != NULL); | ||
1001 | if (!buffer_uptodate(bh)) { | ||
1002 | ll_rw_block(READ, 1, &bh); | ||
1003 | wait_on_buffer(bh); | ||
1004 | if (!buffer_uptodate(bh)) { | ||
1005 | printk (KERN_ERR | ||
1006 | "JBD: IO error reading journal superblock\n"); | ||
1007 | goto out; | ||
1008 | } | ||
1009 | } | ||
1010 | |||
1011 | sb = journal->j_superblock; | ||
1012 | |||
1013 | err = -EINVAL; | ||
1014 | |||
1015 | if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || | ||
1016 | sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { | ||
1017 | printk(KERN_WARNING "JBD: no valid journal superblock found\n"); | ||
1018 | goto out; | ||
1019 | } | ||
1020 | |||
1021 | switch(be32_to_cpu(sb->s_header.h_blocktype)) { | ||
1022 | case JFS_SUPERBLOCK_V1: | ||
1023 | journal->j_format_version = 1; | ||
1024 | break; | ||
1025 | case JFS_SUPERBLOCK_V2: | ||
1026 | journal->j_format_version = 2; | ||
1027 | break; | ||
1028 | default: | ||
1029 | printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); | ||
1030 | goto out; | ||
1031 | } | ||
1032 | |||
1033 | if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) | ||
1034 | journal->j_maxlen = be32_to_cpu(sb->s_maxlen); | ||
1035 | else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { | ||
1036 | printk (KERN_WARNING "JBD: journal file too short\n"); | ||
1037 | goto out; | ||
1038 | } | ||
1039 | |||
1040 | return 0; | ||
1041 | |||
1042 | out: | ||
1043 | journal_fail_superblock(journal); | ||
1044 | return err; | ||
1045 | } | ||
1046 | |||
1047 | /* | ||
1048 | * Load the on-disk journal superblock and read the key fields into the | ||
1049 | * journal_t. | ||
1050 | */ | ||
1051 | |||
1052 | static int load_superblock(journal_t *journal) | ||
1053 | { | ||
1054 | int err; | ||
1055 | journal_superblock_t *sb; | ||
1056 | |||
1057 | err = journal_get_superblock(journal); | ||
1058 | if (err) | ||
1059 | return err; | ||
1060 | |||
1061 | sb = journal->j_superblock; | ||
1062 | |||
1063 | journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); | ||
1064 | journal->j_tail = be32_to_cpu(sb->s_start); | ||
1065 | journal->j_first = be32_to_cpu(sb->s_first); | ||
1066 | journal->j_last = be32_to_cpu(sb->s_maxlen); | ||
1067 | journal->j_errno = be32_to_cpu(sb->s_errno); | ||
1068 | |||
1069 | return 0; | ||
1070 | } | ||
1071 | |||
1072 | |||
1073 | /** | ||
1074 | * int journal_load() - Read journal from disk. | ||
1075 | * @journal: Journal to act on. | ||
1076 | * | ||
1077 | * Given a journal_t structure which tells us which disk blocks contain | ||
1078 | * a journal, read the journal from disk to initialise the in-memory | ||
1079 | * structures. | ||
1080 | */ | ||
1081 | int journal_load(journal_t *journal) | ||
1082 | { | ||
1083 | int err; | ||
1084 | |||
1085 | err = load_superblock(journal); | ||
1086 | if (err) | ||
1087 | return err; | ||
1088 | |||
1089 | /* If this is a V2 superblock, then we have to check the | ||
1090 | * features flags on it. */ | ||
1091 | |||
1092 | if (journal->j_format_version >= 2) { | ||
1093 | journal_superblock_t *sb = journal->j_superblock; | ||
1094 | |||
1095 | if ((sb->s_feature_ro_compat & | ||
1096 | ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || | ||
1097 | (sb->s_feature_incompat & | ||
1098 | ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { | ||
1099 | printk (KERN_WARNING | ||
1100 | "JBD: Unrecognised features on journal\n"); | ||
1101 | return -EINVAL; | ||
1102 | } | ||
1103 | } | ||
1104 | |||
1105 | /* Let the recovery code check whether it needs to recover any | ||
1106 | * data from the journal. */ | ||
1107 | if (journal_recover(journal)) | ||
1108 | goto recovery_error; | ||
1109 | |||
1110 | /* OK, we've finished with the dynamic journal bits: | ||
1111 | * reinitialise the dynamic contents of the superblock in memory | ||
1112 | * and reset them on disk. */ | ||
1113 | if (journal_reset(journal)) | ||
1114 | goto recovery_error; | ||
1115 | |||
1116 | journal->j_flags &= ~JFS_ABORT; | ||
1117 | journal->j_flags |= JFS_LOADED; | ||
1118 | return 0; | ||
1119 | |||
1120 | recovery_error: | ||
1121 | printk (KERN_WARNING "JBD: recovery failed\n"); | ||
1122 | return -EIO; | ||
1123 | } | ||
1124 | |||
1125 | /** | ||
1126 | * void journal_destroy() - Release a journal_t structure. | ||
1127 | * @journal: Journal to act on. | ||
1128 | * | ||
1129 | * Release a journal_t structure once it is no longer in use by the | ||
1130 | * journaled object. | ||
1131 | */ | ||
1132 | void journal_destroy(journal_t *journal) | ||
1133 | { | ||
1134 | /* Wait for the commit thread to wake up and die. */ | ||
1135 | journal_kill_thread(journal); | ||
1136 | |||
1137 | /* Force a final log commit */ | ||
1138 | if (journal->j_running_transaction) | ||
1139 | journal_commit_transaction(journal); | ||
1140 | |||
1141 | /* Force any old transactions to disk */ | ||
1142 | |||
1143 | /* Totally anal locking here... */ | ||
1144 | spin_lock(&journal->j_list_lock); | ||
1145 | while (journal->j_checkpoint_transactions != NULL) { | ||
1146 | spin_unlock(&journal->j_list_lock); | ||
1147 | log_do_checkpoint(journal); | ||
1148 | spin_lock(&journal->j_list_lock); | ||
1149 | } | ||
1150 | |||
1151 | J_ASSERT(journal->j_running_transaction == NULL); | ||
1152 | J_ASSERT(journal->j_committing_transaction == NULL); | ||
1153 | J_ASSERT(journal->j_checkpoint_transactions == NULL); | ||
1154 | spin_unlock(&journal->j_list_lock); | ||
1155 | |||
1156 | /* We can now mark the journal as empty. */ | ||
1157 | journal->j_tail = 0; | ||
1158 | journal->j_tail_sequence = ++journal->j_transaction_sequence; | ||
1159 | if (journal->j_sb_buffer) { | ||
1160 | journal_update_superblock(journal, 1); | ||
1161 | brelse(journal->j_sb_buffer); | ||
1162 | } | ||
1163 | |||
1164 | if (journal->j_inode) | ||
1165 | iput(journal->j_inode); | ||
1166 | if (journal->j_revoke) | ||
1167 | journal_destroy_revoke(journal); | ||
1168 | kfree(journal->j_wbuf); | ||
1169 | kfree(journal); | ||
1170 | } | ||
1171 | |||
1172 | |||
1173 | /** | ||
1174 | *int journal_check_used_features () - Check if features specified are used. | ||
1175 | * @journal: Journal to check. | ||
1176 | * @compat: bitmask of compatible features | ||
1177 | * @ro: bitmask of features that force read-only mount | ||
1178 | * @incompat: bitmask of incompatible features | ||
1179 | * | ||
1180 | * Check whether the journal uses all of a given set of | ||
1181 | * features. Return true (non-zero) if it does. | ||
1182 | **/ | ||
1183 | |||
1184 | int journal_check_used_features (journal_t *journal, unsigned long compat, | ||
1185 | unsigned long ro, unsigned long incompat) | ||
1186 | { | ||
1187 | journal_superblock_t *sb; | ||
1188 | |||
1189 | if (!compat && !ro && !incompat) | ||
1190 | return 1; | ||
1191 | if (journal->j_format_version == 1) | ||
1192 | return 0; | ||
1193 | |||
1194 | sb = journal->j_superblock; | ||
1195 | |||
1196 | if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && | ||
1197 | ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && | ||
1198 | ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) | ||
1199 | return 1; | ||
1200 | |||
1201 | return 0; | ||
1202 | } | ||
1203 | |||
1204 | /** | ||
1205 | * int journal_check_available_features() - Check feature set in journalling layer | ||
1206 | * @journal: Journal to check. | ||
1207 | * @compat: bitmask of compatible features | ||
1208 | * @ro: bitmask of features that force read-only mount | ||
1209 | * @incompat: bitmask of incompatible features | ||
1210 | * | ||
1211 | * Check whether the journaling code supports the use of | ||
1212 | * all of a given set of features on this journal. Return true | ||
1213 | * (non-zero) if it can. */ | ||
1214 | |||
1215 | int journal_check_available_features (journal_t *journal, unsigned long compat, | ||
1216 | unsigned long ro, unsigned long incompat) | ||
1217 | { | ||
1218 | journal_superblock_t *sb; | ||
1219 | |||
1220 | if (!compat && !ro && !incompat) | ||
1221 | return 1; | ||
1222 | |||
1223 | sb = journal->j_superblock; | ||
1224 | |||
1225 | /* We can support any known requested features iff the | ||
1226 | * superblock is in version 2. Otherwise we fail to support any | ||
1227 | * extended sb features. */ | ||
1228 | |||
1229 | if (journal->j_format_version != 2) | ||
1230 | return 0; | ||
1231 | |||
1232 | if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && | ||
1233 | (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && | ||
1234 | (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) | ||
1235 | return 1; | ||
1236 | |||
1237 | return 0; | ||
1238 | } | ||
1239 | |||
1240 | /** | ||
1241 | * int journal_set_features () - Mark a given journal feature in the superblock | ||
1242 | * @journal: Journal to act on. | ||
1243 | * @compat: bitmask of compatible features | ||
1244 | * @ro: bitmask of features that force read-only mount | ||
1245 | * @incompat: bitmask of incompatible features | ||
1246 | * | ||
1247 | * Mark a given journal feature as present on the | ||
1248 | * superblock. Returns true if the requested features could be set. | ||
1249 | * | ||
1250 | */ | ||
1251 | |||
1252 | int journal_set_features (journal_t *journal, unsigned long compat, | ||
1253 | unsigned long ro, unsigned long incompat) | ||
1254 | { | ||
1255 | journal_superblock_t *sb; | ||
1256 | |||
1257 | if (journal_check_used_features(journal, compat, ro, incompat)) | ||
1258 | return 1; | ||
1259 | |||
1260 | if (!journal_check_available_features(journal, compat, ro, incompat)) | ||
1261 | return 0; | ||
1262 | |||
1263 | jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", | ||
1264 | compat, ro, incompat); | ||
1265 | |||
1266 | sb = journal->j_superblock; | ||
1267 | |||
1268 | sb->s_feature_compat |= cpu_to_be32(compat); | ||
1269 | sb->s_feature_ro_compat |= cpu_to_be32(ro); | ||
1270 | sb->s_feature_incompat |= cpu_to_be32(incompat); | ||
1271 | |||
1272 | return 1; | ||
1273 | } | ||
1274 | |||
1275 | |||
1276 | /** | ||
1277 | * int journal_update_format () - Update on-disk journal structure. | ||
1278 | * @journal: Journal to act on. | ||
1279 | * | ||
1280 | * Given an initialised but unloaded journal struct, poke about in the | ||
1281 | * on-disk structure to update it to the most recent supported version. | ||
1282 | */ | ||
1283 | int journal_update_format (journal_t *journal) | ||
1284 | { | ||
1285 | journal_superblock_t *sb; | ||
1286 | int err; | ||
1287 | |||
1288 | err = journal_get_superblock(journal); | ||
1289 | if (err) | ||
1290 | return err; | ||
1291 | |||
1292 | sb = journal->j_superblock; | ||
1293 | |||
1294 | switch (be32_to_cpu(sb->s_header.h_blocktype)) { | ||
1295 | case JFS_SUPERBLOCK_V2: | ||
1296 | return 0; | ||
1297 | case JFS_SUPERBLOCK_V1: | ||
1298 | return journal_convert_superblock_v1(journal, sb); | ||
1299 | default: | ||
1300 | break; | ||
1301 | } | ||
1302 | return -EINVAL; | ||
1303 | } | ||
1304 | |||
1305 | static int journal_convert_superblock_v1(journal_t *journal, | ||
1306 | journal_superblock_t *sb) | ||
1307 | { | ||
1308 | int offset, blocksize; | ||
1309 | struct buffer_head *bh; | ||
1310 | |||
1311 | printk(KERN_WARNING | ||
1312 | "JBD: Converting superblock from version 1 to 2.\n"); | ||
1313 | |||
1314 | /* Pre-initialise new fields to zero */ | ||
1315 | offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); | ||
1316 | blocksize = be32_to_cpu(sb->s_blocksize); | ||
1317 | memset(&sb->s_feature_compat, 0, blocksize-offset); | ||
1318 | |||
1319 | sb->s_nr_users = cpu_to_be32(1); | ||
1320 | sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); | ||
1321 | journal->j_format_version = 2; | ||
1322 | |||
1323 | bh = journal->j_sb_buffer; | ||
1324 | BUFFER_TRACE(bh, "marking dirty"); | ||
1325 | mark_buffer_dirty(bh); | ||
1326 | sync_dirty_buffer(bh); | ||
1327 | return 0; | ||
1328 | } | ||
1329 | |||
1330 | |||
1331 | /** | ||
1332 | * int journal_flush () - Flush journal | ||
1333 | * @journal: Journal to act on. | ||
1334 | * | ||
1335 | * Flush all data for a given journal to disk and empty the journal. | ||
1336 | * Filesystems can use this when remounting readonly to ensure that | ||
1337 | * recovery does not need to happen on remount. | ||
1338 | */ | ||
1339 | |||
1340 | int journal_flush(journal_t *journal) | ||
1341 | { | ||
1342 | int err = 0; | ||
1343 | transaction_t *transaction = NULL; | ||
1344 | unsigned long old_tail; | ||
1345 | |||
1346 | spin_lock(&journal->j_state_lock); | ||
1347 | |||
1348 | /* Force everything buffered to the log... */ | ||
1349 | if (journal->j_running_transaction) { | ||
1350 | transaction = journal->j_running_transaction; | ||
1351 | __log_start_commit(journal, transaction->t_tid); | ||
1352 | } else if (journal->j_committing_transaction) | ||
1353 | transaction = journal->j_committing_transaction; | ||
1354 | |||
1355 | /* Wait for the log commit to complete... */ | ||
1356 | if (transaction) { | ||
1357 | tid_t tid = transaction->t_tid; | ||
1358 | |||
1359 | spin_unlock(&journal->j_state_lock); | ||
1360 | log_wait_commit(journal, tid); | ||
1361 | } else { | ||
1362 | spin_unlock(&journal->j_state_lock); | ||
1363 | } | ||
1364 | |||
1365 | /* ...and flush everything in the log out to disk. */ | ||
1366 | spin_lock(&journal->j_list_lock); | ||
1367 | while (!err && journal->j_checkpoint_transactions != NULL) { | ||
1368 | spin_unlock(&journal->j_list_lock); | ||
1369 | err = log_do_checkpoint(journal); | ||
1370 | spin_lock(&journal->j_list_lock); | ||
1371 | } | ||
1372 | spin_unlock(&journal->j_list_lock); | ||
1373 | cleanup_journal_tail(journal); | ||
1374 | |||
1375 | /* Finally, mark the journal as really needing no recovery. | ||
1376 | * This sets s_start==0 in the underlying superblock, which is | ||
1377 | * the magic code for a fully-recovered superblock. Any future | ||
1378 | * commits of data to the journal will restore the current | ||
1379 | * s_start value. */ | ||
1380 | spin_lock(&journal->j_state_lock); | ||
1381 | old_tail = journal->j_tail; | ||
1382 | journal->j_tail = 0; | ||
1383 | spin_unlock(&journal->j_state_lock); | ||
1384 | journal_update_superblock(journal, 1); | ||
1385 | spin_lock(&journal->j_state_lock); | ||
1386 | journal->j_tail = old_tail; | ||
1387 | |||
1388 | J_ASSERT(!journal->j_running_transaction); | ||
1389 | J_ASSERT(!journal->j_committing_transaction); | ||
1390 | J_ASSERT(!journal->j_checkpoint_transactions); | ||
1391 | J_ASSERT(journal->j_head == journal->j_tail); | ||
1392 | J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); | ||
1393 | spin_unlock(&journal->j_state_lock); | ||
1394 | return err; | ||
1395 | } | ||
1396 | |||
1397 | /** | ||
1398 | * int journal_wipe() - Wipe journal contents | ||
1399 | * @journal: Journal to act on. | ||
1400 | * @write: flag (see below) | ||
1401 | * | ||
1402 | * Wipe out all of the contents of a journal, safely. This will produce | ||
1403 | * a warning if the journal contains any valid recovery information. | ||
1404 | * Must be called between journal_init_*() and journal_load(). | ||
1405 | * | ||
1406 | * If 'write' is non-zero, then we wipe out the journal on disk; otherwise | ||
1407 | * we merely suppress recovery. | ||
1408 | */ | ||
1409 | |||
1410 | int journal_wipe(journal_t *journal, int write) | ||
1411 | { | ||
1412 | journal_superblock_t *sb; | ||
1413 | int err = 0; | ||
1414 | |||
1415 | J_ASSERT (!(journal->j_flags & JFS_LOADED)); | ||
1416 | |||
1417 | err = load_superblock(journal); | ||
1418 | if (err) | ||
1419 | return err; | ||
1420 | |||
1421 | sb = journal->j_superblock; | ||
1422 | |||
1423 | if (!journal->j_tail) | ||
1424 | goto no_recovery; | ||
1425 | |||
1426 | printk (KERN_WARNING "JBD: %s recovery information on journal\n", | ||
1427 | write ? "Clearing" : "Ignoring"); | ||
1428 | |||
1429 | err = journal_skip_recovery(journal); | ||
1430 | if (write) | ||
1431 | journal_update_superblock(journal, 1); | ||
1432 | |||
1433 | no_recovery: | ||
1434 | return err; | ||
1435 | } | ||
1436 | |||
1437 | /* | ||
1438 | * journal_dev_name: format a character string to describe on what | ||
1439 | * device this journal is present. | ||
1440 | */ | ||
1441 | |||
1442 | const char *journal_dev_name(journal_t *journal, char *buffer) | ||
1443 | { | ||
1444 | struct block_device *bdev; | ||
1445 | |||
1446 | if (journal->j_inode) | ||
1447 | bdev = journal->j_inode->i_sb->s_bdev; | ||
1448 | else | ||
1449 | bdev = journal->j_dev; | ||
1450 | |||
1451 | return bdevname(bdev, buffer); | ||
1452 | } | ||
1453 | |||
1454 | /* | ||
1455 | * Journal abort has very specific semantics, which we describe | ||
1456 | * for journal abort. | ||
1457 | * | ||
1458 | * Two internal function, which provide abort to te jbd layer | ||
1459 | * itself are here. | ||
1460 | */ | ||
1461 | |||
1462 | /* | ||
1463 | * Quick version for internal journal use (doesn't lock the journal). | ||
1464 | * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, | ||
1465 | * and don't attempt to make any other journal updates. | ||
1466 | */ | ||
1467 | void __journal_abort_hard(journal_t *journal) | ||
1468 | { | ||
1469 | transaction_t *transaction; | ||
1470 | char b[BDEVNAME_SIZE]; | ||
1471 | |||
1472 | if (journal->j_flags & JFS_ABORT) | ||
1473 | return; | ||
1474 | |||
1475 | printk(KERN_ERR "Aborting journal on device %s.\n", | ||
1476 | journal_dev_name(journal, b)); | ||
1477 | |||
1478 | spin_lock(&journal->j_state_lock); | ||
1479 | journal->j_flags |= JFS_ABORT; | ||
1480 | transaction = journal->j_running_transaction; | ||
1481 | if (transaction) | ||
1482 | __log_start_commit(journal, transaction->t_tid); | ||
1483 | spin_unlock(&journal->j_state_lock); | ||
1484 | } | ||
1485 | |||
1486 | /* Soft abort: record the abort error status in the journal superblock, | ||
1487 | * but don't do any other IO. */ | ||
1488 | void __journal_abort_soft (journal_t *journal, int errno) | ||
1489 | { | ||
1490 | if (journal->j_flags & JFS_ABORT) | ||
1491 | return; | ||
1492 | |||
1493 | if (!journal->j_errno) | ||
1494 | journal->j_errno = errno; | ||
1495 | |||
1496 | __journal_abort_hard(journal); | ||
1497 | |||
1498 | if (errno) | ||
1499 | journal_update_superblock(journal, 1); | ||
1500 | } | ||
1501 | |||
1502 | /** | ||
1503 | * void journal_abort () - Shutdown the journal immediately. | ||
1504 | * @journal: the journal to shutdown. | ||
1505 | * @errno: an error number to record in the journal indicating | ||
1506 | * the reason for the shutdown. | ||
1507 | * | ||
1508 | * Perform a complete, immediate shutdown of the ENTIRE | ||
1509 | * journal (not of a single transaction). This operation cannot be | ||
1510 | * undone without closing and reopening the journal. | ||
1511 | * | ||
1512 | * The journal_abort function is intended to support higher level error | ||
1513 | * recovery mechanisms such as the ext2/ext3 remount-readonly error | ||
1514 | * mode. | ||
1515 | * | ||
1516 | * Journal abort has very specific semantics. Any existing dirty, | ||
1517 | * unjournaled buffers in the main filesystem will still be written to | ||
1518 | * disk by bdflush, but the journaling mechanism will be suspended | ||
1519 | * immediately and no further transaction commits will be honoured. | ||
1520 | * | ||
1521 | * Any dirty, journaled buffers will be written back to disk without | ||
1522 | * hitting the journal. Atomicity cannot be guaranteed on an aborted | ||
1523 | * filesystem, but we _do_ attempt to leave as much data as possible | ||
1524 | * behind for fsck to use for cleanup. | ||
1525 | * | ||
1526 | * Any attempt to get a new transaction handle on a journal which is in | ||
1527 | * ABORT state will just result in an -EROFS error return. A | ||
1528 | * journal_stop on an existing handle will return -EIO if we have | ||
1529 | * entered abort state during the update. | ||
1530 | * | ||
1531 | * Recursive transactions are not disturbed by journal abort until the | ||
1532 | * final journal_stop, which will receive the -EIO error. | ||
1533 | * | ||
1534 | * Finally, the journal_abort call allows the caller to supply an errno | ||
1535 | * which will be recorded (if possible) in the journal superblock. This | ||
1536 | * allows a client to record failure conditions in the middle of a | ||
1537 | * transaction without having to complete the transaction to record the | ||
1538 | * failure to disk. ext3_error, for example, now uses this | ||
1539 | * functionality. | ||
1540 | * | ||
1541 | * Errors which originate from within the journaling layer will NOT | ||
1542 | * supply an errno; a null errno implies that absolutely no further | ||
1543 | * writes are done to the journal (unless there are any already in | ||
1544 | * progress). | ||
1545 | * | ||
1546 | */ | ||
1547 | |||
1548 | void journal_abort(journal_t *journal, int errno) | ||
1549 | { | ||
1550 | __journal_abort_soft(journal, errno); | ||
1551 | } | ||
1552 | |||
1553 | /** | ||
1554 | * int journal_errno () - returns the journal's error state. | ||
1555 | * @journal: journal to examine. | ||
1556 | * | ||
1557 | * This is the errno numbet set with journal_abort(), the last | ||
1558 | * time the journal was mounted - if the journal was stopped | ||
1559 | * without calling abort this will be 0. | ||
1560 | * | ||
1561 | * If the journal has been aborted on this mount time -EROFS will | ||
1562 | * be returned. | ||
1563 | */ | ||
1564 | int journal_errno(journal_t *journal) | ||
1565 | { | ||
1566 | int err; | ||
1567 | |||
1568 | spin_lock(&journal->j_state_lock); | ||
1569 | if (journal->j_flags & JFS_ABORT) | ||
1570 | err = -EROFS; | ||
1571 | else | ||
1572 | err = journal->j_errno; | ||
1573 | spin_unlock(&journal->j_state_lock); | ||
1574 | return err; | ||
1575 | } | ||
1576 | |||
1577 | /** | ||
1578 | * int journal_clear_err () - clears the journal's error state | ||
1579 | * @journal: journal to act on. | ||
1580 | * | ||
1581 | * An error must be cleared or Acked to take a FS out of readonly | ||
1582 | * mode. | ||
1583 | */ | ||
1584 | int journal_clear_err(journal_t *journal) | ||
1585 | { | ||
1586 | int err = 0; | ||
1587 | |||
1588 | spin_lock(&journal->j_state_lock); | ||
1589 | if (journal->j_flags & JFS_ABORT) | ||
1590 | err = -EROFS; | ||
1591 | else | ||
1592 | journal->j_errno = 0; | ||
1593 | spin_unlock(&journal->j_state_lock); | ||
1594 | return err; | ||
1595 | } | ||
1596 | |||
1597 | /** | ||
1598 | * void journal_ack_err() - Ack journal err. | ||
1599 | * @journal: journal to act on. | ||
1600 | * | ||
1601 | * An error must be cleared or Acked to take a FS out of readonly | ||
1602 | * mode. | ||
1603 | */ | ||
1604 | void journal_ack_err(journal_t *journal) | ||
1605 | { | ||
1606 | spin_lock(&journal->j_state_lock); | ||
1607 | if (journal->j_errno) | ||
1608 | journal->j_flags |= JFS_ACK_ERR; | ||
1609 | spin_unlock(&journal->j_state_lock); | ||
1610 | } | ||
1611 | |||
1612 | int journal_blocks_per_page(struct inode *inode) | ||
1613 | { | ||
1614 | return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); | ||
1615 | } | ||
1616 | |||
1617 | /* | ||
1618 | * Simple support for retrying memory allocations. Introduced to help to | ||
1619 | * debug different VM deadlock avoidance strategies. | ||
1620 | */ | ||
1621 | void * __jbd_kmalloc (const char *where, size_t size, int flags, int retry) | ||
1622 | { | ||
1623 | return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0)); | ||
1624 | } | ||
1625 | |||
1626 | /* | ||
1627 | * Journal_head storage management | ||
1628 | */ | ||
1629 | static kmem_cache_t *journal_head_cache; | ||
1630 | #ifdef CONFIG_JBD_DEBUG | ||
1631 | static atomic_t nr_journal_heads = ATOMIC_INIT(0); | ||
1632 | #endif | ||
1633 | |||
1634 | static int journal_init_journal_head_cache(void) | ||
1635 | { | ||
1636 | int retval; | ||
1637 | |||
1638 | J_ASSERT(journal_head_cache == 0); | ||
1639 | journal_head_cache = kmem_cache_create("journal_head", | ||
1640 | sizeof(struct journal_head), | ||
1641 | 0, /* offset */ | ||
1642 | 0, /* flags */ | ||
1643 | NULL, /* ctor */ | ||
1644 | NULL); /* dtor */ | ||
1645 | retval = 0; | ||
1646 | if (journal_head_cache == 0) { | ||
1647 | retval = -ENOMEM; | ||
1648 | printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); | ||
1649 | } | ||
1650 | return retval; | ||
1651 | } | ||
1652 | |||
1653 | static void journal_destroy_journal_head_cache(void) | ||
1654 | { | ||
1655 | J_ASSERT(journal_head_cache != NULL); | ||
1656 | kmem_cache_destroy(journal_head_cache); | ||
1657 | journal_head_cache = NULL; | ||
1658 | } | ||
1659 | |||
1660 | /* | ||
1661 | * journal_head splicing and dicing | ||
1662 | */ | ||
1663 | static struct journal_head *journal_alloc_journal_head(void) | ||
1664 | { | ||
1665 | struct journal_head *ret; | ||
1666 | static unsigned long last_warning; | ||
1667 | |||
1668 | #ifdef CONFIG_JBD_DEBUG | ||
1669 | atomic_inc(&nr_journal_heads); | ||
1670 | #endif | ||
1671 | ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); | ||
1672 | if (ret == 0) { | ||
1673 | jbd_debug(1, "out of memory for journal_head\n"); | ||
1674 | if (time_after(jiffies, last_warning + 5*HZ)) { | ||
1675 | printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", | ||
1676 | __FUNCTION__); | ||
1677 | last_warning = jiffies; | ||
1678 | } | ||
1679 | while (ret == 0) { | ||
1680 | yield(); | ||
1681 | ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); | ||
1682 | } | ||
1683 | } | ||
1684 | return ret; | ||
1685 | } | ||
1686 | |||
1687 | static void journal_free_journal_head(struct journal_head *jh) | ||
1688 | { | ||
1689 | #ifdef CONFIG_JBD_DEBUG | ||
1690 | atomic_dec(&nr_journal_heads); | ||
1691 | memset(jh, 0x5b, sizeof(*jh)); | ||
1692 | #endif | ||
1693 | kmem_cache_free(journal_head_cache, jh); | ||
1694 | } | ||
1695 | |||
1696 | /* | ||
1697 | * A journal_head is attached to a buffer_head whenever JBD has an | ||
1698 | * interest in the buffer. | ||
1699 | * | ||
1700 | * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit | ||
1701 | * is set. This bit is tested in core kernel code where we need to take | ||
1702 | * JBD-specific actions. Testing the zeroness of ->b_private is not reliable | ||
1703 | * there. | ||
1704 | * | ||
1705 | * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. | ||
1706 | * | ||
1707 | * When a buffer has its BH_JBD bit set it is immune from being released by | ||
1708 | * core kernel code, mainly via ->b_count. | ||
1709 | * | ||
1710 | * A journal_head may be detached from its buffer_head when the journal_head's | ||
1711 | * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. | ||
1712 | * Various places in JBD call journal_remove_journal_head() to indicate that the | ||
1713 | * journal_head can be dropped if needed. | ||
1714 | * | ||
1715 | * Various places in the kernel want to attach a journal_head to a buffer_head | ||
1716 | * _before_ attaching the journal_head to a transaction. To protect the | ||
1717 | * journal_head in this situation, journal_add_journal_head elevates the | ||
1718 | * journal_head's b_jcount refcount by one. The caller must call | ||
1719 | * journal_put_journal_head() to undo this. | ||
1720 | * | ||
1721 | * So the typical usage would be: | ||
1722 | * | ||
1723 | * (Attach a journal_head if needed. Increments b_jcount) | ||
1724 | * struct journal_head *jh = journal_add_journal_head(bh); | ||
1725 | * ... | ||
1726 | * jh->b_transaction = xxx; | ||
1727 | * journal_put_journal_head(jh); | ||
1728 | * | ||
1729 | * Now, the journal_head's b_jcount is zero, but it is safe from being released | ||
1730 | * because it has a non-zero b_transaction. | ||
1731 | */ | ||
1732 | |||
1733 | /* | ||
1734 | * Give a buffer_head a journal_head. | ||
1735 | * | ||
1736 | * Doesn't need the journal lock. | ||
1737 | * May sleep. | ||
1738 | */ | ||
1739 | struct journal_head *journal_add_journal_head(struct buffer_head *bh) | ||
1740 | { | ||
1741 | struct journal_head *jh; | ||
1742 | struct journal_head *new_jh = NULL; | ||
1743 | |||
1744 | repeat: | ||
1745 | if (!buffer_jbd(bh)) { | ||
1746 | new_jh = journal_alloc_journal_head(); | ||
1747 | memset(new_jh, 0, sizeof(*new_jh)); | ||
1748 | } | ||
1749 | |||
1750 | jbd_lock_bh_journal_head(bh); | ||
1751 | if (buffer_jbd(bh)) { | ||
1752 | jh = bh2jh(bh); | ||
1753 | } else { | ||
1754 | J_ASSERT_BH(bh, | ||
1755 | (atomic_read(&bh->b_count) > 0) || | ||
1756 | (bh->b_page && bh->b_page->mapping)); | ||
1757 | |||
1758 | if (!new_jh) { | ||
1759 | jbd_unlock_bh_journal_head(bh); | ||
1760 | goto repeat; | ||
1761 | } | ||
1762 | |||
1763 | jh = new_jh; | ||
1764 | new_jh = NULL; /* We consumed it */ | ||
1765 | set_buffer_jbd(bh); | ||
1766 | bh->b_private = jh; | ||
1767 | jh->b_bh = bh; | ||
1768 | get_bh(bh); | ||
1769 | BUFFER_TRACE(bh, "added journal_head"); | ||
1770 | } | ||
1771 | jh->b_jcount++; | ||
1772 | jbd_unlock_bh_journal_head(bh); | ||
1773 | if (new_jh) | ||
1774 | journal_free_journal_head(new_jh); | ||
1775 | return bh->b_private; | ||
1776 | } | ||
1777 | |||
1778 | /* | ||
1779 | * Grab a ref against this buffer_head's journal_head. If it ended up not | ||
1780 | * having a journal_head, return NULL | ||
1781 | */ | ||
1782 | struct journal_head *journal_grab_journal_head(struct buffer_head *bh) | ||
1783 | { | ||
1784 | struct journal_head *jh = NULL; | ||
1785 | |||
1786 | jbd_lock_bh_journal_head(bh); | ||
1787 | if (buffer_jbd(bh)) { | ||
1788 | jh = bh2jh(bh); | ||
1789 | jh->b_jcount++; | ||
1790 | } | ||
1791 | jbd_unlock_bh_journal_head(bh); | ||
1792 | return jh; | ||
1793 | } | ||
1794 | |||
1795 | static void __journal_remove_journal_head(struct buffer_head *bh) | ||
1796 | { | ||
1797 | struct journal_head *jh = bh2jh(bh); | ||
1798 | |||
1799 | J_ASSERT_JH(jh, jh->b_jcount >= 0); | ||
1800 | |||
1801 | get_bh(bh); | ||
1802 | if (jh->b_jcount == 0) { | ||
1803 | if (jh->b_transaction == NULL && | ||
1804 | jh->b_next_transaction == NULL && | ||
1805 | jh->b_cp_transaction == NULL) { | ||
1806 | J_ASSERT_JH(jh, jh->b_jlist == BJ_None); | ||
1807 | J_ASSERT_BH(bh, buffer_jbd(bh)); | ||
1808 | J_ASSERT_BH(bh, jh2bh(jh) == bh); | ||
1809 | BUFFER_TRACE(bh, "remove journal_head"); | ||
1810 | if (jh->b_frozen_data) { | ||
1811 | printk(KERN_WARNING "%s: freeing " | ||
1812 | "b_frozen_data\n", | ||
1813 | __FUNCTION__); | ||
1814 | kfree(jh->b_frozen_data); | ||
1815 | } | ||
1816 | if (jh->b_committed_data) { | ||
1817 | printk(KERN_WARNING "%s: freeing " | ||
1818 | "b_committed_data\n", | ||
1819 | __FUNCTION__); | ||
1820 | kfree(jh->b_committed_data); | ||
1821 | } | ||
1822 | bh->b_private = NULL; | ||
1823 | jh->b_bh = NULL; /* debug, really */ | ||
1824 | clear_buffer_jbd(bh); | ||
1825 | __brelse(bh); | ||
1826 | journal_free_journal_head(jh); | ||
1827 | } else { | ||
1828 | BUFFER_TRACE(bh, "journal_head was locked"); | ||
1829 | } | ||
1830 | } | ||
1831 | } | ||
1832 | |||
1833 | /* | ||
1834 | * journal_remove_journal_head(): if the buffer isn't attached to a transaction | ||
1835 | * and has a zero b_jcount then remove and release its journal_head. If we did | ||
1836 | * see that the buffer is not used by any transaction we also "logically" | ||
1837 | * decrement ->b_count. | ||
1838 | * | ||
1839 | * We in fact take an additional increment on ->b_count as a convenience, | ||
1840 | * because the caller usually wants to do additional things with the bh | ||
1841 | * after calling here. | ||
1842 | * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some | ||
1843 | * time. Once the caller has run __brelse(), the buffer is eligible for | ||
1844 | * reaping by try_to_free_buffers(). | ||
1845 | */ | ||
1846 | void journal_remove_journal_head(struct buffer_head *bh) | ||
1847 | { | ||
1848 | jbd_lock_bh_journal_head(bh); | ||
1849 | __journal_remove_journal_head(bh); | ||
1850 | jbd_unlock_bh_journal_head(bh); | ||
1851 | } | ||
1852 | |||
1853 | /* | ||
1854 | * Drop a reference on the passed journal_head. If it fell to zero then try to | ||
1855 | * release the journal_head from the buffer_head. | ||
1856 | */ | ||
1857 | void journal_put_journal_head(struct journal_head *jh) | ||
1858 | { | ||
1859 | struct buffer_head *bh = jh2bh(jh); | ||
1860 | |||
1861 | jbd_lock_bh_journal_head(bh); | ||
1862 | J_ASSERT_JH(jh, jh->b_jcount > 0); | ||
1863 | --jh->b_jcount; | ||
1864 | if (!jh->b_jcount && !jh->b_transaction) { | ||
1865 | __journal_remove_journal_head(bh); | ||
1866 | __brelse(bh); | ||
1867 | } | ||
1868 | jbd_unlock_bh_journal_head(bh); | ||
1869 | } | ||
1870 | |||
1871 | /* | ||
1872 | * /proc tunables | ||
1873 | */ | ||
1874 | #if defined(CONFIG_JBD_DEBUG) | ||
1875 | int journal_enable_debug; | ||
1876 | EXPORT_SYMBOL(journal_enable_debug); | ||
1877 | #endif | ||
1878 | |||
1879 | #if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) | ||
1880 | |||
1881 | static struct proc_dir_entry *proc_jbd_debug; | ||
1882 | |||
1883 | int read_jbd_debug(char *page, char **start, off_t off, | ||
1884 | int count, int *eof, void *data) | ||
1885 | { | ||
1886 | int ret; | ||
1887 | |||
1888 | ret = sprintf(page + off, "%d\n", journal_enable_debug); | ||
1889 | *eof = 1; | ||
1890 | return ret; | ||
1891 | } | ||
1892 | |||
1893 | int write_jbd_debug(struct file *file, const char __user *buffer, | ||
1894 | unsigned long count, void *data) | ||
1895 | { | ||
1896 | char buf[32]; | ||
1897 | |||
1898 | if (count > ARRAY_SIZE(buf) - 1) | ||
1899 | count = ARRAY_SIZE(buf) - 1; | ||
1900 | if (copy_from_user(buf, buffer, count)) | ||
1901 | return -EFAULT; | ||
1902 | buf[ARRAY_SIZE(buf) - 1] = '\0'; | ||
1903 | journal_enable_debug = simple_strtoul(buf, NULL, 10); | ||
1904 | return count; | ||
1905 | } | ||
1906 | |||
1907 | #define JBD_PROC_NAME "sys/fs/jbd-debug" | ||
1908 | |||
1909 | static void __init create_jbd_proc_entry(void) | ||
1910 | { | ||
1911 | proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); | ||
1912 | if (proc_jbd_debug) { | ||
1913 | /* Why is this so hard? */ | ||
1914 | proc_jbd_debug->read_proc = read_jbd_debug; | ||
1915 | proc_jbd_debug->write_proc = write_jbd_debug; | ||
1916 | } | ||
1917 | } | ||
1918 | |||
1919 | static void __exit remove_jbd_proc_entry(void) | ||
1920 | { | ||
1921 | if (proc_jbd_debug) | ||
1922 | remove_proc_entry(JBD_PROC_NAME, NULL); | ||
1923 | } | ||
1924 | |||
1925 | #else | ||
1926 | |||
1927 | #define create_jbd_proc_entry() do {} while (0) | ||
1928 | #define remove_jbd_proc_entry() do {} while (0) | ||
1929 | |||
1930 | #endif | ||
1931 | |||
1932 | kmem_cache_t *jbd_handle_cache; | ||
1933 | |||
1934 | static int __init journal_init_handle_cache(void) | ||
1935 | { | ||
1936 | jbd_handle_cache = kmem_cache_create("journal_handle", | ||
1937 | sizeof(handle_t), | ||
1938 | 0, /* offset */ | ||
1939 | 0, /* flags */ | ||
1940 | NULL, /* ctor */ | ||
1941 | NULL); /* dtor */ | ||
1942 | if (jbd_handle_cache == NULL) { | ||
1943 | printk(KERN_EMERG "JBD: failed to create handle cache\n"); | ||
1944 | return -ENOMEM; | ||
1945 | } | ||
1946 | return 0; | ||
1947 | } | ||
1948 | |||
1949 | static void journal_destroy_handle_cache(void) | ||
1950 | { | ||
1951 | if (jbd_handle_cache) | ||
1952 | kmem_cache_destroy(jbd_handle_cache); | ||
1953 | } | ||
1954 | |||
1955 | /* | ||
1956 | * Module startup and shutdown | ||
1957 | */ | ||
1958 | |||
1959 | static int __init journal_init_caches(void) | ||
1960 | { | ||
1961 | int ret; | ||
1962 | |||
1963 | ret = journal_init_revoke_caches(); | ||
1964 | if (ret == 0) | ||
1965 | ret = journal_init_journal_head_cache(); | ||
1966 | if (ret == 0) | ||
1967 | ret = journal_init_handle_cache(); | ||
1968 | return ret; | ||
1969 | } | ||
1970 | |||
1971 | static void journal_destroy_caches(void) | ||
1972 | { | ||
1973 | journal_destroy_revoke_caches(); | ||
1974 | journal_destroy_journal_head_cache(); | ||
1975 | journal_destroy_handle_cache(); | ||
1976 | } | ||
1977 | |||
1978 | static int __init journal_init(void) | ||
1979 | { | ||
1980 | int ret; | ||
1981 | |||
1982 | ret = journal_init_caches(); | ||
1983 | if (ret != 0) | ||
1984 | journal_destroy_caches(); | ||
1985 | create_jbd_proc_entry(); | ||
1986 | return ret; | ||
1987 | } | ||
1988 | |||
1989 | static void __exit journal_exit(void) | ||
1990 | { | ||
1991 | #ifdef CONFIG_JBD_DEBUG | ||
1992 | int n = atomic_read(&nr_journal_heads); | ||
1993 | if (n) | ||
1994 | printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); | ||
1995 | #endif | ||
1996 | remove_jbd_proc_entry(); | ||
1997 | journal_destroy_caches(); | ||
1998 | } | ||
1999 | |||
2000 | MODULE_LICENSE("GPL"); | ||
2001 | module_init(journal_init); | ||
2002 | module_exit(journal_exit); | ||
2003 | |||
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c new file mode 100644 index 000000000000..103c34e4fb28 --- /dev/null +++ b/fs/jbd/recovery.c | |||
@@ -0,0 +1,591 @@ | |||
1 | /* | ||
2 | * linux/fs/recovery.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 | ||
5 | * | ||
6 | * Copyright 1999-2000 Red Hat Software --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Journal recovery routines for the generic filesystem journaling code; | ||
13 | * part of the ext2fs journaling system. | ||
14 | */ | ||
15 | |||
16 | #ifndef __KERNEL__ | ||
17 | #include "jfs_user.h" | ||
18 | #else | ||
19 | #include <linux/time.h> | ||
20 | #include <linux/fs.h> | ||
21 | #include <linux/jbd.h> | ||
22 | #include <linux/errno.h> | ||
23 | #include <linux/slab.h> | ||
24 | #endif | ||
25 | |||
26 | /* | ||
27 | * Maintain information about the progress of the recovery job, so that | ||
28 | * the different passes can carry information between them. | ||
29 | */ | ||
30 | struct recovery_info | ||
31 | { | ||
32 | tid_t start_transaction; | ||
33 | tid_t end_transaction; | ||
34 | |||
35 | int nr_replays; | ||
36 | int nr_revokes; | ||
37 | int nr_revoke_hits; | ||
38 | }; | ||
39 | |||
40 | enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; | ||
41 | static int do_one_pass(journal_t *journal, | ||
42 | struct recovery_info *info, enum passtype pass); | ||
43 | static int scan_revoke_records(journal_t *, struct buffer_head *, | ||
44 | tid_t, struct recovery_info *); | ||
45 | |||
46 | #ifdef __KERNEL__ | ||
47 | |||
48 | /* Release readahead buffers after use */ | ||
49 | void journal_brelse_array(struct buffer_head *b[], int n) | ||
50 | { | ||
51 | while (--n >= 0) | ||
52 | brelse (b[n]); | ||
53 | } | ||
54 | |||
55 | |||
56 | /* | ||
57 | * When reading from the journal, we are going through the block device | ||
58 | * layer directly and so there is no readahead being done for us. We | ||
59 | * need to implement any readahead ourselves if we want it to happen at | ||
60 | * all. Recovery is basically one long sequential read, so make sure we | ||
61 | * do the IO in reasonably large chunks. | ||
62 | * | ||
63 | * This is not so critical that we need to be enormously clever about | ||
64 | * the readahead size, though. 128K is a purely arbitrary, good-enough | ||
65 | * fixed value. | ||
66 | */ | ||
67 | |||
68 | #define MAXBUF 8 | ||
69 | static int do_readahead(journal_t *journal, unsigned int start) | ||
70 | { | ||
71 | int err; | ||
72 | unsigned int max, nbufs, next; | ||
73 | unsigned long blocknr; | ||
74 | struct buffer_head *bh; | ||
75 | |||
76 | struct buffer_head * bufs[MAXBUF]; | ||
77 | |||
78 | /* Do up to 128K of readahead */ | ||
79 | max = start + (128 * 1024 / journal->j_blocksize); | ||
80 | if (max > journal->j_maxlen) | ||
81 | max = journal->j_maxlen; | ||
82 | |||
83 | /* Do the readahead itself. We'll submit MAXBUF buffer_heads at | ||
84 | * a time to the block device IO layer. */ | ||
85 | |||
86 | nbufs = 0; | ||
87 | |||
88 | for (next = start; next < max; next++) { | ||
89 | err = journal_bmap(journal, next, &blocknr); | ||
90 | |||
91 | if (err) { | ||
92 | printk (KERN_ERR "JBD: bad block at offset %u\n", | ||
93 | next); | ||
94 | goto failed; | ||
95 | } | ||
96 | |||
97 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
98 | if (!bh) { | ||
99 | err = -ENOMEM; | ||
100 | goto failed; | ||
101 | } | ||
102 | |||
103 | if (!buffer_uptodate(bh) && !buffer_locked(bh)) { | ||
104 | bufs[nbufs++] = bh; | ||
105 | if (nbufs == MAXBUF) { | ||
106 | ll_rw_block(READ, nbufs, bufs); | ||
107 | journal_brelse_array(bufs, nbufs); | ||
108 | nbufs = 0; | ||
109 | } | ||
110 | } else | ||
111 | brelse(bh); | ||
112 | } | ||
113 | |||
114 | if (nbufs) | ||
115 | ll_rw_block(READ, nbufs, bufs); | ||
116 | err = 0; | ||
117 | |||
118 | failed: | ||
119 | if (nbufs) | ||
120 | journal_brelse_array(bufs, nbufs); | ||
121 | return err; | ||
122 | } | ||
123 | |||
124 | #endif /* __KERNEL__ */ | ||
125 | |||
126 | |||
127 | /* | ||
128 | * Read a block from the journal | ||
129 | */ | ||
130 | |||
131 | static int jread(struct buffer_head **bhp, journal_t *journal, | ||
132 | unsigned int offset) | ||
133 | { | ||
134 | int err; | ||
135 | unsigned long blocknr; | ||
136 | struct buffer_head *bh; | ||
137 | |||
138 | *bhp = NULL; | ||
139 | |||
140 | if (offset >= journal->j_maxlen) { | ||
141 | printk(KERN_ERR "JBD: corrupted journal superblock\n"); | ||
142 | return -EIO; | ||
143 | } | ||
144 | |||
145 | err = journal_bmap(journal, offset, &blocknr); | ||
146 | |||
147 | if (err) { | ||
148 | printk (KERN_ERR "JBD: bad block at offset %u\n", | ||
149 | offset); | ||
150 | return err; | ||
151 | } | ||
152 | |||
153 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
154 | if (!bh) | ||
155 | return -ENOMEM; | ||
156 | |||
157 | if (!buffer_uptodate(bh)) { | ||
158 | /* If this is a brand new buffer, start readahead. | ||
159 | Otherwise, we assume we are already reading it. */ | ||
160 | if (!buffer_req(bh)) | ||
161 | do_readahead(journal, offset); | ||
162 | wait_on_buffer(bh); | ||
163 | } | ||
164 | |||
165 | if (!buffer_uptodate(bh)) { | ||
166 | printk (KERN_ERR "JBD: Failed to read block at offset %u\n", | ||
167 | offset); | ||
168 | brelse(bh); | ||
169 | return -EIO; | ||
170 | } | ||
171 | |||
172 | *bhp = bh; | ||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | |||
177 | /* | ||
178 | * Count the number of in-use tags in a journal descriptor block. | ||
179 | */ | ||
180 | |||
181 | static int count_tags(struct buffer_head *bh, int size) | ||
182 | { | ||
183 | char * tagp; | ||
184 | journal_block_tag_t * tag; | ||
185 | int nr = 0; | ||
186 | |||
187 | tagp = &bh->b_data[sizeof(journal_header_t)]; | ||
188 | |||
189 | while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { | ||
190 | tag = (journal_block_tag_t *) tagp; | ||
191 | |||
192 | nr++; | ||
193 | tagp += sizeof(journal_block_tag_t); | ||
194 | if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) | ||
195 | tagp += 16; | ||
196 | |||
197 | if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG)) | ||
198 | break; | ||
199 | } | ||
200 | |||
201 | return nr; | ||
202 | } | ||
203 | |||
204 | |||
205 | /* Make sure we wrap around the log correctly! */ | ||
206 | #define wrap(journal, var) \ | ||
207 | do { \ | ||
208 | if (var >= (journal)->j_last) \ | ||
209 | var -= ((journal)->j_last - (journal)->j_first); \ | ||
210 | } while (0) | ||
211 | |||
212 | /** | ||
213 | * int journal_recover(journal_t *journal) - recovers a on-disk journal | ||
214 | * @journal: the journal to recover | ||
215 | * | ||
216 | * The primary function for recovering the log contents when mounting a | ||
217 | * journaled device. | ||
218 | * | ||
219 | * Recovery is done in three passes. In the first pass, we look for the | ||
220 | * end of the log. In the second, we assemble the list of revoke | ||
221 | * blocks. In the third and final pass, we replay any un-revoked blocks | ||
222 | * in the log. | ||
223 | */ | ||
224 | int journal_recover(journal_t *journal) | ||
225 | { | ||
226 | int err; | ||
227 | journal_superblock_t * sb; | ||
228 | |||
229 | struct recovery_info info; | ||
230 | |||
231 | memset(&info, 0, sizeof(info)); | ||
232 | sb = journal->j_superblock; | ||
233 | |||
234 | /* | ||
235 | * The journal superblock's s_start field (the current log head) | ||
236 | * is always zero if, and only if, the journal was cleanly | ||
237 | * unmounted. | ||
238 | */ | ||
239 | |||
240 | if (!sb->s_start) { | ||
241 | jbd_debug(1, "No recovery required, last transaction %d\n", | ||
242 | be32_to_cpu(sb->s_sequence)); | ||
243 | journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; | ||
244 | return 0; | ||
245 | } | ||
246 | |||
247 | err = do_one_pass(journal, &info, PASS_SCAN); | ||
248 | if (!err) | ||
249 | err = do_one_pass(journal, &info, PASS_REVOKE); | ||
250 | if (!err) | ||
251 | err = do_one_pass(journal, &info, PASS_REPLAY); | ||
252 | |||
253 | jbd_debug(0, "JBD: recovery, exit status %d, " | ||
254 | "recovered transactions %u to %u\n", | ||
255 | err, info.start_transaction, info.end_transaction); | ||
256 | jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", | ||
257 | info.nr_replays, info.nr_revoke_hits, info.nr_revokes); | ||
258 | |||
259 | /* Restart the log at the next transaction ID, thus invalidating | ||
260 | * any existing commit records in the log. */ | ||
261 | journal->j_transaction_sequence = ++info.end_transaction; | ||
262 | |||
263 | journal_clear_revoke(journal); | ||
264 | sync_blockdev(journal->j_fs_dev); | ||
265 | return err; | ||
266 | } | ||
267 | |||
268 | /** | ||
269 | * int journal_skip_recovery() - Start journal and wipe exiting records | ||
270 | * @journal: journal to startup | ||
271 | * | ||
272 | * Locate any valid recovery information from the journal and set up the | ||
273 | * journal structures in memory to ignore it (presumably because the | ||
274 | * caller has evidence that it is out of date). | ||
275 | * This function does'nt appear to be exorted.. | ||
276 | * | ||
277 | * We perform one pass over the journal to allow us to tell the user how | ||
278 | * much recovery information is being erased, and to let us initialise | ||
279 | * the journal transaction sequence numbers to the next unused ID. | ||
280 | */ | ||
281 | int journal_skip_recovery(journal_t *journal) | ||
282 | { | ||
283 | int err; | ||
284 | journal_superblock_t * sb; | ||
285 | |||
286 | struct recovery_info info; | ||
287 | |||
288 | memset (&info, 0, sizeof(info)); | ||
289 | sb = journal->j_superblock; | ||
290 | |||
291 | err = do_one_pass(journal, &info, PASS_SCAN); | ||
292 | |||
293 | if (err) { | ||
294 | printk(KERN_ERR "JBD: error %d scanning journal\n", err); | ||
295 | ++journal->j_transaction_sequence; | ||
296 | } else { | ||
297 | #ifdef CONFIG_JBD_DEBUG | ||
298 | int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); | ||
299 | #endif | ||
300 | jbd_debug(0, | ||
301 | "JBD: ignoring %d transaction%s from the journal.\n", | ||
302 | dropped, (dropped == 1) ? "" : "s"); | ||
303 | journal->j_transaction_sequence = ++info.end_transaction; | ||
304 | } | ||
305 | |||
306 | journal->j_tail = 0; | ||
307 | return err; | ||
308 | } | ||
309 | |||
310 | static int do_one_pass(journal_t *journal, | ||
311 | struct recovery_info *info, enum passtype pass) | ||
312 | { | ||
313 | unsigned int first_commit_ID, next_commit_ID; | ||
314 | unsigned long next_log_block; | ||
315 | int err, success = 0; | ||
316 | journal_superblock_t * sb; | ||
317 | journal_header_t * tmp; | ||
318 | struct buffer_head * bh; | ||
319 | unsigned int sequence; | ||
320 | int blocktype; | ||
321 | |||
322 | /* Precompute the maximum metadata descriptors in a descriptor block */ | ||
323 | int MAX_BLOCKS_PER_DESC; | ||
324 | MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) | ||
325 | / sizeof(journal_block_tag_t)); | ||
326 | |||
327 | /* | ||
328 | * First thing is to establish what we expect to find in the log | ||
329 | * (in terms of transaction IDs), and where (in terms of log | ||
330 | * block offsets): query the superblock. | ||
331 | */ | ||
332 | |||
333 | sb = journal->j_superblock; | ||
334 | next_commit_ID = be32_to_cpu(sb->s_sequence); | ||
335 | next_log_block = be32_to_cpu(sb->s_start); | ||
336 | |||
337 | first_commit_ID = next_commit_ID; | ||
338 | if (pass == PASS_SCAN) | ||
339 | info->start_transaction = first_commit_ID; | ||
340 | |||
341 | jbd_debug(1, "Starting recovery pass %d\n", pass); | ||
342 | |||
343 | /* | ||
344 | * Now we walk through the log, transaction by transaction, | ||
345 | * making sure that each transaction has a commit block in the | ||
346 | * expected place. Each complete transaction gets replayed back | ||
347 | * into the main filesystem. | ||
348 | */ | ||
349 | |||
350 | while (1) { | ||
351 | int flags; | ||
352 | char * tagp; | ||
353 | journal_block_tag_t * tag; | ||
354 | struct buffer_head * obh; | ||
355 | struct buffer_head * nbh; | ||
356 | |||
357 | cond_resched(); /* We're under lock_kernel() */ | ||
358 | |||
359 | /* If we already know where to stop the log traversal, | ||
360 | * check right now that we haven't gone past the end of | ||
361 | * the log. */ | ||
362 | |||
363 | if (pass != PASS_SCAN) | ||
364 | if (tid_geq(next_commit_ID, info->end_transaction)) | ||
365 | break; | ||
366 | |||
367 | jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", | ||
368 | next_commit_ID, next_log_block, journal->j_last); | ||
369 | |||
370 | /* Skip over each chunk of the transaction looking | ||
371 | * either the next descriptor block or the final commit | ||
372 | * record. */ | ||
373 | |||
374 | jbd_debug(3, "JBD: checking block %ld\n", next_log_block); | ||
375 | err = jread(&bh, journal, next_log_block); | ||
376 | if (err) | ||
377 | goto failed; | ||
378 | |||
379 | next_log_block++; | ||
380 | wrap(journal, next_log_block); | ||
381 | |||
382 | /* What kind of buffer is it? | ||
383 | * | ||
384 | * If it is a descriptor block, check that it has the | ||
385 | * expected sequence number. Otherwise, we're all done | ||
386 | * here. */ | ||
387 | |||
388 | tmp = (journal_header_t *)bh->b_data; | ||
389 | |||
390 | if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { | ||
391 | brelse(bh); | ||
392 | break; | ||
393 | } | ||
394 | |||
395 | blocktype = be32_to_cpu(tmp->h_blocktype); | ||
396 | sequence = be32_to_cpu(tmp->h_sequence); | ||
397 | jbd_debug(3, "Found magic %d, sequence %d\n", | ||
398 | blocktype, sequence); | ||
399 | |||
400 | if (sequence != next_commit_ID) { | ||
401 | brelse(bh); | ||
402 | break; | ||
403 | } | ||
404 | |||
405 | /* OK, we have a valid descriptor block which matches | ||
406 | * all of the sequence number checks. What are we going | ||
407 | * to do with it? That depends on the pass... */ | ||
408 | |||
409 | switch(blocktype) { | ||
410 | case JFS_DESCRIPTOR_BLOCK: | ||
411 | /* If it is a valid descriptor block, replay it | ||
412 | * in pass REPLAY; otherwise, just skip over the | ||
413 | * blocks it describes. */ | ||
414 | if (pass != PASS_REPLAY) { | ||
415 | next_log_block += | ||
416 | count_tags(bh, journal->j_blocksize); | ||
417 | wrap(journal, next_log_block); | ||
418 | brelse(bh); | ||
419 | continue; | ||
420 | } | ||
421 | |||
422 | /* A descriptor block: we can now write all of | ||
423 | * the data blocks. Yay, useful work is finally | ||
424 | * getting done here! */ | ||
425 | |||
426 | tagp = &bh->b_data[sizeof(journal_header_t)]; | ||
427 | while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) | ||
428 | <= journal->j_blocksize) { | ||
429 | unsigned long io_block; | ||
430 | |||
431 | tag = (journal_block_tag_t *) tagp; | ||
432 | flags = be32_to_cpu(tag->t_flags); | ||
433 | |||
434 | io_block = next_log_block++; | ||
435 | wrap(journal, next_log_block); | ||
436 | err = jread(&obh, journal, io_block); | ||
437 | if (err) { | ||
438 | /* Recover what we can, but | ||
439 | * report failure at the end. */ | ||
440 | success = err; | ||
441 | printk (KERN_ERR | ||
442 | "JBD: IO error %d recovering " | ||
443 | "block %ld in log\n", | ||
444 | err, io_block); | ||
445 | } else { | ||
446 | unsigned long blocknr; | ||
447 | |||
448 | J_ASSERT(obh != NULL); | ||
449 | blocknr = be32_to_cpu(tag->t_blocknr); | ||
450 | |||
451 | /* If the block has been | ||
452 | * revoked, then we're all done | ||
453 | * here. */ | ||
454 | if (journal_test_revoke | ||
455 | (journal, blocknr, | ||
456 | next_commit_ID)) { | ||
457 | brelse(obh); | ||
458 | ++info->nr_revoke_hits; | ||
459 | goto skip_write; | ||
460 | } | ||
461 | |||
462 | /* Find a buffer for the new | ||
463 | * data being restored */ | ||
464 | nbh = __getblk(journal->j_fs_dev, | ||
465 | blocknr, | ||
466 | journal->j_blocksize); | ||
467 | if (nbh == NULL) { | ||
468 | printk(KERN_ERR | ||
469 | "JBD: Out of memory " | ||
470 | "during recovery.\n"); | ||
471 | err = -ENOMEM; | ||
472 | brelse(bh); | ||
473 | brelse(obh); | ||
474 | goto failed; | ||
475 | } | ||
476 | |||
477 | lock_buffer(nbh); | ||
478 | memcpy(nbh->b_data, obh->b_data, | ||
479 | journal->j_blocksize); | ||
480 | if (flags & JFS_FLAG_ESCAPE) { | ||
481 | *((__be32 *)bh->b_data) = | ||
482 | cpu_to_be32(JFS_MAGIC_NUMBER); | ||
483 | } | ||
484 | |||
485 | BUFFER_TRACE(nbh, "marking dirty"); | ||
486 | set_buffer_uptodate(nbh); | ||
487 | mark_buffer_dirty(nbh); | ||
488 | BUFFER_TRACE(nbh, "marking uptodate"); | ||
489 | ++info->nr_replays; | ||
490 | /* ll_rw_block(WRITE, 1, &nbh); */ | ||
491 | unlock_buffer(nbh); | ||
492 | brelse(obh); | ||
493 | brelse(nbh); | ||
494 | } | ||
495 | |||
496 | skip_write: | ||
497 | tagp += sizeof(journal_block_tag_t); | ||
498 | if (!(flags & JFS_FLAG_SAME_UUID)) | ||
499 | tagp += 16; | ||
500 | |||
501 | if (flags & JFS_FLAG_LAST_TAG) | ||
502 | break; | ||
503 | } | ||
504 | |||
505 | brelse(bh); | ||
506 | continue; | ||
507 | |||
508 | case JFS_COMMIT_BLOCK: | ||
509 | /* Found an expected commit block: not much to | ||
510 | * do other than move on to the next sequence | ||
511 | * number. */ | ||
512 | brelse(bh); | ||
513 | next_commit_ID++; | ||
514 | continue; | ||
515 | |||
516 | case JFS_REVOKE_BLOCK: | ||
517 | /* If we aren't in the REVOKE pass, then we can | ||
518 | * just skip over this block. */ | ||
519 | if (pass != PASS_REVOKE) { | ||
520 | brelse(bh); | ||
521 | continue; | ||
522 | } | ||
523 | |||
524 | err = scan_revoke_records(journal, bh, | ||
525 | next_commit_ID, info); | ||
526 | brelse(bh); | ||
527 | if (err) | ||
528 | goto failed; | ||
529 | continue; | ||
530 | |||
531 | default: | ||
532 | jbd_debug(3, "Unrecognised magic %d, end of scan.\n", | ||
533 | blocktype); | ||
534 | goto done; | ||
535 | } | ||
536 | } | ||
537 | |||
538 | done: | ||
539 | /* | ||
540 | * We broke out of the log scan loop: either we came to the | ||
541 | * known end of the log or we found an unexpected block in the | ||
542 | * log. If the latter happened, then we know that the "current" | ||
543 | * transaction marks the end of the valid log. | ||
544 | */ | ||
545 | |||
546 | if (pass == PASS_SCAN) | ||
547 | info->end_transaction = next_commit_ID; | ||
548 | else { | ||
549 | /* It's really bad news if different passes end up at | ||
550 | * different places (but possible due to IO errors). */ | ||
551 | if (info->end_transaction != next_commit_ID) { | ||
552 | printk (KERN_ERR "JBD: recovery pass %d ended at " | ||
553 | "transaction %u, expected %u\n", | ||
554 | pass, next_commit_ID, info->end_transaction); | ||
555 | if (!success) | ||
556 | success = -EIO; | ||
557 | } | ||
558 | } | ||
559 | |||
560 | return success; | ||
561 | |||
562 | failed: | ||
563 | return err; | ||
564 | } | ||
565 | |||
566 | |||
567 | /* Scan a revoke record, marking all blocks mentioned as revoked. */ | ||
568 | |||
569 | static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, | ||
570 | tid_t sequence, struct recovery_info *info) | ||
571 | { | ||
572 | journal_revoke_header_t *header; | ||
573 | int offset, max; | ||
574 | |||
575 | header = (journal_revoke_header_t *) bh->b_data; | ||
576 | offset = sizeof(journal_revoke_header_t); | ||
577 | max = be32_to_cpu(header->r_count); | ||
578 | |||
579 | while (offset < max) { | ||
580 | unsigned long blocknr; | ||
581 | int err; | ||
582 | |||
583 | blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); | ||
584 | offset += 4; | ||
585 | err = journal_set_revoke(journal, blocknr, sequence); | ||
586 | if (err) | ||
587 | return err; | ||
588 | ++info->nr_revokes; | ||
589 | } | ||
590 | return 0; | ||
591 | } | ||
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c new file mode 100644 index 000000000000..d327a598f861 --- /dev/null +++ b/fs/jbd/revoke.c | |||
@@ -0,0 +1,702 @@ | |||
1 | /* | ||
2 | * linux/fs/revoke.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 | ||
5 | * | ||
6 | * Copyright 2000 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Journal revoke routines for the generic filesystem journaling code; | ||
13 | * part of the ext2fs journaling system. | ||
14 | * | ||
15 | * Revoke is the mechanism used to prevent old log records for deleted | ||
16 | * metadata from being replayed on top of newer data using the same | ||
17 | * blocks. The revoke mechanism is used in two separate places: | ||
18 | * | ||
19 | * + Commit: during commit we write the entire list of the current | ||
20 | * transaction's revoked blocks to the journal | ||
21 | * | ||
22 | * + Recovery: during recovery we record the transaction ID of all | ||
23 | * revoked blocks. If there are multiple revoke records in the log | ||
24 | * for a single block, only the last one counts, and if there is a log | ||
25 | * entry for a block beyond the last revoke, then that log entry still | ||
26 | * gets replayed. | ||
27 | * | ||
28 | * We can get interactions between revokes and new log data within a | ||
29 | * single transaction: | ||
30 | * | ||
31 | * Block is revoked and then journaled: | ||
32 | * The desired end result is the journaling of the new block, so we | ||
33 | * cancel the revoke before the transaction commits. | ||
34 | * | ||
35 | * Block is journaled and then revoked: | ||
36 | * The revoke must take precedence over the write of the block, so we | ||
37 | * need either to cancel the journal entry or to write the revoke | ||
38 | * later in the log than the log block. In this case, we choose the | ||
39 | * latter: journaling a block cancels any revoke record for that block | ||
40 | * in the current transaction, so any revoke for that block in the | ||
41 | * transaction must have happened after the block was journaled and so | ||
42 | * the revoke must take precedence. | ||
43 | * | ||
44 | * Block is revoked and then written as data: | ||
45 | * The data write is allowed to succeed, but the revoke is _not_ | ||
46 | * cancelled. We still need to prevent old log records from | ||
47 | * overwriting the new data. We don't even need to clear the revoke | ||
48 | * bit here. | ||
49 | * | ||
50 | * Revoke information on buffers is a tri-state value: | ||
51 | * | ||
52 | * RevokeValid clear: no cached revoke status, need to look it up | ||
53 | * RevokeValid set, Revoked clear: | ||
54 | * buffer has not been revoked, and cancel_revoke | ||
55 | * need do nothing. | ||
56 | * RevokeValid set, Revoked set: | ||
57 | * buffer has been revoked. | ||
58 | */ | ||
59 | |||
60 | #ifndef __KERNEL__ | ||
61 | #include "jfs_user.h" | ||
62 | #else | ||
63 | #include <linux/time.h> | ||
64 | #include <linux/fs.h> | ||
65 | #include <linux/jbd.h> | ||
66 | #include <linux/errno.h> | ||
67 | #include <linux/slab.h> | ||
68 | #include <linux/list.h> | ||
69 | #include <linux/smp_lock.h> | ||
70 | #include <linux/init.h> | ||
71 | #endif | ||
72 | |||
73 | static kmem_cache_t *revoke_record_cache; | ||
74 | static kmem_cache_t *revoke_table_cache; | ||
75 | |||
76 | /* Each revoke record represents one single revoked block. During | ||
77 | journal replay, this involves recording the transaction ID of the | ||
78 | last transaction to revoke this block. */ | ||
79 | |||
80 | struct jbd_revoke_record_s | ||
81 | { | ||
82 | struct list_head hash; | ||
83 | tid_t sequence; /* Used for recovery only */ | ||
84 | unsigned long blocknr; | ||
85 | }; | ||
86 | |||
87 | |||
88 | /* The revoke table is just a simple hash table of revoke records. */ | ||
89 | struct jbd_revoke_table_s | ||
90 | { | ||
91 | /* It is conceivable that we might want a larger hash table | ||
92 | * for recovery. Must be a power of two. */ | ||
93 | int hash_size; | ||
94 | int hash_shift; | ||
95 | struct list_head *hash_table; | ||
96 | }; | ||
97 | |||
98 | |||
99 | #ifdef __KERNEL__ | ||
100 | static void write_one_revoke_record(journal_t *, transaction_t *, | ||
101 | struct journal_head **, int *, | ||
102 | struct jbd_revoke_record_s *); | ||
103 | static void flush_descriptor(journal_t *, struct journal_head *, int); | ||
104 | #endif | ||
105 | |||
106 | /* Utility functions to maintain the revoke table */ | ||
107 | |||
108 | /* Borrowed from buffer.c: this is a tried and tested block hash function */ | ||
109 | static inline int hash(journal_t *journal, unsigned long block) | ||
110 | { | ||
111 | struct jbd_revoke_table_s *table = journal->j_revoke; | ||
112 | int hash_shift = table->hash_shift; | ||
113 | |||
114 | return ((block << (hash_shift - 6)) ^ | ||
115 | (block >> 13) ^ | ||
116 | (block << (hash_shift - 12))) & (table->hash_size - 1); | ||
117 | } | ||
118 | |||
119 | int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq) | ||
120 | { | ||
121 | struct list_head *hash_list; | ||
122 | struct jbd_revoke_record_s *record; | ||
123 | |||
124 | repeat: | ||
125 | record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); | ||
126 | if (!record) | ||
127 | goto oom; | ||
128 | |||
129 | record->sequence = seq; | ||
130 | record->blocknr = blocknr; | ||
131 | hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; | ||
132 | spin_lock(&journal->j_revoke_lock); | ||
133 | list_add(&record->hash, hash_list); | ||
134 | spin_unlock(&journal->j_revoke_lock); | ||
135 | return 0; | ||
136 | |||
137 | oom: | ||
138 | if (!journal_oom_retry) | ||
139 | return -ENOMEM; | ||
140 | jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__); | ||
141 | yield(); | ||
142 | goto repeat; | ||
143 | } | ||
144 | |||
145 | /* Find a revoke record in the journal's hash table. */ | ||
146 | |||
147 | static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, | ||
148 | unsigned long blocknr) | ||
149 | { | ||
150 | struct list_head *hash_list; | ||
151 | struct jbd_revoke_record_s *record; | ||
152 | |||
153 | hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; | ||
154 | |||
155 | spin_lock(&journal->j_revoke_lock); | ||
156 | record = (struct jbd_revoke_record_s *) hash_list->next; | ||
157 | while (&(record->hash) != hash_list) { | ||
158 | if (record->blocknr == blocknr) { | ||
159 | spin_unlock(&journal->j_revoke_lock); | ||
160 | return record; | ||
161 | } | ||
162 | record = (struct jbd_revoke_record_s *) record->hash.next; | ||
163 | } | ||
164 | spin_unlock(&journal->j_revoke_lock); | ||
165 | return NULL; | ||
166 | } | ||
167 | |||
168 | int __init journal_init_revoke_caches(void) | ||
169 | { | ||
170 | revoke_record_cache = kmem_cache_create("revoke_record", | ||
171 | sizeof(struct jbd_revoke_record_s), | ||
172 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
173 | if (revoke_record_cache == 0) | ||
174 | return -ENOMEM; | ||
175 | |||
176 | revoke_table_cache = kmem_cache_create("revoke_table", | ||
177 | sizeof(struct jbd_revoke_table_s), | ||
178 | 0, 0, NULL, NULL); | ||
179 | if (revoke_table_cache == 0) { | ||
180 | kmem_cache_destroy(revoke_record_cache); | ||
181 | revoke_record_cache = NULL; | ||
182 | return -ENOMEM; | ||
183 | } | ||
184 | return 0; | ||
185 | } | ||
186 | |||
187 | void journal_destroy_revoke_caches(void) | ||
188 | { | ||
189 | kmem_cache_destroy(revoke_record_cache); | ||
190 | revoke_record_cache = NULL; | ||
191 | kmem_cache_destroy(revoke_table_cache); | ||
192 | revoke_table_cache = NULL; | ||
193 | } | ||
194 | |||
195 | /* Initialise the revoke table for a given journal to a given size. */ | ||
196 | |||
197 | int journal_init_revoke(journal_t *journal, int hash_size) | ||
198 | { | ||
199 | int shift, tmp; | ||
200 | |||
201 | J_ASSERT (journal->j_revoke_table[0] == NULL); | ||
202 | |||
203 | shift = 0; | ||
204 | tmp = hash_size; | ||
205 | while((tmp >>= 1UL) != 0UL) | ||
206 | shift++; | ||
207 | |||
208 | journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); | ||
209 | if (!journal->j_revoke_table[0]) | ||
210 | return -ENOMEM; | ||
211 | journal->j_revoke = journal->j_revoke_table[0]; | ||
212 | |||
213 | /* Check that the hash_size is a power of two */ | ||
214 | J_ASSERT ((hash_size & (hash_size-1)) == 0); | ||
215 | |||
216 | journal->j_revoke->hash_size = hash_size; | ||
217 | |||
218 | journal->j_revoke->hash_shift = shift; | ||
219 | |||
220 | journal->j_revoke->hash_table = | ||
221 | kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); | ||
222 | if (!journal->j_revoke->hash_table) { | ||
223 | kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); | ||
224 | journal->j_revoke = NULL; | ||
225 | return -ENOMEM; | ||
226 | } | ||
227 | |||
228 | for (tmp = 0; tmp < hash_size; tmp++) | ||
229 | INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); | ||
230 | |||
231 | journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); | ||
232 | if (!journal->j_revoke_table[1]) { | ||
233 | kfree(journal->j_revoke_table[0]->hash_table); | ||
234 | kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); | ||
235 | return -ENOMEM; | ||
236 | } | ||
237 | |||
238 | journal->j_revoke = journal->j_revoke_table[1]; | ||
239 | |||
240 | /* Check that the hash_size is a power of two */ | ||
241 | J_ASSERT ((hash_size & (hash_size-1)) == 0); | ||
242 | |||
243 | journal->j_revoke->hash_size = hash_size; | ||
244 | |||
245 | journal->j_revoke->hash_shift = shift; | ||
246 | |||
247 | journal->j_revoke->hash_table = | ||
248 | kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); | ||
249 | if (!journal->j_revoke->hash_table) { | ||
250 | kfree(journal->j_revoke_table[0]->hash_table); | ||
251 | kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); | ||
252 | kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]); | ||
253 | journal->j_revoke = NULL; | ||
254 | return -ENOMEM; | ||
255 | } | ||
256 | |||
257 | for (tmp = 0; tmp < hash_size; tmp++) | ||
258 | INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); | ||
259 | |||
260 | spin_lock_init(&journal->j_revoke_lock); | ||
261 | |||
262 | return 0; | ||
263 | } | ||
264 | |||
265 | /* Destoy a journal's revoke table. The table must already be empty! */ | ||
266 | |||
267 | void journal_destroy_revoke(journal_t *journal) | ||
268 | { | ||
269 | struct jbd_revoke_table_s *table; | ||
270 | struct list_head *hash_list; | ||
271 | int i; | ||
272 | |||
273 | table = journal->j_revoke_table[0]; | ||
274 | if (!table) | ||
275 | return; | ||
276 | |||
277 | for (i=0; i<table->hash_size; i++) { | ||
278 | hash_list = &table->hash_table[i]; | ||
279 | J_ASSERT (list_empty(hash_list)); | ||
280 | } | ||
281 | |||
282 | kfree(table->hash_table); | ||
283 | kmem_cache_free(revoke_table_cache, table); | ||
284 | journal->j_revoke = NULL; | ||
285 | |||
286 | table = journal->j_revoke_table[1]; | ||
287 | if (!table) | ||
288 | return; | ||
289 | |||
290 | for (i=0; i<table->hash_size; i++) { | ||
291 | hash_list = &table->hash_table[i]; | ||
292 | J_ASSERT (list_empty(hash_list)); | ||
293 | } | ||
294 | |||
295 | kfree(table->hash_table); | ||
296 | kmem_cache_free(revoke_table_cache, table); | ||
297 | journal->j_revoke = NULL; | ||
298 | } | ||
299 | |||
300 | |||
301 | #ifdef __KERNEL__ | ||
302 | |||
303 | /* | ||
304 | * journal_revoke: revoke a given buffer_head from the journal. This | ||
305 | * prevents the block from being replayed during recovery if we take a | ||
306 | * crash after this current transaction commits. Any subsequent | ||
307 | * metadata writes of the buffer in this transaction cancel the | ||
308 | * revoke. | ||
309 | * | ||
310 | * Note that this call may block --- it is up to the caller to make | ||
311 | * sure that there are no further calls to journal_write_metadata | ||
312 | * before the revoke is complete. In ext3, this implies calling the | ||
313 | * revoke before clearing the block bitmap when we are deleting | ||
314 | * metadata. | ||
315 | * | ||
316 | * Revoke performs a journal_forget on any buffer_head passed in as a | ||
317 | * parameter, but does _not_ forget the buffer_head if the bh was only | ||
318 | * found implicitly. | ||
319 | * | ||
320 | * bh_in may not be a journalled buffer - it may have come off | ||
321 | * the hash tables without an attached journal_head. | ||
322 | * | ||
323 | * If bh_in is non-zero, journal_revoke() will decrement its b_count | ||
324 | * by one. | ||
325 | */ | ||
326 | |||
327 | int journal_revoke(handle_t *handle, unsigned long blocknr, | ||
328 | struct buffer_head *bh_in) | ||
329 | { | ||
330 | struct buffer_head *bh = NULL; | ||
331 | journal_t *journal; | ||
332 | struct block_device *bdev; | ||
333 | int err; | ||
334 | |||
335 | might_sleep(); | ||
336 | if (bh_in) | ||
337 | BUFFER_TRACE(bh_in, "enter"); | ||
338 | |||
339 | journal = handle->h_transaction->t_journal; | ||
340 | if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ | ||
341 | J_ASSERT (!"Cannot set revoke feature!"); | ||
342 | return -EINVAL; | ||
343 | } | ||
344 | |||
345 | bdev = journal->j_fs_dev; | ||
346 | bh = bh_in; | ||
347 | |||
348 | if (!bh) { | ||
349 | bh = __find_get_block(bdev, blocknr, journal->j_blocksize); | ||
350 | if (bh) | ||
351 | BUFFER_TRACE(bh, "found on hash"); | ||
352 | } | ||
353 | #ifdef JBD_EXPENSIVE_CHECKING | ||
354 | else { | ||
355 | struct buffer_head *bh2; | ||
356 | |||
357 | /* If there is a different buffer_head lying around in | ||
358 | * memory anywhere... */ | ||
359 | bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); | ||
360 | if (bh2) { | ||
361 | /* ... and it has RevokeValid status... */ | ||
362 | if (bh2 != bh && buffer_revokevalid(bh2)) | ||
363 | /* ...then it better be revoked too, | ||
364 | * since it's illegal to create a revoke | ||
365 | * record against a buffer_head which is | ||
366 | * not marked revoked --- that would | ||
367 | * risk missing a subsequent revoke | ||
368 | * cancel. */ | ||
369 | J_ASSERT_BH(bh2, buffer_revoked(bh2)); | ||
370 | put_bh(bh2); | ||
371 | } | ||
372 | } | ||
373 | #endif | ||
374 | |||
375 | /* We really ought not ever to revoke twice in a row without | ||
376 | first having the revoke cancelled: it's illegal to free a | ||
377 | block twice without allocating it in between! */ | ||
378 | if (bh) { | ||
379 | if (!J_EXPECT_BH(bh, !buffer_revoked(bh), | ||
380 | "inconsistent data on disk")) { | ||
381 | if (!bh_in) | ||
382 | brelse(bh); | ||
383 | return -EIO; | ||
384 | } | ||
385 | set_buffer_revoked(bh); | ||
386 | set_buffer_revokevalid(bh); | ||
387 | if (bh_in) { | ||
388 | BUFFER_TRACE(bh_in, "call journal_forget"); | ||
389 | journal_forget(handle, bh_in); | ||
390 | } else { | ||
391 | BUFFER_TRACE(bh, "call brelse"); | ||
392 | __brelse(bh); | ||
393 | } | ||
394 | } | ||
395 | |||
396 | jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); | ||
397 | err = insert_revoke_hash(journal, blocknr, | ||
398 | handle->h_transaction->t_tid); | ||
399 | BUFFER_TRACE(bh_in, "exit"); | ||
400 | return err; | ||
401 | } | ||
402 | |||
403 | /* | ||
404 | * Cancel an outstanding revoke. For use only internally by the | ||
405 | * journaling code (called from journal_get_write_access). | ||
406 | * | ||
407 | * We trust buffer_revoked() on the buffer if the buffer is already | ||
408 | * being journaled: if there is no revoke pending on the buffer, then we | ||
409 | * don't do anything here. | ||
410 | * | ||
411 | * This would break if it were possible for a buffer to be revoked and | ||
412 | * discarded, and then reallocated within the same transaction. In such | ||
413 | * a case we would have lost the revoked bit, but when we arrived here | ||
414 | * the second time we would still have a pending revoke to cancel. So, | ||
415 | * do not trust the Revoked bit on buffers unless RevokeValid is also | ||
416 | * set. | ||
417 | * | ||
418 | * The caller must have the journal locked. | ||
419 | */ | ||
420 | int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) | ||
421 | { | ||
422 | struct jbd_revoke_record_s *record; | ||
423 | journal_t *journal = handle->h_transaction->t_journal; | ||
424 | int need_cancel; | ||
425 | int did_revoke = 0; /* akpm: debug */ | ||
426 | struct buffer_head *bh = jh2bh(jh); | ||
427 | |||
428 | jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); | ||
429 | |||
430 | /* Is the existing Revoke bit valid? If so, we trust it, and | ||
431 | * only perform the full cancel if the revoke bit is set. If | ||
432 | * not, we can't trust the revoke bit, and we need to do the | ||
433 | * full search for a revoke record. */ | ||
434 | if (test_set_buffer_revokevalid(bh)) { | ||
435 | need_cancel = test_clear_buffer_revoked(bh); | ||
436 | } else { | ||
437 | need_cancel = 1; | ||
438 | clear_buffer_revoked(bh); | ||
439 | } | ||
440 | |||
441 | if (need_cancel) { | ||
442 | record = find_revoke_record(journal, bh->b_blocknr); | ||
443 | if (record) { | ||
444 | jbd_debug(4, "cancelled existing revoke on " | ||
445 | "blocknr %llu\n", (unsigned long long)bh->b_blocknr); | ||
446 | spin_lock(&journal->j_revoke_lock); | ||
447 | list_del(&record->hash); | ||
448 | spin_unlock(&journal->j_revoke_lock); | ||
449 | kmem_cache_free(revoke_record_cache, record); | ||
450 | did_revoke = 1; | ||
451 | } | ||
452 | } | ||
453 | |||
454 | #ifdef JBD_EXPENSIVE_CHECKING | ||
455 | /* There better not be one left behind by now! */ | ||
456 | record = find_revoke_record(journal, bh->b_blocknr); | ||
457 | J_ASSERT_JH(jh, record == NULL); | ||
458 | #endif | ||
459 | |||
460 | /* Finally, have we just cleared revoke on an unhashed | ||
461 | * buffer_head? If so, we'd better make sure we clear the | ||
462 | * revoked status on any hashed alias too, otherwise the revoke | ||
463 | * state machine will get very upset later on. */ | ||
464 | if (need_cancel) { | ||
465 | struct buffer_head *bh2; | ||
466 | bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); | ||
467 | if (bh2) { | ||
468 | if (bh2 != bh) | ||
469 | clear_buffer_revoked(bh2); | ||
470 | __brelse(bh2); | ||
471 | } | ||
472 | } | ||
473 | return did_revoke; | ||
474 | } | ||
475 | |||
476 | /* journal_switch_revoke table select j_revoke for next transaction | ||
477 | * we do not want to suspend any processing until all revokes are | ||
478 | * written -bzzz | ||
479 | */ | ||
480 | void journal_switch_revoke_table(journal_t *journal) | ||
481 | { | ||
482 | int i; | ||
483 | |||
484 | if (journal->j_revoke == journal->j_revoke_table[0]) | ||
485 | journal->j_revoke = journal->j_revoke_table[1]; | ||
486 | else | ||
487 | journal->j_revoke = journal->j_revoke_table[0]; | ||
488 | |||
489 | for (i = 0; i < journal->j_revoke->hash_size; i++) | ||
490 | INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); | ||
491 | } | ||
492 | |||
493 | /* | ||
494 | * Write revoke records to the journal for all entries in the current | ||
495 | * revoke hash, deleting the entries as we go. | ||
496 | * | ||
497 | * Called with the journal lock held. | ||
498 | */ | ||
499 | |||
500 | void journal_write_revoke_records(journal_t *journal, | ||
501 | transaction_t *transaction) | ||
502 | { | ||
503 | struct journal_head *descriptor; | ||
504 | struct jbd_revoke_record_s *record; | ||
505 | struct jbd_revoke_table_s *revoke; | ||
506 | struct list_head *hash_list; | ||
507 | int i, offset, count; | ||
508 | |||
509 | descriptor = NULL; | ||
510 | offset = 0; | ||
511 | count = 0; | ||
512 | |||
513 | /* select revoke table for committing transaction */ | ||
514 | revoke = journal->j_revoke == journal->j_revoke_table[0] ? | ||
515 | journal->j_revoke_table[1] : journal->j_revoke_table[0]; | ||
516 | |||
517 | for (i = 0; i < revoke->hash_size; i++) { | ||
518 | hash_list = &revoke->hash_table[i]; | ||
519 | |||
520 | while (!list_empty(hash_list)) { | ||
521 | record = (struct jbd_revoke_record_s *) | ||
522 | hash_list->next; | ||
523 | write_one_revoke_record(journal, transaction, | ||
524 | &descriptor, &offset, | ||
525 | record); | ||
526 | count++; | ||
527 | list_del(&record->hash); | ||
528 | kmem_cache_free(revoke_record_cache, record); | ||
529 | } | ||
530 | } | ||
531 | if (descriptor) | ||
532 | flush_descriptor(journal, descriptor, offset); | ||
533 | jbd_debug(1, "Wrote %d revoke records\n", count); | ||
534 | } | ||
535 | |||
536 | /* | ||
537 | * Write out one revoke record. We need to create a new descriptor | ||
538 | * block if the old one is full or if we have not already created one. | ||
539 | */ | ||
540 | |||
541 | static void write_one_revoke_record(journal_t *journal, | ||
542 | transaction_t *transaction, | ||
543 | struct journal_head **descriptorp, | ||
544 | int *offsetp, | ||
545 | struct jbd_revoke_record_s *record) | ||
546 | { | ||
547 | struct journal_head *descriptor; | ||
548 | int offset; | ||
549 | journal_header_t *header; | ||
550 | |||
551 | /* If we are already aborting, this all becomes a noop. We | ||
552 | still need to go round the loop in | ||
553 | journal_write_revoke_records in order to free all of the | ||
554 | revoke records: only the IO to the journal is omitted. */ | ||
555 | if (is_journal_aborted(journal)) | ||
556 | return; | ||
557 | |||
558 | descriptor = *descriptorp; | ||
559 | offset = *offsetp; | ||
560 | |||
561 | /* Make sure we have a descriptor with space left for the record */ | ||
562 | if (descriptor) { | ||
563 | if (offset == journal->j_blocksize) { | ||
564 | flush_descriptor(journal, descriptor, offset); | ||
565 | descriptor = NULL; | ||
566 | } | ||
567 | } | ||
568 | |||
569 | if (!descriptor) { | ||
570 | descriptor = journal_get_descriptor_buffer(journal); | ||
571 | if (!descriptor) | ||
572 | return; | ||
573 | header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; | ||
574 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
575 | header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK); | ||
576 | header->h_sequence = cpu_to_be32(transaction->t_tid); | ||
577 | |||
578 | /* Record it so that we can wait for IO completion later */ | ||
579 | JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); | ||
580 | journal_file_buffer(descriptor, transaction, BJ_LogCtl); | ||
581 | |||
582 | offset = sizeof(journal_revoke_header_t); | ||
583 | *descriptorp = descriptor; | ||
584 | } | ||
585 | |||
586 | * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = | ||
587 | cpu_to_be32(record->blocknr); | ||
588 | offset += 4; | ||
589 | *offsetp = offset; | ||
590 | } | ||
591 | |||
592 | /* | ||
593 | * Flush a revoke descriptor out to the journal. If we are aborting, | ||
594 | * this is a noop; otherwise we are generating a buffer which needs to | ||
595 | * be waited for during commit, so it has to go onto the appropriate | ||
596 | * journal buffer list. | ||
597 | */ | ||
598 | |||
599 | static void flush_descriptor(journal_t *journal, | ||
600 | struct journal_head *descriptor, | ||
601 | int offset) | ||
602 | { | ||
603 | journal_revoke_header_t *header; | ||
604 | struct buffer_head *bh = jh2bh(descriptor); | ||
605 | |||
606 | if (is_journal_aborted(journal)) { | ||
607 | put_bh(bh); | ||
608 | return; | ||
609 | } | ||
610 | |||
611 | header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; | ||
612 | header->r_count = cpu_to_be32(offset); | ||
613 | set_buffer_jwrite(bh); | ||
614 | BUFFER_TRACE(bh, "write"); | ||
615 | set_buffer_dirty(bh); | ||
616 | ll_rw_block(WRITE, 1, &bh); | ||
617 | } | ||
618 | #endif | ||
619 | |||
620 | /* | ||
621 | * Revoke support for recovery. | ||
622 | * | ||
623 | * Recovery needs to be able to: | ||
624 | * | ||
625 | * record all revoke records, including the tid of the latest instance | ||
626 | * of each revoke in the journal | ||
627 | * | ||
628 | * check whether a given block in a given transaction should be replayed | ||
629 | * (ie. has not been revoked by a revoke record in that or a subsequent | ||
630 | * transaction) | ||
631 | * | ||
632 | * empty the revoke table after recovery. | ||
633 | */ | ||
634 | |||
635 | /* | ||
636 | * First, setting revoke records. We create a new revoke record for | ||
637 | * every block ever revoked in the log as we scan it for recovery, and | ||
638 | * we update the existing records if we find multiple revokes for a | ||
639 | * single block. | ||
640 | */ | ||
641 | |||
642 | int journal_set_revoke(journal_t *journal, | ||
643 | unsigned long blocknr, | ||
644 | tid_t sequence) | ||
645 | { | ||
646 | struct jbd_revoke_record_s *record; | ||
647 | |||
648 | record = find_revoke_record(journal, blocknr); | ||
649 | if (record) { | ||
650 | /* If we have multiple occurrences, only record the | ||
651 | * latest sequence number in the hashed record */ | ||
652 | if (tid_gt(sequence, record->sequence)) | ||
653 | record->sequence = sequence; | ||
654 | return 0; | ||
655 | } | ||
656 | return insert_revoke_hash(journal, blocknr, sequence); | ||
657 | } | ||
658 | |||
659 | /* | ||
660 | * Test revoke records. For a given block referenced in the log, has | ||
661 | * that block been revoked? A revoke record with a given transaction | ||
662 | * sequence number revokes all blocks in that transaction and earlier | ||
663 | * ones, but later transactions still need replayed. | ||
664 | */ | ||
665 | |||
666 | int journal_test_revoke(journal_t *journal, | ||
667 | unsigned long blocknr, | ||
668 | tid_t sequence) | ||
669 | { | ||
670 | struct jbd_revoke_record_s *record; | ||
671 | |||
672 | record = find_revoke_record(journal, blocknr); | ||
673 | if (!record) | ||
674 | return 0; | ||
675 | if (tid_gt(sequence, record->sequence)) | ||
676 | return 0; | ||
677 | return 1; | ||
678 | } | ||
679 | |||
680 | /* | ||
681 | * Finally, once recovery is over, we need to clear the revoke table so | ||
682 | * that it can be reused by the running filesystem. | ||
683 | */ | ||
684 | |||
685 | void journal_clear_revoke(journal_t *journal) | ||
686 | { | ||
687 | int i; | ||
688 | struct list_head *hash_list; | ||
689 | struct jbd_revoke_record_s *record; | ||
690 | struct jbd_revoke_table_s *revoke; | ||
691 | |||
692 | revoke = journal->j_revoke; | ||
693 | |||
694 | for (i = 0; i < revoke->hash_size; i++) { | ||
695 | hash_list = &revoke->hash_table[i]; | ||
696 | while (!list_empty(hash_list)) { | ||
697 | record = (struct jbd_revoke_record_s*) hash_list->next; | ||
698 | list_del(&record->hash); | ||
699 | kmem_cache_free(revoke_record_cache, record); | ||
700 | } | ||
701 | } | ||
702 | } | ||
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c new file mode 100644 index 000000000000..932e7c1ef4a1 --- /dev/null +++ b/fs/jbd/transaction.c | |||
@@ -0,0 +1,2062 @@ | |||
1 | /* | ||
2 | * linux/fs/transaction.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | ||
5 | * | ||
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Generic filesystem transaction handling code; part of the ext2fs | ||
13 | * journaling system. | ||
14 | * | ||
15 | * This file manages transactions (compound commits managed by the | ||
16 | * journaling code) and handles (individual atomic operations by the | ||
17 | * filesystem). | ||
18 | */ | ||
19 | |||
20 | #include <linux/time.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/jbd.h> | ||
23 | #include <linux/errno.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/timer.h> | ||
26 | #include <linux/smp_lock.h> | ||
27 | #include <linux/mm.h> | ||
28 | #include <linux/highmem.h> | ||
29 | |||
30 | /* | ||
31 | * get_transaction: obtain a new transaction_t object. | ||
32 | * | ||
33 | * Simply allocate and initialise a new transaction. Create it in | ||
34 | * RUNNING state and add it to the current journal (which should not | ||
35 | * have an existing running transaction: we only make a new transaction | ||
36 | * once we have started to commit the old one). | ||
37 | * | ||
38 | * Preconditions: | ||
39 | * The journal MUST be locked. We don't perform atomic mallocs on the | ||
40 | * new transaction and we can't block without protecting against other | ||
41 | * processes trying to touch the journal while it is in transition. | ||
42 | * | ||
43 | * Called under j_state_lock | ||
44 | */ | ||
45 | |||
46 | static transaction_t * | ||
47 | get_transaction(journal_t *journal, transaction_t *transaction) | ||
48 | { | ||
49 | transaction->t_journal = journal; | ||
50 | transaction->t_state = T_RUNNING; | ||
51 | transaction->t_tid = journal->j_transaction_sequence++; | ||
52 | transaction->t_expires = jiffies + journal->j_commit_interval; | ||
53 | spin_lock_init(&transaction->t_handle_lock); | ||
54 | |||
55 | /* Set up the commit timer for the new transaction. */ | ||
56 | journal->j_commit_timer->expires = transaction->t_expires; | ||
57 | add_timer(journal->j_commit_timer); | ||
58 | |||
59 | J_ASSERT(journal->j_running_transaction == NULL); | ||
60 | journal->j_running_transaction = transaction; | ||
61 | |||
62 | return transaction; | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * Handle management. | ||
67 | * | ||
68 | * A handle_t is an object which represents a single atomic update to a | ||
69 | * filesystem, and which tracks all of the modifications which form part | ||
70 | * of that one update. | ||
71 | */ | ||
72 | |||
73 | /* | ||
74 | * start_this_handle: Given a handle, deal with any locking or stalling | ||
75 | * needed to make sure that there is enough journal space for the handle | ||
76 | * to begin. Attach the handle to a transaction and set up the | ||
77 | * transaction's buffer credits. | ||
78 | */ | ||
79 | |||
80 | static int start_this_handle(journal_t *journal, handle_t *handle) | ||
81 | { | ||
82 | transaction_t *transaction; | ||
83 | int needed; | ||
84 | int nblocks = handle->h_buffer_credits; | ||
85 | transaction_t *new_transaction = NULL; | ||
86 | int ret = 0; | ||
87 | |||
88 | if (nblocks > journal->j_max_transaction_buffers) { | ||
89 | printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", | ||
90 | current->comm, nblocks, | ||
91 | journal->j_max_transaction_buffers); | ||
92 | ret = -ENOSPC; | ||
93 | goto out; | ||
94 | } | ||
95 | |||
96 | alloc_transaction: | ||
97 | if (!journal->j_running_transaction) { | ||
98 | new_transaction = jbd_kmalloc(sizeof(*new_transaction), | ||
99 | GFP_NOFS); | ||
100 | if (!new_transaction) { | ||
101 | ret = -ENOMEM; | ||
102 | goto out; | ||
103 | } | ||
104 | memset(new_transaction, 0, sizeof(*new_transaction)); | ||
105 | } | ||
106 | |||
107 | jbd_debug(3, "New handle %p going live.\n", handle); | ||
108 | |||
109 | repeat: | ||
110 | |||
111 | /* | ||
112 | * We need to hold j_state_lock until t_updates has been incremented, | ||
113 | * for proper journal barrier handling | ||
114 | */ | ||
115 | spin_lock(&journal->j_state_lock); | ||
116 | repeat_locked: | ||
117 | if (is_journal_aborted(journal) || | ||
118 | (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { | ||
119 | spin_unlock(&journal->j_state_lock); | ||
120 | ret = -EROFS; | ||
121 | goto out; | ||
122 | } | ||
123 | |||
124 | /* Wait on the journal's transaction barrier if necessary */ | ||
125 | if (journal->j_barrier_count) { | ||
126 | spin_unlock(&journal->j_state_lock); | ||
127 | wait_event(journal->j_wait_transaction_locked, | ||
128 | journal->j_barrier_count == 0); | ||
129 | goto repeat; | ||
130 | } | ||
131 | |||
132 | if (!journal->j_running_transaction) { | ||
133 | if (!new_transaction) { | ||
134 | spin_unlock(&journal->j_state_lock); | ||
135 | goto alloc_transaction; | ||
136 | } | ||
137 | get_transaction(journal, new_transaction); | ||
138 | new_transaction = NULL; | ||
139 | } | ||
140 | |||
141 | transaction = journal->j_running_transaction; | ||
142 | |||
143 | /* | ||
144 | * If the current transaction is locked down for commit, wait for the | ||
145 | * lock to be released. | ||
146 | */ | ||
147 | if (transaction->t_state == T_LOCKED) { | ||
148 | DEFINE_WAIT(wait); | ||
149 | |||
150 | prepare_to_wait(&journal->j_wait_transaction_locked, | ||
151 | &wait, TASK_UNINTERRUPTIBLE); | ||
152 | spin_unlock(&journal->j_state_lock); | ||
153 | schedule(); | ||
154 | finish_wait(&journal->j_wait_transaction_locked, &wait); | ||
155 | goto repeat; | ||
156 | } | ||
157 | |||
158 | /* | ||
159 | * If there is not enough space left in the log to write all potential | ||
160 | * buffers requested by this operation, we need to stall pending a log | ||
161 | * checkpoint to free some more log space. | ||
162 | */ | ||
163 | spin_lock(&transaction->t_handle_lock); | ||
164 | needed = transaction->t_outstanding_credits + nblocks; | ||
165 | |||
166 | if (needed > journal->j_max_transaction_buffers) { | ||
167 | /* | ||
168 | * If the current transaction is already too large, then start | ||
169 | * to commit it: we can then go back and attach this handle to | ||
170 | * a new transaction. | ||
171 | */ | ||
172 | DEFINE_WAIT(wait); | ||
173 | |||
174 | jbd_debug(2, "Handle %p starting new commit...\n", handle); | ||
175 | spin_unlock(&transaction->t_handle_lock); | ||
176 | prepare_to_wait(&journal->j_wait_transaction_locked, &wait, | ||
177 | TASK_UNINTERRUPTIBLE); | ||
178 | __log_start_commit(journal, transaction->t_tid); | ||
179 | spin_unlock(&journal->j_state_lock); | ||
180 | schedule(); | ||
181 | finish_wait(&journal->j_wait_transaction_locked, &wait); | ||
182 | goto repeat; | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * The commit code assumes that it can get enough log space | ||
187 | * without forcing a checkpoint. This is *critical* for | ||
188 | * correctness: a checkpoint of a buffer which is also | ||
189 | * associated with a committing transaction creates a deadlock, | ||
190 | * so commit simply cannot force through checkpoints. | ||
191 | * | ||
192 | * We must therefore ensure the necessary space in the journal | ||
193 | * *before* starting to dirty potentially checkpointed buffers | ||
194 | * in the new transaction. | ||
195 | * | ||
196 | * The worst part is, any transaction currently committing can | ||
197 | * reduce the free space arbitrarily. Be careful to account for | ||
198 | * those buffers when checkpointing. | ||
199 | */ | ||
200 | |||
201 | /* | ||
202 | * @@@ AKPM: This seems rather over-defensive. We're giving commit | ||
203 | * a _lot_ of headroom: 1/4 of the journal plus the size of | ||
204 | * the committing transaction. Really, we only need to give it | ||
205 | * committing_transaction->t_outstanding_credits plus "enough" for | ||
206 | * the log control blocks. | ||
207 | * Also, this test is inconsitent with the matching one in | ||
208 | * journal_extend(). | ||
209 | */ | ||
210 | if (__log_space_left(journal) < jbd_space_needed(journal)) { | ||
211 | jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); | ||
212 | spin_unlock(&transaction->t_handle_lock); | ||
213 | __log_wait_for_space(journal); | ||
214 | goto repeat_locked; | ||
215 | } | ||
216 | |||
217 | /* OK, account for the buffers that this operation expects to | ||
218 | * use and add the handle to the running transaction. */ | ||
219 | |||
220 | handle->h_transaction = transaction; | ||
221 | transaction->t_outstanding_credits += nblocks; | ||
222 | transaction->t_updates++; | ||
223 | transaction->t_handle_count++; | ||
224 | jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", | ||
225 | handle, nblocks, transaction->t_outstanding_credits, | ||
226 | __log_space_left(journal)); | ||
227 | spin_unlock(&transaction->t_handle_lock); | ||
228 | spin_unlock(&journal->j_state_lock); | ||
229 | out: | ||
230 | if (new_transaction) | ||
231 | kfree(new_transaction); | ||
232 | return ret; | ||
233 | } | ||
234 | |||
235 | /* Allocate a new handle. This should probably be in a slab... */ | ||
236 | static handle_t *new_handle(int nblocks) | ||
237 | { | ||
238 | handle_t *handle = jbd_alloc_handle(GFP_NOFS); | ||
239 | if (!handle) | ||
240 | return NULL; | ||
241 | memset(handle, 0, sizeof(*handle)); | ||
242 | handle->h_buffer_credits = nblocks; | ||
243 | handle->h_ref = 1; | ||
244 | |||
245 | return handle; | ||
246 | } | ||
247 | |||
248 | /** | ||
249 | * handle_t *journal_start() - Obtain a new handle. | ||
250 | * @journal: Journal to start transaction on. | ||
251 | * @nblocks: number of block buffer we might modify | ||
252 | * | ||
253 | * We make sure that the transaction can guarantee at least nblocks of | ||
254 | * modified buffers in the log. We block until the log can guarantee | ||
255 | * that much space. | ||
256 | * | ||
257 | * This function is visible to journal users (like ext3fs), so is not | ||
258 | * called with the journal already locked. | ||
259 | * | ||
260 | * Return a pointer to a newly allocated handle, or NULL on failure | ||
261 | */ | ||
262 | handle_t *journal_start(journal_t *journal, int nblocks) | ||
263 | { | ||
264 | handle_t *handle = journal_current_handle(); | ||
265 | int err; | ||
266 | |||
267 | if (!journal) | ||
268 | return ERR_PTR(-EROFS); | ||
269 | |||
270 | if (handle) { | ||
271 | J_ASSERT(handle->h_transaction->t_journal == journal); | ||
272 | handle->h_ref++; | ||
273 | return handle; | ||
274 | } | ||
275 | |||
276 | handle = new_handle(nblocks); | ||
277 | if (!handle) | ||
278 | return ERR_PTR(-ENOMEM); | ||
279 | |||
280 | current->journal_info = handle; | ||
281 | |||
282 | err = start_this_handle(journal, handle); | ||
283 | if (err < 0) { | ||
284 | jbd_free_handle(handle); | ||
285 | current->journal_info = NULL; | ||
286 | handle = ERR_PTR(err); | ||
287 | } | ||
288 | return handle; | ||
289 | } | ||
290 | |||
291 | /** | ||
292 | * int journal_extend() - extend buffer credits. | ||
293 | * @handle: handle to 'extend' | ||
294 | * @nblocks: nr blocks to try to extend by. | ||
295 | * | ||
296 | * Some transactions, such as large extends and truncates, can be done | ||
297 | * atomically all at once or in several stages. The operation requests | ||
298 | * a credit for a number of buffer modications in advance, but can | ||
299 | * extend its credit if it needs more. | ||
300 | * | ||
301 | * journal_extend tries to give the running handle more buffer credits. | ||
302 | * It does not guarantee that allocation - this is a best-effort only. | ||
303 | * The calling process MUST be able to deal cleanly with a failure to | ||
304 | * extend here. | ||
305 | * | ||
306 | * Return 0 on success, non-zero on failure. | ||
307 | * | ||
308 | * return code < 0 implies an error | ||
309 | * return code > 0 implies normal transaction-full status. | ||
310 | */ | ||
311 | int journal_extend(handle_t *handle, int nblocks) | ||
312 | { | ||
313 | transaction_t *transaction = handle->h_transaction; | ||
314 | journal_t *journal = transaction->t_journal; | ||
315 | int result; | ||
316 | int wanted; | ||
317 | |||
318 | result = -EIO; | ||
319 | if (is_handle_aborted(handle)) | ||
320 | goto out; | ||
321 | |||
322 | result = 1; | ||
323 | |||
324 | spin_lock(&journal->j_state_lock); | ||
325 | |||
326 | /* Don't extend a locked-down transaction! */ | ||
327 | if (handle->h_transaction->t_state != T_RUNNING) { | ||
328 | jbd_debug(3, "denied handle %p %d blocks: " | ||
329 | "transaction not running\n", handle, nblocks); | ||
330 | goto error_out; | ||
331 | } | ||
332 | |||
333 | spin_lock(&transaction->t_handle_lock); | ||
334 | wanted = transaction->t_outstanding_credits + nblocks; | ||
335 | |||
336 | if (wanted > journal->j_max_transaction_buffers) { | ||
337 | jbd_debug(3, "denied handle %p %d blocks: " | ||
338 | "transaction too large\n", handle, nblocks); | ||
339 | goto unlock; | ||
340 | } | ||
341 | |||
342 | if (wanted > __log_space_left(journal)) { | ||
343 | jbd_debug(3, "denied handle %p %d blocks: " | ||
344 | "insufficient log space\n", handle, nblocks); | ||
345 | goto unlock; | ||
346 | } | ||
347 | |||
348 | handle->h_buffer_credits += nblocks; | ||
349 | transaction->t_outstanding_credits += nblocks; | ||
350 | result = 0; | ||
351 | |||
352 | jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); | ||
353 | unlock: | ||
354 | spin_unlock(&transaction->t_handle_lock); | ||
355 | error_out: | ||
356 | spin_unlock(&journal->j_state_lock); | ||
357 | out: | ||
358 | return result; | ||
359 | } | ||
360 | |||
361 | |||
362 | /** | ||
363 | * int journal_restart() - restart a handle . | ||
364 | * @handle: handle to restart | ||
365 | * @nblocks: nr credits requested | ||
366 | * | ||
367 | * Restart a handle for a multi-transaction filesystem | ||
368 | * operation. | ||
369 | * | ||
370 | * If the journal_extend() call above fails to grant new buffer credits | ||
371 | * to a running handle, a call to journal_restart will commit the | ||
372 | * handle's transaction so far and reattach the handle to a new | ||
373 | * transaction capabable of guaranteeing the requested number of | ||
374 | * credits. | ||
375 | */ | ||
376 | |||
377 | int journal_restart(handle_t *handle, int nblocks) | ||
378 | { | ||
379 | transaction_t *transaction = handle->h_transaction; | ||
380 | journal_t *journal = transaction->t_journal; | ||
381 | int ret; | ||
382 | |||
383 | /* If we've had an abort of any type, don't even think about | ||
384 | * actually doing the restart! */ | ||
385 | if (is_handle_aborted(handle)) | ||
386 | return 0; | ||
387 | |||
388 | /* | ||
389 | * First unlink the handle from its current transaction, and start the | ||
390 | * commit on that. | ||
391 | */ | ||
392 | J_ASSERT(transaction->t_updates > 0); | ||
393 | J_ASSERT(journal_current_handle() == handle); | ||
394 | |||
395 | spin_lock(&journal->j_state_lock); | ||
396 | spin_lock(&transaction->t_handle_lock); | ||
397 | transaction->t_outstanding_credits -= handle->h_buffer_credits; | ||
398 | transaction->t_updates--; | ||
399 | |||
400 | if (!transaction->t_updates) | ||
401 | wake_up(&journal->j_wait_updates); | ||
402 | spin_unlock(&transaction->t_handle_lock); | ||
403 | |||
404 | jbd_debug(2, "restarting handle %p\n", handle); | ||
405 | __log_start_commit(journal, transaction->t_tid); | ||
406 | spin_unlock(&journal->j_state_lock); | ||
407 | |||
408 | handle->h_buffer_credits = nblocks; | ||
409 | ret = start_this_handle(journal, handle); | ||
410 | return ret; | ||
411 | } | ||
412 | |||
413 | |||
414 | /** | ||
415 | * void journal_lock_updates () - establish a transaction barrier. | ||
416 | * @journal: Journal to establish a barrier on. | ||
417 | * | ||
418 | * This locks out any further updates from being started, and blocks | ||
419 | * until all existing updates have completed, returning only once the | ||
420 | * journal is in a quiescent state with no updates running. | ||
421 | * | ||
422 | * The journal lock should not be held on entry. | ||
423 | */ | ||
424 | void journal_lock_updates(journal_t *journal) | ||
425 | { | ||
426 | DEFINE_WAIT(wait); | ||
427 | |||
428 | spin_lock(&journal->j_state_lock); | ||
429 | ++journal->j_barrier_count; | ||
430 | |||
431 | /* Wait until there are no running updates */ | ||
432 | while (1) { | ||
433 | transaction_t *transaction = journal->j_running_transaction; | ||
434 | |||
435 | if (!transaction) | ||
436 | break; | ||
437 | |||
438 | spin_lock(&transaction->t_handle_lock); | ||
439 | if (!transaction->t_updates) { | ||
440 | spin_unlock(&transaction->t_handle_lock); | ||
441 | break; | ||
442 | } | ||
443 | prepare_to_wait(&journal->j_wait_updates, &wait, | ||
444 | TASK_UNINTERRUPTIBLE); | ||
445 | spin_unlock(&transaction->t_handle_lock); | ||
446 | spin_unlock(&journal->j_state_lock); | ||
447 | schedule(); | ||
448 | finish_wait(&journal->j_wait_updates, &wait); | ||
449 | spin_lock(&journal->j_state_lock); | ||
450 | } | ||
451 | spin_unlock(&journal->j_state_lock); | ||
452 | |||
453 | /* | ||
454 | * We have now established a barrier against other normal updates, but | ||
455 | * we also need to barrier against other journal_lock_updates() calls | ||
456 | * to make sure that we serialise special journal-locked operations | ||
457 | * too. | ||
458 | */ | ||
459 | down(&journal->j_barrier); | ||
460 | } | ||
461 | |||
462 | /** | ||
463 | * void journal_unlock_updates (journal_t* journal) - release barrier | ||
464 | * @journal: Journal to release the barrier on. | ||
465 | * | ||
466 | * Release a transaction barrier obtained with journal_lock_updates(). | ||
467 | * | ||
468 | * Should be called without the journal lock held. | ||
469 | */ | ||
470 | void journal_unlock_updates (journal_t *journal) | ||
471 | { | ||
472 | J_ASSERT(journal->j_barrier_count != 0); | ||
473 | |||
474 | up(&journal->j_barrier); | ||
475 | spin_lock(&journal->j_state_lock); | ||
476 | --journal->j_barrier_count; | ||
477 | spin_unlock(&journal->j_state_lock); | ||
478 | wake_up(&journal->j_wait_transaction_locked); | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * Report any unexpected dirty buffers which turn up. Normally those | ||
483 | * indicate an error, but they can occur if the user is running (say) | ||
484 | * tune2fs to modify the live filesystem, so we need the option of | ||
485 | * continuing as gracefully as possible. # | ||
486 | * | ||
487 | * The caller should already hold the journal lock and | ||
488 | * j_list_lock spinlock: most callers will need those anyway | ||
489 | * in order to probe the buffer's journaling state safely. | ||
490 | */ | ||
491 | static void jbd_unexpected_dirty_buffer(struct journal_head *jh) | ||
492 | { | ||
493 | struct buffer_head *bh = jh2bh(jh); | ||
494 | int jlist; | ||
495 | |||
496 | if (buffer_dirty(bh)) { | ||
497 | /* If this buffer is one which might reasonably be dirty | ||
498 | * --- ie. data, or not part of this journal --- then | ||
499 | * we're OK to leave it alone, but otherwise we need to | ||
500 | * move the dirty bit to the journal's own internal | ||
501 | * JBDDirty bit. */ | ||
502 | jlist = jh->b_jlist; | ||
503 | |||
504 | if (jlist == BJ_Metadata || jlist == BJ_Reserved || | ||
505 | jlist == BJ_Shadow || jlist == BJ_Forget) { | ||
506 | if (test_clear_buffer_dirty(jh2bh(jh))) { | ||
507 | set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); | ||
508 | } | ||
509 | } | ||
510 | } | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * If the buffer is already part of the current transaction, then there | ||
515 | * is nothing we need to do. If it is already part of a prior | ||
516 | * transaction which we are still committing to disk, then we need to | ||
517 | * make sure that we do not overwrite the old copy: we do copy-out to | ||
518 | * preserve the copy going to disk. We also account the buffer against | ||
519 | * the handle's metadata buffer credits (unless the buffer is already | ||
520 | * part of the transaction, that is). | ||
521 | * | ||
522 | */ | ||
523 | static int | ||
524 | do_get_write_access(handle_t *handle, struct journal_head *jh, | ||
525 | int force_copy) | ||
526 | { | ||
527 | struct buffer_head *bh; | ||
528 | transaction_t *transaction; | ||
529 | journal_t *journal; | ||
530 | int error; | ||
531 | char *frozen_buffer = NULL; | ||
532 | int need_copy = 0; | ||
533 | |||
534 | if (is_handle_aborted(handle)) | ||
535 | return -EROFS; | ||
536 | |||
537 | transaction = handle->h_transaction; | ||
538 | journal = transaction->t_journal; | ||
539 | |||
540 | jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); | ||
541 | |||
542 | JBUFFER_TRACE(jh, "entry"); | ||
543 | repeat: | ||
544 | bh = jh2bh(jh); | ||
545 | |||
546 | /* @@@ Need to check for errors here at some point. */ | ||
547 | |||
548 | lock_buffer(bh); | ||
549 | jbd_lock_bh_state(bh); | ||
550 | |||
551 | /* We now hold the buffer lock so it is safe to query the buffer | ||
552 | * state. Is the buffer dirty? | ||
553 | * | ||
554 | * If so, there are two possibilities. The buffer may be | ||
555 | * non-journaled, and undergoing a quite legitimate writeback. | ||
556 | * Otherwise, it is journaled, and we don't expect dirty buffers | ||
557 | * in that state (the buffers should be marked JBD_Dirty | ||
558 | * instead.) So either the IO is being done under our own | ||
559 | * control and this is a bug, or it's a third party IO such as | ||
560 | * dump(8) (which may leave the buffer scheduled for read --- | ||
561 | * ie. locked but not dirty) or tune2fs (which may actually have | ||
562 | * the buffer dirtied, ugh.) */ | ||
563 | |||
564 | if (buffer_dirty(bh)) { | ||
565 | /* | ||
566 | * First question: is this buffer already part of the current | ||
567 | * transaction or the existing committing transaction? | ||
568 | */ | ||
569 | if (jh->b_transaction) { | ||
570 | J_ASSERT_JH(jh, | ||
571 | jh->b_transaction == transaction || | ||
572 | jh->b_transaction == | ||
573 | journal->j_committing_transaction); | ||
574 | if (jh->b_next_transaction) | ||
575 | J_ASSERT_JH(jh, jh->b_next_transaction == | ||
576 | transaction); | ||
577 | JBUFFER_TRACE(jh, "Unexpected dirty buffer"); | ||
578 | jbd_unexpected_dirty_buffer(jh); | ||
579 | } | ||
580 | } | ||
581 | |||
582 | unlock_buffer(bh); | ||
583 | |||
584 | error = -EROFS; | ||
585 | if (is_handle_aborted(handle)) { | ||
586 | jbd_unlock_bh_state(bh); | ||
587 | goto out; | ||
588 | } | ||
589 | error = 0; | ||
590 | |||
591 | /* | ||
592 | * The buffer is already part of this transaction if b_transaction or | ||
593 | * b_next_transaction points to it | ||
594 | */ | ||
595 | if (jh->b_transaction == transaction || | ||
596 | jh->b_next_transaction == transaction) | ||
597 | goto done; | ||
598 | |||
599 | /* | ||
600 | * If there is already a copy-out version of this buffer, then we don't | ||
601 | * need to make another one | ||
602 | */ | ||
603 | if (jh->b_frozen_data) { | ||
604 | JBUFFER_TRACE(jh, "has frozen data"); | ||
605 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
606 | jh->b_next_transaction = transaction; | ||
607 | goto done; | ||
608 | } | ||
609 | |||
610 | /* Is there data here we need to preserve? */ | ||
611 | |||
612 | if (jh->b_transaction && jh->b_transaction != transaction) { | ||
613 | JBUFFER_TRACE(jh, "owned by older transaction"); | ||
614 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
615 | J_ASSERT_JH(jh, jh->b_transaction == | ||
616 | journal->j_committing_transaction); | ||
617 | |||
618 | /* There is one case we have to be very careful about. | ||
619 | * If the committing transaction is currently writing | ||
620 | * this buffer out to disk and has NOT made a copy-out, | ||
621 | * then we cannot modify the buffer contents at all | ||
622 | * right now. The essence of copy-out is that it is the | ||
623 | * extra copy, not the primary copy, which gets | ||
624 | * journaled. If the primary copy is already going to | ||
625 | * disk then we cannot do copy-out here. */ | ||
626 | |||
627 | if (jh->b_jlist == BJ_Shadow) { | ||
628 | DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); | ||
629 | wait_queue_head_t *wqh; | ||
630 | |||
631 | wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); | ||
632 | |||
633 | JBUFFER_TRACE(jh, "on shadow: sleep"); | ||
634 | jbd_unlock_bh_state(bh); | ||
635 | /* commit wakes up all shadow buffers after IO */ | ||
636 | for ( ; ; ) { | ||
637 | prepare_to_wait(wqh, &wait.wait, | ||
638 | TASK_UNINTERRUPTIBLE); | ||
639 | if (jh->b_jlist != BJ_Shadow) | ||
640 | break; | ||
641 | schedule(); | ||
642 | } | ||
643 | finish_wait(wqh, &wait.wait); | ||
644 | goto repeat; | ||
645 | } | ||
646 | |||
647 | /* Only do the copy if the currently-owning transaction | ||
648 | * still needs it. If it is on the Forget list, the | ||
649 | * committing transaction is past that stage. The | ||
650 | * buffer had better remain locked during the kmalloc, | ||
651 | * but that should be true --- we hold the journal lock | ||
652 | * still and the buffer is already on the BUF_JOURNAL | ||
653 | * list so won't be flushed. | ||
654 | * | ||
655 | * Subtle point, though: if this is a get_undo_access, | ||
656 | * then we will be relying on the frozen_data to contain | ||
657 | * the new value of the committed_data record after the | ||
658 | * transaction, so we HAVE to force the frozen_data copy | ||
659 | * in that case. */ | ||
660 | |||
661 | if (jh->b_jlist != BJ_Forget || force_copy) { | ||
662 | JBUFFER_TRACE(jh, "generate frozen data"); | ||
663 | if (!frozen_buffer) { | ||
664 | JBUFFER_TRACE(jh, "allocate memory for buffer"); | ||
665 | jbd_unlock_bh_state(bh); | ||
666 | frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, | ||
667 | GFP_NOFS); | ||
668 | if (!frozen_buffer) { | ||
669 | printk(KERN_EMERG | ||
670 | "%s: OOM for frozen_buffer\n", | ||
671 | __FUNCTION__); | ||
672 | JBUFFER_TRACE(jh, "oom!"); | ||
673 | error = -ENOMEM; | ||
674 | jbd_lock_bh_state(bh); | ||
675 | goto done; | ||
676 | } | ||
677 | goto repeat; | ||
678 | } | ||
679 | jh->b_frozen_data = frozen_buffer; | ||
680 | frozen_buffer = NULL; | ||
681 | need_copy = 1; | ||
682 | } | ||
683 | jh->b_next_transaction = transaction; | ||
684 | } | ||
685 | |||
686 | |||
687 | /* | ||
688 | * Finally, if the buffer is not journaled right now, we need to make | ||
689 | * sure it doesn't get written to disk before the caller actually | ||
690 | * commits the new data | ||
691 | */ | ||
692 | if (!jh->b_transaction) { | ||
693 | JBUFFER_TRACE(jh, "no transaction"); | ||
694 | J_ASSERT_JH(jh, !jh->b_next_transaction); | ||
695 | jh->b_transaction = transaction; | ||
696 | JBUFFER_TRACE(jh, "file as BJ_Reserved"); | ||
697 | spin_lock(&journal->j_list_lock); | ||
698 | __journal_file_buffer(jh, transaction, BJ_Reserved); | ||
699 | spin_unlock(&journal->j_list_lock); | ||
700 | } | ||
701 | |||
702 | done: | ||
703 | if (need_copy) { | ||
704 | struct page *page; | ||
705 | int offset; | ||
706 | char *source; | ||
707 | |||
708 | J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), | ||
709 | "Possible IO failure.\n"); | ||
710 | page = jh2bh(jh)->b_page; | ||
711 | offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; | ||
712 | source = kmap_atomic(page, KM_USER0); | ||
713 | memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); | ||
714 | kunmap_atomic(source, KM_USER0); | ||
715 | } | ||
716 | jbd_unlock_bh_state(bh); | ||
717 | |||
718 | /* | ||
719 | * If we are about to journal a buffer, then any revoke pending on it is | ||
720 | * no longer valid | ||
721 | */ | ||
722 | journal_cancel_revoke(handle, jh); | ||
723 | |||
724 | out: | ||
725 | if (frozen_buffer) | ||
726 | kfree(frozen_buffer); | ||
727 | |||
728 | JBUFFER_TRACE(jh, "exit"); | ||
729 | return error; | ||
730 | } | ||
731 | |||
732 | /** | ||
733 | * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. | ||
734 | * @handle: transaction to add buffer modifications to | ||
735 | * @bh: bh to be used for metadata writes | ||
736 | * @credits: variable that will receive credits for the buffer | ||
737 | * | ||
738 | * Returns an error code or 0 on success. | ||
739 | * | ||
740 | * In full data journalling mode the buffer may be of type BJ_AsyncData, | ||
741 | * because we're write()ing a buffer which is also part of a shared mapping. | ||
742 | */ | ||
743 | |||
744 | int journal_get_write_access(handle_t *handle, struct buffer_head *bh) | ||
745 | { | ||
746 | struct journal_head *jh = journal_add_journal_head(bh); | ||
747 | int rc; | ||
748 | |||
749 | /* We do not want to get caught playing with fields which the | ||
750 | * log thread also manipulates. Make sure that the buffer | ||
751 | * completes any outstanding IO before proceeding. */ | ||
752 | rc = do_get_write_access(handle, jh, 0); | ||
753 | journal_put_journal_head(jh); | ||
754 | return rc; | ||
755 | } | ||
756 | |||
757 | |||
758 | /* | ||
759 | * When the user wants to journal a newly created buffer_head | ||
760 | * (ie. getblk() returned a new buffer and we are going to populate it | ||
761 | * manually rather than reading off disk), then we need to keep the | ||
762 | * buffer_head locked until it has been completely filled with new | ||
763 | * data. In this case, we should be able to make the assertion that | ||
764 | * the bh is not already part of an existing transaction. | ||
765 | * | ||
766 | * The buffer should already be locked by the caller by this point. | ||
767 | * There is no lock ranking violation: it was a newly created, | ||
768 | * unlocked buffer beforehand. */ | ||
769 | |||
770 | /** | ||
771 | * int journal_get_create_access () - notify intent to use newly created bh | ||
772 | * @handle: transaction to new buffer to | ||
773 | * @bh: new buffer. | ||
774 | * | ||
775 | * Call this if you create a new bh. | ||
776 | */ | ||
777 | int journal_get_create_access(handle_t *handle, struct buffer_head *bh) | ||
778 | { | ||
779 | transaction_t *transaction = handle->h_transaction; | ||
780 | journal_t *journal = transaction->t_journal; | ||
781 | struct journal_head *jh = journal_add_journal_head(bh); | ||
782 | int err; | ||
783 | |||
784 | jbd_debug(5, "journal_head %p\n", jh); | ||
785 | err = -EROFS; | ||
786 | if (is_handle_aborted(handle)) | ||
787 | goto out; | ||
788 | err = 0; | ||
789 | |||
790 | JBUFFER_TRACE(jh, "entry"); | ||
791 | /* | ||
792 | * The buffer may already belong to this transaction due to pre-zeroing | ||
793 | * in the filesystem's new_block code. It may also be on the previous, | ||
794 | * committing transaction's lists, but it HAS to be in Forget state in | ||
795 | * that case: the transaction must have deleted the buffer for it to be | ||
796 | * reused here. | ||
797 | */ | ||
798 | jbd_lock_bh_state(bh); | ||
799 | spin_lock(&journal->j_list_lock); | ||
800 | J_ASSERT_JH(jh, (jh->b_transaction == transaction || | ||
801 | jh->b_transaction == NULL || | ||
802 | (jh->b_transaction == journal->j_committing_transaction && | ||
803 | jh->b_jlist == BJ_Forget))); | ||
804 | |||
805 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
806 | J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); | ||
807 | |||
808 | if (jh->b_transaction == NULL) { | ||
809 | jh->b_transaction = transaction; | ||
810 | JBUFFER_TRACE(jh, "file as BJ_Reserved"); | ||
811 | __journal_file_buffer(jh, transaction, BJ_Reserved); | ||
812 | } else if (jh->b_transaction == journal->j_committing_transaction) { | ||
813 | JBUFFER_TRACE(jh, "set next transaction"); | ||
814 | jh->b_next_transaction = transaction; | ||
815 | } | ||
816 | spin_unlock(&journal->j_list_lock); | ||
817 | jbd_unlock_bh_state(bh); | ||
818 | |||
819 | /* | ||
820 | * akpm: I added this. ext3_alloc_branch can pick up new indirect | ||
821 | * blocks which contain freed but then revoked metadata. We need | ||
822 | * to cancel the revoke in case we end up freeing it yet again | ||
823 | * and the reallocating as data - this would cause a second revoke, | ||
824 | * which hits an assertion error. | ||
825 | */ | ||
826 | JBUFFER_TRACE(jh, "cancelling revoke"); | ||
827 | journal_cancel_revoke(handle, jh); | ||
828 | journal_put_journal_head(jh); | ||
829 | out: | ||
830 | return err; | ||
831 | } | ||
832 | |||
833 | /** | ||
834 | * int journal_get_undo_access() - Notify intent to modify metadata with | ||
835 | * non-rewindable consequences | ||
836 | * @handle: transaction | ||
837 | * @bh: buffer to undo | ||
838 | * @credits: store the number of taken credits here (if not NULL) | ||
839 | * | ||
840 | * Sometimes there is a need to distinguish between metadata which has | ||
841 | * been committed to disk and that which has not. The ext3fs code uses | ||
842 | * this for freeing and allocating space, we have to make sure that we | ||
843 | * do not reuse freed space until the deallocation has been committed, | ||
844 | * since if we overwrote that space we would make the delete | ||
845 | * un-rewindable in case of a crash. | ||
846 | * | ||
847 | * To deal with that, journal_get_undo_access requests write access to a | ||
848 | * buffer for parts of non-rewindable operations such as delete | ||
849 | * operations on the bitmaps. The journaling code must keep a copy of | ||
850 | * the buffer's contents prior to the undo_access call until such time | ||
851 | * as we know that the buffer has definitely been committed to disk. | ||
852 | * | ||
853 | * We never need to know which transaction the committed data is part | ||
854 | * of, buffers touched here are guaranteed to be dirtied later and so | ||
855 | * will be committed to a new transaction in due course, at which point | ||
856 | * we can discard the old committed data pointer. | ||
857 | * | ||
858 | * Returns error number or 0 on success. | ||
859 | */ | ||
860 | int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) | ||
861 | { | ||
862 | int err; | ||
863 | struct journal_head *jh = journal_add_journal_head(bh); | ||
864 | char *committed_data = NULL; | ||
865 | |||
866 | JBUFFER_TRACE(jh, "entry"); | ||
867 | |||
868 | /* | ||
869 | * Do this first --- it can drop the journal lock, so we want to | ||
870 | * make sure that obtaining the committed_data is done | ||
871 | * atomically wrt. completion of any outstanding commits. | ||
872 | */ | ||
873 | err = do_get_write_access(handle, jh, 1); | ||
874 | if (err) | ||
875 | goto out; | ||
876 | |||
877 | repeat: | ||
878 | if (!jh->b_committed_data) { | ||
879 | committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS); | ||
880 | if (!committed_data) { | ||
881 | printk(KERN_EMERG "%s: No memory for committed data\n", | ||
882 | __FUNCTION__); | ||
883 | err = -ENOMEM; | ||
884 | goto out; | ||
885 | } | ||
886 | } | ||
887 | |||
888 | jbd_lock_bh_state(bh); | ||
889 | if (!jh->b_committed_data) { | ||
890 | /* Copy out the current buffer contents into the | ||
891 | * preserved, committed copy. */ | ||
892 | JBUFFER_TRACE(jh, "generate b_committed data"); | ||
893 | if (!committed_data) { | ||
894 | jbd_unlock_bh_state(bh); | ||
895 | goto repeat; | ||
896 | } | ||
897 | |||
898 | jh->b_committed_data = committed_data; | ||
899 | committed_data = NULL; | ||
900 | memcpy(jh->b_committed_data, bh->b_data, bh->b_size); | ||
901 | } | ||
902 | jbd_unlock_bh_state(bh); | ||
903 | out: | ||
904 | journal_put_journal_head(jh); | ||
905 | if (committed_data) | ||
906 | kfree(committed_data); | ||
907 | return err; | ||
908 | } | ||
909 | |||
910 | /** | ||
911 | * int journal_dirty_data() - mark a buffer as containing dirty data which | ||
912 | * needs to be flushed before we can commit the | ||
913 | * current transaction. | ||
914 | * @handle: transaction | ||
915 | * @bh: bufferhead to mark | ||
916 | * | ||
917 | * The buffer is placed on the transaction's data list and is marked as | ||
918 | * belonging to the transaction. | ||
919 | * | ||
920 | * Returns error number or 0 on success. | ||
921 | * | ||
922 | * journal_dirty_data() can be called via page_launder->ext3_writepage | ||
923 | * by kswapd. | ||
924 | */ | ||
925 | int journal_dirty_data(handle_t *handle, struct buffer_head *bh) | ||
926 | { | ||
927 | journal_t *journal = handle->h_transaction->t_journal; | ||
928 | int need_brelse = 0; | ||
929 | struct journal_head *jh; | ||
930 | |||
931 | if (is_handle_aborted(handle)) | ||
932 | return 0; | ||
933 | |||
934 | jh = journal_add_journal_head(bh); | ||
935 | JBUFFER_TRACE(jh, "entry"); | ||
936 | |||
937 | /* | ||
938 | * The buffer could *already* be dirty. Writeout can start | ||
939 | * at any time. | ||
940 | */ | ||
941 | jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); | ||
942 | |||
943 | /* | ||
944 | * What if the buffer is already part of a running transaction? | ||
945 | * | ||
946 | * There are two cases: | ||
947 | * 1) It is part of the current running transaction. Refile it, | ||
948 | * just in case we have allocated it as metadata, deallocated | ||
949 | * it, then reallocated it as data. | ||
950 | * 2) It is part of the previous, still-committing transaction. | ||
951 | * If all we want to do is to guarantee that the buffer will be | ||
952 | * written to disk before this new transaction commits, then | ||
953 | * being sure that the *previous* transaction has this same | ||
954 | * property is sufficient for us! Just leave it on its old | ||
955 | * transaction. | ||
956 | * | ||
957 | * In case (2), the buffer must not already exist as metadata | ||
958 | * --- that would violate write ordering (a transaction is free | ||
959 | * to write its data at any point, even before the previous | ||
960 | * committing transaction has committed). The caller must | ||
961 | * never, ever allow this to happen: there's nothing we can do | ||
962 | * about it in this layer. | ||
963 | */ | ||
964 | jbd_lock_bh_state(bh); | ||
965 | spin_lock(&journal->j_list_lock); | ||
966 | if (jh->b_transaction) { | ||
967 | JBUFFER_TRACE(jh, "has transaction"); | ||
968 | if (jh->b_transaction != handle->h_transaction) { | ||
969 | JBUFFER_TRACE(jh, "belongs to older transaction"); | ||
970 | J_ASSERT_JH(jh, jh->b_transaction == | ||
971 | journal->j_committing_transaction); | ||
972 | |||
973 | /* @@@ IS THIS TRUE ? */ | ||
974 | /* | ||
975 | * Not any more. Scenario: someone does a write() | ||
976 | * in data=journal mode. The buffer's transaction has | ||
977 | * moved into commit. Then someone does another | ||
978 | * write() to the file. We do the frozen data copyout | ||
979 | * and set b_next_transaction to point to j_running_t. | ||
980 | * And while we're in that state, someone does a | ||
981 | * writepage() in an attempt to pageout the same area | ||
982 | * of the file via a shared mapping. At present that | ||
983 | * calls journal_dirty_data(), and we get right here. | ||
984 | * It may be too late to journal the data. Simply | ||
985 | * falling through to the next test will suffice: the | ||
986 | * data will be dirty and wil be checkpointed. The | ||
987 | * ordering comments in the next comment block still | ||
988 | * apply. | ||
989 | */ | ||
990 | //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
991 | |||
992 | /* | ||
993 | * If we're journalling data, and this buffer was | ||
994 | * subject to a write(), it could be metadata, forget | ||
995 | * or shadow against the committing transaction. Now, | ||
996 | * someone has dirtied the same darn page via a mapping | ||
997 | * and it is being writepage()'d. | ||
998 | * We *could* just steal the page from commit, with some | ||
999 | * fancy locking there. Instead, we just skip it - | ||
1000 | * don't tie the page's buffers to the new transaction | ||
1001 | * at all. | ||
1002 | * Implication: if we crash before the writepage() data | ||
1003 | * is written into the filesystem, recovery will replay | ||
1004 | * the write() data. | ||
1005 | */ | ||
1006 | if (jh->b_jlist != BJ_None && | ||
1007 | jh->b_jlist != BJ_SyncData && | ||
1008 | jh->b_jlist != BJ_Locked) { | ||
1009 | JBUFFER_TRACE(jh, "Not stealing"); | ||
1010 | goto no_journal; | ||
1011 | } | ||
1012 | |||
1013 | /* | ||
1014 | * This buffer may be undergoing writeout in commit. We | ||
1015 | * can't return from here and let the caller dirty it | ||
1016 | * again because that can cause the write-out loop in | ||
1017 | * commit to never terminate. | ||
1018 | */ | ||
1019 | if (buffer_dirty(bh)) { | ||
1020 | get_bh(bh); | ||
1021 | spin_unlock(&journal->j_list_lock); | ||
1022 | jbd_unlock_bh_state(bh); | ||
1023 | need_brelse = 1; | ||
1024 | sync_dirty_buffer(bh); | ||
1025 | jbd_lock_bh_state(bh); | ||
1026 | spin_lock(&journal->j_list_lock); | ||
1027 | /* The buffer may become locked again at any | ||
1028 | time if it is redirtied */ | ||
1029 | } | ||
1030 | |||
1031 | /* journal_clean_data_list() may have got there first */ | ||
1032 | if (jh->b_transaction != NULL) { | ||
1033 | JBUFFER_TRACE(jh, "unfile from commit"); | ||
1034 | __journal_temp_unlink_buffer(jh); | ||
1035 | /* It still points to the committing | ||
1036 | * transaction; move it to this one so | ||
1037 | * that the refile assert checks are | ||
1038 | * happy. */ | ||
1039 | jh->b_transaction = handle->h_transaction; | ||
1040 | } | ||
1041 | /* The buffer will be refiled below */ | ||
1042 | |||
1043 | } | ||
1044 | /* | ||
1045 | * Special case --- the buffer might actually have been | ||
1046 | * allocated and then immediately deallocated in the previous, | ||
1047 | * committing transaction, so might still be left on that | ||
1048 | * transaction's metadata lists. | ||
1049 | */ | ||
1050 | if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { | ||
1051 | JBUFFER_TRACE(jh, "not on correct data list: unfile"); | ||
1052 | J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); | ||
1053 | __journal_temp_unlink_buffer(jh); | ||
1054 | jh->b_transaction = handle->h_transaction; | ||
1055 | JBUFFER_TRACE(jh, "file as data"); | ||
1056 | __journal_file_buffer(jh, handle->h_transaction, | ||
1057 | BJ_SyncData); | ||
1058 | } | ||
1059 | } else { | ||
1060 | JBUFFER_TRACE(jh, "not on a transaction"); | ||
1061 | __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); | ||
1062 | } | ||
1063 | no_journal: | ||
1064 | spin_unlock(&journal->j_list_lock); | ||
1065 | jbd_unlock_bh_state(bh); | ||
1066 | if (need_brelse) { | ||
1067 | BUFFER_TRACE(bh, "brelse"); | ||
1068 | __brelse(bh); | ||
1069 | } | ||
1070 | JBUFFER_TRACE(jh, "exit"); | ||
1071 | journal_put_journal_head(jh); | ||
1072 | return 0; | ||
1073 | } | ||
1074 | |||
1075 | /** | ||
1076 | * int journal_dirty_metadata() - mark a buffer as containing dirty metadata | ||
1077 | * @handle: transaction to add buffer to. | ||
1078 | * @bh: buffer to mark | ||
1079 | * | ||
1080 | * mark dirty metadata which needs to be journaled as part of the current | ||
1081 | * transaction. | ||
1082 | * | ||
1083 | * The buffer is placed on the transaction's metadata list and is marked | ||
1084 | * as belonging to the transaction. | ||
1085 | * | ||
1086 | * Returns error number or 0 on success. | ||
1087 | * | ||
1088 | * Special care needs to be taken if the buffer already belongs to the | ||
1089 | * current committing transaction (in which case we should have frozen | ||
1090 | * data present for that commit). In that case, we don't relink the | ||
1091 | * buffer: that only gets done when the old transaction finally | ||
1092 | * completes its commit. | ||
1093 | */ | ||
1094 | int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) | ||
1095 | { | ||
1096 | transaction_t *transaction = handle->h_transaction; | ||
1097 | journal_t *journal = transaction->t_journal; | ||
1098 | struct journal_head *jh = bh2jh(bh); | ||
1099 | |||
1100 | jbd_debug(5, "journal_head %p\n", jh); | ||
1101 | JBUFFER_TRACE(jh, "entry"); | ||
1102 | if (is_handle_aborted(handle)) | ||
1103 | goto out; | ||
1104 | |||
1105 | jbd_lock_bh_state(bh); | ||
1106 | |||
1107 | if (jh->b_modified == 0) { | ||
1108 | /* | ||
1109 | * This buffer's got modified and becoming part | ||
1110 | * of the transaction. This needs to be done | ||
1111 | * once a transaction -bzzz | ||
1112 | */ | ||
1113 | jh->b_modified = 1; | ||
1114 | J_ASSERT_JH(jh, handle->h_buffer_credits > 0); | ||
1115 | handle->h_buffer_credits--; | ||
1116 | } | ||
1117 | |||
1118 | /* | ||
1119 | * fastpath, to avoid expensive locking. If this buffer is already | ||
1120 | * on the running transaction's metadata list there is nothing to do. | ||
1121 | * Nobody can take it off again because there is a handle open. | ||
1122 | * I _think_ we're OK here with SMP barriers - a mistaken decision will | ||
1123 | * result in this test being false, so we go in and take the locks. | ||
1124 | */ | ||
1125 | if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { | ||
1126 | JBUFFER_TRACE(jh, "fastpath"); | ||
1127 | J_ASSERT_JH(jh, jh->b_transaction == | ||
1128 | journal->j_running_transaction); | ||
1129 | goto out_unlock_bh; | ||
1130 | } | ||
1131 | |||
1132 | set_buffer_jbddirty(bh); | ||
1133 | |||
1134 | /* | ||
1135 | * Metadata already on the current transaction list doesn't | ||
1136 | * need to be filed. Metadata on another transaction's list must | ||
1137 | * be committing, and will be refiled once the commit completes: | ||
1138 | * leave it alone for now. | ||
1139 | */ | ||
1140 | if (jh->b_transaction != transaction) { | ||
1141 | JBUFFER_TRACE(jh, "already on other transaction"); | ||
1142 | J_ASSERT_JH(jh, jh->b_transaction == | ||
1143 | journal->j_committing_transaction); | ||
1144 | J_ASSERT_JH(jh, jh->b_next_transaction == transaction); | ||
1145 | /* And this case is illegal: we can't reuse another | ||
1146 | * transaction's data buffer, ever. */ | ||
1147 | goto out_unlock_bh; | ||
1148 | } | ||
1149 | |||
1150 | /* That test should have eliminated the following case: */ | ||
1151 | J_ASSERT_JH(jh, jh->b_frozen_data == 0); | ||
1152 | |||
1153 | JBUFFER_TRACE(jh, "file as BJ_Metadata"); | ||
1154 | spin_lock(&journal->j_list_lock); | ||
1155 | __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); | ||
1156 | spin_unlock(&journal->j_list_lock); | ||
1157 | out_unlock_bh: | ||
1158 | jbd_unlock_bh_state(bh); | ||
1159 | out: | ||
1160 | JBUFFER_TRACE(jh, "exit"); | ||
1161 | return 0; | ||
1162 | } | ||
1163 | |||
1164 | /* | ||
1165 | * journal_release_buffer: undo a get_write_access without any buffer | ||
1166 | * updates, if the update decided in the end that it didn't need access. | ||
1167 | * | ||
1168 | */ | ||
1169 | void | ||
1170 | journal_release_buffer(handle_t *handle, struct buffer_head *bh) | ||
1171 | { | ||
1172 | BUFFER_TRACE(bh, "entry"); | ||
1173 | } | ||
1174 | |||
1175 | /** | ||
1176 | * void journal_forget() - bforget() for potentially-journaled buffers. | ||
1177 | * @handle: transaction handle | ||
1178 | * @bh: bh to 'forget' | ||
1179 | * | ||
1180 | * We can only do the bforget if there are no commits pending against the | ||
1181 | * buffer. If the buffer is dirty in the current running transaction we | ||
1182 | * can safely unlink it. | ||
1183 | * | ||
1184 | * bh may not be a journalled buffer at all - it may be a non-JBD | ||
1185 | * buffer which came off the hashtable. Check for this. | ||
1186 | * | ||
1187 | * Decrements bh->b_count by one. | ||
1188 | * | ||
1189 | * Allow this call even if the handle has aborted --- it may be part of | ||
1190 | * the caller's cleanup after an abort. | ||
1191 | */ | ||
1192 | int journal_forget (handle_t *handle, struct buffer_head *bh) | ||
1193 | { | ||
1194 | transaction_t *transaction = handle->h_transaction; | ||
1195 | journal_t *journal = transaction->t_journal; | ||
1196 | struct journal_head *jh; | ||
1197 | int drop_reserve = 0; | ||
1198 | int err = 0; | ||
1199 | |||
1200 | BUFFER_TRACE(bh, "entry"); | ||
1201 | |||
1202 | jbd_lock_bh_state(bh); | ||
1203 | spin_lock(&journal->j_list_lock); | ||
1204 | |||
1205 | if (!buffer_jbd(bh)) | ||
1206 | goto not_jbd; | ||
1207 | jh = bh2jh(bh); | ||
1208 | |||
1209 | /* Critical error: attempting to delete a bitmap buffer, maybe? | ||
1210 | * Don't do any jbd operations, and return an error. */ | ||
1211 | if (!J_EXPECT_JH(jh, !jh->b_committed_data, | ||
1212 | "inconsistent data on disk")) { | ||
1213 | err = -EIO; | ||
1214 | goto not_jbd; | ||
1215 | } | ||
1216 | |||
1217 | /* | ||
1218 | * The buffer's going from the transaction, we must drop | ||
1219 | * all references -bzzz | ||
1220 | */ | ||
1221 | jh->b_modified = 0; | ||
1222 | |||
1223 | if (jh->b_transaction == handle->h_transaction) { | ||
1224 | J_ASSERT_JH(jh, !jh->b_frozen_data); | ||
1225 | |||
1226 | /* If we are forgetting a buffer which is already part | ||
1227 | * of this transaction, then we can just drop it from | ||
1228 | * the transaction immediately. */ | ||
1229 | clear_buffer_dirty(bh); | ||
1230 | clear_buffer_jbddirty(bh); | ||
1231 | |||
1232 | JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); | ||
1233 | |||
1234 | drop_reserve = 1; | ||
1235 | |||
1236 | /* | ||
1237 | * We are no longer going to journal this buffer. | ||
1238 | * However, the commit of this transaction is still | ||
1239 | * important to the buffer: the delete that we are now | ||
1240 | * processing might obsolete an old log entry, so by | ||
1241 | * committing, we can satisfy the buffer's checkpoint. | ||
1242 | * | ||
1243 | * So, if we have a checkpoint on the buffer, we should | ||
1244 | * now refile the buffer on our BJ_Forget list so that | ||
1245 | * we know to remove the checkpoint after we commit. | ||
1246 | */ | ||
1247 | |||
1248 | if (jh->b_cp_transaction) { | ||
1249 | __journal_temp_unlink_buffer(jh); | ||
1250 | __journal_file_buffer(jh, transaction, BJ_Forget); | ||
1251 | } else { | ||
1252 | __journal_unfile_buffer(jh); | ||
1253 | journal_remove_journal_head(bh); | ||
1254 | __brelse(bh); | ||
1255 | if (!buffer_jbd(bh)) { | ||
1256 | spin_unlock(&journal->j_list_lock); | ||
1257 | jbd_unlock_bh_state(bh); | ||
1258 | __bforget(bh); | ||
1259 | goto drop; | ||
1260 | } | ||
1261 | } | ||
1262 | } else if (jh->b_transaction) { | ||
1263 | J_ASSERT_JH(jh, (jh->b_transaction == | ||
1264 | journal->j_committing_transaction)); | ||
1265 | /* However, if the buffer is still owned by a prior | ||
1266 | * (committing) transaction, we can't drop it yet... */ | ||
1267 | JBUFFER_TRACE(jh, "belongs to older transaction"); | ||
1268 | /* ... but we CAN drop it from the new transaction if we | ||
1269 | * have also modified it since the original commit. */ | ||
1270 | |||
1271 | if (jh->b_next_transaction) { | ||
1272 | J_ASSERT(jh->b_next_transaction == transaction); | ||
1273 | jh->b_next_transaction = NULL; | ||
1274 | drop_reserve = 1; | ||
1275 | } | ||
1276 | } | ||
1277 | |||
1278 | not_jbd: | ||
1279 | spin_unlock(&journal->j_list_lock); | ||
1280 | jbd_unlock_bh_state(bh); | ||
1281 | __brelse(bh); | ||
1282 | drop: | ||
1283 | if (drop_reserve) { | ||
1284 | /* no need to reserve log space for this block -bzzz */ | ||
1285 | handle->h_buffer_credits++; | ||
1286 | } | ||
1287 | return err; | ||
1288 | } | ||
1289 | |||
1290 | /** | ||
1291 | * int journal_stop() - complete a transaction | ||
1292 | * @handle: tranaction to complete. | ||
1293 | * | ||
1294 | * All done for a particular handle. | ||
1295 | * | ||
1296 | * There is not much action needed here. We just return any remaining | ||
1297 | * buffer credits to the transaction and remove the handle. The only | ||
1298 | * complication is that we need to start a commit operation if the | ||
1299 | * filesystem is marked for synchronous update. | ||
1300 | * | ||
1301 | * journal_stop itself will not usually return an error, but it may | ||
1302 | * do so in unusual circumstances. In particular, expect it to | ||
1303 | * return -EIO if a journal_abort has been executed since the | ||
1304 | * transaction began. | ||
1305 | */ | ||
1306 | int journal_stop(handle_t *handle) | ||
1307 | { | ||
1308 | transaction_t *transaction = handle->h_transaction; | ||
1309 | journal_t *journal = transaction->t_journal; | ||
1310 | int old_handle_count, err; | ||
1311 | |||
1312 | J_ASSERT(transaction->t_updates > 0); | ||
1313 | J_ASSERT(journal_current_handle() == handle); | ||
1314 | |||
1315 | if (is_handle_aborted(handle)) | ||
1316 | err = -EIO; | ||
1317 | else | ||
1318 | err = 0; | ||
1319 | |||
1320 | if (--handle->h_ref > 0) { | ||
1321 | jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, | ||
1322 | handle->h_ref); | ||
1323 | return err; | ||
1324 | } | ||
1325 | |||
1326 | jbd_debug(4, "Handle %p going down\n", handle); | ||
1327 | |||
1328 | /* | ||
1329 | * Implement synchronous transaction batching. If the handle | ||
1330 | * was synchronous, don't force a commit immediately. Let's | ||
1331 | * yield and let another thread piggyback onto this transaction. | ||
1332 | * Keep doing that while new threads continue to arrive. | ||
1333 | * It doesn't cost much - we're about to run a commit and sleep | ||
1334 | * on IO anyway. Speeds up many-threaded, many-dir operations | ||
1335 | * by 30x or more... | ||
1336 | */ | ||
1337 | if (handle->h_sync) { | ||
1338 | do { | ||
1339 | old_handle_count = transaction->t_handle_count; | ||
1340 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1341 | schedule_timeout(1); | ||
1342 | } while (old_handle_count != transaction->t_handle_count); | ||
1343 | } | ||
1344 | |||
1345 | current->journal_info = NULL; | ||
1346 | spin_lock(&journal->j_state_lock); | ||
1347 | spin_lock(&transaction->t_handle_lock); | ||
1348 | transaction->t_outstanding_credits -= handle->h_buffer_credits; | ||
1349 | transaction->t_updates--; | ||
1350 | if (!transaction->t_updates) { | ||
1351 | wake_up(&journal->j_wait_updates); | ||
1352 | if (journal->j_barrier_count) | ||
1353 | wake_up(&journal->j_wait_transaction_locked); | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * If the handle is marked SYNC, we need to set another commit | ||
1358 | * going! We also want to force a commit if the current | ||
1359 | * transaction is occupying too much of the log, or if the | ||
1360 | * transaction is too old now. | ||
1361 | */ | ||
1362 | if (handle->h_sync || | ||
1363 | transaction->t_outstanding_credits > | ||
1364 | journal->j_max_transaction_buffers || | ||
1365 | time_after_eq(jiffies, transaction->t_expires)) { | ||
1366 | /* Do this even for aborted journals: an abort still | ||
1367 | * completes the commit thread, it just doesn't write | ||
1368 | * anything to disk. */ | ||
1369 | tid_t tid = transaction->t_tid; | ||
1370 | |||
1371 | spin_unlock(&transaction->t_handle_lock); | ||
1372 | jbd_debug(2, "transaction too old, requesting commit for " | ||
1373 | "handle %p\n", handle); | ||
1374 | /* This is non-blocking */ | ||
1375 | __log_start_commit(journal, transaction->t_tid); | ||
1376 | spin_unlock(&journal->j_state_lock); | ||
1377 | |||
1378 | /* | ||
1379 | * Special case: JFS_SYNC synchronous updates require us | ||
1380 | * to wait for the commit to complete. | ||
1381 | */ | ||
1382 | if (handle->h_sync && !(current->flags & PF_MEMALLOC)) | ||
1383 | err = log_wait_commit(journal, tid); | ||
1384 | } else { | ||
1385 | spin_unlock(&transaction->t_handle_lock); | ||
1386 | spin_unlock(&journal->j_state_lock); | ||
1387 | } | ||
1388 | |||
1389 | jbd_free_handle(handle); | ||
1390 | return err; | ||
1391 | } | ||
1392 | |||
1393 | /**int journal_force_commit() - force any uncommitted transactions | ||
1394 | * @journal: journal to force | ||
1395 | * | ||
1396 | * For synchronous operations: force any uncommitted transactions | ||
1397 | * to disk. May seem kludgy, but it reuses all the handle batching | ||
1398 | * code in a very simple manner. | ||
1399 | */ | ||
1400 | int journal_force_commit(journal_t *journal) | ||
1401 | { | ||
1402 | handle_t *handle; | ||
1403 | int ret; | ||
1404 | |||
1405 | handle = journal_start(journal, 1); | ||
1406 | if (IS_ERR(handle)) { | ||
1407 | ret = PTR_ERR(handle); | ||
1408 | } else { | ||
1409 | handle->h_sync = 1; | ||
1410 | ret = journal_stop(handle); | ||
1411 | } | ||
1412 | return ret; | ||
1413 | } | ||
1414 | |||
1415 | /* | ||
1416 | * | ||
1417 | * List management code snippets: various functions for manipulating the | ||
1418 | * transaction buffer lists. | ||
1419 | * | ||
1420 | */ | ||
1421 | |||
1422 | /* | ||
1423 | * Append a buffer to a transaction list, given the transaction's list head | ||
1424 | * pointer. | ||
1425 | * | ||
1426 | * j_list_lock is held. | ||
1427 | * | ||
1428 | * jbd_lock_bh_state(jh2bh(jh)) is held. | ||
1429 | */ | ||
1430 | |||
1431 | static inline void | ||
1432 | __blist_add_buffer(struct journal_head **list, struct journal_head *jh) | ||
1433 | { | ||
1434 | if (!*list) { | ||
1435 | jh->b_tnext = jh->b_tprev = jh; | ||
1436 | *list = jh; | ||
1437 | } else { | ||
1438 | /* Insert at the tail of the list to preserve order */ | ||
1439 | struct journal_head *first = *list, *last = first->b_tprev; | ||
1440 | jh->b_tprev = last; | ||
1441 | jh->b_tnext = first; | ||
1442 | last->b_tnext = first->b_tprev = jh; | ||
1443 | } | ||
1444 | } | ||
1445 | |||
1446 | /* | ||
1447 | * Remove a buffer from a transaction list, given the transaction's list | ||
1448 | * head pointer. | ||
1449 | * | ||
1450 | * Called with j_list_lock held, and the journal may not be locked. | ||
1451 | * | ||
1452 | * jbd_lock_bh_state(jh2bh(jh)) is held. | ||
1453 | */ | ||
1454 | |||
1455 | static inline void | ||
1456 | __blist_del_buffer(struct journal_head **list, struct journal_head *jh) | ||
1457 | { | ||
1458 | if (*list == jh) { | ||
1459 | *list = jh->b_tnext; | ||
1460 | if (*list == jh) | ||
1461 | *list = NULL; | ||
1462 | } | ||
1463 | jh->b_tprev->b_tnext = jh->b_tnext; | ||
1464 | jh->b_tnext->b_tprev = jh->b_tprev; | ||
1465 | } | ||
1466 | |||
1467 | /* | ||
1468 | * Remove a buffer from the appropriate transaction list. | ||
1469 | * | ||
1470 | * Note that this function can *change* the value of | ||
1471 | * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, | ||
1472 | * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller | ||
1473 | * is holding onto a copy of one of thee pointers, it could go bad. | ||
1474 | * Generally the caller needs to re-read the pointer from the transaction_t. | ||
1475 | * | ||
1476 | * Called under j_list_lock. The journal may not be locked. | ||
1477 | */ | ||
1478 | void __journal_temp_unlink_buffer(struct journal_head *jh) | ||
1479 | { | ||
1480 | struct journal_head **list = NULL; | ||
1481 | transaction_t *transaction; | ||
1482 | struct buffer_head *bh = jh2bh(jh); | ||
1483 | |||
1484 | J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); | ||
1485 | transaction = jh->b_transaction; | ||
1486 | if (transaction) | ||
1487 | assert_spin_locked(&transaction->t_journal->j_list_lock); | ||
1488 | |||
1489 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); | ||
1490 | if (jh->b_jlist != BJ_None) | ||
1491 | J_ASSERT_JH(jh, transaction != 0); | ||
1492 | |||
1493 | switch (jh->b_jlist) { | ||
1494 | case BJ_None: | ||
1495 | return; | ||
1496 | case BJ_SyncData: | ||
1497 | list = &transaction->t_sync_datalist; | ||
1498 | break; | ||
1499 | case BJ_Metadata: | ||
1500 | transaction->t_nr_buffers--; | ||
1501 | J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); | ||
1502 | list = &transaction->t_buffers; | ||
1503 | break; | ||
1504 | case BJ_Forget: | ||
1505 | list = &transaction->t_forget; | ||
1506 | break; | ||
1507 | case BJ_IO: | ||
1508 | list = &transaction->t_iobuf_list; | ||
1509 | break; | ||
1510 | case BJ_Shadow: | ||
1511 | list = &transaction->t_shadow_list; | ||
1512 | break; | ||
1513 | case BJ_LogCtl: | ||
1514 | list = &transaction->t_log_list; | ||
1515 | break; | ||
1516 | case BJ_Reserved: | ||
1517 | list = &transaction->t_reserved_list; | ||
1518 | break; | ||
1519 | case BJ_Locked: | ||
1520 | list = &transaction->t_locked_list; | ||
1521 | break; | ||
1522 | } | ||
1523 | |||
1524 | __blist_del_buffer(list, jh); | ||
1525 | jh->b_jlist = BJ_None; | ||
1526 | if (test_clear_buffer_jbddirty(bh)) | ||
1527 | mark_buffer_dirty(bh); /* Expose it to the VM */ | ||
1528 | } | ||
1529 | |||
1530 | void __journal_unfile_buffer(struct journal_head *jh) | ||
1531 | { | ||
1532 | __journal_temp_unlink_buffer(jh); | ||
1533 | jh->b_transaction = NULL; | ||
1534 | } | ||
1535 | |||
1536 | void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) | ||
1537 | { | ||
1538 | jbd_lock_bh_state(jh2bh(jh)); | ||
1539 | spin_lock(&journal->j_list_lock); | ||
1540 | __journal_unfile_buffer(jh); | ||
1541 | spin_unlock(&journal->j_list_lock); | ||
1542 | jbd_unlock_bh_state(jh2bh(jh)); | ||
1543 | } | ||
1544 | |||
1545 | /* | ||
1546 | * Called from journal_try_to_free_buffers(). | ||
1547 | * | ||
1548 | * Called under jbd_lock_bh_state(bh) | ||
1549 | */ | ||
1550 | static void | ||
1551 | __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) | ||
1552 | { | ||
1553 | struct journal_head *jh; | ||
1554 | |||
1555 | jh = bh2jh(bh); | ||
1556 | |||
1557 | if (buffer_locked(bh) || buffer_dirty(bh)) | ||
1558 | goto out; | ||
1559 | |||
1560 | if (jh->b_next_transaction != 0) | ||
1561 | goto out; | ||
1562 | |||
1563 | spin_lock(&journal->j_list_lock); | ||
1564 | if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { | ||
1565 | if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { | ||
1566 | /* A written-back ordered data buffer */ | ||
1567 | JBUFFER_TRACE(jh, "release data"); | ||
1568 | __journal_unfile_buffer(jh); | ||
1569 | journal_remove_journal_head(bh); | ||
1570 | __brelse(bh); | ||
1571 | } | ||
1572 | } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { | ||
1573 | /* written-back checkpointed metadata buffer */ | ||
1574 | if (jh->b_jlist == BJ_None) { | ||
1575 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | ||
1576 | __journal_remove_checkpoint(jh); | ||
1577 | journal_remove_journal_head(bh); | ||
1578 | __brelse(bh); | ||
1579 | } | ||
1580 | } | ||
1581 | spin_unlock(&journal->j_list_lock); | ||
1582 | out: | ||
1583 | return; | ||
1584 | } | ||
1585 | |||
1586 | |||
1587 | /** | ||
1588 | * int journal_try_to_free_buffers() - try to free page buffers. | ||
1589 | * @journal: journal for operation | ||
1590 | * @page: to try and free | ||
1591 | * @unused_gfp_mask: unused | ||
1592 | * | ||
1593 | * | ||
1594 | * For all the buffers on this page, | ||
1595 | * if they are fully written out ordered data, move them onto BUF_CLEAN | ||
1596 | * so try_to_free_buffers() can reap them. | ||
1597 | * | ||
1598 | * This function returns non-zero if we wish try_to_free_buffers() | ||
1599 | * to be called. We do this if the page is releasable by try_to_free_buffers(). | ||
1600 | * We also do it if the page has locked or dirty buffers and the caller wants | ||
1601 | * us to perform sync or async writeout. | ||
1602 | * | ||
1603 | * This complicates JBD locking somewhat. We aren't protected by the | ||
1604 | * BKL here. We wish to remove the buffer from its committing or | ||
1605 | * running transaction's ->t_datalist via __journal_unfile_buffer. | ||
1606 | * | ||
1607 | * This may *change* the value of transaction_t->t_datalist, so anyone | ||
1608 | * who looks at t_datalist needs to lock against this function. | ||
1609 | * | ||
1610 | * Even worse, someone may be doing a journal_dirty_data on this | ||
1611 | * buffer. So we need to lock against that. journal_dirty_data() | ||
1612 | * will come out of the lock with the buffer dirty, which makes it | ||
1613 | * ineligible for release here. | ||
1614 | * | ||
1615 | * Who else is affected by this? hmm... Really the only contender | ||
1616 | * is do_get_write_access() - it could be looking at the buffer while | ||
1617 | * journal_try_to_free_buffer() is changing its state. But that | ||
1618 | * cannot happen because we never reallocate freed data as metadata | ||
1619 | * while the data is part of a transaction. Yes? | ||
1620 | */ | ||
1621 | int journal_try_to_free_buffers(journal_t *journal, | ||
1622 | struct page *page, int unused_gfp_mask) | ||
1623 | { | ||
1624 | struct buffer_head *head; | ||
1625 | struct buffer_head *bh; | ||
1626 | int ret = 0; | ||
1627 | |||
1628 | J_ASSERT(PageLocked(page)); | ||
1629 | |||
1630 | head = page_buffers(page); | ||
1631 | bh = head; | ||
1632 | do { | ||
1633 | struct journal_head *jh; | ||
1634 | |||
1635 | /* | ||
1636 | * We take our own ref against the journal_head here to avoid | ||
1637 | * having to add tons of locking around each instance of | ||
1638 | * journal_remove_journal_head() and journal_put_journal_head(). | ||
1639 | */ | ||
1640 | jh = journal_grab_journal_head(bh); | ||
1641 | if (!jh) | ||
1642 | continue; | ||
1643 | |||
1644 | jbd_lock_bh_state(bh); | ||
1645 | __journal_try_to_free_buffer(journal, bh); | ||
1646 | journal_put_journal_head(jh); | ||
1647 | jbd_unlock_bh_state(bh); | ||
1648 | if (buffer_jbd(bh)) | ||
1649 | goto busy; | ||
1650 | } while ((bh = bh->b_this_page) != head); | ||
1651 | ret = try_to_free_buffers(page); | ||
1652 | busy: | ||
1653 | return ret; | ||
1654 | } | ||
1655 | |||
1656 | /* | ||
1657 | * This buffer is no longer needed. If it is on an older transaction's | ||
1658 | * checkpoint list we need to record it on this transaction's forget list | ||
1659 | * to pin this buffer (and hence its checkpointing transaction) down until | ||
1660 | * this transaction commits. If the buffer isn't on a checkpoint list, we | ||
1661 | * release it. | ||
1662 | * Returns non-zero if JBD no longer has an interest in the buffer. | ||
1663 | * | ||
1664 | * Called under j_list_lock. | ||
1665 | * | ||
1666 | * Called under jbd_lock_bh_state(bh). | ||
1667 | */ | ||
1668 | static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) | ||
1669 | { | ||
1670 | int may_free = 1; | ||
1671 | struct buffer_head *bh = jh2bh(jh); | ||
1672 | |||
1673 | __journal_unfile_buffer(jh); | ||
1674 | |||
1675 | if (jh->b_cp_transaction) { | ||
1676 | JBUFFER_TRACE(jh, "on running+cp transaction"); | ||
1677 | __journal_file_buffer(jh, transaction, BJ_Forget); | ||
1678 | clear_buffer_jbddirty(bh); | ||
1679 | may_free = 0; | ||
1680 | } else { | ||
1681 | JBUFFER_TRACE(jh, "on running transaction"); | ||
1682 | journal_remove_journal_head(bh); | ||
1683 | __brelse(bh); | ||
1684 | } | ||
1685 | return may_free; | ||
1686 | } | ||
1687 | |||
1688 | /* | ||
1689 | * journal_invalidatepage | ||
1690 | * | ||
1691 | * This code is tricky. It has a number of cases to deal with. | ||
1692 | * | ||
1693 | * There are two invariants which this code relies on: | ||
1694 | * | ||
1695 | * i_size must be updated on disk before we start calling invalidatepage on the | ||
1696 | * data. | ||
1697 | * | ||
1698 | * This is done in ext3 by defining an ext3_setattr method which | ||
1699 | * updates i_size before truncate gets going. By maintaining this | ||
1700 | * invariant, we can be sure that it is safe to throw away any buffers | ||
1701 | * attached to the current transaction: once the transaction commits, | ||
1702 | * we know that the data will not be needed. | ||
1703 | * | ||
1704 | * Note however that we can *not* throw away data belonging to the | ||
1705 | * previous, committing transaction! | ||
1706 | * | ||
1707 | * Any disk blocks which *are* part of the previous, committing | ||
1708 | * transaction (and which therefore cannot be discarded immediately) are | ||
1709 | * not going to be reused in the new running transaction | ||
1710 | * | ||
1711 | * The bitmap committed_data images guarantee this: any block which is | ||
1712 | * allocated in one transaction and removed in the next will be marked | ||
1713 | * as in-use in the committed_data bitmap, so cannot be reused until | ||
1714 | * the next transaction to delete the block commits. This means that | ||
1715 | * leaving committing buffers dirty is quite safe: the disk blocks | ||
1716 | * cannot be reallocated to a different file and so buffer aliasing is | ||
1717 | * not possible. | ||
1718 | * | ||
1719 | * | ||
1720 | * The above applies mainly to ordered data mode. In writeback mode we | ||
1721 | * don't make guarantees about the order in which data hits disk --- in | ||
1722 | * particular we don't guarantee that new dirty data is flushed before | ||
1723 | * transaction commit --- so it is always safe just to discard data | ||
1724 | * immediately in that mode. --sct | ||
1725 | */ | ||
1726 | |||
1727 | /* | ||
1728 | * The journal_unmap_buffer helper function returns zero if the buffer | ||
1729 | * concerned remains pinned as an anonymous buffer belonging to an older | ||
1730 | * transaction. | ||
1731 | * | ||
1732 | * We're outside-transaction here. Either or both of j_running_transaction | ||
1733 | * and j_committing_transaction may be NULL. | ||
1734 | */ | ||
1735 | static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) | ||
1736 | { | ||
1737 | transaction_t *transaction; | ||
1738 | struct journal_head *jh; | ||
1739 | int may_free = 1; | ||
1740 | int ret; | ||
1741 | |||
1742 | BUFFER_TRACE(bh, "entry"); | ||
1743 | |||
1744 | /* | ||
1745 | * It is safe to proceed here without the j_list_lock because the | ||
1746 | * buffers cannot be stolen by try_to_free_buffers as long as we are | ||
1747 | * holding the page lock. --sct | ||
1748 | */ | ||
1749 | |||
1750 | if (!buffer_jbd(bh)) | ||
1751 | goto zap_buffer_unlocked; | ||
1752 | |||
1753 | spin_lock(&journal->j_state_lock); | ||
1754 | jbd_lock_bh_state(bh); | ||
1755 | spin_lock(&journal->j_list_lock); | ||
1756 | |||
1757 | jh = journal_grab_journal_head(bh); | ||
1758 | if (!jh) | ||
1759 | goto zap_buffer_no_jh; | ||
1760 | |||
1761 | transaction = jh->b_transaction; | ||
1762 | if (transaction == NULL) { | ||
1763 | /* First case: not on any transaction. If it | ||
1764 | * has no checkpoint link, then we can zap it: | ||
1765 | * it's a writeback-mode buffer so we don't care | ||
1766 | * if it hits disk safely. */ | ||
1767 | if (!jh->b_cp_transaction) { | ||
1768 | JBUFFER_TRACE(jh, "not on any transaction: zap"); | ||
1769 | goto zap_buffer; | ||
1770 | } | ||
1771 | |||
1772 | if (!buffer_dirty(bh)) { | ||
1773 | /* bdflush has written it. We can drop it now */ | ||
1774 | goto zap_buffer; | ||
1775 | } | ||
1776 | |||
1777 | /* OK, it must be in the journal but still not | ||
1778 | * written fully to disk: it's metadata or | ||
1779 | * journaled data... */ | ||
1780 | |||
1781 | if (journal->j_running_transaction) { | ||
1782 | /* ... and once the current transaction has | ||
1783 | * committed, the buffer won't be needed any | ||
1784 | * longer. */ | ||
1785 | JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); | ||
1786 | ret = __dispose_buffer(jh, | ||
1787 | journal->j_running_transaction); | ||
1788 | journal_put_journal_head(jh); | ||
1789 | spin_unlock(&journal->j_list_lock); | ||
1790 | jbd_unlock_bh_state(bh); | ||
1791 | spin_unlock(&journal->j_state_lock); | ||
1792 | return ret; | ||
1793 | } else { | ||
1794 | /* There is no currently-running transaction. So the | ||
1795 | * orphan record which we wrote for this file must have | ||
1796 | * passed into commit. We must attach this buffer to | ||
1797 | * the committing transaction, if it exists. */ | ||
1798 | if (journal->j_committing_transaction) { | ||
1799 | JBUFFER_TRACE(jh, "give to committing trans"); | ||
1800 | ret = __dispose_buffer(jh, | ||
1801 | journal->j_committing_transaction); | ||
1802 | journal_put_journal_head(jh); | ||
1803 | spin_unlock(&journal->j_list_lock); | ||
1804 | jbd_unlock_bh_state(bh); | ||
1805 | spin_unlock(&journal->j_state_lock); | ||
1806 | return ret; | ||
1807 | } else { | ||
1808 | /* The orphan record's transaction has | ||
1809 | * committed. We can cleanse this buffer */ | ||
1810 | clear_buffer_jbddirty(bh); | ||
1811 | goto zap_buffer; | ||
1812 | } | ||
1813 | } | ||
1814 | } else if (transaction == journal->j_committing_transaction) { | ||
1815 | /* If it is committing, we simply cannot touch it. We | ||
1816 | * can remove it's next_transaction pointer from the | ||
1817 | * running transaction if that is set, but nothing | ||
1818 | * else. */ | ||
1819 | JBUFFER_TRACE(jh, "on committing transaction"); | ||
1820 | set_buffer_freed(bh); | ||
1821 | if (jh->b_next_transaction) { | ||
1822 | J_ASSERT(jh->b_next_transaction == | ||
1823 | journal->j_running_transaction); | ||
1824 | jh->b_next_transaction = NULL; | ||
1825 | } | ||
1826 | journal_put_journal_head(jh); | ||
1827 | spin_unlock(&journal->j_list_lock); | ||
1828 | jbd_unlock_bh_state(bh); | ||
1829 | spin_unlock(&journal->j_state_lock); | ||
1830 | return 0; | ||
1831 | } else { | ||
1832 | /* Good, the buffer belongs to the running transaction. | ||
1833 | * We are writing our own transaction's data, not any | ||
1834 | * previous one's, so it is safe to throw it away | ||
1835 | * (remember that we expect the filesystem to have set | ||
1836 | * i_size already for this truncate so recovery will not | ||
1837 | * expose the disk blocks we are discarding here.) */ | ||
1838 | J_ASSERT_JH(jh, transaction == journal->j_running_transaction); | ||
1839 | may_free = __dispose_buffer(jh, transaction); | ||
1840 | } | ||
1841 | |||
1842 | zap_buffer: | ||
1843 | journal_put_journal_head(jh); | ||
1844 | zap_buffer_no_jh: | ||
1845 | spin_unlock(&journal->j_list_lock); | ||
1846 | jbd_unlock_bh_state(bh); | ||
1847 | spin_unlock(&journal->j_state_lock); | ||
1848 | zap_buffer_unlocked: | ||
1849 | clear_buffer_dirty(bh); | ||
1850 | J_ASSERT_BH(bh, !buffer_jbddirty(bh)); | ||
1851 | clear_buffer_mapped(bh); | ||
1852 | clear_buffer_req(bh); | ||
1853 | clear_buffer_new(bh); | ||
1854 | bh->b_bdev = NULL; | ||
1855 | return may_free; | ||
1856 | } | ||
1857 | |||
1858 | /** | ||
1859 | * int journal_invalidatepage() | ||
1860 | * @journal: journal to use for flush... | ||
1861 | * @page: page to flush | ||
1862 | * @offset: length of page to invalidate. | ||
1863 | * | ||
1864 | * Reap page buffers containing data after offset in page. | ||
1865 | * | ||
1866 | * Return non-zero if the page's buffers were successfully reaped. | ||
1867 | */ | ||
1868 | int journal_invalidatepage(journal_t *journal, | ||
1869 | struct page *page, | ||
1870 | unsigned long offset) | ||
1871 | { | ||
1872 | struct buffer_head *head, *bh, *next; | ||
1873 | unsigned int curr_off = 0; | ||
1874 | int may_free = 1; | ||
1875 | |||
1876 | if (!PageLocked(page)) | ||
1877 | BUG(); | ||
1878 | if (!page_has_buffers(page)) | ||
1879 | return 1; | ||
1880 | |||
1881 | /* We will potentially be playing with lists other than just the | ||
1882 | * data lists (especially for journaled data mode), so be | ||
1883 | * cautious in our locking. */ | ||
1884 | |||
1885 | head = bh = page_buffers(page); | ||
1886 | do { | ||
1887 | unsigned int next_off = curr_off + bh->b_size; | ||
1888 | next = bh->b_this_page; | ||
1889 | |||
1890 | /* AKPM: doing lock_buffer here may be overly paranoid */ | ||
1891 | if (offset <= curr_off) { | ||
1892 | /* This block is wholly outside the truncation point */ | ||
1893 | lock_buffer(bh); | ||
1894 | may_free &= journal_unmap_buffer(journal, bh); | ||
1895 | unlock_buffer(bh); | ||
1896 | } | ||
1897 | curr_off = next_off; | ||
1898 | bh = next; | ||
1899 | |||
1900 | } while (bh != head); | ||
1901 | |||
1902 | if (!offset) { | ||
1903 | if (!may_free || !try_to_free_buffers(page)) | ||
1904 | return 0; | ||
1905 | J_ASSERT(!page_has_buffers(page)); | ||
1906 | } | ||
1907 | return 1; | ||
1908 | } | ||
1909 | |||
1910 | /* | ||
1911 | * File a buffer on the given transaction list. | ||
1912 | */ | ||
1913 | void __journal_file_buffer(struct journal_head *jh, | ||
1914 | transaction_t *transaction, int jlist) | ||
1915 | { | ||
1916 | struct journal_head **list = NULL; | ||
1917 | int was_dirty = 0; | ||
1918 | struct buffer_head *bh = jh2bh(jh); | ||
1919 | |||
1920 | J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); | ||
1921 | assert_spin_locked(&transaction->t_journal->j_list_lock); | ||
1922 | |||
1923 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); | ||
1924 | J_ASSERT_JH(jh, jh->b_transaction == transaction || | ||
1925 | jh->b_transaction == 0); | ||
1926 | |||
1927 | if (jh->b_transaction && jh->b_jlist == jlist) | ||
1928 | return; | ||
1929 | |||
1930 | /* The following list of buffer states needs to be consistent | ||
1931 | * with __jbd_unexpected_dirty_buffer()'s handling of dirty | ||
1932 | * state. */ | ||
1933 | |||
1934 | if (jlist == BJ_Metadata || jlist == BJ_Reserved || | ||
1935 | jlist == BJ_Shadow || jlist == BJ_Forget) { | ||
1936 | if (test_clear_buffer_dirty(bh) || | ||
1937 | test_clear_buffer_jbddirty(bh)) | ||
1938 | was_dirty = 1; | ||
1939 | } | ||
1940 | |||
1941 | if (jh->b_transaction) | ||
1942 | __journal_temp_unlink_buffer(jh); | ||
1943 | jh->b_transaction = transaction; | ||
1944 | |||
1945 | switch (jlist) { | ||
1946 | case BJ_None: | ||
1947 | J_ASSERT_JH(jh, !jh->b_committed_data); | ||
1948 | J_ASSERT_JH(jh, !jh->b_frozen_data); | ||
1949 | return; | ||
1950 | case BJ_SyncData: | ||
1951 | list = &transaction->t_sync_datalist; | ||
1952 | break; | ||
1953 | case BJ_Metadata: | ||
1954 | transaction->t_nr_buffers++; | ||
1955 | list = &transaction->t_buffers; | ||
1956 | break; | ||
1957 | case BJ_Forget: | ||
1958 | list = &transaction->t_forget; | ||
1959 | break; | ||
1960 | case BJ_IO: | ||
1961 | list = &transaction->t_iobuf_list; | ||
1962 | break; | ||
1963 | case BJ_Shadow: | ||
1964 | list = &transaction->t_shadow_list; | ||
1965 | break; | ||
1966 | case BJ_LogCtl: | ||
1967 | list = &transaction->t_log_list; | ||
1968 | break; | ||
1969 | case BJ_Reserved: | ||
1970 | list = &transaction->t_reserved_list; | ||
1971 | break; | ||
1972 | case BJ_Locked: | ||
1973 | list = &transaction->t_locked_list; | ||
1974 | break; | ||
1975 | } | ||
1976 | |||
1977 | __blist_add_buffer(list, jh); | ||
1978 | jh->b_jlist = jlist; | ||
1979 | |||
1980 | if (was_dirty) | ||
1981 | set_buffer_jbddirty(bh); | ||
1982 | } | ||
1983 | |||
1984 | void journal_file_buffer(struct journal_head *jh, | ||
1985 | transaction_t *transaction, int jlist) | ||
1986 | { | ||
1987 | jbd_lock_bh_state(jh2bh(jh)); | ||
1988 | spin_lock(&transaction->t_journal->j_list_lock); | ||
1989 | __journal_file_buffer(jh, transaction, jlist); | ||
1990 | spin_unlock(&transaction->t_journal->j_list_lock); | ||
1991 | jbd_unlock_bh_state(jh2bh(jh)); | ||
1992 | } | ||
1993 | |||
1994 | /* | ||
1995 | * Remove a buffer from its current buffer list in preparation for | ||
1996 | * dropping it from its current transaction entirely. If the buffer has | ||
1997 | * already started to be used by a subsequent transaction, refile the | ||
1998 | * buffer on that transaction's metadata list. | ||
1999 | * | ||
2000 | * Called under journal->j_list_lock | ||
2001 | * | ||
2002 | * Called under jbd_lock_bh_state(jh2bh(jh)) | ||
2003 | */ | ||
2004 | void __journal_refile_buffer(struct journal_head *jh) | ||
2005 | { | ||
2006 | int was_dirty; | ||
2007 | struct buffer_head *bh = jh2bh(jh); | ||
2008 | |||
2009 | J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); | ||
2010 | if (jh->b_transaction) | ||
2011 | assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); | ||
2012 | |||
2013 | /* If the buffer is now unused, just drop it. */ | ||
2014 | if (jh->b_next_transaction == NULL) { | ||
2015 | __journal_unfile_buffer(jh); | ||
2016 | return; | ||
2017 | } | ||
2018 | |||
2019 | /* | ||
2020 | * It has been modified by a later transaction: add it to the new | ||
2021 | * transaction's metadata list. | ||
2022 | */ | ||
2023 | |||
2024 | was_dirty = test_clear_buffer_jbddirty(bh); | ||
2025 | __journal_temp_unlink_buffer(jh); | ||
2026 | jh->b_transaction = jh->b_next_transaction; | ||
2027 | jh->b_next_transaction = NULL; | ||
2028 | __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); | ||
2029 | J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); | ||
2030 | |||
2031 | if (was_dirty) | ||
2032 | set_buffer_jbddirty(bh); | ||
2033 | } | ||
2034 | |||
2035 | /* | ||
2036 | * For the unlocked version of this call, also make sure that any | ||
2037 | * hanging journal_head is cleaned up if necessary. | ||
2038 | * | ||
2039 | * __journal_refile_buffer is usually called as part of a single locked | ||
2040 | * operation on a buffer_head, in which the caller is probably going to | ||
2041 | * be hooking the journal_head onto other lists. In that case it is up | ||
2042 | * to the caller to remove the journal_head if necessary. For the | ||
2043 | * unlocked journal_refile_buffer call, the caller isn't going to be | ||
2044 | * doing anything else to the buffer so we need to do the cleanup | ||
2045 | * ourselves to avoid a jh leak. | ||
2046 | * | ||
2047 | * *** The journal_head may be freed by this call! *** | ||
2048 | */ | ||
2049 | void journal_refile_buffer(journal_t *journal, struct journal_head *jh) | ||
2050 | { | ||
2051 | struct buffer_head *bh = jh2bh(jh); | ||
2052 | |||
2053 | jbd_lock_bh_state(bh); | ||
2054 | spin_lock(&journal->j_list_lock); | ||
2055 | |||
2056 | __journal_refile_buffer(jh); | ||
2057 | jbd_unlock_bh_state(bh); | ||
2058 | journal_remove_journal_head(bh); | ||
2059 | |||
2060 | spin_unlock(&journal->j_list_lock); | ||
2061 | __brelse(bh); | ||
2062 | } | ||