diff options
author | Dave Kleikamp <shaggy@austin.ibm.com> | 2006-10-11 04:20:57 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-10-11 14:14:15 -0400 |
commit | 470decc613ab2048b619a01028072d932d9086ee (patch) | |
tree | 5268576f5099040db94b8e984983c0bb28b2a9a7 /fs | |
parent | 02ea2104c55b625cf5b5d9ba8586a4fc17920f5c (diff) |
[PATCH] jbd2: initial copy of files from jbd
This is a simple copy of the files in fs/jbd to fs/jbd2 and
/usr/incude/linux/[ext4_]jbd.h to /usr/include/[ext4_]jbd2.h
Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/jbd2/Makefile | 7 | ||||
-rw-r--r-- | fs/jbd2/checkpoint.c | 697 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 911 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 2072 | ||||
-rw-r--r-- | fs/jbd2/recovery.c | 592 | ||||
-rw-r--r-- | fs/jbd2/revoke.c | 703 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 2080 |
7 files changed, 7062 insertions, 0 deletions
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile new file mode 100644 index 000000000000..54aca4868a36 --- /dev/null +++ b/fs/jbd2/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | # | ||
2 | # Makefile for the linux journaling routines. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_JBD) += jbd.o | ||
6 | |||
7 | jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o | ||
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c new file mode 100644 index 000000000000..0208cc7ac5d0 --- /dev/null +++ b/fs/jbd2/checkpoint.c | |||
@@ -0,0 +1,697 @@ | |||
1 | /* | ||
2 | * linux/fs/checkpoint.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 | ||
5 | * | ||
6 | * Copyright 1999 Red Hat Software --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Checkpoint routines for the generic filesystem journaling code. | ||
13 | * Part of the ext2fs journaling system. | ||
14 | * | ||
15 | * Checkpointing is the process of ensuring that a section of the log is | ||
16 | * committed fully to disk, so that that portion of the log can be | ||
17 | * reused. | ||
18 | */ | ||
19 | |||
20 | #include <linux/time.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/jbd.h> | ||
23 | #include <linux/errno.h> | ||
24 | #include <linux/slab.h> | ||
25 | |||
26 | /* | ||
27 | * Unlink a buffer from a transaction checkpoint list. | ||
28 | * | ||
29 | * Called with j_list_lock held. | ||
30 | */ | ||
31 | static inline void __buffer_unlink_first(struct journal_head *jh) | ||
32 | { | ||
33 | transaction_t *transaction = jh->b_cp_transaction; | ||
34 | |||
35 | jh->b_cpnext->b_cpprev = jh->b_cpprev; | ||
36 | jh->b_cpprev->b_cpnext = jh->b_cpnext; | ||
37 | if (transaction->t_checkpoint_list == jh) { | ||
38 | transaction->t_checkpoint_list = jh->b_cpnext; | ||
39 | if (transaction->t_checkpoint_list == jh) | ||
40 | transaction->t_checkpoint_list = NULL; | ||
41 | } | ||
42 | } | ||
43 | |||
44 | /* | ||
45 | * Unlink a buffer from a transaction checkpoint(io) list. | ||
46 | * | ||
47 | * Called with j_list_lock held. | ||
48 | */ | ||
49 | static inline void __buffer_unlink(struct journal_head *jh) | ||
50 | { | ||
51 | transaction_t *transaction = jh->b_cp_transaction; | ||
52 | |||
53 | __buffer_unlink_first(jh); | ||
54 | if (transaction->t_checkpoint_io_list == jh) { | ||
55 | transaction->t_checkpoint_io_list = jh->b_cpnext; | ||
56 | if (transaction->t_checkpoint_io_list == jh) | ||
57 | transaction->t_checkpoint_io_list = NULL; | ||
58 | } | ||
59 | } | ||
60 | |||
61 | /* | ||
62 | * Move a buffer from the checkpoint list to the checkpoint io list | ||
63 | * | ||
64 | * Called with j_list_lock held | ||
65 | */ | ||
66 | static inline void __buffer_relink_io(struct journal_head *jh) | ||
67 | { | ||
68 | transaction_t *transaction = jh->b_cp_transaction; | ||
69 | |||
70 | __buffer_unlink_first(jh); | ||
71 | |||
72 | if (!transaction->t_checkpoint_io_list) { | ||
73 | jh->b_cpnext = jh->b_cpprev = jh; | ||
74 | } else { | ||
75 | jh->b_cpnext = transaction->t_checkpoint_io_list; | ||
76 | jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; | ||
77 | jh->b_cpprev->b_cpnext = jh; | ||
78 | jh->b_cpnext->b_cpprev = jh; | ||
79 | } | ||
80 | transaction->t_checkpoint_io_list = jh; | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Try to release a checkpointed buffer from its transaction. | ||
85 | * Returns 1 if we released it and 2 if we also released the | ||
86 | * whole transaction. | ||
87 | * | ||
88 | * Requires j_list_lock | ||
89 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | ||
90 | */ | ||
91 | static int __try_to_free_cp_buf(struct journal_head *jh) | ||
92 | { | ||
93 | int ret = 0; | ||
94 | struct buffer_head *bh = jh2bh(jh); | ||
95 | |||
96 | if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { | ||
97 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | ||
98 | ret = __journal_remove_checkpoint(jh) + 1; | ||
99 | jbd_unlock_bh_state(bh); | ||
100 | journal_remove_journal_head(bh); | ||
101 | BUFFER_TRACE(bh, "release"); | ||
102 | __brelse(bh); | ||
103 | } else { | ||
104 | jbd_unlock_bh_state(bh); | ||
105 | } | ||
106 | return ret; | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * __log_wait_for_space: wait until there is space in the journal. | ||
111 | * | ||
112 | * Called under j-state_lock *only*. It will be unlocked if we have to wait | ||
113 | * for a checkpoint to free up some space in the log. | ||
114 | */ | ||
115 | void __log_wait_for_space(journal_t *journal) | ||
116 | { | ||
117 | int nblocks; | ||
118 | assert_spin_locked(&journal->j_state_lock); | ||
119 | |||
120 | nblocks = jbd_space_needed(journal); | ||
121 | while (__log_space_left(journal) < nblocks) { | ||
122 | if (journal->j_flags & JFS_ABORT) | ||
123 | return; | ||
124 | spin_unlock(&journal->j_state_lock); | ||
125 | mutex_lock(&journal->j_checkpoint_mutex); | ||
126 | |||
127 | /* | ||
128 | * Test again, another process may have checkpointed while we | ||
129 | * were waiting for the checkpoint lock | ||
130 | */ | ||
131 | spin_lock(&journal->j_state_lock); | ||
132 | nblocks = jbd_space_needed(journal); | ||
133 | if (__log_space_left(journal) < nblocks) { | ||
134 | spin_unlock(&journal->j_state_lock); | ||
135 | log_do_checkpoint(journal); | ||
136 | spin_lock(&journal->j_state_lock); | ||
137 | } | ||
138 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
139 | } | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. | ||
144 | * The caller must restart a list walk. Wait for someone else to run | ||
145 | * jbd_unlock_bh_state(). | ||
146 | */ | ||
147 | static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) | ||
148 | __releases(journal->j_list_lock) | ||
149 | { | ||
150 | get_bh(bh); | ||
151 | spin_unlock(&journal->j_list_lock); | ||
152 | jbd_lock_bh_state(bh); | ||
153 | jbd_unlock_bh_state(bh); | ||
154 | put_bh(bh); | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Clean up transaction's list of buffers submitted for io. | ||
159 | * We wait for any pending IO to complete and remove any clean | ||
160 | * buffers. Note that we take the buffers in the opposite ordering | ||
161 | * from the one in which they were submitted for IO. | ||
162 | * | ||
163 | * Called with j_list_lock held. | ||
164 | */ | ||
165 | static void __wait_cp_io(journal_t *journal, transaction_t *transaction) | ||
166 | { | ||
167 | struct journal_head *jh; | ||
168 | struct buffer_head *bh; | ||
169 | tid_t this_tid; | ||
170 | int released = 0; | ||
171 | |||
172 | this_tid = transaction->t_tid; | ||
173 | restart: | ||
174 | /* Did somebody clean up the transaction in the meanwhile? */ | ||
175 | if (journal->j_checkpoint_transactions != transaction || | ||
176 | transaction->t_tid != this_tid) | ||
177 | return; | ||
178 | while (!released && transaction->t_checkpoint_io_list) { | ||
179 | jh = transaction->t_checkpoint_io_list; | ||
180 | bh = jh2bh(jh); | ||
181 | if (!jbd_trylock_bh_state(bh)) { | ||
182 | jbd_sync_bh(journal, bh); | ||
183 | spin_lock(&journal->j_list_lock); | ||
184 | goto restart; | ||
185 | } | ||
186 | if (buffer_locked(bh)) { | ||
187 | atomic_inc(&bh->b_count); | ||
188 | spin_unlock(&journal->j_list_lock); | ||
189 | jbd_unlock_bh_state(bh); | ||
190 | wait_on_buffer(bh); | ||
191 | /* the journal_head may have gone by now */ | ||
192 | BUFFER_TRACE(bh, "brelse"); | ||
193 | __brelse(bh); | ||
194 | spin_lock(&journal->j_list_lock); | ||
195 | goto restart; | ||
196 | } | ||
197 | /* | ||
198 | * Now in whatever state the buffer currently is, we know that | ||
199 | * it has been written out and so we can drop it from the list | ||
200 | */ | ||
201 | released = __journal_remove_checkpoint(jh); | ||
202 | jbd_unlock_bh_state(bh); | ||
203 | journal_remove_journal_head(bh); | ||
204 | __brelse(bh); | ||
205 | } | ||
206 | } | ||
207 | |||
208 | #define NR_BATCH 64 | ||
209 | |||
210 | static void | ||
211 | __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) | ||
212 | { | ||
213 | int i; | ||
214 | |||
215 | ll_rw_block(SWRITE, *batch_count, bhs); | ||
216 | for (i = 0; i < *batch_count; i++) { | ||
217 | struct buffer_head *bh = bhs[i]; | ||
218 | clear_buffer_jwrite(bh); | ||
219 | BUFFER_TRACE(bh, "brelse"); | ||
220 | __brelse(bh); | ||
221 | } | ||
222 | *batch_count = 0; | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * Try to flush one buffer from the checkpoint list to disk. | ||
227 | * | ||
228 | * Return 1 if something happened which requires us to abort the current | ||
229 | * scan of the checkpoint list. | ||
230 | * | ||
231 | * Called with j_list_lock held and drops it if 1 is returned | ||
232 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | ||
233 | */ | ||
234 | static int __process_buffer(journal_t *journal, struct journal_head *jh, | ||
235 | struct buffer_head **bhs, int *batch_count) | ||
236 | { | ||
237 | struct buffer_head *bh = jh2bh(jh); | ||
238 | int ret = 0; | ||
239 | |||
240 | if (buffer_locked(bh)) { | ||
241 | atomic_inc(&bh->b_count); | ||
242 | spin_unlock(&journal->j_list_lock); | ||
243 | jbd_unlock_bh_state(bh); | ||
244 | wait_on_buffer(bh); | ||
245 | /* the journal_head may have gone by now */ | ||
246 | BUFFER_TRACE(bh, "brelse"); | ||
247 | __brelse(bh); | ||
248 | ret = 1; | ||
249 | } else if (jh->b_transaction != NULL) { | ||
250 | transaction_t *t = jh->b_transaction; | ||
251 | tid_t tid = t->t_tid; | ||
252 | |||
253 | spin_unlock(&journal->j_list_lock); | ||
254 | jbd_unlock_bh_state(bh); | ||
255 | log_start_commit(journal, tid); | ||
256 | log_wait_commit(journal, tid); | ||
257 | ret = 1; | ||
258 | } else if (!buffer_dirty(bh)) { | ||
259 | J_ASSERT_JH(jh, !buffer_jbddirty(bh)); | ||
260 | BUFFER_TRACE(bh, "remove from checkpoint"); | ||
261 | __journal_remove_checkpoint(jh); | ||
262 | spin_unlock(&journal->j_list_lock); | ||
263 | jbd_unlock_bh_state(bh); | ||
264 | journal_remove_journal_head(bh); | ||
265 | __brelse(bh); | ||
266 | ret = 1; | ||
267 | } else { | ||
268 | /* | ||
269 | * Important: we are about to write the buffer, and | ||
270 | * possibly block, while still holding the journal lock. | ||
271 | * We cannot afford to let the transaction logic start | ||
272 | * messing around with this buffer before we write it to | ||
273 | * disk, as that would break recoverability. | ||
274 | */ | ||
275 | BUFFER_TRACE(bh, "queue"); | ||
276 | get_bh(bh); | ||
277 | J_ASSERT_BH(bh, !buffer_jwrite(bh)); | ||
278 | set_buffer_jwrite(bh); | ||
279 | bhs[*batch_count] = bh; | ||
280 | __buffer_relink_io(jh); | ||
281 | jbd_unlock_bh_state(bh); | ||
282 | (*batch_count)++; | ||
283 | if (*batch_count == NR_BATCH) { | ||
284 | spin_unlock(&journal->j_list_lock); | ||
285 | __flush_batch(journal, bhs, batch_count); | ||
286 | ret = 1; | ||
287 | } | ||
288 | } | ||
289 | return ret; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * Perform an actual checkpoint. We take the first transaction on the | ||
294 | * list of transactions to be checkpointed and send all its buffers | ||
295 | * to disk. We submit larger chunks of data at once. | ||
296 | * | ||
297 | * The journal should be locked before calling this function. | ||
298 | */ | ||
299 | int log_do_checkpoint(journal_t *journal) | ||
300 | { | ||
301 | transaction_t *transaction; | ||
302 | tid_t this_tid; | ||
303 | int result; | ||
304 | |||
305 | jbd_debug(1, "Start checkpoint\n"); | ||
306 | |||
307 | /* | ||
308 | * First thing: if there are any transactions in the log which | ||
309 | * don't need checkpointing, just eliminate them from the | ||
310 | * journal straight away. | ||
311 | */ | ||
312 | result = cleanup_journal_tail(journal); | ||
313 | jbd_debug(1, "cleanup_journal_tail returned %d\n", result); | ||
314 | if (result <= 0) | ||
315 | return result; | ||
316 | |||
317 | /* | ||
318 | * OK, we need to start writing disk blocks. Take one transaction | ||
319 | * and write it. | ||
320 | */ | ||
321 | spin_lock(&journal->j_list_lock); | ||
322 | if (!journal->j_checkpoint_transactions) | ||
323 | goto out; | ||
324 | transaction = journal->j_checkpoint_transactions; | ||
325 | this_tid = transaction->t_tid; | ||
326 | restart: | ||
327 | /* | ||
328 | * If someone cleaned up this transaction while we slept, we're | ||
329 | * done (maybe it's a new transaction, but it fell at the same | ||
330 | * address). | ||
331 | */ | ||
332 | if (journal->j_checkpoint_transactions == transaction && | ||
333 | transaction->t_tid == this_tid) { | ||
334 | int batch_count = 0; | ||
335 | struct buffer_head *bhs[NR_BATCH]; | ||
336 | struct journal_head *jh; | ||
337 | int retry = 0; | ||
338 | |||
339 | while (!retry && transaction->t_checkpoint_list) { | ||
340 | struct buffer_head *bh; | ||
341 | |||
342 | jh = transaction->t_checkpoint_list; | ||
343 | bh = jh2bh(jh); | ||
344 | if (!jbd_trylock_bh_state(bh)) { | ||
345 | jbd_sync_bh(journal, bh); | ||
346 | retry = 1; | ||
347 | break; | ||
348 | } | ||
349 | retry = __process_buffer(journal, jh, bhs,&batch_count); | ||
350 | if (!retry && lock_need_resched(&journal->j_list_lock)){ | ||
351 | spin_unlock(&journal->j_list_lock); | ||
352 | retry = 1; | ||
353 | break; | ||
354 | } | ||
355 | } | ||
356 | |||
357 | if (batch_count) { | ||
358 | if (!retry) { | ||
359 | spin_unlock(&journal->j_list_lock); | ||
360 | retry = 1; | ||
361 | } | ||
362 | __flush_batch(journal, bhs, &batch_count); | ||
363 | } | ||
364 | |||
365 | if (retry) { | ||
366 | spin_lock(&journal->j_list_lock); | ||
367 | goto restart; | ||
368 | } | ||
369 | /* | ||
370 | * Now we have cleaned up the first transaction's checkpoint | ||
371 | * list. Let's clean up the second one | ||
372 | */ | ||
373 | __wait_cp_io(journal, transaction); | ||
374 | } | ||
375 | out: | ||
376 | spin_unlock(&journal->j_list_lock); | ||
377 | result = cleanup_journal_tail(journal); | ||
378 | if (result < 0) | ||
379 | return result; | ||
380 | return 0; | ||
381 | } | ||
382 | |||
383 | /* | ||
384 | * Check the list of checkpoint transactions for the journal to see if | ||
385 | * we have already got rid of any since the last update of the log tail | ||
386 | * in the journal superblock. If so, we can instantly roll the | ||
387 | * superblock forward to remove those transactions from the log. | ||
388 | * | ||
389 | * Return <0 on error, 0 on success, 1 if there was nothing to clean up. | ||
390 | * | ||
391 | * Called with the journal lock held. | ||
392 | * | ||
393 | * This is the only part of the journaling code which really needs to be | ||
394 | * aware of transaction aborts. Checkpointing involves writing to the | ||
395 | * main filesystem area rather than to the journal, so it can proceed | ||
396 | * even in abort state, but we must not update the journal superblock if | ||
397 | * we have an abort error outstanding. | ||
398 | */ | ||
399 | |||
400 | int cleanup_journal_tail(journal_t *journal) | ||
401 | { | ||
402 | transaction_t * transaction; | ||
403 | tid_t first_tid; | ||
404 | unsigned long blocknr, freed; | ||
405 | |||
406 | /* OK, work out the oldest transaction remaining in the log, and | ||
407 | * the log block it starts at. | ||
408 | * | ||
409 | * If the log is now empty, we need to work out which is the | ||
410 | * next transaction ID we will write, and where it will | ||
411 | * start. */ | ||
412 | |||
413 | spin_lock(&journal->j_state_lock); | ||
414 | spin_lock(&journal->j_list_lock); | ||
415 | transaction = journal->j_checkpoint_transactions; | ||
416 | if (transaction) { | ||
417 | first_tid = transaction->t_tid; | ||
418 | blocknr = transaction->t_log_start; | ||
419 | } else if ((transaction = journal->j_committing_transaction) != NULL) { | ||
420 | first_tid = transaction->t_tid; | ||
421 | blocknr = transaction->t_log_start; | ||
422 | } else if ((transaction = journal->j_running_transaction) != NULL) { | ||
423 | first_tid = transaction->t_tid; | ||
424 | blocknr = journal->j_head; | ||
425 | } else { | ||
426 | first_tid = journal->j_transaction_sequence; | ||
427 | blocknr = journal->j_head; | ||
428 | } | ||
429 | spin_unlock(&journal->j_list_lock); | ||
430 | J_ASSERT(blocknr != 0); | ||
431 | |||
432 | /* If the oldest pinned transaction is at the tail of the log | ||
433 | already then there's not much we can do right now. */ | ||
434 | if (journal->j_tail_sequence == first_tid) { | ||
435 | spin_unlock(&journal->j_state_lock); | ||
436 | return 1; | ||
437 | } | ||
438 | |||
439 | /* OK, update the superblock to recover the freed space. | ||
440 | * Physical blocks come first: have we wrapped beyond the end of | ||
441 | * the log? */ | ||
442 | freed = blocknr - journal->j_tail; | ||
443 | if (blocknr < journal->j_tail) | ||
444 | freed = freed + journal->j_last - journal->j_first; | ||
445 | |||
446 | jbd_debug(1, | ||
447 | "Cleaning journal tail from %d to %d (offset %lu), " | ||
448 | "freeing %lu\n", | ||
449 | journal->j_tail_sequence, first_tid, blocknr, freed); | ||
450 | |||
451 | journal->j_free += freed; | ||
452 | journal->j_tail_sequence = first_tid; | ||
453 | journal->j_tail = blocknr; | ||
454 | spin_unlock(&journal->j_state_lock); | ||
455 | if (!(journal->j_flags & JFS_ABORT)) | ||
456 | journal_update_superblock(journal, 1); | ||
457 | return 0; | ||
458 | } | ||
459 | |||
460 | |||
461 | /* Checkpoint list management */ | ||
462 | |||
463 | /* | ||
464 | * journal_clean_one_cp_list | ||
465 | * | ||
466 | * Find all the written-back checkpoint buffers in the given list and release them. | ||
467 | * | ||
468 | * Called with the journal locked. | ||
469 | * Called with j_list_lock held. | ||
470 | * Returns number of bufers reaped (for debug) | ||
471 | */ | ||
472 | |||
473 | static int journal_clean_one_cp_list(struct journal_head *jh, int *released) | ||
474 | { | ||
475 | struct journal_head *last_jh; | ||
476 | struct journal_head *next_jh = jh; | ||
477 | int ret, freed = 0; | ||
478 | |||
479 | *released = 0; | ||
480 | if (!jh) | ||
481 | return 0; | ||
482 | |||
483 | last_jh = jh->b_cpprev; | ||
484 | do { | ||
485 | jh = next_jh; | ||
486 | next_jh = jh->b_cpnext; | ||
487 | /* Use trylock because of the ranking */ | ||
488 | if (jbd_trylock_bh_state(jh2bh(jh))) { | ||
489 | ret = __try_to_free_cp_buf(jh); | ||
490 | if (ret) { | ||
491 | freed++; | ||
492 | if (ret == 2) { | ||
493 | *released = 1; | ||
494 | return freed; | ||
495 | } | ||
496 | } | ||
497 | } | ||
498 | /* | ||
499 | * This function only frees up some memory | ||
500 | * if possible so we dont have an obligation | ||
501 | * to finish processing. Bail out if preemption | ||
502 | * requested: | ||
503 | */ | ||
504 | if (need_resched()) | ||
505 | return freed; | ||
506 | } while (jh != last_jh); | ||
507 | |||
508 | return freed; | ||
509 | } | ||
510 | |||
511 | /* | ||
512 | * journal_clean_checkpoint_list | ||
513 | * | ||
514 | * Find all the written-back checkpoint buffers in the journal and release them. | ||
515 | * | ||
516 | * Called with the journal locked. | ||
517 | * Called with j_list_lock held. | ||
518 | * Returns number of buffers reaped (for debug) | ||
519 | */ | ||
520 | |||
521 | int __journal_clean_checkpoint_list(journal_t *journal) | ||
522 | { | ||
523 | transaction_t *transaction, *last_transaction, *next_transaction; | ||
524 | int ret = 0; | ||
525 | int released; | ||
526 | |||
527 | transaction = journal->j_checkpoint_transactions; | ||
528 | if (!transaction) | ||
529 | goto out; | ||
530 | |||
531 | last_transaction = transaction->t_cpprev; | ||
532 | next_transaction = transaction; | ||
533 | do { | ||
534 | transaction = next_transaction; | ||
535 | next_transaction = transaction->t_cpnext; | ||
536 | ret += journal_clean_one_cp_list(transaction-> | ||
537 | t_checkpoint_list, &released); | ||
538 | /* | ||
539 | * This function only frees up some memory if possible so we | ||
540 | * dont have an obligation to finish processing. Bail out if | ||
541 | * preemption requested: | ||
542 | */ | ||
543 | if (need_resched()) | ||
544 | goto out; | ||
545 | if (released) | ||
546 | continue; | ||
547 | /* | ||
548 | * It is essential that we are as careful as in the case of | ||
549 | * t_checkpoint_list with removing the buffer from the list as | ||
550 | * we can possibly see not yet submitted buffers on io_list | ||
551 | */ | ||
552 | ret += journal_clean_one_cp_list(transaction-> | ||
553 | t_checkpoint_io_list, &released); | ||
554 | if (need_resched()) | ||
555 | goto out; | ||
556 | } while (transaction != last_transaction); | ||
557 | out: | ||
558 | return ret; | ||
559 | } | ||
560 | |||
561 | /* | ||
562 | * journal_remove_checkpoint: called after a buffer has been committed | ||
563 | * to disk (either by being write-back flushed to disk, or being | ||
564 | * committed to the log). | ||
565 | * | ||
566 | * We cannot safely clean a transaction out of the log until all of the | ||
567 | * buffer updates committed in that transaction have safely been stored | ||
568 | * elsewhere on disk. To achieve this, all of the buffers in a | ||
569 | * transaction need to be maintained on the transaction's checkpoint | ||
570 | * lists until they have been rewritten, at which point this function is | ||
571 | * called to remove the buffer from the existing transaction's | ||
572 | * checkpoint lists. | ||
573 | * | ||
574 | * The function returns 1 if it frees the transaction, 0 otherwise. | ||
575 | * | ||
576 | * This function is called with the journal locked. | ||
577 | * This function is called with j_list_lock held. | ||
578 | * This function is called with jbd_lock_bh_state(jh2bh(jh)) | ||
579 | */ | ||
580 | |||
581 | int __journal_remove_checkpoint(struct journal_head *jh) | ||
582 | { | ||
583 | transaction_t *transaction; | ||
584 | journal_t *journal; | ||
585 | int ret = 0; | ||
586 | |||
587 | JBUFFER_TRACE(jh, "entry"); | ||
588 | |||
589 | if ((transaction = jh->b_cp_transaction) == NULL) { | ||
590 | JBUFFER_TRACE(jh, "not on transaction"); | ||
591 | goto out; | ||
592 | } | ||
593 | journal = transaction->t_journal; | ||
594 | |||
595 | __buffer_unlink(jh); | ||
596 | jh->b_cp_transaction = NULL; | ||
597 | |||
598 | if (transaction->t_checkpoint_list != NULL || | ||
599 | transaction->t_checkpoint_io_list != NULL) | ||
600 | goto out; | ||
601 | JBUFFER_TRACE(jh, "transaction has no more buffers"); | ||
602 | |||
603 | /* | ||
604 | * There is one special case to worry about: if we have just pulled the | ||
605 | * buffer off a committing transaction's forget list, then even if the | ||
606 | * checkpoint list is empty, the transaction obviously cannot be | ||
607 | * dropped! | ||
608 | * | ||
609 | * The locking here around j_committing_transaction is a bit sleazy. | ||
610 | * See the comment at the end of journal_commit_transaction(). | ||
611 | */ | ||
612 | if (transaction == journal->j_committing_transaction) { | ||
613 | JBUFFER_TRACE(jh, "belongs to committing transaction"); | ||
614 | goto out; | ||
615 | } | ||
616 | |||
617 | /* OK, that was the last buffer for the transaction: we can now | ||
618 | safely remove this transaction from the log */ | ||
619 | |||
620 | __journal_drop_transaction(journal, transaction); | ||
621 | |||
622 | /* Just in case anybody was waiting for more transactions to be | ||
623 | checkpointed... */ | ||
624 | wake_up(&journal->j_wait_logspace); | ||
625 | ret = 1; | ||
626 | out: | ||
627 | JBUFFER_TRACE(jh, "exit"); | ||
628 | return ret; | ||
629 | } | ||
630 | |||
631 | /* | ||
632 | * journal_insert_checkpoint: put a committed buffer onto a checkpoint | ||
633 | * list so that we know when it is safe to clean the transaction out of | ||
634 | * the log. | ||
635 | * | ||
636 | * Called with the journal locked. | ||
637 | * Called with j_list_lock held. | ||
638 | */ | ||
639 | void __journal_insert_checkpoint(struct journal_head *jh, | ||
640 | transaction_t *transaction) | ||
641 | { | ||
642 | JBUFFER_TRACE(jh, "entry"); | ||
643 | J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); | ||
644 | J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); | ||
645 | |||
646 | jh->b_cp_transaction = transaction; | ||
647 | |||
648 | if (!transaction->t_checkpoint_list) { | ||
649 | jh->b_cpnext = jh->b_cpprev = jh; | ||
650 | } else { | ||
651 | jh->b_cpnext = transaction->t_checkpoint_list; | ||
652 | jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; | ||
653 | jh->b_cpprev->b_cpnext = jh; | ||
654 | jh->b_cpnext->b_cpprev = jh; | ||
655 | } | ||
656 | transaction->t_checkpoint_list = jh; | ||
657 | } | ||
658 | |||
659 | /* | ||
660 | * We've finished with this transaction structure: adios... | ||
661 | * | ||
662 | * The transaction must have no links except for the checkpoint by this | ||
663 | * point. | ||
664 | * | ||
665 | * Called with the journal locked. | ||
666 | * Called with j_list_lock held. | ||
667 | */ | ||
668 | |||
669 | void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) | ||
670 | { | ||
671 | assert_spin_locked(&journal->j_list_lock); | ||
672 | if (transaction->t_cpnext) { | ||
673 | transaction->t_cpnext->t_cpprev = transaction->t_cpprev; | ||
674 | transaction->t_cpprev->t_cpnext = transaction->t_cpnext; | ||
675 | if (journal->j_checkpoint_transactions == transaction) | ||
676 | journal->j_checkpoint_transactions = | ||
677 | transaction->t_cpnext; | ||
678 | if (journal->j_checkpoint_transactions == transaction) | ||
679 | journal->j_checkpoint_transactions = NULL; | ||
680 | } | ||
681 | |||
682 | J_ASSERT(transaction->t_state == T_FINISHED); | ||
683 | J_ASSERT(transaction->t_buffers == NULL); | ||
684 | J_ASSERT(transaction->t_sync_datalist == NULL); | ||
685 | J_ASSERT(transaction->t_forget == NULL); | ||
686 | J_ASSERT(transaction->t_iobuf_list == NULL); | ||
687 | J_ASSERT(transaction->t_shadow_list == NULL); | ||
688 | J_ASSERT(transaction->t_log_list == NULL); | ||
689 | J_ASSERT(transaction->t_checkpoint_list == NULL); | ||
690 | J_ASSERT(transaction->t_checkpoint_io_list == NULL); | ||
691 | J_ASSERT(transaction->t_updates == 0); | ||
692 | J_ASSERT(journal->j_committing_transaction != transaction); | ||
693 | J_ASSERT(journal->j_running_transaction != transaction); | ||
694 | |||
695 | jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); | ||
696 | kfree(transaction); | ||
697 | } | ||
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c new file mode 100644 index 000000000000..10be51290a27 --- /dev/null +++ b/fs/jbd2/commit.c | |||
@@ -0,0 +1,911 @@ | |||
1 | /* | ||
2 | * linux/fs/jbd/commit.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | ||
5 | * | ||
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Journal commit routines for the generic filesystem journaling code; | ||
13 | * part of the ext2fs journaling system. | ||
14 | */ | ||
15 | |||
16 | #include <linux/time.h> | ||
17 | #include <linux/fs.h> | ||
18 | #include <linux/jbd.h> | ||
19 | #include <linux/errno.h> | ||
20 | #include <linux/slab.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/pagemap.h> | ||
23 | #include <linux/smp_lock.h> | ||
24 | |||
25 | /* | ||
26 | * Default IO end handler for temporary BJ_IO buffer_heads. | ||
27 | */ | ||
28 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | ||
29 | { | ||
30 | BUFFER_TRACE(bh, ""); | ||
31 | if (uptodate) | ||
32 | set_buffer_uptodate(bh); | ||
33 | else | ||
34 | clear_buffer_uptodate(bh); | ||
35 | unlock_buffer(bh); | ||
36 | } | ||
37 | |||
38 | /* | ||
39 | * When an ext3-ordered file is truncated, it is possible that many pages are | ||
40 | * not sucessfully freed, because they are attached to a committing transaction. | ||
41 | * After the transaction commits, these pages are left on the LRU, with no | ||
42 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | ||
43 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | ||
44 | * the numbers in /proc/meminfo look odd. | ||
45 | * | ||
46 | * So here, we have a buffer which has just come off the forget list. Look to | ||
47 | * see if we can strip all buffers from the backing page. | ||
48 | * | ||
49 | * Called under lock_journal(), and possibly under journal_datalist_lock. The | ||
50 | * caller provided us with a ref against the buffer, and we drop that here. | ||
51 | */ | ||
52 | static void release_buffer_page(struct buffer_head *bh) | ||
53 | { | ||
54 | struct page *page; | ||
55 | |||
56 | if (buffer_dirty(bh)) | ||
57 | goto nope; | ||
58 | if (atomic_read(&bh->b_count) != 1) | ||
59 | goto nope; | ||
60 | page = bh->b_page; | ||
61 | if (!page) | ||
62 | goto nope; | ||
63 | if (page->mapping) | ||
64 | goto nope; | ||
65 | |||
66 | /* OK, it's a truncated page */ | ||
67 | if (TestSetPageLocked(page)) | ||
68 | goto nope; | ||
69 | |||
70 | page_cache_get(page); | ||
71 | __brelse(bh); | ||
72 | try_to_free_buffers(page); | ||
73 | unlock_page(page); | ||
74 | page_cache_release(page); | ||
75 | return; | ||
76 | |||
77 | nope: | ||
78 | __brelse(bh); | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is | ||
83 | * held. For ranking reasons we must trylock. If we lose, schedule away and | ||
84 | * return 0. j_list_lock is dropped in this case. | ||
85 | */ | ||
86 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) | ||
87 | { | ||
88 | if (!jbd_trylock_bh_state(bh)) { | ||
89 | spin_unlock(&journal->j_list_lock); | ||
90 | schedule(); | ||
91 | return 0; | ||
92 | } | ||
93 | return 1; | ||
94 | } | ||
95 | |||
96 | /* Done it all: now write the commit record. We should have | ||
97 | * cleaned up our previous buffers by now, so if we are in abort | ||
98 | * mode we can now just skip the rest of the journal write | ||
99 | * entirely. | ||
100 | * | ||
101 | * Returns 1 if the journal needs to be aborted or 0 on success | ||
102 | */ | ||
103 | static int journal_write_commit_record(journal_t *journal, | ||
104 | transaction_t *commit_transaction) | ||
105 | { | ||
106 | struct journal_head *descriptor; | ||
107 | struct buffer_head *bh; | ||
108 | int i, ret; | ||
109 | int barrier_done = 0; | ||
110 | |||
111 | if (is_journal_aborted(journal)) | ||
112 | return 0; | ||
113 | |||
114 | descriptor = journal_get_descriptor_buffer(journal); | ||
115 | if (!descriptor) | ||
116 | return 1; | ||
117 | |||
118 | bh = jh2bh(descriptor); | ||
119 | |||
120 | /* AKPM: buglet - add `i' to tmp! */ | ||
121 | for (i = 0; i < bh->b_size; i += 512) { | ||
122 | journal_header_t *tmp = (journal_header_t*)bh->b_data; | ||
123 | tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
124 | tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); | ||
125 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | ||
126 | } | ||
127 | |||
128 | JBUFFER_TRACE(descriptor, "write commit block"); | ||
129 | set_buffer_dirty(bh); | ||
130 | if (journal->j_flags & JFS_BARRIER) { | ||
131 | set_buffer_ordered(bh); | ||
132 | barrier_done = 1; | ||
133 | } | ||
134 | ret = sync_dirty_buffer(bh); | ||
135 | /* is it possible for another commit to fail at roughly | ||
136 | * the same time as this one? If so, we don't want to | ||
137 | * trust the barrier flag in the super, but instead want | ||
138 | * to remember if we sent a barrier request | ||
139 | */ | ||
140 | if (ret == -EOPNOTSUPP && barrier_done) { | ||
141 | char b[BDEVNAME_SIZE]; | ||
142 | |||
143 | printk(KERN_WARNING | ||
144 | "JBD: barrier-based sync failed on %s - " | ||
145 | "disabling barriers\n", | ||
146 | bdevname(journal->j_dev, b)); | ||
147 | spin_lock(&journal->j_state_lock); | ||
148 | journal->j_flags &= ~JFS_BARRIER; | ||
149 | spin_unlock(&journal->j_state_lock); | ||
150 | |||
151 | /* And try again, without the barrier */ | ||
152 | clear_buffer_ordered(bh); | ||
153 | set_buffer_uptodate(bh); | ||
154 | set_buffer_dirty(bh); | ||
155 | ret = sync_dirty_buffer(bh); | ||
156 | } | ||
157 | put_bh(bh); /* One for getblk() */ | ||
158 | journal_put_journal_head(descriptor); | ||
159 | |||
160 | return (ret == -EIO); | ||
161 | } | ||
162 | |||
163 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) | ||
164 | { | ||
165 | int i; | ||
166 | |||
167 | for (i = 0; i < bufs; i++) { | ||
168 | wbuf[i]->b_end_io = end_buffer_write_sync; | ||
169 | /* We use-up our safety reference in submit_bh() */ | ||
170 | submit_bh(WRITE, wbuf[i]); | ||
171 | } | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * Submit all the data buffers to disk | ||
176 | */ | ||
177 | static void journal_submit_data_buffers(journal_t *journal, | ||
178 | transaction_t *commit_transaction) | ||
179 | { | ||
180 | struct journal_head *jh; | ||
181 | struct buffer_head *bh; | ||
182 | int locked; | ||
183 | int bufs = 0; | ||
184 | struct buffer_head **wbuf = journal->j_wbuf; | ||
185 | |||
186 | /* | ||
187 | * Whenever we unlock the journal and sleep, things can get added | ||
188 | * onto ->t_sync_datalist, so we have to keep looping back to | ||
189 | * write_out_data until we *know* that the list is empty. | ||
190 | * | ||
191 | * Cleanup any flushed data buffers from the data list. Even in | ||
192 | * abort mode, we want to flush this out as soon as possible. | ||
193 | */ | ||
194 | write_out_data: | ||
195 | cond_resched(); | ||
196 | spin_lock(&journal->j_list_lock); | ||
197 | |||
198 | while (commit_transaction->t_sync_datalist) { | ||
199 | jh = commit_transaction->t_sync_datalist; | ||
200 | bh = jh2bh(jh); | ||
201 | locked = 0; | ||
202 | |||
203 | /* Get reference just to make sure buffer does not disappear | ||
204 | * when we are forced to drop various locks */ | ||
205 | get_bh(bh); | ||
206 | /* If the buffer is dirty, we need to submit IO and hence | ||
207 | * we need the buffer lock. We try to lock the buffer without | ||
208 | * blocking. If we fail, we need to drop j_list_lock and do | ||
209 | * blocking lock_buffer(). | ||
210 | */ | ||
211 | if (buffer_dirty(bh)) { | ||
212 | if (test_set_buffer_locked(bh)) { | ||
213 | BUFFER_TRACE(bh, "needs blocking lock"); | ||
214 | spin_unlock(&journal->j_list_lock); | ||
215 | /* Write out all data to prevent deadlocks */ | ||
216 | journal_do_submit_data(wbuf, bufs); | ||
217 | bufs = 0; | ||
218 | lock_buffer(bh); | ||
219 | spin_lock(&journal->j_list_lock); | ||
220 | } | ||
221 | locked = 1; | ||
222 | } | ||
223 | /* We have to get bh_state lock. Again out of order, sigh. */ | ||
224 | if (!inverted_lock(journal, bh)) { | ||
225 | jbd_lock_bh_state(bh); | ||
226 | spin_lock(&journal->j_list_lock); | ||
227 | } | ||
228 | /* Someone already cleaned up the buffer? */ | ||
229 | if (!buffer_jbd(bh) | ||
230 | || jh->b_transaction != commit_transaction | ||
231 | || jh->b_jlist != BJ_SyncData) { | ||
232 | jbd_unlock_bh_state(bh); | ||
233 | if (locked) | ||
234 | unlock_buffer(bh); | ||
235 | BUFFER_TRACE(bh, "already cleaned up"); | ||
236 | put_bh(bh); | ||
237 | continue; | ||
238 | } | ||
239 | if (locked && test_clear_buffer_dirty(bh)) { | ||
240 | BUFFER_TRACE(bh, "needs writeout, adding to array"); | ||
241 | wbuf[bufs++] = bh; | ||
242 | __journal_file_buffer(jh, commit_transaction, | ||
243 | BJ_Locked); | ||
244 | jbd_unlock_bh_state(bh); | ||
245 | if (bufs == journal->j_wbufsize) { | ||
246 | spin_unlock(&journal->j_list_lock); | ||
247 | journal_do_submit_data(wbuf, bufs); | ||
248 | bufs = 0; | ||
249 | goto write_out_data; | ||
250 | } | ||
251 | } | ||
252 | else { | ||
253 | BUFFER_TRACE(bh, "writeout complete: unfile"); | ||
254 | __journal_unfile_buffer(jh); | ||
255 | jbd_unlock_bh_state(bh); | ||
256 | if (locked) | ||
257 | unlock_buffer(bh); | ||
258 | journal_remove_journal_head(bh); | ||
259 | /* Once for our safety reference, once for | ||
260 | * journal_remove_journal_head() */ | ||
261 | put_bh(bh); | ||
262 | put_bh(bh); | ||
263 | } | ||
264 | |||
265 | if (lock_need_resched(&journal->j_list_lock)) { | ||
266 | spin_unlock(&journal->j_list_lock); | ||
267 | goto write_out_data; | ||
268 | } | ||
269 | } | ||
270 | spin_unlock(&journal->j_list_lock); | ||
271 | journal_do_submit_data(wbuf, bufs); | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * journal_commit_transaction | ||
276 | * | ||
277 | * The primary function for committing a transaction to the log. This | ||
278 | * function is called by the journal thread to begin a complete commit. | ||
279 | */ | ||
280 | void journal_commit_transaction(journal_t *journal) | ||
281 | { | ||
282 | transaction_t *commit_transaction; | ||
283 | struct journal_head *jh, *new_jh, *descriptor; | ||
284 | struct buffer_head **wbuf = journal->j_wbuf; | ||
285 | int bufs; | ||
286 | int flags; | ||
287 | int err; | ||
288 | unsigned long blocknr; | ||
289 | char *tagp = NULL; | ||
290 | journal_header_t *header; | ||
291 | journal_block_tag_t *tag = NULL; | ||
292 | int space_left = 0; | ||
293 | int first_tag = 0; | ||
294 | int tag_flag; | ||
295 | int i; | ||
296 | |||
297 | /* | ||
298 | * First job: lock down the current transaction and wait for | ||
299 | * all outstanding updates to complete. | ||
300 | */ | ||
301 | |||
302 | #ifdef COMMIT_STATS | ||
303 | spin_lock(&journal->j_list_lock); | ||
304 | summarise_journal_usage(journal); | ||
305 | spin_unlock(&journal->j_list_lock); | ||
306 | #endif | ||
307 | |||
308 | /* Do we need to erase the effects of a prior journal_flush? */ | ||
309 | if (journal->j_flags & JFS_FLUSHED) { | ||
310 | jbd_debug(3, "super block updated\n"); | ||
311 | journal_update_superblock(journal, 1); | ||
312 | } else { | ||
313 | jbd_debug(3, "superblock not updated\n"); | ||
314 | } | ||
315 | |||
316 | J_ASSERT(journal->j_running_transaction != NULL); | ||
317 | J_ASSERT(journal->j_committing_transaction == NULL); | ||
318 | |||
319 | commit_transaction = journal->j_running_transaction; | ||
320 | J_ASSERT(commit_transaction->t_state == T_RUNNING); | ||
321 | |||
322 | jbd_debug(1, "JBD: starting commit of transaction %d\n", | ||
323 | commit_transaction->t_tid); | ||
324 | |||
325 | spin_lock(&journal->j_state_lock); | ||
326 | commit_transaction->t_state = T_LOCKED; | ||
327 | |||
328 | spin_lock(&commit_transaction->t_handle_lock); | ||
329 | while (commit_transaction->t_updates) { | ||
330 | DEFINE_WAIT(wait); | ||
331 | |||
332 | prepare_to_wait(&journal->j_wait_updates, &wait, | ||
333 | TASK_UNINTERRUPTIBLE); | ||
334 | if (commit_transaction->t_updates) { | ||
335 | spin_unlock(&commit_transaction->t_handle_lock); | ||
336 | spin_unlock(&journal->j_state_lock); | ||
337 | schedule(); | ||
338 | spin_lock(&journal->j_state_lock); | ||
339 | spin_lock(&commit_transaction->t_handle_lock); | ||
340 | } | ||
341 | finish_wait(&journal->j_wait_updates, &wait); | ||
342 | } | ||
343 | spin_unlock(&commit_transaction->t_handle_lock); | ||
344 | |||
345 | J_ASSERT (commit_transaction->t_outstanding_credits <= | ||
346 | journal->j_max_transaction_buffers); | ||
347 | |||
348 | /* | ||
349 | * First thing we are allowed to do is to discard any remaining | ||
350 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume | ||
351 | * that there are no such buffers: if a large filesystem | ||
352 | * operation like a truncate needs to split itself over multiple | ||
353 | * transactions, then it may try to do a journal_restart() while | ||
354 | * there are still BJ_Reserved buffers outstanding. These must | ||
355 | * be released cleanly from the current transaction. | ||
356 | * | ||
357 | * In this case, the filesystem must still reserve write access | ||
358 | * again before modifying the buffer in the new transaction, but | ||
359 | * we do not require it to remember exactly which old buffers it | ||
360 | * has reserved. This is consistent with the existing behaviour | ||
361 | * that multiple journal_get_write_access() calls to the same | ||
362 | * buffer are perfectly permissable. | ||
363 | */ | ||
364 | while (commit_transaction->t_reserved_list) { | ||
365 | jh = commit_transaction->t_reserved_list; | ||
366 | JBUFFER_TRACE(jh, "reserved, unused: refile"); | ||
367 | /* | ||
368 | * A journal_get_undo_access()+journal_release_buffer() may | ||
369 | * leave undo-committed data. | ||
370 | */ | ||
371 | if (jh->b_committed_data) { | ||
372 | struct buffer_head *bh = jh2bh(jh); | ||
373 | |||
374 | jbd_lock_bh_state(bh); | ||
375 | jbd_slab_free(jh->b_committed_data, bh->b_size); | ||
376 | jh->b_committed_data = NULL; | ||
377 | jbd_unlock_bh_state(bh); | ||
378 | } | ||
379 | journal_refile_buffer(journal, jh); | ||
380 | } | ||
381 | |||
382 | /* | ||
383 | * Now try to drop any written-back buffers from the journal's | ||
384 | * checkpoint lists. We do this *before* commit because it potentially | ||
385 | * frees some memory | ||
386 | */ | ||
387 | spin_lock(&journal->j_list_lock); | ||
388 | __journal_clean_checkpoint_list(journal); | ||
389 | spin_unlock(&journal->j_list_lock); | ||
390 | |||
391 | jbd_debug (3, "JBD: commit phase 1\n"); | ||
392 | |||
393 | /* | ||
394 | * Switch to a new revoke table. | ||
395 | */ | ||
396 | journal_switch_revoke_table(journal); | ||
397 | |||
398 | commit_transaction->t_state = T_FLUSH; | ||
399 | journal->j_committing_transaction = commit_transaction; | ||
400 | journal->j_running_transaction = NULL; | ||
401 | commit_transaction->t_log_start = journal->j_head; | ||
402 | wake_up(&journal->j_wait_transaction_locked); | ||
403 | spin_unlock(&journal->j_state_lock); | ||
404 | |||
405 | jbd_debug (3, "JBD: commit phase 2\n"); | ||
406 | |||
407 | /* | ||
408 | * First, drop modified flag: all accesses to the buffers | ||
409 | * will be tracked for a new trasaction only -bzzz | ||
410 | */ | ||
411 | spin_lock(&journal->j_list_lock); | ||
412 | if (commit_transaction->t_buffers) { | ||
413 | new_jh = jh = commit_transaction->t_buffers->b_tnext; | ||
414 | do { | ||
415 | J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || | ||
416 | new_jh->b_modified == 0); | ||
417 | new_jh->b_modified = 0; | ||
418 | new_jh = new_jh->b_tnext; | ||
419 | } while (new_jh != jh); | ||
420 | } | ||
421 | spin_unlock(&journal->j_list_lock); | ||
422 | |||
423 | /* | ||
424 | * Now start flushing things to disk, in the order they appear | ||
425 | * on the transaction lists. Data blocks go first. | ||
426 | */ | ||
427 | err = 0; | ||
428 | journal_submit_data_buffers(journal, commit_transaction); | ||
429 | |||
430 | /* | ||
431 | * Wait for all previously submitted IO to complete. | ||
432 | */ | ||
433 | spin_lock(&journal->j_list_lock); | ||
434 | while (commit_transaction->t_locked_list) { | ||
435 | struct buffer_head *bh; | ||
436 | |||
437 | jh = commit_transaction->t_locked_list->b_tprev; | ||
438 | bh = jh2bh(jh); | ||
439 | get_bh(bh); | ||
440 | if (buffer_locked(bh)) { | ||
441 | spin_unlock(&journal->j_list_lock); | ||
442 | wait_on_buffer(bh); | ||
443 | if (unlikely(!buffer_uptodate(bh))) | ||
444 | err = -EIO; | ||
445 | spin_lock(&journal->j_list_lock); | ||
446 | } | ||
447 | if (!inverted_lock(journal, bh)) { | ||
448 | put_bh(bh); | ||
449 | spin_lock(&journal->j_list_lock); | ||
450 | continue; | ||
451 | } | ||
452 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
453 | __journal_unfile_buffer(jh); | ||
454 | jbd_unlock_bh_state(bh); | ||
455 | journal_remove_journal_head(bh); | ||
456 | put_bh(bh); | ||
457 | } else { | ||
458 | jbd_unlock_bh_state(bh); | ||
459 | } | ||
460 | put_bh(bh); | ||
461 | cond_resched_lock(&journal->j_list_lock); | ||
462 | } | ||
463 | spin_unlock(&journal->j_list_lock); | ||
464 | |||
465 | if (err) | ||
466 | __journal_abort_hard(journal); | ||
467 | |||
468 | journal_write_revoke_records(journal, commit_transaction); | ||
469 | |||
470 | jbd_debug(3, "JBD: commit phase 2\n"); | ||
471 | |||
472 | /* | ||
473 | * If we found any dirty or locked buffers, then we should have | ||
474 | * looped back up to the write_out_data label. If there weren't | ||
475 | * any then journal_clean_data_list should have wiped the list | ||
476 | * clean by now, so check that it is in fact empty. | ||
477 | */ | ||
478 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); | ||
479 | |||
480 | jbd_debug (3, "JBD: commit phase 3\n"); | ||
481 | |||
482 | /* | ||
483 | * Way to go: we have now written out all of the data for a | ||
484 | * transaction! Now comes the tricky part: we need to write out | ||
485 | * metadata. Loop over the transaction's entire buffer list: | ||
486 | */ | ||
487 | commit_transaction->t_state = T_COMMIT; | ||
488 | |||
489 | descriptor = NULL; | ||
490 | bufs = 0; | ||
491 | while (commit_transaction->t_buffers) { | ||
492 | |||
493 | /* Find the next buffer to be journaled... */ | ||
494 | |||
495 | jh = commit_transaction->t_buffers; | ||
496 | |||
497 | /* If we're in abort mode, we just un-journal the buffer and | ||
498 | release it for background writing. */ | ||
499 | |||
500 | if (is_journal_aborted(journal)) { | ||
501 | JBUFFER_TRACE(jh, "journal is aborting: refile"); | ||
502 | journal_refile_buffer(journal, jh); | ||
503 | /* If that was the last one, we need to clean up | ||
504 | * any descriptor buffers which may have been | ||
505 | * already allocated, even if we are now | ||
506 | * aborting. */ | ||
507 | if (!commit_transaction->t_buffers) | ||
508 | goto start_journal_io; | ||
509 | continue; | ||
510 | } | ||
511 | |||
512 | /* Make sure we have a descriptor block in which to | ||
513 | record the metadata buffer. */ | ||
514 | |||
515 | if (!descriptor) { | ||
516 | struct buffer_head *bh; | ||
517 | |||
518 | J_ASSERT (bufs == 0); | ||
519 | |||
520 | jbd_debug(4, "JBD: get descriptor\n"); | ||
521 | |||
522 | descriptor = journal_get_descriptor_buffer(journal); | ||
523 | if (!descriptor) { | ||
524 | __journal_abort_hard(journal); | ||
525 | continue; | ||
526 | } | ||
527 | |||
528 | bh = jh2bh(descriptor); | ||
529 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", | ||
530 | (unsigned long long)bh->b_blocknr, bh->b_data); | ||
531 | header = (journal_header_t *)&bh->b_data[0]; | ||
532 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
533 | header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); | ||
534 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | ||
535 | |||
536 | tagp = &bh->b_data[sizeof(journal_header_t)]; | ||
537 | space_left = bh->b_size - sizeof(journal_header_t); | ||
538 | first_tag = 1; | ||
539 | set_buffer_jwrite(bh); | ||
540 | set_buffer_dirty(bh); | ||
541 | wbuf[bufs++] = bh; | ||
542 | |||
543 | /* Record it so that we can wait for IO | ||
544 | completion later */ | ||
545 | BUFFER_TRACE(bh, "ph3: file as descriptor"); | ||
546 | journal_file_buffer(descriptor, commit_transaction, | ||
547 | BJ_LogCtl); | ||
548 | } | ||
549 | |||
550 | /* Where is the buffer to be written? */ | ||
551 | |||
552 | err = journal_next_log_block(journal, &blocknr); | ||
553 | /* If the block mapping failed, just abandon the buffer | ||
554 | and repeat this loop: we'll fall into the | ||
555 | refile-on-abort condition above. */ | ||
556 | if (err) { | ||
557 | __journal_abort_hard(journal); | ||
558 | continue; | ||
559 | } | ||
560 | |||
561 | /* | ||
562 | * start_this_handle() uses t_outstanding_credits to determine | ||
563 | * the free space in the log, but this counter is changed | ||
564 | * by journal_next_log_block() also. | ||
565 | */ | ||
566 | commit_transaction->t_outstanding_credits--; | ||
567 | |||
568 | /* Bump b_count to prevent truncate from stumbling over | ||
569 | the shadowed buffer! @@@ This can go if we ever get | ||
570 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ | ||
571 | atomic_inc(&jh2bh(jh)->b_count); | ||
572 | |||
573 | /* Make a temporary IO buffer with which to write it out | ||
574 | (this will requeue both the metadata buffer and the | ||
575 | temporary IO buffer). new_bh goes on BJ_IO*/ | ||
576 | |||
577 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); | ||
578 | /* | ||
579 | * akpm: journal_write_metadata_buffer() sets | ||
580 | * new_bh->b_transaction to commit_transaction. | ||
581 | * We need to clean this up before we release new_bh | ||
582 | * (which is of type BJ_IO) | ||
583 | */ | ||
584 | JBUFFER_TRACE(jh, "ph3: write metadata"); | ||
585 | flags = journal_write_metadata_buffer(commit_transaction, | ||
586 | jh, &new_jh, blocknr); | ||
587 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); | ||
588 | wbuf[bufs++] = jh2bh(new_jh); | ||
589 | |||
590 | /* Record the new block's tag in the current descriptor | ||
591 | buffer */ | ||
592 | |||
593 | tag_flag = 0; | ||
594 | if (flags & 1) | ||
595 | tag_flag |= JFS_FLAG_ESCAPE; | ||
596 | if (!first_tag) | ||
597 | tag_flag |= JFS_FLAG_SAME_UUID; | ||
598 | |||
599 | tag = (journal_block_tag_t *) tagp; | ||
600 | tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); | ||
601 | tag->t_flags = cpu_to_be32(tag_flag); | ||
602 | tagp += sizeof(journal_block_tag_t); | ||
603 | space_left -= sizeof(journal_block_tag_t); | ||
604 | |||
605 | if (first_tag) { | ||
606 | memcpy (tagp, journal->j_uuid, 16); | ||
607 | tagp += 16; | ||
608 | space_left -= 16; | ||
609 | first_tag = 0; | ||
610 | } | ||
611 | |||
612 | /* If there's no more to do, or if the descriptor is full, | ||
613 | let the IO rip! */ | ||
614 | |||
615 | if (bufs == journal->j_wbufsize || | ||
616 | commit_transaction->t_buffers == NULL || | ||
617 | space_left < sizeof(journal_block_tag_t) + 16) { | ||
618 | |||
619 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); | ||
620 | |||
621 | /* Write an end-of-descriptor marker before | ||
622 | submitting the IOs. "tag" still points to | ||
623 | the last tag we set up. */ | ||
624 | |||
625 | tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); | ||
626 | |||
627 | start_journal_io: | ||
628 | for (i = 0; i < bufs; i++) { | ||
629 | struct buffer_head *bh = wbuf[i]; | ||
630 | lock_buffer(bh); | ||
631 | clear_buffer_dirty(bh); | ||
632 | set_buffer_uptodate(bh); | ||
633 | bh->b_end_io = journal_end_buffer_io_sync; | ||
634 | submit_bh(WRITE, bh); | ||
635 | } | ||
636 | cond_resched(); | ||
637 | |||
638 | /* Force a new descriptor to be generated next | ||
639 | time round the loop. */ | ||
640 | descriptor = NULL; | ||
641 | bufs = 0; | ||
642 | } | ||
643 | } | ||
644 | |||
645 | /* Lo and behold: we have just managed to send a transaction to | ||
646 | the log. Before we can commit it, wait for the IO so far to | ||
647 | complete. Control buffers being written are on the | ||
648 | transaction's t_log_list queue, and metadata buffers are on | ||
649 | the t_iobuf_list queue. | ||
650 | |||
651 | Wait for the buffers in reverse order. That way we are | ||
652 | less likely to be woken up until all IOs have completed, and | ||
653 | so we incur less scheduling load. | ||
654 | */ | ||
655 | |||
656 | jbd_debug(3, "JBD: commit phase 4\n"); | ||
657 | |||
658 | /* | ||
659 | * akpm: these are BJ_IO, and j_list_lock is not needed. | ||
660 | * See __journal_try_to_free_buffer. | ||
661 | */ | ||
662 | wait_for_iobuf: | ||
663 | while (commit_transaction->t_iobuf_list != NULL) { | ||
664 | struct buffer_head *bh; | ||
665 | |||
666 | jh = commit_transaction->t_iobuf_list->b_tprev; | ||
667 | bh = jh2bh(jh); | ||
668 | if (buffer_locked(bh)) { | ||
669 | wait_on_buffer(bh); | ||
670 | goto wait_for_iobuf; | ||
671 | } | ||
672 | if (cond_resched()) | ||
673 | goto wait_for_iobuf; | ||
674 | |||
675 | if (unlikely(!buffer_uptodate(bh))) | ||
676 | err = -EIO; | ||
677 | |||
678 | clear_buffer_jwrite(bh); | ||
679 | |||
680 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); | ||
681 | journal_unfile_buffer(journal, jh); | ||
682 | |||
683 | /* | ||
684 | * ->t_iobuf_list should contain only dummy buffer_heads | ||
685 | * which were created by journal_write_metadata_buffer(). | ||
686 | */ | ||
687 | BUFFER_TRACE(bh, "dumping temporary bh"); | ||
688 | journal_put_journal_head(jh); | ||
689 | __brelse(bh); | ||
690 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); | ||
691 | free_buffer_head(bh); | ||
692 | |||
693 | /* We also have to unlock and free the corresponding | ||
694 | shadowed buffer */ | ||
695 | jh = commit_transaction->t_shadow_list->b_tprev; | ||
696 | bh = jh2bh(jh); | ||
697 | clear_bit(BH_JWrite, &bh->b_state); | ||
698 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | ||
699 | |||
700 | /* The metadata is now released for reuse, but we need | ||
701 | to remember it against this transaction so that when | ||
702 | we finally commit, we can do any checkpointing | ||
703 | required. */ | ||
704 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | ||
705 | journal_file_buffer(jh, commit_transaction, BJ_Forget); | ||
706 | /* Wake up any transactions which were waiting for this | ||
707 | IO to complete */ | ||
708 | wake_up_bit(&bh->b_state, BH_Unshadow); | ||
709 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | ||
710 | __brelse(bh); | ||
711 | } | ||
712 | |||
713 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | ||
714 | |||
715 | jbd_debug(3, "JBD: commit phase 5\n"); | ||
716 | |||
717 | /* Here we wait for the revoke record and descriptor record buffers */ | ||
718 | wait_for_ctlbuf: | ||
719 | while (commit_transaction->t_log_list != NULL) { | ||
720 | struct buffer_head *bh; | ||
721 | |||
722 | jh = commit_transaction->t_log_list->b_tprev; | ||
723 | bh = jh2bh(jh); | ||
724 | if (buffer_locked(bh)) { | ||
725 | wait_on_buffer(bh); | ||
726 | goto wait_for_ctlbuf; | ||
727 | } | ||
728 | if (cond_resched()) | ||
729 | goto wait_for_ctlbuf; | ||
730 | |||
731 | if (unlikely(!buffer_uptodate(bh))) | ||
732 | err = -EIO; | ||
733 | |||
734 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); | ||
735 | clear_buffer_jwrite(bh); | ||
736 | journal_unfile_buffer(journal, jh); | ||
737 | journal_put_journal_head(jh); | ||
738 | __brelse(bh); /* One for getblk */ | ||
739 | /* AKPM: bforget here */ | ||
740 | } | ||
741 | |||
742 | jbd_debug(3, "JBD: commit phase 6\n"); | ||
743 | |||
744 | if (journal_write_commit_record(journal, commit_transaction)) | ||
745 | err = -EIO; | ||
746 | |||
747 | if (err) | ||
748 | __journal_abort_hard(journal); | ||
749 | |||
750 | /* End of a transaction! Finally, we can do checkpoint | ||
751 | processing: any buffers committed as a result of this | ||
752 | transaction can be removed from any checkpoint list it was on | ||
753 | before. */ | ||
754 | |||
755 | jbd_debug(3, "JBD: commit phase 7\n"); | ||
756 | |||
757 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); | ||
758 | J_ASSERT(commit_transaction->t_buffers == NULL); | ||
759 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | ||
760 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | ||
761 | J_ASSERT(commit_transaction->t_shadow_list == NULL); | ||
762 | J_ASSERT(commit_transaction->t_log_list == NULL); | ||
763 | |||
764 | restart_loop: | ||
765 | /* | ||
766 | * As there are other places (journal_unmap_buffer()) adding buffers | ||
767 | * to this list we have to be careful and hold the j_list_lock. | ||
768 | */ | ||
769 | spin_lock(&journal->j_list_lock); | ||
770 | while (commit_transaction->t_forget) { | ||
771 | transaction_t *cp_transaction; | ||
772 | struct buffer_head *bh; | ||
773 | |||
774 | jh = commit_transaction->t_forget; | ||
775 | spin_unlock(&journal->j_list_lock); | ||
776 | bh = jh2bh(jh); | ||
777 | jbd_lock_bh_state(bh); | ||
778 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || | ||
779 | jh->b_transaction == journal->j_running_transaction); | ||
780 | |||
781 | /* | ||
782 | * If there is undo-protected committed data against | ||
783 | * this buffer, then we can remove it now. If it is a | ||
784 | * buffer needing such protection, the old frozen_data | ||
785 | * field now points to a committed version of the | ||
786 | * buffer, so rotate that field to the new committed | ||
787 | * data. | ||
788 | * | ||
789 | * Otherwise, we can just throw away the frozen data now. | ||
790 | */ | ||
791 | if (jh->b_committed_data) { | ||
792 | jbd_slab_free(jh->b_committed_data, bh->b_size); | ||
793 | jh->b_committed_data = NULL; | ||
794 | if (jh->b_frozen_data) { | ||
795 | jh->b_committed_data = jh->b_frozen_data; | ||
796 | jh->b_frozen_data = NULL; | ||
797 | } | ||
798 | } else if (jh->b_frozen_data) { | ||
799 | jbd_slab_free(jh->b_frozen_data, bh->b_size); | ||
800 | jh->b_frozen_data = NULL; | ||
801 | } | ||
802 | |||
803 | spin_lock(&journal->j_list_lock); | ||
804 | cp_transaction = jh->b_cp_transaction; | ||
805 | if (cp_transaction) { | ||
806 | JBUFFER_TRACE(jh, "remove from old cp transaction"); | ||
807 | __journal_remove_checkpoint(jh); | ||
808 | } | ||
809 | |||
810 | /* Only re-checkpoint the buffer_head if it is marked | ||
811 | * dirty. If the buffer was added to the BJ_Forget list | ||
812 | * by journal_forget, it may no longer be dirty and | ||
813 | * there's no point in keeping a checkpoint record for | ||
814 | * it. */ | ||
815 | |||
816 | /* A buffer which has been freed while still being | ||
817 | * journaled by a previous transaction may end up still | ||
818 | * being dirty here, but we want to avoid writing back | ||
819 | * that buffer in the future now that the last use has | ||
820 | * been committed. That's not only a performance gain, | ||
821 | * it also stops aliasing problems if the buffer is left | ||
822 | * behind for writeback and gets reallocated for another | ||
823 | * use in a different page. */ | ||
824 | if (buffer_freed(bh)) { | ||
825 | clear_buffer_freed(bh); | ||
826 | clear_buffer_jbddirty(bh); | ||
827 | } | ||
828 | |||
829 | if (buffer_jbddirty(bh)) { | ||
830 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); | ||
831 | __journal_insert_checkpoint(jh, commit_transaction); | ||
832 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); | ||
833 | __journal_refile_buffer(jh); | ||
834 | jbd_unlock_bh_state(bh); | ||
835 | } else { | ||
836 | J_ASSERT_BH(bh, !buffer_dirty(bh)); | ||
837 | /* The buffer on BJ_Forget list and not jbddirty means | ||
838 | * it has been freed by this transaction and hence it | ||
839 | * could not have been reallocated until this | ||
840 | * transaction has committed. *BUT* it could be | ||
841 | * reallocated once we have written all the data to | ||
842 | * disk and before we process the buffer on BJ_Forget | ||
843 | * list. */ | ||
844 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); | ||
845 | __journal_refile_buffer(jh); | ||
846 | if (!jh->b_transaction) { | ||
847 | jbd_unlock_bh_state(bh); | ||
848 | /* needs a brelse */ | ||
849 | journal_remove_journal_head(bh); | ||
850 | release_buffer_page(bh); | ||
851 | } else | ||
852 | jbd_unlock_bh_state(bh); | ||
853 | } | ||
854 | cond_resched_lock(&journal->j_list_lock); | ||
855 | } | ||
856 | spin_unlock(&journal->j_list_lock); | ||
857 | /* | ||
858 | * This is a bit sleazy. We borrow j_list_lock to protect | ||
859 | * journal->j_committing_transaction in __journal_remove_checkpoint. | ||
860 | * Really, __journal_remove_checkpoint should be using j_state_lock but | ||
861 | * it's a bit hassle to hold that across __journal_remove_checkpoint | ||
862 | */ | ||
863 | spin_lock(&journal->j_state_lock); | ||
864 | spin_lock(&journal->j_list_lock); | ||
865 | /* | ||
866 | * Now recheck if some buffers did not get attached to the transaction | ||
867 | * while the lock was dropped... | ||
868 | */ | ||
869 | if (commit_transaction->t_forget) { | ||
870 | spin_unlock(&journal->j_list_lock); | ||
871 | spin_unlock(&journal->j_state_lock); | ||
872 | goto restart_loop; | ||
873 | } | ||
874 | |||
875 | /* Done with this transaction! */ | ||
876 | |||
877 | jbd_debug(3, "JBD: commit phase 8\n"); | ||
878 | |||
879 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | ||
880 | |||
881 | commit_transaction->t_state = T_FINISHED; | ||
882 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | ||
883 | journal->j_commit_sequence = commit_transaction->t_tid; | ||
884 | journal->j_committing_transaction = NULL; | ||
885 | spin_unlock(&journal->j_state_lock); | ||
886 | |||
887 | if (commit_transaction->t_checkpoint_list == NULL) { | ||
888 | __journal_drop_transaction(journal, commit_transaction); | ||
889 | } else { | ||
890 | if (journal->j_checkpoint_transactions == NULL) { | ||
891 | journal->j_checkpoint_transactions = commit_transaction; | ||
892 | commit_transaction->t_cpnext = commit_transaction; | ||
893 | commit_transaction->t_cpprev = commit_transaction; | ||
894 | } else { | ||
895 | commit_transaction->t_cpnext = | ||
896 | journal->j_checkpoint_transactions; | ||
897 | commit_transaction->t_cpprev = | ||
898 | commit_transaction->t_cpnext->t_cpprev; | ||
899 | commit_transaction->t_cpnext->t_cpprev = | ||
900 | commit_transaction; | ||
901 | commit_transaction->t_cpprev->t_cpnext = | ||
902 | commit_transaction; | ||
903 | } | ||
904 | } | ||
905 | spin_unlock(&journal->j_list_lock); | ||
906 | |||
907 | jbd_debug(1, "JBD: commit %d complete, head %d\n", | ||
908 | journal->j_commit_sequence, journal->j_tail_sequence); | ||
909 | |||
910 | wake_up(&journal->j_wait_done_commit); | ||
911 | } | ||
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c new file mode 100644 index 000000000000..c518dd8fe60a --- /dev/null +++ b/fs/jbd2/journal.c | |||
@@ -0,0 +1,2072 @@ | |||
1 | /* | ||
2 | * linux/fs/jbd/journal.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | ||
5 | * | ||
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Generic filesystem journal-writing code; part of the ext2fs | ||
13 | * journaling system. | ||
14 | * | ||
15 | * This file manages journals: areas of disk reserved for logging | ||
16 | * transactional updates. This includes the kernel journaling thread | ||
17 | * which is responsible for scheduling updates to the log. | ||
18 | * | ||
19 | * We do not actually manage the physical storage of the journal in this | ||
20 | * file: that is left to a per-journal policy function, which allows us | ||
21 | * to store the journal within a filesystem-specified area for ext2 | ||
22 | * journaling (ext2 can use a reserved inode for storing the log). | ||
23 | */ | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/time.h> | ||
27 | #include <linux/fs.h> | ||
28 | #include <linux/jbd.h> | ||
29 | #include <linux/errno.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/smp_lock.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/mm.h> | ||
34 | #include <linux/suspend.h> | ||
35 | #include <linux/pagemap.h> | ||
36 | #include <linux/kthread.h> | ||
37 | #include <linux/poison.h> | ||
38 | #include <linux/proc_fs.h> | ||
39 | |||
40 | #include <asm/uaccess.h> | ||
41 | #include <asm/page.h> | ||
42 | |||
43 | EXPORT_SYMBOL(journal_start); | ||
44 | EXPORT_SYMBOL(journal_restart); | ||
45 | EXPORT_SYMBOL(journal_extend); | ||
46 | EXPORT_SYMBOL(journal_stop); | ||
47 | EXPORT_SYMBOL(journal_lock_updates); | ||
48 | EXPORT_SYMBOL(journal_unlock_updates); | ||
49 | EXPORT_SYMBOL(journal_get_write_access); | ||
50 | EXPORT_SYMBOL(journal_get_create_access); | ||
51 | EXPORT_SYMBOL(journal_get_undo_access); | ||
52 | EXPORT_SYMBOL(journal_dirty_data); | ||
53 | EXPORT_SYMBOL(journal_dirty_metadata); | ||
54 | EXPORT_SYMBOL(journal_release_buffer); | ||
55 | EXPORT_SYMBOL(journal_forget); | ||
56 | #if 0 | ||
57 | EXPORT_SYMBOL(journal_sync_buffer); | ||
58 | #endif | ||
59 | EXPORT_SYMBOL(journal_flush); | ||
60 | EXPORT_SYMBOL(journal_revoke); | ||
61 | |||
62 | EXPORT_SYMBOL(journal_init_dev); | ||
63 | EXPORT_SYMBOL(journal_init_inode); | ||
64 | EXPORT_SYMBOL(journal_update_format); | ||
65 | EXPORT_SYMBOL(journal_check_used_features); | ||
66 | EXPORT_SYMBOL(journal_check_available_features); | ||
67 | EXPORT_SYMBOL(journal_set_features); | ||
68 | EXPORT_SYMBOL(journal_create); | ||
69 | EXPORT_SYMBOL(journal_load); | ||
70 | EXPORT_SYMBOL(journal_destroy); | ||
71 | EXPORT_SYMBOL(journal_update_superblock); | ||
72 | EXPORT_SYMBOL(journal_abort); | ||
73 | EXPORT_SYMBOL(journal_errno); | ||
74 | EXPORT_SYMBOL(journal_ack_err); | ||
75 | EXPORT_SYMBOL(journal_clear_err); | ||
76 | EXPORT_SYMBOL(log_wait_commit); | ||
77 | EXPORT_SYMBOL(journal_start_commit); | ||
78 | EXPORT_SYMBOL(journal_force_commit_nested); | ||
79 | EXPORT_SYMBOL(journal_wipe); | ||
80 | EXPORT_SYMBOL(journal_blocks_per_page); | ||
81 | EXPORT_SYMBOL(journal_invalidatepage); | ||
82 | EXPORT_SYMBOL(journal_try_to_free_buffers); | ||
83 | EXPORT_SYMBOL(journal_force_commit); | ||
84 | |||
85 | static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); | ||
86 | static void __journal_abort_soft (journal_t *journal, int errno); | ||
87 | static int journal_create_jbd_slab(size_t slab_size); | ||
88 | |||
89 | /* | ||
90 | * Helper function used to manage commit timeouts | ||
91 | */ | ||
92 | |||
93 | static void commit_timeout(unsigned long __data) | ||
94 | { | ||
95 | struct task_struct * p = (struct task_struct *) __data; | ||
96 | |||
97 | wake_up_process(p); | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * kjournald: The main thread function used to manage a logging device | ||
102 | * journal. | ||
103 | * | ||
104 | * This kernel thread is responsible for two things: | ||
105 | * | ||
106 | * 1) COMMIT: Every so often we need to commit the current state of the | ||
107 | * filesystem to disk. The journal thread is responsible for writing | ||
108 | * all of the metadata buffers to disk. | ||
109 | * | ||
110 | * 2) CHECKPOINT: We cannot reuse a used section of the log file until all | ||
111 | * of the data in that part of the log has been rewritten elsewhere on | ||
112 | * the disk. Flushing these old buffers to reclaim space in the log is | ||
113 | * known as checkpointing, and this thread is responsible for that job. | ||
114 | */ | ||
115 | |||
116 | static int kjournald(void *arg) | ||
117 | { | ||
118 | journal_t *journal = arg; | ||
119 | transaction_t *transaction; | ||
120 | |||
121 | /* | ||
122 | * Set up an interval timer which can be used to trigger a commit wakeup | ||
123 | * after the commit interval expires | ||
124 | */ | ||
125 | setup_timer(&journal->j_commit_timer, commit_timeout, | ||
126 | (unsigned long)current); | ||
127 | |||
128 | /* Record that the journal thread is running */ | ||
129 | journal->j_task = current; | ||
130 | wake_up(&journal->j_wait_done_commit); | ||
131 | |||
132 | printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", | ||
133 | journal->j_commit_interval / HZ); | ||
134 | |||
135 | /* | ||
136 | * And now, wait forever for commit wakeup events. | ||
137 | */ | ||
138 | spin_lock(&journal->j_state_lock); | ||
139 | |||
140 | loop: | ||
141 | if (journal->j_flags & JFS_UNMOUNT) | ||
142 | goto end_loop; | ||
143 | |||
144 | jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", | ||
145 | journal->j_commit_sequence, journal->j_commit_request); | ||
146 | |||
147 | if (journal->j_commit_sequence != journal->j_commit_request) { | ||
148 | jbd_debug(1, "OK, requests differ\n"); | ||
149 | spin_unlock(&journal->j_state_lock); | ||
150 | del_timer_sync(&journal->j_commit_timer); | ||
151 | journal_commit_transaction(journal); | ||
152 | spin_lock(&journal->j_state_lock); | ||
153 | goto loop; | ||
154 | } | ||
155 | |||
156 | wake_up(&journal->j_wait_done_commit); | ||
157 | if (freezing(current)) { | ||
158 | /* | ||
159 | * The simpler the better. Flushing journal isn't a | ||
160 | * good idea, because that depends on threads that may | ||
161 | * be already stopped. | ||
162 | */ | ||
163 | jbd_debug(1, "Now suspending kjournald\n"); | ||
164 | spin_unlock(&journal->j_state_lock); | ||
165 | refrigerator(); | ||
166 | spin_lock(&journal->j_state_lock); | ||
167 | } else { | ||
168 | /* | ||
169 | * We assume on resume that commits are already there, | ||
170 | * so we don't sleep | ||
171 | */ | ||
172 | DEFINE_WAIT(wait); | ||
173 | int should_sleep = 1; | ||
174 | |||
175 | prepare_to_wait(&journal->j_wait_commit, &wait, | ||
176 | TASK_INTERRUPTIBLE); | ||
177 | if (journal->j_commit_sequence != journal->j_commit_request) | ||
178 | should_sleep = 0; | ||
179 | transaction = journal->j_running_transaction; | ||
180 | if (transaction && time_after_eq(jiffies, | ||
181 | transaction->t_expires)) | ||
182 | should_sleep = 0; | ||
183 | if (journal->j_flags & JFS_UNMOUNT) | ||
184 | should_sleep = 0; | ||
185 | if (should_sleep) { | ||
186 | spin_unlock(&journal->j_state_lock); | ||
187 | schedule(); | ||
188 | spin_lock(&journal->j_state_lock); | ||
189 | } | ||
190 | finish_wait(&journal->j_wait_commit, &wait); | ||
191 | } | ||
192 | |||
193 | jbd_debug(1, "kjournald wakes\n"); | ||
194 | |||
195 | /* | ||
196 | * Were we woken up by a commit wakeup event? | ||
197 | */ | ||
198 | transaction = journal->j_running_transaction; | ||
199 | if (transaction && time_after_eq(jiffies, transaction->t_expires)) { | ||
200 | journal->j_commit_request = transaction->t_tid; | ||
201 | jbd_debug(1, "woke because of timeout\n"); | ||
202 | } | ||
203 | goto loop; | ||
204 | |||
205 | end_loop: | ||
206 | spin_unlock(&journal->j_state_lock); | ||
207 | del_timer_sync(&journal->j_commit_timer); | ||
208 | journal->j_task = NULL; | ||
209 | wake_up(&journal->j_wait_done_commit); | ||
210 | jbd_debug(1, "Journal thread exiting.\n"); | ||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | static void journal_start_thread(journal_t *journal) | ||
215 | { | ||
216 | kthread_run(kjournald, journal, "kjournald"); | ||
217 | wait_event(journal->j_wait_done_commit, journal->j_task != 0); | ||
218 | } | ||
219 | |||
220 | static void journal_kill_thread(journal_t *journal) | ||
221 | { | ||
222 | spin_lock(&journal->j_state_lock); | ||
223 | journal->j_flags |= JFS_UNMOUNT; | ||
224 | |||
225 | while (journal->j_task) { | ||
226 | wake_up(&journal->j_wait_commit); | ||
227 | spin_unlock(&journal->j_state_lock); | ||
228 | wait_event(journal->j_wait_done_commit, journal->j_task == 0); | ||
229 | spin_lock(&journal->j_state_lock); | ||
230 | } | ||
231 | spin_unlock(&journal->j_state_lock); | ||
232 | } | ||
233 | |||
234 | /* | ||
235 | * journal_write_metadata_buffer: write a metadata buffer to the journal. | ||
236 | * | ||
237 | * Writes a metadata buffer to a given disk block. The actual IO is not | ||
238 | * performed but a new buffer_head is constructed which labels the data | ||
239 | * to be written with the correct destination disk block. | ||
240 | * | ||
241 | * Any magic-number escaping which needs to be done will cause a | ||
242 | * copy-out here. If the buffer happens to start with the | ||
243 | * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the | ||
244 | * magic number is only written to the log for descripter blocks. In | ||
245 | * this case, we copy the data and replace the first word with 0, and we | ||
246 | * return a result code which indicates that this buffer needs to be | ||
247 | * marked as an escaped buffer in the corresponding log descriptor | ||
248 | * block. The missing word can then be restored when the block is read | ||
249 | * during recovery. | ||
250 | * | ||
251 | * If the source buffer has already been modified by a new transaction | ||
252 | * since we took the last commit snapshot, we use the frozen copy of | ||
253 | * that data for IO. If we end up using the existing buffer_head's data | ||
254 | * for the write, then we *have* to lock the buffer to prevent anyone | ||
255 | * else from using and possibly modifying it while the IO is in | ||
256 | * progress. | ||
257 | * | ||
258 | * The function returns a pointer to the buffer_heads to be used for IO. | ||
259 | * | ||
260 | * We assume that the journal has already been locked in this function. | ||
261 | * | ||
262 | * Return value: | ||
263 | * <0: Error | ||
264 | * >=0: Finished OK | ||
265 | * | ||
266 | * On success: | ||
267 | * Bit 0 set == escape performed on the data | ||
268 | * Bit 1 set == buffer copy-out performed (kfree the data after IO) | ||
269 | */ | ||
270 | |||
271 | int journal_write_metadata_buffer(transaction_t *transaction, | ||
272 | struct journal_head *jh_in, | ||
273 | struct journal_head **jh_out, | ||
274 | unsigned long blocknr) | ||
275 | { | ||
276 | int need_copy_out = 0; | ||
277 | int done_copy_out = 0; | ||
278 | int do_escape = 0; | ||
279 | char *mapped_data; | ||
280 | struct buffer_head *new_bh; | ||
281 | struct journal_head *new_jh; | ||
282 | struct page *new_page; | ||
283 | unsigned int new_offset; | ||
284 | struct buffer_head *bh_in = jh2bh(jh_in); | ||
285 | |||
286 | /* | ||
287 | * The buffer really shouldn't be locked: only the current committing | ||
288 | * transaction is allowed to write it, so nobody else is allowed | ||
289 | * to do any IO. | ||
290 | * | ||
291 | * akpm: except if we're journalling data, and write() output is | ||
292 | * also part of a shared mapping, and another thread has | ||
293 | * decided to launch a writepage() against this buffer. | ||
294 | */ | ||
295 | J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); | ||
296 | |||
297 | new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); | ||
298 | |||
299 | /* | ||
300 | * If a new transaction has already done a buffer copy-out, then | ||
301 | * we use that version of the data for the commit. | ||
302 | */ | ||
303 | jbd_lock_bh_state(bh_in); | ||
304 | repeat: | ||
305 | if (jh_in->b_frozen_data) { | ||
306 | done_copy_out = 1; | ||
307 | new_page = virt_to_page(jh_in->b_frozen_data); | ||
308 | new_offset = offset_in_page(jh_in->b_frozen_data); | ||
309 | } else { | ||
310 | new_page = jh2bh(jh_in)->b_page; | ||
311 | new_offset = offset_in_page(jh2bh(jh_in)->b_data); | ||
312 | } | ||
313 | |||
314 | mapped_data = kmap_atomic(new_page, KM_USER0); | ||
315 | /* | ||
316 | * Check for escaping | ||
317 | */ | ||
318 | if (*((__be32 *)(mapped_data + new_offset)) == | ||
319 | cpu_to_be32(JFS_MAGIC_NUMBER)) { | ||
320 | need_copy_out = 1; | ||
321 | do_escape = 1; | ||
322 | } | ||
323 | kunmap_atomic(mapped_data, KM_USER0); | ||
324 | |||
325 | /* | ||
326 | * Do we need to do a data copy? | ||
327 | */ | ||
328 | if (need_copy_out && !done_copy_out) { | ||
329 | char *tmp; | ||
330 | |||
331 | jbd_unlock_bh_state(bh_in); | ||
332 | tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS); | ||
333 | jbd_lock_bh_state(bh_in); | ||
334 | if (jh_in->b_frozen_data) { | ||
335 | jbd_slab_free(tmp, bh_in->b_size); | ||
336 | goto repeat; | ||
337 | } | ||
338 | |||
339 | jh_in->b_frozen_data = tmp; | ||
340 | mapped_data = kmap_atomic(new_page, KM_USER0); | ||
341 | memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); | ||
342 | kunmap_atomic(mapped_data, KM_USER0); | ||
343 | |||
344 | new_page = virt_to_page(tmp); | ||
345 | new_offset = offset_in_page(tmp); | ||
346 | done_copy_out = 1; | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * Did we need to do an escaping? Now we've done all the | ||
351 | * copying, we can finally do so. | ||
352 | */ | ||
353 | if (do_escape) { | ||
354 | mapped_data = kmap_atomic(new_page, KM_USER0); | ||
355 | *((unsigned int *)(mapped_data + new_offset)) = 0; | ||
356 | kunmap_atomic(mapped_data, KM_USER0); | ||
357 | } | ||
358 | |||
359 | /* keep subsequent assertions sane */ | ||
360 | new_bh->b_state = 0; | ||
361 | init_buffer(new_bh, NULL, NULL); | ||
362 | atomic_set(&new_bh->b_count, 1); | ||
363 | jbd_unlock_bh_state(bh_in); | ||
364 | |||
365 | new_jh = journal_add_journal_head(new_bh); /* This sleeps */ | ||
366 | |||
367 | set_bh_page(new_bh, new_page, new_offset); | ||
368 | new_jh->b_transaction = NULL; | ||
369 | new_bh->b_size = jh2bh(jh_in)->b_size; | ||
370 | new_bh->b_bdev = transaction->t_journal->j_dev; | ||
371 | new_bh->b_blocknr = blocknr; | ||
372 | set_buffer_mapped(new_bh); | ||
373 | set_buffer_dirty(new_bh); | ||
374 | |||
375 | *jh_out = new_jh; | ||
376 | |||
377 | /* | ||
378 | * The to-be-written buffer needs to get moved to the io queue, | ||
379 | * and the original buffer whose contents we are shadowing or | ||
380 | * copying is moved to the transaction's shadow queue. | ||
381 | */ | ||
382 | JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); | ||
383 | journal_file_buffer(jh_in, transaction, BJ_Shadow); | ||
384 | JBUFFER_TRACE(new_jh, "file as BJ_IO"); | ||
385 | journal_file_buffer(new_jh, transaction, BJ_IO); | ||
386 | |||
387 | return do_escape | (done_copy_out << 1); | ||
388 | } | ||
389 | |||
390 | /* | ||
391 | * Allocation code for the journal file. Manage the space left in the | ||
392 | * journal, so that we can begin checkpointing when appropriate. | ||
393 | */ | ||
394 | |||
395 | /* | ||
396 | * __log_space_left: Return the number of free blocks left in the journal. | ||
397 | * | ||
398 | * Called with the journal already locked. | ||
399 | * | ||
400 | * Called under j_state_lock | ||
401 | */ | ||
402 | |||
403 | int __log_space_left(journal_t *journal) | ||
404 | { | ||
405 | int left = journal->j_free; | ||
406 | |||
407 | assert_spin_locked(&journal->j_state_lock); | ||
408 | |||
409 | /* | ||
410 | * Be pessimistic here about the number of those free blocks which | ||
411 | * might be required for log descriptor control blocks. | ||
412 | */ | ||
413 | |||
414 | #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ | ||
415 | |||
416 | left -= MIN_LOG_RESERVED_BLOCKS; | ||
417 | |||
418 | if (left <= 0) | ||
419 | return 0; | ||
420 | left -= (left >> 3); | ||
421 | return left; | ||
422 | } | ||
423 | |||
424 | /* | ||
425 | * Called under j_state_lock. Returns true if a transaction was started. | ||
426 | */ | ||
427 | int __log_start_commit(journal_t *journal, tid_t target) | ||
428 | { | ||
429 | /* | ||
430 | * Are we already doing a recent enough commit? | ||
431 | */ | ||
432 | if (!tid_geq(journal->j_commit_request, target)) { | ||
433 | /* | ||
434 | * We want a new commit: OK, mark the request and wakup the | ||
435 | * commit thread. We do _not_ do the commit ourselves. | ||
436 | */ | ||
437 | |||
438 | journal->j_commit_request = target; | ||
439 | jbd_debug(1, "JBD: requesting commit %d/%d\n", | ||
440 | journal->j_commit_request, | ||
441 | journal->j_commit_sequence); | ||
442 | wake_up(&journal->j_wait_commit); | ||
443 | return 1; | ||
444 | } | ||
445 | return 0; | ||
446 | } | ||
447 | |||
448 | int log_start_commit(journal_t *journal, tid_t tid) | ||
449 | { | ||
450 | int ret; | ||
451 | |||
452 | spin_lock(&journal->j_state_lock); | ||
453 | ret = __log_start_commit(journal, tid); | ||
454 | spin_unlock(&journal->j_state_lock); | ||
455 | return ret; | ||
456 | } | ||
457 | |||
458 | /* | ||
459 | * Force and wait upon a commit if the calling process is not within | ||
460 | * transaction. This is used for forcing out undo-protected data which contains | ||
461 | * bitmaps, when the fs is running out of space. | ||
462 | * | ||
463 | * We can only force the running transaction if we don't have an active handle; | ||
464 | * otherwise, we will deadlock. | ||
465 | * | ||
466 | * Returns true if a transaction was started. | ||
467 | */ | ||
468 | int journal_force_commit_nested(journal_t *journal) | ||
469 | { | ||
470 | transaction_t *transaction = NULL; | ||
471 | tid_t tid; | ||
472 | |||
473 | spin_lock(&journal->j_state_lock); | ||
474 | if (journal->j_running_transaction && !current->journal_info) { | ||
475 | transaction = journal->j_running_transaction; | ||
476 | __log_start_commit(journal, transaction->t_tid); | ||
477 | } else if (journal->j_committing_transaction) | ||
478 | transaction = journal->j_committing_transaction; | ||
479 | |||
480 | if (!transaction) { | ||
481 | spin_unlock(&journal->j_state_lock); | ||
482 | return 0; /* Nothing to retry */ | ||
483 | } | ||
484 | |||
485 | tid = transaction->t_tid; | ||
486 | spin_unlock(&journal->j_state_lock); | ||
487 | log_wait_commit(journal, tid); | ||
488 | return 1; | ||
489 | } | ||
490 | |||
491 | /* | ||
492 | * Start a commit of the current running transaction (if any). Returns true | ||
493 | * if a transaction was started, and fills its tid in at *ptid | ||
494 | */ | ||
495 | int journal_start_commit(journal_t *journal, tid_t *ptid) | ||
496 | { | ||
497 | int ret = 0; | ||
498 | |||
499 | spin_lock(&journal->j_state_lock); | ||
500 | if (journal->j_running_transaction) { | ||
501 | tid_t tid = journal->j_running_transaction->t_tid; | ||
502 | |||
503 | ret = __log_start_commit(journal, tid); | ||
504 | if (ret && ptid) | ||
505 | *ptid = tid; | ||
506 | } else if (journal->j_committing_transaction && ptid) { | ||
507 | /* | ||
508 | * If ext3_write_super() recently started a commit, then we | ||
509 | * have to wait for completion of that transaction | ||
510 | */ | ||
511 | *ptid = journal->j_committing_transaction->t_tid; | ||
512 | ret = 1; | ||
513 | } | ||
514 | spin_unlock(&journal->j_state_lock); | ||
515 | return ret; | ||
516 | } | ||
517 | |||
518 | /* | ||
519 | * Wait for a specified commit to complete. | ||
520 | * The caller may not hold the journal lock. | ||
521 | */ | ||
522 | int log_wait_commit(journal_t *journal, tid_t tid) | ||
523 | { | ||
524 | int err = 0; | ||
525 | |||
526 | #ifdef CONFIG_JBD_DEBUG | ||
527 | spin_lock(&journal->j_state_lock); | ||
528 | if (!tid_geq(journal->j_commit_request, tid)) { | ||
529 | printk(KERN_EMERG | ||
530 | "%s: error: j_commit_request=%d, tid=%d\n", | ||
531 | __FUNCTION__, journal->j_commit_request, tid); | ||
532 | } | ||
533 | spin_unlock(&journal->j_state_lock); | ||
534 | #endif | ||
535 | spin_lock(&journal->j_state_lock); | ||
536 | while (tid_gt(tid, journal->j_commit_sequence)) { | ||
537 | jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", | ||
538 | tid, journal->j_commit_sequence); | ||
539 | wake_up(&journal->j_wait_commit); | ||
540 | spin_unlock(&journal->j_state_lock); | ||
541 | wait_event(journal->j_wait_done_commit, | ||
542 | !tid_gt(tid, journal->j_commit_sequence)); | ||
543 | spin_lock(&journal->j_state_lock); | ||
544 | } | ||
545 | spin_unlock(&journal->j_state_lock); | ||
546 | |||
547 | if (unlikely(is_journal_aborted(journal))) { | ||
548 | printk(KERN_EMERG "journal commit I/O error\n"); | ||
549 | err = -EIO; | ||
550 | } | ||
551 | return err; | ||
552 | } | ||
553 | |||
554 | /* | ||
555 | * Log buffer allocation routines: | ||
556 | */ | ||
557 | |||
558 | int journal_next_log_block(journal_t *journal, unsigned long *retp) | ||
559 | { | ||
560 | unsigned long blocknr; | ||
561 | |||
562 | spin_lock(&journal->j_state_lock); | ||
563 | J_ASSERT(journal->j_free > 1); | ||
564 | |||
565 | blocknr = journal->j_head; | ||
566 | journal->j_head++; | ||
567 | journal->j_free--; | ||
568 | if (journal->j_head == journal->j_last) | ||
569 | journal->j_head = journal->j_first; | ||
570 | spin_unlock(&journal->j_state_lock); | ||
571 | return journal_bmap(journal, blocknr, retp); | ||
572 | } | ||
573 | |||
574 | /* | ||
575 | * Conversion of logical to physical block numbers for the journal | ||
576 | * | ||
577 | * On external journals the journal blocks are identity-mapped, so | ||
578 | * this is a no-op. If needed, we can use j_blk_offset - everything is | ||
579 | * ready. | ||
580 | */ | ||
581 | int journal_bmap(journal_t *journal, unsigned long blocknr, | ||
582 | unsigned long *retp) | ||
583 | { | ||
584 | int err = 0; | ||
585 | unsigned long ret; | ||
586 | |||
587 | if (journal->j_inode) { | ||
588 | ret = bmap(journal->j_inode, blocknr); | ||
589 | if (ret) | ||
590 | *retp = ret; | ||
591 | else { | ||
592 | char b[BDEVNAME_SIZE]; | ||
593 | |||
594 | printk(KERN_ALERT "%s: journal block not found " | ||
595 | "at offset %lu on %s\n", | ||
596 | __FUNCTION__, | ||
597 | blocknr, | ||
598 | bdevname(journal->j_dev, b)); | ||
599 | err = -EIO; | ||
600 | __journal_abort_soft(journal, err); | ||
601 | } | ||
602 | } else { | ||
603 | *retp = blocknr; /* +journal->j_blk_offset */ | ||
604 | } | ||
605 | return err; | ||
606 | } | ||
607 | |||
608 | /* | ||
609 | * We play buffer_head aliasing tricks to write data/metadata blocks to | ||
610 | * the journal without copying their contents, but for journal | ||
611 | * descriptor blocks we do need to generate bona fide buffers. | ||
612 | * | ||
613 | * After the caller of journal_get_descriptor_buffer() has finished modifying | ||
614 | * the buffer's contents they really should run flush_dcache_page(bh->b_page). | ||
615 | * But we don't bother doing that, so there will be coherency problems with | ||
616 | * mmaps of blockdevs which hold live JBD-controlled filesystems. | ||
617 | */ | ||
618 | struct journal_head *journal_get_descriptor_buffer(journal_t *journal) | ||
619 | { | ||
620 | struct buffer_head *bh; | ||
621 | unsigned long blocknr; | ||
622 | int err; | ||
623 | |||
624 | err = journal_next_log_block(journal, &blocknr); | ||
625 | |||
626 | if (err) | ||
627 | return NULL; | ||
628 | |||
629 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
630 | lock_buffer(bh); | ||
631 | memset(bh->b_data, 0, journal->j_blocksize); | ||
632 | set_buffer_uptodate(bh); | ||
633 | unlock_buffer(bh); | ||
634 | BUFFER_TRACE(bh, "return this buffer"); | ||
635 | return journal_add_journal_head(bh); | ||
636 | } | ||
637 | |||
638 | /* | ||
639 | * Management for journal control blocks: functions to create and | ||
640 | * destroy journal_t structures, and to initialise and read existing | ||
641 | * journal blocks from disk. */ | ||
642 | |||
643 | /* First: create and setup a journal_t object in memory. We initialise | ||
644 | * very few fields yet: that has to wait until we have created the | ||
645 | * journal structures from from scratch, or loaded them from disk. */ | ||
646 | |||
647 | static journal_t * journal_init_common (void) | ||
648 | { | ||
649 | journal_t *journal; | ||
650 | int err; | ||
651 | |||
652 | journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL); | ||
653 | if (!journal) | ||
654 | goto fail; | ||
655 | memset(journal, 0, sizeof(*journal)); | ||
656 | |||
657 | init_waitqueue_head(&journal->j_wait_transaction_locked); | ||
658 | init_waitqueue_head(&journal->j_wait_logspace); | ||
659 | init_waitqueue_head(&journal->j_wait_done_commit); | ||
660 | init_waitqueue_head(&journal->j_wait_checkpoint); | ||
661 | init_waitqueue_head(&journal->j_wait_commit); | ||
662 | init_waitqueue_head(&journal->j_wait_updates); | ||
663 | mutex_init(&journal->j_barrier); | ||
664 | mutex_init(&journal->j_checkpoint_mutex); | ||
665 | spin_lock_init(&journal->j_revoke_lock); | ||
666 | spin_lock_init(&journal->j_list_lock); | ||
667 | spin_lock_init(&journal->j_state_lock); | ||
668 | |||
669 | journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); | ||
670 | |||
671 | /* The journal is marked for error until we succeed with recovery! */ | ||
672 | journal->j_flags = JFS_ABORT; | ||
673 | |||
674 | /* Set up a default-sized revoke table for the new mount. */ | ||
675 | err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); | ||
676 | if (err) { | ||
677 | kfree(journal); | ||
678 | goto fail; | ||
679 | } | ||
680 | return journal; | ||
681 | fail: | ||
682 | return NULL; | ||
683 | } | ||
684 | |||
685 | /* journal_init_dev and journal_init_inode: | ||
686 | * | ||
687 | * Create a journal structure assigned some fixed set of disk blocks to | ||
688 | * the journal. We don't actually touch those disk blocks yet, but we | ||
689 | * need to set up all of the mapping information to tell the journaling | ||
690 | * system where the journal blocks are. | ||
691 | * | ||
692 | */ | ||
693 | |||
694 | /** | ||
695 | * journal_t * journal_init_dev() - creates an initialises a journal structure | ||
696 | * @bdev: Block device on which to create the journal | ||
697 | * @fs_dev: Device which hold journalled filesystem for this journal. | ||
698 | * @start: Block nr Start of journal. | ||
699 | * @len: Length of the journal in blocks. | ||
700 | * @blocksize: blocksize of journalling device | ||
701 | * @returns: a newly created journal_t * | ||
702 | * | ||
703 | * journal_init_dev creates a journal which maps a fixed contiguous | ||
704 | * range of blocks on an arbitrary block device. | ||
705 | * | ||
706 | */ | ||
707 | journal_t * journal_init_dev(struct block_device *bdev, | ||
708 | struct block_device *fs_dev, | ||
709 | int start, int len, int blocksize) | ||
710 | { | ||
711 | journal_t *journal = journal_init_common(); | ||
712 | struct buffer_head *bh; | ||
713 | int n; | ||
714 | |||
715 | if (!journal) | ||
716 | return NULL; | ||
717 | |||
718 | /* journal descriptor can store up to n blocks -bzzz */ | ||
719 | journal->j_blocksize = blocksize; | ||
720 | n = journal->j_blocksize / sizeof(journal_block_tag_t); | ||
721 | journal->j_wbufsize = n; | ||
722 | journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); | ||
723 | if (!journal->j_wbuf) { | ||
724 | printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", | ||
725 | __FUNCTION__); | ||
726 | kfree(journal); | ||
727 | journal = NULL; | ||
728 | } | ||
729 | journal->j_dev = bdev; | ||
730 | journal->j_fs_dev = fs_dev; | ||
731 | journal->j_blk_offset = start; | ||
732 | journal->j_maxlen = len; | ||
733 | |||
734 | bh = __getblk(journal->j_dev, start, journal->j_blocksize); | ||
735 | J_ASSERT(bh != NULL); | ||
736 | journal->j_sb_buffer = bh; | ||
737 | journal->j_superblock = (journal_superblock_t *)bh->b_data; | ||
738 | |||
739 | return journal; | ||
740 | } | ||
741 | |||
742 | /** | ||
743 | * journal_t * journal_init_inode () - creates a journal which maps to a inode. | ||
744 | * @inode: An inode to create the journal in | ||
745 | * | ||
746 | * journal_init_inode creates a journal which maps an on-disk inode as | ||
747 | * the journal. The inode must exist already, must support bmap() and | ||
748 | * must have all data blocks preallocated. | ||
749 | */ | ||
750 | journal_t * journal_init_inode (struct inode *inode) | ||
751 | { | ||
752 | struct buffer_head *bh; | ||
753 | journal_t *journal = journal_init_common(); | ||
754 | int err; | ||
755 | int n; | ||
756 | unsigned long blocknr; | ||
757 | |||
758 | if (!journal) | ||
759 | return NULL; | ||
760 | |||
761 | journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; | ||
762 | journal->j_inode = inode; | ||
763 | jbd_debug(1, | ||
764 | "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", | ||
765 | journal, inode->i_sb->s_id, inode->i_ino, | ||
766 | (long long) inode->i_size, | ||
767 | inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); | ||
768 | |||
769 | journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; | ||
770 | journal->j_blocksize = inode->i_sb->s_blocksize; | ||
771 | |||
772 | /* journal descriptor can store up to n blocks -bzzz */ | ||
773 | n = journal->j_blocksize / sizeof(journal_block_tag_t); | ||
774 | journal->j_wbufsize = n; | ||
775 | journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); | ||
776 | if (!journal->j_wbuf) { | ||
777 | printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", | ||
778 | __FUNCTION__); | ||
779 | kfree(journal); | ||
780 | return NULL; | ||
781 | } | ||
782 | |||
783 | err = journal_bmap(journal, 0, &blocknr); | ||
784 | /* If that failed, give up */ | ||
785 | if (err) { | ||
786 | printk(KERN_ERR "%s: Cannnot locate journal superblock\n", | ||
787 | __FUNCTION__); | ||
788 | kfree(journal); | ||
789 | return NULL; | ||
790 | } | ||
791 | |||
792 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
793 | J_ASSERT(bh != NULL); | ||
794 | journal->j_sb_buffer = bh; | ||
795 | journal->j_superblock = (journal_superblock_t *)bh->b_data; | ||
796 | |||
797 | return journal; | ||
798 | } | ||
799 | |||
800 | /* | ||
801 | * If the journal init or create aborts, we need to mark the journal | ||
802 | * superblock as being NULL to prevent the journal destroy from writing | ||
803 | * back a bogus superblock. | ||
804 | */ | ||
805 | static void journal_fail_superblock (journal_t *journal) | ||
806 | { | ||
807 | struct buffer_head *bh = journal->j_sb_buffer; | ||
808 | brelse(bh); | ||
809 | journal->j_sb_buffer = NULL; | ||
810 | } | ||
811 | |||
812 | /* | ||
813 | * Given a journal_t structure, initialise the various fields for | ||
814 | * startup of a new journaling session. We use this both when creating | ||
815 | * a journal, and after recovering an old journal to reset it for | ||
816 | * subsequent use. | ||
817 | */ | ||
818 | |||
819 | static int journal_reset(journal_t *journal) | ||
820 | { | ||
821 | journal_superblock_t *sb = journal->j_superblock; | ||
822 | unsigned long first, last; | ||
823 | |||
824 | first = be32_to_cpu(sb->s_first); | ||
825 | last = be32_to_cpu(sb->s_maxlen); | ||
826 | |||
827 | journal->j_first = first; | ||
828 | journal->j_last = last; | ||
829 | |||
830 | journal->j_head = first; | ||
831 | journal->j_tail = first; | ||
832 | journal->j_free = last - first; | ||
833 | |||
834 | journal->j_tail_sequence = journal->j_transaction_sequence; | ||
835 | journal->j_commit_sequence = journal->j_transaction_sequence - 1; | ||
836 | journal->j_commit_request = journal->j_commit_sequence; | ||
837 | |||
838 | journal->j_max_transaction_buffers = journal->j_maxlen / 4; | ||
839 | |||
840 | /* Add the dynamic fields and write it to disk. */ | ||
841 | journal_update_superblock(journal, 1); | ||
842 | journal_start_thread(journal); | ||
843 | return 0; | ||
844 | } | ||
845 | |||
846 | /** | ||
847 | * int journal_create() - Initialise the new journal file | ||
848 | * @journal: Journal to create. This structure must have been initialised | ||
849 | * | ||
850 | * Given a journal_t structure which tells us which disk blocks we can | ||
851 | * use, create a new journal superblock and initialise all of the | ||
852 | * journal fields from scratch. | ||
853 | **/ | ||
854 | int journal_create(journal_t *journal) | ||
855 | { | ||
856 | unsigned long blocknr; | ||
857 | struct buffer_head *bh; | ||
858 | journal_superblock_t *sb; | ||
859 | int i, err; | ||
860 | |||
861 | if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { | ||
862 | printk (KERN_ERR "Journal length (%d blocks) too short.\n", | ||
863 | journal->j_maxlen); | ||
864 | journal_fail_superblock(journal); | ||
865 | return -EINVAL; | ||
866 | } | ||
867 | |||
868 | if (journal->j_inode == NULL) { | ||
869 | /* | ||
870 | * We don't know what block to start at! | ||
871 | */ | ||
872 | printk(KERN_EMERG | ||
873 | "%s: creation of journal on external device!\n", | ||
874 | __FUNCTION__); | ||
875 | BUG(); | ||
876 | } | ||
877 | |||
878 | /* Zero out the entire journal on disk. We cannot afford to | ||
879 | have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ | ||
880 | jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); | ||
881 | for (i = 0; i < journal->j_maxlen; i++) { | ||
882 | err = journal_bmap(journal, i, &blocknr); | ||
883 | if (err) | ||
884 | return err; | ||
885 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
886 | lock_buffer(bh); | ||
887 | memset (bh->b_data, 0, journal->j_blocksize); | ||
888 | BUFFER_TRACE(bh, "marking dirty"); | ||
889 | mark_buffer_dirty(bh); | ||
890 | BUFFER_TRACE(bh, "marking uptodate"); | ||
891 | set_buffer_uptodate(bh); | ||
892 | unlock_buffer(bh); | ||
893 | __brelse(bh); | ||
894 | } | ||
895 | |||
896 | sync_blockdev(journal->j_dev); | ||
897 | jbd_debug(1, "JBD: journal cleared.\n"); | ||
898 | |||
899 | /* OK, fill in the initial static fields in the new superblock */ | ||
900 | sb = journal->j_superblock; | ||
901 | |||
902 | sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
903 | sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); | ||
904 | |||
905 | sb->s_blocksize = cpu_to_be32(journal->j_blocksize); | ||
906 | sb->s_maxlen = cpu_to_be32(journal->j_maxlen); | ||
907 | sb->s_first = cpu_to_be32(1); | ||
908 | |||
909 | journal->j_transaction_sequence = 1; | ||
910 | |||
911 | journal->j_flags &= ~JFS_ABORT; | ||
912 | journal->j_format_version = 2; | ||
913 | |||
914 | return journal_reset(journal); | ||
915 | } | ||
916 | |||
917 | /** | ||
918 | * void journal_update_superblock() - Update journal sb on disk. | ||
919 | * @journal: The journal to update. | ||
920 | * @wait: Set to '0' if you don't want to wait for IO completion. | ||
921 | * | ||
922 | * Update a journal's dynamic superblock fields and write it to disk, | ||
923 | * optionally waiting for the IO to complete. | ||
924 | */ | ||
925 | void journal_update_superblock(journal_t *journal, int wait) | ||
926 | { | ||
927 | journal_superblock_t *sb = journal->j_superblock; | ||
928 | struct buffer_head *bh = journal->j_sb_buffer; | ||
929 | |||
930 | /* | ||
931 | * As a special case, if the on-disk copy is already marked as needing | ||
932 | * no recovery (s_start == 0) and there are no outstanding transactions | ||
933 | * in the filesystem, then we can safely defer the superblock update | ||
934 | * until the next commit by setting JFS_FLUSHED. This avoids | ||
935 | * attempting a write to a potential-readonly device. | ||
936 | */ | ||
937 | if (sb->s_start == 0 && journal->j_tail_sequence == | ||
938 | journal->j_transaction_sequence) { | ||
939 | jbd_debug(1,"JBD: Skipping superblock update on recovered sb " | ||
940 | "(start %ld, seq %d, errno %d)\n", | ||
941 | journal->j_tail, journal->j_tail_sequence, | ||
942 | journal->j_errno); | ||
943 | goto out; | ||
944 | } | ||
945 | |||
946 | spin_lock(&journal->j_state_lock); | ||
947 | jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", | ||
948 | journal->j_tail, journal->j_tail_sequence, journal->j_errno); | ||
949 | |||
950 | sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); | ||
951 | sb->s_start = cpu_to_be32(journal->j_tail); | ||
952 | sb->s_errno = cpu_to_be32(journal->j_errno); | ||
953 | spin_unlock(&journal->j_state_lock); | ||
954 | |||
955 | BUFFER_TRACE(bh, "marking dirty"); | ||
956 | mark_buffer_dirty(bh); | ||
957 | if (wait) | ||
958 | sync_dirty_buffer(bh); | ||
959 | else | ||
960 | ll_rw_block(SWRITE, 1, &bh); | ||
961 | |||
962 | out: | ||
963 | /* If we have just flushed the log (by marking s_start==0), then | ||
964 | * any future commit will have to be careful to update the | ||
965 | * superblock again to re-record the true start of the log. */ | ||
966 | |||
967 | spin_lock(&journal->j_state_lock); | ||
968 | if (sb->s_start) | ||
969 | journal->j_flags &= ~JFS_FLUSHED; | ||
970 | else | ||
971 | journal->j_flags |= JFS_FLUSHED; | ||
972 | spin_unlock(&journal->j_state_lock); | ||
973 | } | ||
974 | |||
975 | /* | ||
976 | * Read the superblock for a given journal, performing initial | ||
977 | * validation of the format. | ||
978 | */ | ||
979 | |||
980 | static int journal_get_superblock(journal_t *journal) | ||
981 | { | ||
982 | struct buffer_head *bh; | ||
983 | journal_superblock_t *sb; | ||
984 | int err = -EIO; | ||
985 | |||
986 | bh = journal->j_sb_buffer; | ||
987 | |||
988 | J_ASSERT(bh != NULL); | ||
989 | if (!buffer_uptodate(bh)) { | ||
990 | ll_rw_block(READ, 1, &bh); | ||
991 | wait_on_buffer(bh); | ||
992 | if (!buffer_uptodate(bh)) { | ||
993 | printk (KERN_ERR | ||
994 | "JBD: IO error reading journal superblock\n"); | ||
995 | goto out; | ||
996 | } | ||
997 | } | ||
998 | |||
999 | sb = journal->j_superblock; | ||
1000 | |||
1001 | err = -EINVAL; | ||
1002 | |||
1003 | if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || | ||
1004 | sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { | ||
1005 | printk(KERN_WARNING "JBD: no valid journal superblock found\n"); | ||
1006 | goto out; | ||
1007 | } | ||
1008 | |||
1009 | switch(be32_to_cpu(sb->s_header.h_blocktype)) { | ||
1010 | case JFS_SUPERBLOCK_V1: | ||
1011 | journal->j_format_version = 1; | ||
1012 | break; | ||
1013 | case JFS_SUPERBLOCK_V2: | ||
1014 | journal->j_format_version = 2; | ||
1015 | break; | ||
1016 | default: | ||
1017 | printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); | ||
1018 | goto out; | ||
1019 | } | ||
1020 | |||
1021 | if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) | ||
1022 | journal->j_maxlen = be32_to_cpu(sb->s_maxlen); | ||
1023 | else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { | ||
1024 | printk (KERN_WARNING "JBD: journal file too short\n"); | ||
1025 | goto out; | ||
1026 | } | ||
1027 | |||
1028 | return 0; | ||
1029 | |||
1030 | out: | ||
1031 | journal_fail_superblock(journal); | ||
1032 | return err; | ||
1033 | } | ||
1034 | |||
1035 | /* | ||
1036 | * Load the on-disk journal superblock and read the key fields into the | ||
1037 | * journal_t. | ||
1038 | */ | ||
1039 | |||
1040 | static int load_superblock(journal_t *journal) | ||
1041 | { | ||
1042 | int err; | ||
1043 | journal_superblock_t *sb; | ||
1044 | |||
1045 | err = journal_get_superblock(journal); | ||
1046 | if (err) | ||
1047 | return err; | ||
1048 | |||
1049 | sb = journal->j_superblock; | ||
1050 | |||
1051 | journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); | ||
1052 | journal->j_tail = be32_to_cpu(sb->s_start); | ||
1053 | journal->j_first = be32_to_cpu(sb->s_first); | ||
1054 | journal->j_last = be32_to_cpu(sb->s_maxlen); | ||
1055 | journal->j_errno = be32_to_cpu(sb->s_errno); | ||
1056 | |||
1057 | return 0; | ||
1058 | } | ||
1059 | |||
1060 | |||
1061 | /** | ||
1062 | * int journal_load() - Read journal from disk. | ||
1063 | * @journal: Journal to act on. | ||
1064 | * | ||
1065 | * Given a journal_t structure which tells us which disk blocks contain | ||
1066 | * a journal, read the journal from disk to initialise the in-memory | ||
1067 | * structures. | ||
1068 | */ | ||
1069 | int journal_load(journal_t *journal) | ||
1070 | { | ||
1071 | int err; | ||
1072 | journal_superblock_t *sb; | ||
1073 | |||
1074 | err = load_superblock(journal); | ||
1075 | if (err) | ||
1076 | return err; | ||
1077 | |||
1078 | sb = journal->j_superblock; | ||
1079 | /* If this is a V2 superblock, then we have to check the | ||
1080 | * features flags on it. */ | ||
1081 | |||
1082 | if (journal->j_format_version >= 2) { | ||
1083 | if ((sb->s_feature_ro_compat & | ||
1084 | ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || | ||
1085 | (sb->s_feature_incompat & | ||
1086 | ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { | ||
1087 | printk (KERN_WARNING | ||
1088 | "JBD: Unrecognised features on journal\n"); | ||
1089 | return -EINVAL; | ||
1090 | } | ||
1091 | } | ||
1092 | |||
1093 | /* | ||
1094 | * Create a slab for this blocksize | ||
1095 | */ | ||
1096 | err = journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize)); | ||
1097 | if (err) | ||
1098 | return err; | ||
1099 | |||
1100 | /* Let the recovery code check whether it needs to recover any | ||
1101 | * data from the journal. */ | ||
1102 | if (journal_recover(journal)) | ||
1103 | goto recovery_error; | ||
1104 | |||
1105 | /* OK, we've finished with the dynamic journal bits: | ||
1106 | * reinitialise the dynamic contents of the superblock in memory | ||
1107 | * and reset them on disk. */ | ||
1108 | if (journal_reset(journal)) | ||
1109 | goto recovery_error; | ||
1110 | |||
1111 | journal->j_flags &= ~JFS_ABORT; | ||
1112 | journal->j_flags |= JFS_LOADED; | ||
1113 | return 0; | ||
1114 | |||
1115 | recovery_error: | ||
1116 | printk (KERN_WARNING "JBD: recovery failed\n"); | ||
1117 | return -EIO; | ||
1118 | } | ||
1119 | |||
1120 | /** | ||
1121 | * void journal_destroy() - Release a journal_t structure. | ||
1122 | * @journal: Journal to act on. | ||
1123 | * | ||
1124 | * Release a journal_t structure once it is no longer in use by the | ||
1125 | * journaled object. | ||
1126 | */ | ||
1127 | void journal_destroy(journal_t *journal) | ||
1128 | { | ||
1129 | /* Wait for the commit thread to wake up and die. */ | ||
1130 | journal_kill_thread(journal); | ||
1131 | |||
1132 | /* Force a final log commit */ | ||
1133 | if (journal->j_running_transaction) | ||
1134 | journal_commit_transaction(journal); | ||
1135 | |||
1136 | /* Force any old transactions to disk */ | ||
1137 | |||
1138 | /* Totally anal locking here... */ | ||
1139 | spin_lock(&journal->j_list_lock); | ||
1140 | while (journal->j_checkpoint_transactions != NULL) { | ||
1141 | spin_unlock(&journal->j_list_lock); | ||
1142 | log_do_checkpoint(journal); | ||
1143 | spin_lock(&journal->j_list_lock); | ||
1144 | } | ||
1145 | |||
1146 | J_ASSERT(journal->j_running_transaction == NULL); | ||
1147 | J_ASSERT(journal->j_committing_transaction == NULL); | ||
1148 | J_ASSERT(journal->j_checkpoint_transactions == NULL); | ||
1149 | spin_unlock(&journal->j_list_lock); | ||
1150 | |||
1151 | /* We can now mark the journal as empty. */ | ||
1152 | journal->j_tail = 0; | ||
1153 | journal->j_tail_sequence = ++journal->j_transaction_sequence; | ||
1154 | if (journal->j_sb_buffer) { | ||
1155 | journal_update_superblock(journal, 1); | ||
1156 | brelse(journal->j_sb_buffer); | ||
1157 | } | ||
1158 | |||
1159 | if (journal->j_inode) | ||
1160 | iput(journal->j_inode); | ||
1161 | if (journal->j_revoke) | ||
1162 | journal_destroy_revoke(journal); | ||
1163 | kfree(journal->j_wbuf); | ||
1164 | kfree(journal); | ||
1165 | } | ||
1166 | |||
1167 | |||
1168 | /** | ||
1169 | *int journal_check_used_features () - Check if features specified are used. | ||
1170 | * @journal: Journal to check. | ||
1171 | * @compat: bitmask of compatible features | ||
1172 | * @ro: bitmask of features that force read-only mount | ||
1173 | * @incompat: bitmask of incompatible features | ||
1174 | * | ||
1175 | * Check whether the journal uses all of a given set of | ||
1176 | * features. Return true (non-zero) if it does. | ||
1177 | **/ | ||
1178 | |||
1179 | int journal_check_used_features (journal_t *journal, unsigned long compat, | ||
1180 | unsigned long ro, unsigned long incompat) | ||
1181 | { | ||
1182 | journal_superblock_t *sb; | ||
1183 | |||
1184 | if (!compat && !ro && !incompat) | ||
1185 | return 1; | ||
1186 | if (journal->j_format_version == 1) | ||
1187 | return 0; | ||
1188 | |||
1189 | sb = journal->j_superblock; | ||
1190 | |||
1191 | if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && | ||
1192 | ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && | ||
1193 | ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) | ||
1194 | return 1; | ||
1195 | |||
1196 | return 0; | ||
1197 | } | ||
1198 | |||
1199 | /** | ||
1200 | * int journal_check_available_features() - Check feature set in journalling layer | ||
1201 | * @journal: Journal to check. | ||
1202 | * @compat: bitmask of compatible features | ||
1203 | * @ro: bitmask of features that force read-only mount | ||
1204 | * @incompat: bitmask of incompatible features | ||
1205 | * | ||
1206 | * Check whether the journaling code supports the use of | ||
1207 | * all of a given set of features on this journal. Return true | ||
1208 | * (non-zero) if it can. */ | ||
1209 | |||
1210 | int journal_check_available_features (journal_t *journal, unsigned long compat, | ||
1211 | unsigned long ro, unsigned long incompat) | ||
1212 | { | ||
1213 | journal_superblock_t *sb; | ||
1214 | |||
1215 | if (!compat && !ro && !incompat) | ||
1216 | return 1; | ||
1217 | |||
1218 | sb = journal->j_superblock; | ||
1219 | |||
1220 | /* We can support any known requested features iff the | ||
1221 | * superblock is in version 2. Otherwise we fail to support any | ||
1222 | * extended sb features. */ | ||
1223 | |||
1224 | if (journal->j_format_version != 2) | ||
1225 | return 0; | ||
1226 | |||
1227 | if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && | ||
1228 | (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && | ||
1229 | (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) | ||
1230 | return 1; | ||
1231 | |||
1232 | return 0; | ||
1233 | } | ||
1234 | |||
1235 | /** | ||
1236 | * int journal_set_features () - Mark a given journal feature in the superblock | ||
1237 | * @journal: Journal to act on. | ||
1238 | * @compat: bitmask of compatible features | ||
1239 | * @ro: bitmask of features that force read-only mount | ||
1240 | * @incompat: bitmask of incompatible features | ||
1241 | * | ||
1242 | * Mark a given journal feature as present on the | ||
1243 | * superblock. Returns true if the requested features could be set. | ||
1244 | * | ||
1245 | */ | ||
1246 | |||
1247 | int journal_set_features (journal_t *journal, unsigned long compat, | ||
1248 | unsigned long ro, unsigned long incompat) | ||
1249 | { | ||
1250 | journal_superblock_t *sb; | ||
1251 | |||
1252 | if (journal_check_used_features(journal, compat, ro, incompat)) | ||
1253 | return 1; | ||
1254 | |||
1255 | if (!journal_check_available_features(journal, compat, ro, incompat)) | ||
1256 | return 0; | ||
1257 | |||
1258 | jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", | ||
1259 | compat, ro, incompat); | ||
1260 | |||
1261 | sb = journal->j_superblock; | ||
1262 | |||
1263 | sb->s_feature_compat |= cpu_to_be32(compat); | ||
1264 | sb->s_feature_ro_compat |= cpu_to_be32(ro); | ||
1265 | sb->s_feature_incompat |= cpu_to_be32(incompat); | ||
1266 | |||
1267 | return 1; | ||
1268 | } | ||
1269 | |||
1270 | |||
1271 | /** | ||
1272 | * int journal_update_format () - Update on-disk journal structure. | ||
1273 | * @journal: Journal to act on. | ||
1274 | * | ||
1275 | * Given an initialised but unloaded journal struct, poke about in the | ||
1276 | * on-disk structure to update it to the most recent supported version. | ||
1277 | */ | ||
1278 | int journal_update_format (journal_t *journal) | ||
1279 | { | ||
1280 | journal_superblock_t *sb; | ||
1281 | int err; | ||
1282 | |||
1283 | err = journal_get_superblock(journal); | ||
1284 | if (err) | ||
1285 | return err; | ||
1286 | |||
1287 | sb = journal->j_superblock; | ||
1288 | |||
1289 | switch (be32_to_cpu(sb->s_header.h_blocktype)) { | ||
1290 | case JFS_SUPERBLOCK_V2: | ||
1291 | return 0; | ||
1292 | case JFS_SUPERBLOCK_V1: | ||
1293 | return journal_convert_superblock_v1(journal, sb); | ||
1294 | default: | ||
1295 | break; | ||
1296 | } | ||
1297 | return -EINVAL; | ||
1298 | } | ||
1299 | |||
1300 | static int journal_convert_superblock_v1(journal_t *journal, | ||
1301 | journal_superblock_t *sb) | ||
1302 | { | ||
1303 | int offset, blocksize; | ||
1304 | struct buffer_head *bh; | ||
1305 | |||
1306 | printk(KERN_WARNING | ||
1307 | "JBD: Converting superblock from version 1 to 2.\n"); | ||
1308 | |||
1309 | /* Pre-initialise new fields to zero */ | ||
1310 | offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); | ||
1311 | blocksize = be32_to_cpu(sb->s_blocksize); | ||
1312 | memset(&sb->s_feature_compat, 0, blocksize-offset); | ||
1313 | |||
1314 | sb->s_nr_users = cpu_to_be32(1); | ||
1315 | sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); | ||
1316 | journal->j_format_version = 2; | ||
1317 | |||
1318 | bh = journal->j_sb_buffer; | ||
1319 | BUFFER_TRACE(bh, "marking dirty"); | ||
1320 | mark_buffer_dirty(bh); | ||
1321 | sync_dirty_buffer(bh); | ||
1322 | return 0; | ||
1323 | } | ||
1324 | |||
1325 | |||
1326 | /** | ||
1327 | * int journal_flush () - Flush journal | ||
1328 | * @journal: Journal to act on. | ||
1329 | * | ||
1330 | * Flush all data for a given journal to disk and empty the journal. | ||
1331 | * Filesystems can use this when remounting readonly to ensure that | ||
1332 | * recovery does not need to happen on remount. | ||
1333 | */ | ||
1334 | |||
1335 | int journal_flush(journal_t *journal) | ||
1336 | { | ||
1337 | int err = 0; | ||
1338 | transaction_t *transaction = NULL; | ||
1339 | unsigned long old_tail; | ||
1340 | |||
1341 | spin_lock(&journal->j_state_lock); | ||
1342 | |||
1343 | /* Force everything buffered to the log... */ | ||
1344 | if (journal->j_running_transaction) { | ||
1345 | transaction = journal->j_running_transaction; | ||
1346 | __log_start_commit(journal, transaction->t_tid); | ||
1347 | } else if (journal->j_committing_transaction) | ||
1348 | transaction = journal->j_committing_transaction; | ||
1349 | |||
1350 | /* Wait for the log commit to complete... */ | ||
1351 | if (transaction) { | ||
1352 | tid_t tid = transaction->t_tid; | ||
1353 | |||
1354 | spin_unlock(&journal->j_state_lock); | ||
1355 | log_wait_commit(journal, tid); | ||
1356 | } else { | ||
1357 | spin_unlock(&journal->j_state_lock); | ||
1358 | } | ||
1359 | |||
1360 | /* ...and flush everything in the log out to disk. */ | ||
1361 | spin_lock(&journal->j_list_lock); | ||
1362 | while (!err && journal->j_checkpoint_transactions != NULL) { | ||
1363 | spin_unlock(&journal->j_list_lock); | ||
1364 | err = log_do_checkpoint(journal); | ||
1365 | spin_lock(&journal->j_list_lock); | ||
1366 | } | ||
1367 | spin_unlock(&journal->j_list_lock); | ||
1368 | cleanup_journal_tail(journal); | ||
1369 | |||
1370 | /* Finally, mark the journal as really needing no recovery. | ||
1371 | * This sets s_start==0 in the underlying superblock, which is | ||
1372 | * the magic code for a fully-recovered superblock. Any future | ||
1373 | * commits of data to the journal will restore the current | ||
1374 | * s_start value. */ | ||
1375 | spin_lock(&journal->j_state_lock); | ||
1376 | old_tail = journal->j_tail; | ||
1377 | journal->j_tail = 0; | ||
1378 | spin_unlock(&journal->j_state_lock); | ||
1379 | journal_update_superblock(journal, 1); | ||
1380 | spin_lock(&journal->j_state_lock); | ||
1381 | journal->j_tail = old_tail; | ||
1382 | |||
1383 | J_ASSERT(!journal->j_running_transaction); | ||
1384 | J_ASSERT(!journal->j_committing_transaction); | ||
1385 | J_ASSERT(!journal->j_checkpoint_transactions); | ||
1386 | J_ASSERT(journal->j_head == journal->j_tail); | ||
1387 | J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); | ||
1388 | spin_unlock(&journal->j_state_lock); | ||
1389 | return err; | ||
1390 | } | ||
1391 | |||
1392 | /** | ||
1393 | * int journal_wipe() - Wipe journal contents | ||
1394 | * @journal: Journal to act on. | ||
1395 | * @write: flag (see below) | ||
1396 | * | ||
1397 | * Wipe out all of the contents of a journal, safely. This will produce | ||
1398 | * a warning if the journal contains any valid recovery information. | ||
1399 | * Must be called between journal_init_*() and journal_load(). | ||
1400 | * | ||
1401 | * If 'write' is non-zero, then we wipe out the journal on disk; otherwise | ||
1402 | * we merely suppress recovery. | ||
1403 | */ | ||
1404 | |||
1405 | int journal_wipe(journal_t *journal, int write) | ||
1406 | { | ||
1407 | journal_superblock_t *sb; | ||
1408 | int err = 0; | ||
1409 | |||
1410 | J_ASSERT (!(journal->j_flags & JFS_LOADED)); | ||
1411 | |||
1412 | err = load_superblock(journal); | ||
1413 | if (err) | ||
1414 | return err; | ||
1415 | |||
1416 | sb = journal->j_superblock; | ||
1417 | |||
1418 | if (!journal->j_tail) | ||
1419 | goto no_recovery; | ||
1420 | |||
1421 | printk (KERN_WARNING "JBD: %s recovery information on journal\n", | ||
1422 | write ? "Clearing" : "Ignoring"); | ||
1423 | |||
1424 | err = journal_skip_recovery(journal); | ||
1425 | if (write) | ||
1426 | journal_update_superblock(journal, 1); | ||
1427 | |||
1428 | no_recovery: | ||
1429 | return err; | ||
1430 | } | ||
1431 | |||
1432 | /* | ||
1433 | * journal_dev_name: format a character string to describe on what | ||
1434 | * device this journal is present. | ||
1435 | */ | ||
1436 | |||
1437 | static const char *journal_dev_name(journal_t *journal, char *buffer) | ||
1438 | { | ||
1439 | struct block_device *bdev; | ||
1440 | |||
1441 | if (journal->j_inode) | ||
1442 | bdev = journal->j_inode->i_sb->s_bdev; | ||
1443 | else | ||
1444 | bdev = journal->j_dev; | ||
1445 | |||
1446 | return bdevname(bdev, buffer); | ||
1447 | } | ||
1448 | |||
1449 | /* | ||
1450 | * Journal abort has very specific semantics, which we describe | ||
1451 | * for journal abort. | ||
1452 | * | ||
1453 | * Two internal function, which provide abort to te jbd layer | ||
1454 | * itself are here. | ||
1455 | */ | ||
1456 | |||
1457 | /* | ||
1458 | * Quick version for internal journal use (doesn't lock the journal). | ||
1459 | * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, | ||
1460 | * and don't attempt to make any other journal updates. | ||
1461 | */ | ||
1462 | void __journal_abort_hard(journal_t *journal) | ||
1463 | { | ||
1464 | transaction_t *transaction; | ||
1465 | char b[BDEVNAME_SIZE]; | ||
1466 | |||
1467 | if (journal->j_flags & JFS_ABORT) | ||
1468 | return; | ||
1469 | |||
1470 | printk(KERN_ERR "Aborting journal on device %s.\n", | ||
1471 | journal_dev_name(journal, b)); | ||
1472 | |||
1473 | spin_lock(&journal->j_state_lock); | ||
1474 | journal->j_flags |= JFS_ABORT; | ||
1475 | transaction = journal->j_running_transaction; | ||
1476 | if (transaction) | ||
1477 | __log_start_commit(journal, transaction->t_tid); | ||
1478 | spin_unlock(&journal->j_state_lock); | ||
1479 | } | ||
1480 | |||
1481 | /* Soft abort: record the abort error status in the journal superblock, | ||
1482 | * but don't do any other IO. */ | ||
1483 | static void __journal_abort_soft (journal_t *journal, int errno) | ||
1484 | { | ||
1485 | if (journal->j_flags & JFS_ABORT) | ||
1486 | return; | ||
1487 | |||
1488 | if (!journal->j_errno) | ||
1489 | journal->j_errno = errno; | ||
1490 | |||
1491 | __journal_abort_hard(journal); | ||
1492 | |||
1493 | if (errno) | ||
1494 | journal_update_superblock(journal, 1); | ||
1495 | } | ||
1496 | |||
1497 | /** | ||
1498 | * void journal_abort () - Shutdown the journal immediately. | ||
1499 | * @journal: the journal to shutdown. | ||
1500 | * @errno: an error number to record in the journal indicating | ||
1501 | * the reason for the shutdown. | ||
1502 | * | ||
1503 | * Perform a complete, immediate shutdown of the ENTIRE | ||
1504 | * journal (not of a single transaction). This operation cannot be | ||
1505 | * undone without closing and reopening the journal. | ||
1506 | * | ||
1507 | * The journal_abort function is intended to support higher level error | ||
1508 | * recovery mechanisms such as the ext2/ext3 remount-readonly error | ||
1509 | * mode. | ||
1510 | * | ||
1511 | * Journal abort has very specific semantics. Any existing dirty, | ||
1512 | * unjournaled buffers in the main filesystem will still be written to | ||
1513 | * disk by bdflush, but the journaling mechanism will be suspended | ||
1514 | * immediately and no further transaction commits will be honoured. | ||
1515 | * | ||
1516 | * Any dirty, journaled buffers will be written back to disk without | ||
1517 | * hitting the journal. Atomicity cannot be guaranteed on an aborted | ||
1518 | * filesystem, but we _do_ attempt to leave as much data as possible | ||
1519 | * behind for fsck to use for cleanup. | ||
1520 | * | ||
1521 | * Any attempt to get a new transaction handle on a journal which is in | ||
1522 | * ABORT state will just result in an -EROFS error return. A | ||
1523 | * journal_stop on an existing handle will return -EIO if we have | ||
1524 | * entered abort state during the update. | ||
1525 | * | ||
1526 | * Recursive transactions are not disturbed by journal abort until the | ||
1527 | * final journal_stop, which will receive the -EIO error. | ||
1528 | * | ||
1529 | * Finally, the journal_abort call allows the caller to supply an errno | ||
1530 | * which will be recorded (if possible) in the journal superblock. This | ||
1531 | * allows a client to record failure conditions in the middle of a | ||
1532 | * transaction without having to complete the transaction to record the | ||
1533 | * failure to disk. ext3_error, for example, now uses this | ||
1534 | * functionality. | ||
1535 | * | ||
1536 | * Errors which originate from within the journaling layer will NOT | ||
1537 | * supply an errno; a null errno implies that absolutely no further | ||
1538 | * writes are done to the journal (unless there are any already in | ||
1539 | * progress). | ||
1540 | * | ||
1541 | */ | ||
1542 | |||
1543 | void journal_abort(journal_t *journal, int errno) | ||
1544 | { | ||
1545 | __journal_abort_soft(journal, errno); | ||
1546 | } | ||
1547 | |||
1548 | /** | ||
1549 | * int journal_errno () - returns the journal's error state. | ||
1550 | * @journal: journal to examine. | ||
1551 | * | ||
1552 | * This is the errno numbet set with journal_abort(), the last | ||
1553 | * time the journal was mounted - if the journal was stopped | ||
1554 | * without calling abort this will be 0. | ||
1555 | * | ||
1556 | * If the journal has been aborted on this mount time -EROFS will | ||
1557 | * be returned. | ||
1558 | */ | ||
1559 | int journal_errno(journal_t *journal) | ||
1560 | { | ||
1561 | int err; | ||
1562 | |||
1563 | spin_lock(&journal->j_state_lock); | ||
1564 | if (journal->j_flags & JFS_ABORT) | ||
1565 | err = -EROFS; | ||
1566 | else | ||
1567 | err = journal->j_errno; | ||
1568 | spin_unlock(&journal->j_state_lock); | ||
1569 | return err; | ||
1570 | } | ||
1571 | |||
1572 | /** | ||
1573 | * int journal_clear_err () - clears the journal's error state | ||
1574 | * @journal: journal to act on. | ||
1575 | * | ||
1576 | * An error must be cleared or Acked to take a FS out of readonly | ||
1577 | * mode. | ||
1578 | */ | ||
1579 | int journal_clear_err(journal_t *journal) | ||
1580 | { | ||
1581 | int err = 0; | ||
1582 | |||
1583 | spin_lock(&journal->j_state_lock); | ||
1584 | if (journal->j_flags & JFS_ABORT) | ||
1585 | err = -EROFS; | ||
1586 | else | ||
1587 | journal->j_errno = 0; | ||
1588 | spin_unlock(&journal->j_state_lock); | ||
1589 | return err; | ||
1590 | } | ||
1591 | |||
1592 | /** | ||
1593 | * void journal_ack_err() - Ack journal err. | ||
1594 | * @journal: journal to act on. | ||
1595 | * | ||
1596 | * An error must be cleared or Acked to take a FS out of readonly | ||
1597 | * mode. | ||
1598 | */ | ||
1599 | void journal_ack_err(journal_t *journal) | ||
1600 | { | ||
1601 | spin_lock(&journal->j_state_lock); | ||
1602 | if (journal->j_errno) | ||
1603 | journal->j_flags |= JFS_ACK_ERR; | ||
1604 | spin_unlock(&journal->j_state_lock); | ||
1605 | } | ||
1606 | |||
1607 | int journal_blocks_per_page(struct inode *inode) | ||
1608 | { | ||
1609 | return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); | ||
1610 | } | ||
1611 | |||
1612 | /* | ||
1613 | * Simple support for retrying memory allocations. Introduced to help to | ||
1614 | * debug different VM deadlock avoidance strategies. | ||
1615 | */ | ||
1616 | void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry) | ||
1617 | { | ||
1618 | return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0)); | ||
1619 | } | ||
1620 | |||
1621 | /* | ||
1622 | * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed | ||
1623 | * and allocate frozen and commit buffers from these slabs. | ||
1624 | * | ||
1625 | * Reason for doing this is to avoid, SLAB_DEBUG - since it could | ||
1626 | * cause bh to cross page boundary. | ||
1627 | */ | ||
1628 | |||
1629 | #define JBD_MAX_SLABS 5 | ||
1630 | #define JBD_SLAB_INDEX(size) (size >> 11) | ||
1631 | |||
1632 | static kmem_cache_t *jbd_slab[JBD_MAX_SLABS]; | ||
1633 | static const char *jbd_slab_names[JBD_MAX_SLABS] = { | ||
1634 | "jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k" | ||
1635 | }; | ||
1636 | |||
1637 | static void journal_destroy_jbd_slabs(void) | ||
1638 | { | ||
1639 | int i; | ||
1640 | |||
1641 | for (i = 0; i < JBD_MAX_SLABS; i++) { | ||
1642 | if (jbd_slab[i]) | ||
1643 | kmem_cache_destroy(jbd_slab[i]); | ||
1644 | jbd_slab[i] = NULL; | ||
1645 | } | ||
1646 | } | ||
1647 | |||
1648 | static int journal_create_jbd_slab(size_t slab_size) | ||
1649 | { | ||
1650 | int i = JBD_SLAB_INDEX(slab_size); | ||
1651 | |||
1652 | BUG_ON(i >= JBD_MAX_SLABS); | ||
1653 | |||
1654 | /* | ||
1655 | * Check if we already have a slab created for this size | ||
1656 | */ | ||
1657 | if (jbd_slab[i]) | ||
1658 | return 0; | ||
1659 | |||
1660 | /* | ||
1661 | * Create a slab and force alignment to be same as slabsize - | ||
1662 | * this will make sure that allocations won't cross the page | ||
1663 | * boundary. | ||
1664 | */ | ||
1665 | jbd_slab[i] = kmem_cache_create(jbd_slab_names[i], | ||
1666 | slab_size, slab_size, 0, NULL, NULL); | ||
1667 | if (!jbd_slab[i]) { | ||
1668 | printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n"); | ||
1669 | return -ENOMEM; | ||
1670 | } | ||
1671 | return 0; | ||
1672 | } | ||
1673 | |||
1674 | void * jbd_slab_alloc(size_t size, gfp_t flags) | ||
1675 | { | ||
1676 | int idx; | ||
1677 | |||
1678 | idx = JBD_SLAB_INDEX(size); | ||
1679 | BUG_ON(jbd_slab[idx] == NULL); | ||
1680 | return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL); | ||
1681 | } | ||
1682 | |||
1683 | void jbd_slab_free(void *ptr, size_t size) | ||
1684 | { | ||
1685 | int idx; | ||
1686 | |||
1687 | idx = JBD_SLAB_INDEX(size); | ||
1688 | BUG_ON(jbd_slab[idx] == NULL); | ||
1689 | kmem_cache_free(jbd_slab[idx], ptr); | ||
1690 | } | ||
1691 | |||
1692 | /* | ||
1693 | * Journal_head storage management | ||
1694 | */ | ||
1695 | static kmem_cache_t *journal_head_cache; | ||
1696 | #ifdef CONFIG_JBD_DEBUG | ||
1697 | static atomic_t nr_journal_heads = ATOMIC_INIT(0); | ||
1698 | #endif | ||
1699 | |||
1700 | static int journal_init_journal_head_cache(void) | ||
1701 | { | ||
1702 | int retval; | ||
1703 | |||
1704 | J_ASSERT(journal_head_cache == 0); | ||
1705 | journal_head_cache = kmem_cache_create("journal_head", | ||
1706 | sizeof(struct journal_head), | ||
1707 | 0, /* offset */ | ||
1708 | 0, /* flags */ | ||
1709 | NULL, /* ctor */ | ||
1710 | NULL); /* dtor */ | ||
1711 | retval = 0; | ||
1712 | if (journal_head_cache == 0) { | ||
1713 | retval = -ENOMEM; | ||
1714 | printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); | ||
1715 | } | ||
1716 | return retval; | ||
1717 | } | ||
1718 | |||
1719 | static void journal_destroy_journal_head_cache(void) | ||
1720 | { | ||
1721 | J_ASSERT(journal_head_cache != NULL); | ||
1722 | kmem_cache_destroy(journal_head_cache); | ||
1723 | journal_head_cache = NULL; | ||
1724 | } | ||
1725 | |||
1726 | /* | ||
1727 | * journal_head splicing and dicing | ||
1728 | */ | ||
1729 | static struct journal_head *journal_alloc_journal_head(void) | ||
1730 | { | ||
1731 | struct journal_head *ret; | ||
1732 | static unsigned long last_warning; | ||
1733 | |||
1734 | #ifdef CONFIG_JBD_DEBUG | ||
1735 | atomic_inc(&nr_journal_heads); | ||
1736 | #endif | ||
1737 | ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); | ||
1738 | if (ret == 0) { | ||
1739 | jbd_debug(1, "out of memory for journal_head\n"); | ||
1740 | if (time_after(jiffies, last_warning + 5*HZ)) { | ||
1741 | printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", | ||
1742 | __FUNCTION__); | ||
1743 | last_warning = jiffies; | ||
1744 | } | ||
1745 | while (ret == 0) { | ||
1746 | yield(); | ||
1747 | ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); | ||
1748 | } | ||
1749 | } | ||
1750 | return ret; | ||
1751 | } | ||
1752 | |||
1753 | static void journal_free_journal_head(struct journal_head *jh) | ||
1754 | { | ||
1755 | #ifdef CONFIG_JBD_DEBUG | ||
1756 | atomic_dec(&nr_journal_heads); | ||
1757 | memset(jh, JBD_POISON_FREE, sizeof(*jh)); | ||
1758 | #endif | ||
1759 | kmem_cache_free(journal_head_cache, jh); | ||
1760 | } | ||
1761 | |||
1762 | /* | ||
1763 | * A journal_head is attached to a buffer_head whenever JBD has an | ||
1764 | * interest in the buffer. | ||
1765 | * | ||
1766 | * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit | ||
1767 | * is set. This bit is tested in core kernel code where we need to take | ||
1768 | * JBD-specific actions. Testing the zeroness of ->b_private is not reliable | ||
1769 | * there. | ||
1770 | * | ||
1771 | * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. | ||
1772 | * | ||
1773 | * When a buffer has its BH_JBD bit set it is immune from being released by | ||
1774 | * core kernel code, mainly via ->b_count. | ||
1775 | * | ||
1776 | * A journal_head may be detached from its buffer_head when the journal_head's | ||
1777 | * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. | ||
1778 | * Various places in JBD call journal_remove_journal_head() to indicate that the | ||
1779 | * journal_head can be dropped if needed. | ||
1780 | * | ||
1781 | * Various places in the kernel want to attach a journal_head to a buffer_head | ||
1782 | * _before_ attaching the journal_head to a transaction. To protect the | ||
1783 | * journal_head in this situation, journal_add_journal_head elevates the | ||
1784 | * journal_head's b_jcount refcount by one. The caller must call | ||
1785 | * journal_put_journal_head() to undo this. | ||
1786 | * | ||
1787 | * So the typical usage would be: | ||
1788 | * | ||
1789 | * (Attach a journal_head if needed. Increments b_jcount) | ||
1790 | * struct journal_head *jh = journal_add_journal_head(bh); | ||
1791 | * ... | ||
1792 | * jh->b_transaction = xxx; | ||
1793 | * journal_put_journal_head(jh); | ||
1794 | * | ||
1795 | * Now, the journal_head's b_jcount is zero, but it is safe from being released | ||
1796 | * because it has a non-zero b_transaction. | ||
1797 | */ | ||
1798 | |||
1799 | /* | ||
1800 | * Give a buffer_head a journal_head. | ||
1801 | * | ||
1802 | * Doesn't need the journal lock. | ||
1803 | * May sleep. | ||
1804 | */ | ||
1805 | struct journal_head *journal_add_journal_head(struct buffer_head *bh) | ||
1806 | { | ||
1807 | struct journal_head *jh; | ||
1808 | struct journal_head *new_jh = NULL; | ||
1809 | |||
1810 | repeat: | ||
1811 | if (!buffer_jbd(bh)) { | ||
1812 | new_jh = journal_alloc_journal_head(); | ||
1813 | memset(new_jh, 0, sizeof(*new_jh)); | ||
1814 | } | ||
1815 | |||
1816 | jbd_lock_bh_journal_head(bh); | ||
1817 | if (buffer_jbd(bh)) { | ||
1818 | jh = bh2jh(bh); | ||
1819 | } else { | ||
1820 | J_ASSERT_BH(bh, | ||
1821 | (atomic_read(&bh->b_count) > 0) || | ||
1822 | (bh->b_page && bh->b_page->mapping)); | ||
1823 | |||
1824 | if (!new_jh) { | ||
1825 | jbd_unlock_bh_journal_head(bh); | ||
1826 | goto repeat; | ||
1827 | } | ||
1828 | |||
1829 | jh = new_jh; | ||
1830 | new_jh = NULL; /* We consumed it */ | ||
1831 | set_buffer_jbd(bh); | ||
1832 | bh->b_private = jh; | ||
1833 | jh->b_bh = bh; | ||
1834 | get_bh(bh); | ||
1835 | BUFFER_TRACE(bh, "added journal_head"); | ||
1836 | } | ||
1837 | jh->b_jcount++; | ||
1838 | jbd_unlock_bh_journal_head(bh); | ||
1839 | if (new_jh) | ||
1840 | journal_free_journal_head(new_jh); | ||
1841 | return bh->b_private; | ||
1842 | } | ||
1843 | |||
1844 | /* | ||
1845 | * Grab a ref against this buffer_head's journal_head. If it ended up not | ||
1846 | * having a journal_head, return NULL | ||
1847 | */ | ||
1848 | struct journal_head *journal_grab_journal_head(struct buffer_head *bh) | ||
1849 | { | ||
1850 | struct journal_head *jh = NULL; | ||
1851 | |||
1852 | jbd_lock_bh_journal_head(bh); | ||
1853 | if (buffer_jbd(bh)) { | ||
1854 | jh = bh2jh(bh); | ||
1855 | jh->b_jcount++; | ||
1856 | } | ||
1857 | jbd_unlock_bh_journal_head(bh); | ||
1858 | return jh; | ||
1859 | } | ||
1860 | |||
1861 | static void __journal_remove_journal_head(struct buffer_head *bh) | ||
1862 | { | ||
1863 | struct journal_head *jh = bh2jh(bh); | ||
1864 | |||
1865 | J_ASSERT_JH(jh, jh->b_jcount >= 0); | ||
1866 | |||
1867 | get_bh(bh); | ||
1868 | if (jh->b_jcount == 0) { | ||
1869 | if (jh->b_transaction == NULL && | ||
1870 | jh->b_next_transaction == NULL && | ||
1871 | jh->b_cp_transaction == NULL) { | ||
1872 | J_ASSERT_JH(jh, jh->b_jlist == BJ_None); | ||
1873 | J_ASSERT_BH(bh, buffer_jbd(bh)); | ||
1874 | J_ASSERT_BH(bh, jh2bh(jh) == bh); | ||
1875 | BUFFER_TRACE(bh, "remove journal_head"); | ||
1876 | if (jh->b_frozen_data) { | ||
1877 | printk(KERN_WARNING "%s: freeing " | ||
1878 | "b_frozen_data\n", | ||
1879 | __FUNCTION__); | ||
1880 | jbd_slab_free(jh->b_frozen_data, bh->b_size); | ||
1881 | } | ||
1882 | if (jh->b_committed_data) { | ||
1883 | printk(KERN_WARNING "%s: freeing " | ||
1884 | "b_committed_data\n", | ||
1885 | __FUNCTION__); | ||
1886 | jbd_slab_free(jh->b_committed_data, bh->b_size); | ||
1887 | } | ||
1888 | bh->b_private = NULL; | ||
1889 | jh->b_bh = NULL; /* debug, really */ | ||
1890 | clear_buffer_jbd(bh); | ||
1891 | __brelse(bh); | ||
1892 | journal_free_journal_head(jh); | ||
1893 | } else { | ||
1894 | BUFFER_TRACE(bh, "journal_head was locked"); | ||
1895 | } | ||
1896 | } | ||
1897 | } | ||
1898 | |||
1899 | /* | ||
1900 | * journal_remove_journal_head(): if the buffer isn't attached to a transaction | ||
1901 | * and has a zero b_jcount then remove and release its journal_head. If we did | ||
1902 | * see that the buffer is not used by any transaction we also "logically" | ||
1903 | * decrement ->b_count. | ||
1904 | * | ||
1905 | * We in fact take an additional increment on ->b_count as a convenience, | ||
1906 | * because the caller usually wants to do additional things with the bh | ||
1907 | * after calling here. | ||
1908 | * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some | ||
1909 | * time. Once the caller has run __brelse(), the buffer is eligible for | ||
1910 | * reaping by try_to_free_buffers(). | ||
1911 | */ | ||
1912 | void journal_remove_journal_head(struct buffer_head *bh) | ||
1913 | { | ||
1914 | jbd_lock_bh_journal_head(bh); | ||
1915 | __journal_remove_journal_head(bh); | ||
1916 | jbd_unlock_bh_journal_head(bh); | ||
1917 | } | ||
1918 | |||
1919 | /* | ||
1920 | * Drop a reference on the passed journal_head. If it fell to zero then try to | ||
1921 | * release the journal_head from the buffer_head. | ||
1922 | */ | ||
1923 | void journal_put_journal_head(struct journal_head *jh) | ||
1924 | { | ||
1925 | struct buffer_head *bh = jh2bh(jh); | ||
1926 | |||
1927 | jbd_lock_bh_journal_head(bh); | ||
1928 | J_ASSERT_JH(jh, jh->b_jcount > 0); | ||
1929 | --jh->b_jcount; | ||
1930 | if (!jh->b_jcount && !jh->b_transaction) { | ||
1931 | __journal_remove_journal_head(bh); | ||
1932 | __brelse(bh); | ||
1933 | } | ||
1934 | jbd_unlock_bh_journal_head(bh); | ||
1935 | } | ||
1936 | |||
1937 | /* | ||
1938 | * /proc tunables | ||
1939 | */ | ||
1940 | #if defined(CONFIG_JBD_DEBUG) | ||
1941 | int journal_enable_debug; | ||
1942 | EXPORT_SYMBOL(journal_enable_debug); | ||
1943 | #endif | ||
1944 | |||
1945 | #if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) | ||
1946 | |||
1947 | static struct proc_dir_entry *proc_jbd_debug; | ||
1948 | |||
1949 | static int read_jbd_debug(char *page, char **start, off_t off, | ||
1950 | int count, int *eof, void *data) | ||
1951 | { | ||
1952 | int ret; | ||
1953 | |||
1954 | ret = sprintf(page + off, "%d\n", journal_enable_debug); | ||
1955 | *eof = 1; | ||
1956 | return ret; | ||
1957 | } | ||
1958 | |||
1959 | static int write_jbd_debug(struct file *file, const char __user *buffer, | ||
1960 | unsigned long count, void *data) | ||
1961 | { | ||
1962 | char buf[32]; | ||
1963 | |||
1964 | if (count > ARRAY_SIZE(buf) - 1) | ||
1965 | count = ARRAY_SIZE(buf) - 1; | ||
1966 | if (copy_from_user(buf, buffer, count)) | ||
1967 | return -EFAULT; | ||
1968 | buf[ARRAY_SIZE(buf) - 1] = '\0'; | ||
1969 | journal_enable_debug = simple_strtoul(buf, NULL, 10); | ||
1970 | return count; | ||
1971 | } | ||
1972 | |||
1973 | #define JBD_PROC_NAME "sys/fs/jbd-debug" | ||
1974 | |||
1975 | static void __init create_jbd_proc_entry(void) | ||
1976 | { | ||
1977 | proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); | ||
1978 | if (proc_jbd_debug) { | ||
1979 | /* Why is this so hard? */ | ||
1980 | proc_jbd_debug->read_proc = read_jbd_debug; | ||
1981 | proc_jbd_debug->write_proc = write_jbd_debug; | ||
1982 | } | ||
1983 | } | ||
1984 | |||
1985 | static void __exit remove_jbd_proc_entry(void) | ||
1986 | { | ||
1987 | if (proc_jbd_debug) | ||
1988 | remove_proc_entry(JBD_PROC_NAME, NULL); | ||
1989 | } | ||
1990 | |||
1991 | #else | ||
1992 | |||
1993 | #define create_jbd_proc_entry() do {} while (0) | ||
1994 | #define remove_jbd_proc_entry() do {} while (0) | ||
1995 | |||
1996 | #endif | ||
1997 | |||
1998 | kmem_cache_t *jbd_handle_cache; | ||
1999 | |||
2000 | static int __init journal_init_handle_cache(void) | ||
2001 | { | ||
2002 | jbd_handle_cache = kmem_cache_create("journal_handle", | ||
2003 | sizeof(handle_t), | ||
2004 | 0, /* offset */ | ||
2005 | 0, /* flags */ | ||
2006 | NULL, /* ctor */ | ||
2007 | NULL); /* dtor */ | ||
2008 | if (jbd_handle_cache == NULL) { | ||
2009 | printk(KERN_EMERG "JBD: failed to create handle cache\n"); | ||
2010 | return -ENOMEM; | ||
2011 | } | ||
2012 | return 0; | ||
2013 | } | ||
2014 | |||
2015 | static void journal_destroy_handle_cache(void) | ||
2016 | { | ||
2017 | if (jbd_handle_cache) | ||
2018 | kmem_cache_destroy(jbd_handle_cache); | ||
2019 | } | ||
2020 | |||
2021 | /* | ||
2022 | * Module startup and shutdown | ||
2023 | */ | ||
2024 | |||
2025 | static int __init journal_init_caches(void) | ||
2026 | { | ||
2027 | int ret; | ||
2028 | |||
2029 | ret = journal_init_revoke_caches(); | ||
2030 | if (ret == 0) | ||
2031 | ret = journal_init_journal_head_cache(); | ||
2032 | if (ret == 0) | ||
2033 | ret = journal_init_handle_cache(); | ||
2034 | return ret; | ||
2035 | } | ||
2036 | |||
2037 | static void journal_destroy_caches(void) | ||
2038 | { | ||
2039 | journal_destroy_revoke_caches(); | ||
2040 | journal_destroy_journal_head_cache(); | ||
2041 | journal_destroy_handle_cache(); | ||
2042 | journal_destroy_jbd_slabs(); | ||
2043 | } | ||
2044 | |||
2045 | static int __init journal_init(void) | ||
2046 | { | ||
2047 | int ret; | ||
2048 | |||
2049 | BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); | ||
2050 | |||
2051 | ret = journal_init_caches(); | ||
2052 | if (ret != 0) | ||
2053 | journal_destroy_caches(); | ||
2054 | create_jbd_proc_entry(); | ||
2055 | return ret; | ||
2056 | } | ||
2057 | |||
2058 | static void __exit journal_exit(void) | ||
2059 | { | ||
2060 | #ifdef CONFIG_JBD_DEBUG | ||
2061 | int n = atomic_read(&nr_journal_heads); | ||
2062 | if (n) | ||
2063 | printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); | ||
2064 | #endif | ||
2065 | remove_jbd_proc_entry(); | ||
2066 | journal_destroy_caches(); | ||
2067 | } | ||
2068 | |||
2069 | MODULE_LICENSE("GPL"); | ||
2070 | module_init(journal_init); | ||
2071 | module_exit(journal_exit); | ||
2072 | |||
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c new file mode 100644 index 000000000000..11563fe2a52b --- /dev/null +++ b/fs/jbd2/recovery.c | |||
@@ -0,0 +1,592 @@ | |||
1 | /* | ||
2 | * linux/fs/recovery.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 | ||
5 | * | ||
6 | * Copyright 1999-2000 Red Hat Software --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Journal recovery routines for the generic filesystem journaling code; | ||
13 | * part of the ext2fs journaling system. | ||
14 | */ | ||
15 | |||
16 | #ifndef __KERNEL__ | ||
17 | #include "jfs_user.h" | ||
18 | #else | ||
19 | #include <linux/time.h> | ||
20 | #include <linux/fs.h> | ||
21 | #include <linux/jbd.h> | ||
22 | #include <linux/errno.h> | ||
23 | #include <linux/slab.h> | ||
24 | #endif | ||
25 | |||
26 | /* | ||
27 | * Maintain information about the progress of the recovery job, so that | ||
28 | * the different passes can carry information between them. | ||
29 | */ | ||
30 | struct recovery_info | ||
31 | { | ||
32 | tid_t start_transaction; | ||
33 | tid_t end_transaction; | ||
34 | |||
35 | int nr_replays; | ||
36 | int nr_revokes; | ||
37 | int nr_revoke_hits; | ||
38 | }; | ||
39 | |||
40 | enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; | ||
41 | static int do_one_pass(journal_t *journal, | ||
42 | struct recovery_info *info, enum passtype pass); | ||
43 | static int scan_revoke_records(journal_t *, struct buffer_head *, | ||
44 | tid_t, struct recovery_info *); | ||
45 | |||
46 | #ifdef __KERNEL__ | ||
47 | |||
48 | /* Release readahead buffers after use */ | ||
49 | static void journal_brelse_array(struct buffer_head *b[], int n) | ||
50 | { | ||
51 | while (--n >= 0) | ||
52 | brelse (b[n]); | ||
53 | } | ||
54 | |||
55 | |||
56 | /* | ||
57 | * When reading from the journal, we are going through the block device | ||
58 | * layer directly and so there is no readahead being done for us. We | ||
59 | * need to implement any readahead ourselves if we want it to happen at | ||
60 | * all. Recovery is basically one long sequential read, so make sure we | ||
61 | * do the IO in reasonably large chunks. | ||
62 | * | ||
63 | * This is not so critical that we need to be enormously clever about | ||
64 | * the readahead size, though. 128K is a purely arbitrary, good-enough | ||
65 | * fixed value. | ||
66 | */ | ||
67 | |||
68 | #define MAXBUF 8 | ||
69 | static int do_readahead(journal_t *journal, unsigned int start) | ||
70 | { | ||
71 | int err; | ||
72 | unsigned int max, nbufs, next; | ||
73 | unsigned long blocknr; | ||
74 | struct buffer_head *bh; | ||
75 | |||
76 | struct buffer_head * bufs[MAXBUF]; | ||
77 | |||
78 | /* Do up to 128K of readahead */ | ||
79 | max = start + (128 * 1024 / journal->j_blocksize); | ||
80 | if (max > journal->j_maxlen) | ||
81 | max = journal->j_maxlen; | ||
82 | |||
83 | /* Do the readahead itself. We'll submit MAXBUF buffer_heads at | ||
84 | * a time to the block device IO layer. */ | ||
85 | |||
86 | nbufs = 0; | ||
87 | |||
88 | for (next = start; next < max; next++) { | ||
89 | err = journal_bmap(journal, next, &blocknr); | ||
90 | |||
91 | if (err) { | ||
92 | printk (KERN_ERR "JBD: bad block at offset %u\n", | ||
93 | next); | ||
94 | goto failed; | ||
95 | } | ||
96 | |||
97 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
98 | if (!bh) { | ||
99 | err = -ENOMEM; | ||
100 | goto failed; | ||
101 | } | ||
102 | |||
103 | if (!buffer_uptodate(bh) && !buffer_locked(bh)) { | ||
104 | bufs[nbufs++] = bh; | ||
105 | if (nbufs == MAXBUF) { | ||
106 | ll_rw_block(READ, nbufs, bufs); | ||
107 | journal_brelse_array(bufs, nbufs); | ||
108 | nbufs = 0; | ||
109 | } | ||
110 | } else | ||
111 | brelse(bh); | ||
112 | } | ||
113 | |||
114 | if (nbufs) | ||
115 | ll_rw_block(READ, nbufs, bufs); | ||
116 | err = 0; | ||
117 | |||
118 | failed: | ||
119 | if (nbufs) | ||
120 | journal_brelse_array(bufs, nbufs); | ||
121 | return err; | ||
122 | } | ||
123 | |||
124 | #endif /* __KERNEL__ */ | ||
125 | |||
126 | |||
127 | /* | ||
128 | * Read a block from the journal | ||
129 | */ | ||
130 | |||
131 | static int jread(struct buffer_head **bhp, journal_t *journal, | ||
132 | unsigned int offset) | ||
133 | { | ||
134 | int err; | ||
135 | unsigned long blocknr; | ||
136 | struct buffer_head *bh; | ||
137 | |||
138 | *bhp = NULL; | ||
139 | |||
140 | if (offset >= journal->j_maxlen) { | ||
141 | printk(KERN_ERR "JBD: corrupted journal superblock\n"); | ||
142 | return -EIO; | ||
143 | } | ||
144 | |||
145 | err = journal_bmap(journal, offset, &blocknr); | ||
146 | |||
147 | if (err) { | ||
148 | printk (KERN_ERR "JBD: bad block at offset %u\n", | ||
149 | offset); | ||
150 | return err; | ||
151 | } | ||
152 | |||
153 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
154 | if (!bh) | ||
155 | return -ENOMEM; | ||
156 | |||
157 | if (!buffer_uptodate(bh)) { | ||
158 | /* If this is a brand new buffer, start readahead. | ||
159 | Otherwise, we assume we are already reading it. */ | ||
160 | if (!buffer_req(bh)) | ||
161 | do_readahead(journal, offset); | ||
162 | wait_on_buffer(bh); | ||
163 | } | ||
164 | |||
165 | if (!buffer_uptodate(bh)) { | ||
166 | printk (KERN_ERR "JBD: Failed to read block at offset %u\n", | ||
167 | offset); | ||
168 | brelse(bh); | ||
169 | return -EIO; | ||
170 | } | ||
171 | |||
172 | *bhp = bh; | ||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | |||
177 | /* | ||
178 | * Count the number of in-use tags in a journal descriptor block. | ||
179 | */ | ||
180 | |||
181 | static int count_tags(struct buffer_head *bh, int size) | ||
182 | { | ||
183 | char * tagp; | ||
184 | journal_block_tag_t * tag; | ||
185 | int nr = 0; | ||
186 | |||
187 | tagp = &bh->b_data[sizeof(journal_header_t)]; | ||
188 | |||
189 | while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { | ||
190 | tag = (journal_block_tag_t *) tagp; | ||
191 | |||
192 | nr++; | ||
193 | tagp += sizeof(journal_block_tag_t); | ||
194 | if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) | ||
195 | tagp += 16; | ||
196 | |||
197 | if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG)) | ||
198 | break; | ||
199 | } | ||
200 | |||
201 | return nr; | ||
202 | } | ||
203 | |||
204 | |||
205 | /* Make sure we wrap around the log correctly! */ | ||
206 | #define wrap(journal, var) \ | ||
207 | do { \ | ||
208 | if (var >= (journal)->j_last) \ | ||
209 | var -= ((journal)->j_last - (journal)->j_first); \ | ||
210 | } while (0) | ||
211 | |||
212 | /** | ||
213 | * journal_recover - recovers a on-disk journal | ||
214 | * @journal: the journal to recover | ||
215 | * | ||
216 | * The primary function for recovering the log contents when mounting a | ||
217 | * journaled device. | ||
218 | * | ||
219 | * Recovery is done in three passes. In the first pass, we look for the | ||
220 | * end of the log. In the second, we assemble the list of revoke | ||
221 | * blocks. In the third and final pass, we replay any un-revoked blocks | ||
222 | * in the log. | ||
223 | */ | ||
224 | int journal_recover(journal_t *journal) | ||
225 | { | ||
226 | int err; | ||
227 | journal_superblock_t * sb; | ||
228 | |||
229 | struct recovery_info info; | ||
230 | |||
231 | memset(&info, 0, sizeof(info)); | ||
232 | sb = journal->j_superblock; | ||
233 | |||
234 | /* | ||
235 | * The journal superblock's s_start field (the current log head) | ||
236 | * is always zero if, and only if, the journal was cleanly | ||
237 | * unmounted. | ||
238 | */ | ||
239 | |||
240 | if (!sb->s_start) { | ||
241 | jbd_debug(1, "No recovery required, last transaction %d\n", | ||
242 | be32_to_cpu(sb->s_sequence)); | ||
243 | journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; | ||
244 | return 0; | ||
245 | } | ||
246 | |||
247 | err = do_one_pass(journal, &info, PASS_SCAN); | ||
248 | if (!err) | ||
249 | err = do_one_pass(journal, &info, PASS_REVOKE); | ||
250 | if (!err) | ||
251 | err = do_one_pass(journal, &info, PASS_REPLAY); | ||
252 | |||
253 | jbd_debug(0, "JBD: recovery, exit status %d, " | ||
254 | "recovered transactions %u to %u\n", | ||
255 | err, info.start_transaction, info.end_transaction); | ||
256 | jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", | ||
257 | info.nr_replays, info.nr_revoke_hits, info.nr_revokes); | ||
258 | |||
259 | /* Restart the log at the next transaction ID, thus invalidating | ||
260 | * any existing commit records in the log. */ | ||
261 | journal->j_transaction_sequence = ++info.end_transaction; | ||
262 | |||
263 | journal_clear_revoke(journal); | ||
264 | sync_blockdev(journal->j_fs_dev); | ||
265 | return err; | ||
266 | } | ||
267 | |||
268 | /** | ||
269 | * journal_skip_recovery - Start journal and wipe exiting records | ||
270 | * @journal: journal to startup | ||
271 | * | ||
272 | * Locate any valid recovery information from the journal and set up the | ||
273 | * journal structures in memory to ignore it (presumably because the | ||
274 | * caller has evidence that it is out of date). | ||
275 | * This function does'nt appear to be exorted.. | ||
276 | * | ||
277 | * We perform one pass over the journal to allow us to tell the user how | ||
278 | * much recovery information is being erased, and to let us initialise | ||
279 | * the journal transaction sequence numbers to the next unused ID. | ||
280 | */ | ||
281 | int journal_skip_recovery(journal_t *journal) | ||
282 | { | ||
283 | int err; | ||
284 | journal_superblock_t * sb; | ||
285 | |||
286 | struct recovery_info info; | ||
287 | |||
288 | memset (&info, 0, sizeof(info)); | ||
289 | sb = journal->j_superblock; | ||
290 | |||
291 | err = do_one_pass(journal, &info, PASS_SCAN); | ||
292 | |||
293 | if (err) { | ||
294 | printk(KERN_ERR "JBD: error %d scanning journal\n", err); | ||
295 | ++journal->j_transaction_sequence; | ||
296 | } else { | ||
297 | #ifdef CONFIG_JBD_DEBUG | ||
298 | int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); | ||
299 | #endif | ||
300 | jbd_debug(0, | ||
301 | "JBD: ignoring %d transaction%s from the journal.\n", | ||
302 | dropped, (dropped == 1) ? "" : "s"); | ||
303 | journal->j_transaction_sequence = ++info.end_transaction; | ||
304 | } | ||
305 | |||
306 | journal->j_tail = 0; | ||
307 | return err; | ||
308 | } | ||
309 | |||
310 | static int do_one_pass(journal_t *journal, | ||
311 | struct recovery_info *info, enum passtype pass) | ||
312 | { | ||
313 | unsigned int first_commit_ID, next_commit_ID; | ||
314 | unsigned long next_log_block; | ||
315 | int err, success = 0; | ||
316 | journal_superblock_t * sb; | ||
317 | journal_header_t * tmp; | ||
318 | struct buffer_head * bh; | ||
319 | unsigned int sequence; | ||
320 | int blocktype; | ||
321 | |||
322 | /* Precompute the maximum metadata descriptors in a descriptor block */ | ||
323 | int MAX_BLOCKS_PER_DESC; | ||
324 | MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) | ||
325 | / sizeof(journal_block_tag_t)); | ||
326 | |||
327 | /* | ||
328 | * First thing is to establish what we expect to find in the log | ||
329 | * (in terms of transaction IDs), and where (in terms of log | ||
330 | * block offsets): query the superblock. | ||
331 | */ | ||
332 | |||
333 | sb = journal->j_superblock; | ||
334 | next_commit_ID = be32_to_cpu(sb->s_sequence); | ||
335 | next_log_block = be32_to_cpu(sb->s_start); | ||
336 | |||
337 | first_commit_ID = next_commit_ID; | ||
338 | if (pass == PASS_SCAN) | ||
339 | info->start_transaction = first_commit_ID; | ||
340 | |||
341 | jbd_debug(1, "Starting recovery pass %d\n", pass); | ||
342 | |||
343 | /* | ||
344 | * Now we walk through the log, transaction by transaction, | ||
345 | * making sure that each transaction has a commit block in the | ||
346 | * expected place. Each complete transaction gets replayed back | ||
347 | * into the main filesystem. | ||
348 | */ | ||
349 | |||
350 | while (1) { | ||
351 | int flags; | ||
352 | char * tagp; | ||
353 | journal_block_tag_t * tag; | ||
354 | struct buffer_head * obh; | ||
355 | struct buffer_head * nbh; | ||
356 | |||
357 | cond_resched(); /* We're under lock_kernel() */ | ||
358 | |||
359 | /* If we already know where to stop the log traversal, | ||
360 | * check right now that we haven't gone past the end of | ||
361 | * the log. */ | ||
362 | |||
363 | if (pass != PASS_SCAN) | ||
364 | if (tid_geq(next_commit_ID, info->end_transaction)) | ||
365 | break; | ||
366 | |||
367 | jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", | ||
368 | next_commit_ID, next_log_block, journal->j_last); | ||
369 | |||
370 | /* Skip over each chunk of the transaction looking | ||
371 | * either the next descriptor block or the final commit | ||
372 | * record. */ | ||
373 | |||
374 | jbd_debug(3, "JBD: checking block %ld\n", next_log_block); | ||
375 | err = jread(&bh, journal, next_log_block); | ||
376 | if (err) | ||
377 | goto failed; | ||
378 | |||
379 | next_log_block++; | ||
380 | wrap(journal, next_log_block); | ||
381 | |||
382 | /* What kind of buffer is it? | ||
383 | * | ||
384 | * If it is a descriptor block, check that it has the | ||
385 | * expected sequence number. Otherwise, we're all done | ||
386 | * here. */ | ||
387 | |||
388 | tmp = (journal_header_t *)bh->b_data; | ||
389 | |||
390 | if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { | ||
391 | brelse(bh); | ||
392 | break; | ||
393 | } | ||
394 | |||
395 | blocktype = be32_to_cpu(tmp->h_blocktype); | ||
396 | sequence = be32_to_cpu(tmp->h_sequence); | ||
397 | jbd_debug(3, "Found magic %d, sequence %d\n", | ||
398 | blocktype, sequence); | ||
399 | |||
400 | if (sequence != next_commit_ID) { | ||
401 | brelse(bh); | ||
402 | break; | ||
403 | } | ||
404 | |||
405 | /* OK, we have a valid descriptor block which matches | ||
406 | * all of the sequence number checks. What are we going | ||
407 | * to do with it? That depends on the pass... */ | ||
408 | |||
409 | switch(blocktype) { | ||
410 | case JFS_DESCRIPTOR_BLOCK: | ||
411 | /* If it is a valid descriptor block, replay it | ||
412 | * in pass REPLAY; otherwise, just skip over the | ||
413 | * blocks it describes. */ | ||
414 | if (pass != PASS_REPLAY) { | ||
415 | next_log_block += | ||
416 | count_tags(bh, journal->j_blocksize); | ||
417 | wrap(journal, next_log_block); | ||
418 | brelse(bh); | ||
419 | continue; | ||
420 | } | ||
421 | |||
422 | /* A descriptor block: we can now write all of | ||
423 | * the data blocks. Yay, useful work is finally | ||
424 | * getting done here! */ | ||
425 | |||
426 | tagp = &bh->b_data[sizeof(journal_header_t)]; | ||
427 | while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) | ||
428 | <= journal->j_blocksize) { | ||
429 | unsigned long io_block; | ||
430 | |||
431 | tag = (journal_block_tag_t *) tagp; | ||
432 | flags = be32_to_cpu(tag->t_flags); | ||
433 | |||
434 | io_block = next_log_block++; | ||
435 | wrap(journal, next_log_block); | ||
436 | err = jread(&obh, journal, io_block); | ||
437 | if (err) { | ||
438 | /* Recover what we can, but | ||
439 | * report failure at the end. */ | ||
440 | success = err; | ||
441 | printk (KERN_ERR | ||
442 | "JBD: IO error %d recovering " | ||
443 | "block %ld in log\n", | ||
444 | err, io_block); | ||
445 | } else { | ||
446 | unsigned long blocknr; | ||
447 | |||
448 | J_ASSERT(obh != NULL); | ||
449 | blocknr = be32_to_cpu(tag->t_blocknr); | ||
450 | |||
451 | /* If the block has been | ||
452 | * revoked, then we're all done | ||
453 | * here. */ | ||
454 | if (journal_test_revoke | ||
455 | (journal, blocknr, | ||
456 | next_commit_ID)) { | ||
457 | brelse(obh); | ||
458 | ++info->nr_revoke_hits; | ||
459 | goto skip_write; | ||
460 | } | ||
461 | |||
462 | /* Find a buffer for the new | ||
463 | * data being restored */ | ||
464 | nbh = __getblk(journal->j_fs_dev, | ||
465 | blocknr, | ||
466 | journal->j_blocksize); | ||
467 | if (nbh == NULL) { | ||
468 | printk(KERN_ERR | ||
469 | "JBD: Out of memory " | ||
470 | "during recovery.\n"); | ||
471 | err = -ENOMEM; | ||
472 | brelse(bh); | ||
473 | brelse(obh); | ||
474 | goto failed; | ||
475 | } | ||
476 | |||
477 | lock_buffer(nbh); | ||
478 | memcpy(nbh->b_data, obh->b_data, | ||
479 | journal->j_blocksize); | ||
480 | if (flags & JFS_FLAG_ESCAPE) { | ||
481 | *((__be32 *)bh->b_data) = | ||
482 | cpu_to_be32(JFS_MAGIC_NUMBER); | ||
483 | } | ||
484 | |||
485 | BUFFER_TRACE(nbh, "marking dirty"); | ||
486 | set_buffer_uptodate(nbh); | ||
487 | mark_buffer_dirty(nbh); | ||
488 | BUFFER_TRACE(nbh, "marking uptodate"); | ||
489 | ++info->nr_replays; | ||
490 | /* ll_rw_block(WRITE, 1, &nbh); */ | ||
491 | unlock_buffer(nbh); | ||
492 | brelse(obh); | ||
493 | brelse(nbh); | ||
494 | } | ||
495 | |||
496 | skip_write: | ||
497 | tagp += sizeof(journal_block_tag_t); | ||
498 | if (!(flags & JFS_FLAG_SAME_UUID)) | ||
499 | tagp += 16; | ||
500 | |||
501 | if (flags & JFS_FLAG_LAST_TAG) | ||
502 | break; | ||
503 | } | ||
504 | |||
505 | brelse(bh); | ||
506 | continue; | ||
507 | |||
508 | case JFS_COMMIT_BLOCK: | ||
509 | /* Found an expected commit block: not much to | ||
510 | * do other than move on to the next sequence | ||
511 | * number. */ | ||
512 | brelse(bh); | ||
513 | next_commit_ID++; | ||
514 | continue; | ||
515 | |||
516 | case JFS_REVOKE_BLOCK: | ||
517 | /* If we aren't in the REVOKE pass, then we can | ||
518 | * just skip over this block. */ | ||
519 | if (pass != PASS_REVOKE) { | ||
520 | brelse(bh); | ||
521 | continue; | ||
522 | } | ||
523 | |||
524 | err = scan_revoke_records(journal, bh, | ||
525 | next_commit_ID, info); | ||
526 | brelse(bh); | ||
527 | if (err) | ||
528 | goto failed; | ||
529 | continue; | ||
530 | |||
531 | default: | ||
532 | jbd_debug(3, "Unrecognised magic %d, end of scan.\n", | ||
533 | blocktype); | ||
534 | brelse(bh); | ||
535 | goto done; | ||
536 | } | ||
537 | } | ||
538 | |||
539 | done: | ||
540 | /* | ||
541 | * We broke out of the log scan loop: either we came to the | ||
542 | * known end of the log or we found an unexpected block in the | ||
543 | * log. If the latter happened, then we know that the "current" | ||
544 | * transaction marks the end of the valid log. | ||
545 | */ | ||
546 | |||
547 | if (pass == PASS_SCAN) | ||
548 | info->end_transaction = next_commit_ID; | ||
549 | else { | ||
550 | /* It's really bad news if different passes end up at | ||
551 | * different places (but possible due to IO errors). */ | ||
552 | if (info->end_transaction != next_commit_ID) { | ||
553 | printk (KERN_ERR "JBD: recovery pass %d ended at " | ||
554 | "transaction %u, expected %u\n", | ||
555 | pass, next_commit_ID, info->end_transaction); | ||
556 | if (!success) | ||
557 | success = -EIO; | ||
558 | } | ||
559 | } | ||
560 | |||
561 | return success; | ||
562 | |||
563 | failed: | ||
564 | return err; | ||
565 | } | ||
566 | |||
567 | |||
568 | /* Scan a revoke record, marking all blocks mentioned as revoked. */ | ||
569 | |||
570 | static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, | ||
571 | tid_t sequence, struct recovery_info *info) | ||
572 | { | ||
573 | journal_revoke_header_t *header; | ||
574 | int offset, max; | ||
575 | |||
576 | header = (journal_revoke_header_t *) bh->b_data; | ||
577 | offset = sizeof(journal_revoke_header_t); | ||
578 | max = be32_to_cpu(header->r_count); | ||
579 | |||
580 | while (offset < max) { | ||
581 | unsigned long blocknr; | ||
582 | int err; | ||
583 | |||
584 | blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); | ||
585 | offset += 4; | ||
586 | err = journal_set_revoke(journal, blocknr, sequence); | ||
587 | if (err) | ||
588 | return err; | ||
589 | ++info->nr_revokes; | ||
590 | } | ||
591 | return 0; | ||
592 | } | ||
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c new file mode 100644 index 000000000000..c532429d8d9b --- /dev/null +++ b/fs/jbd2/revoke.c | |||
@@ -0,0 +1,703 @@ | |||
1 | /* | ||
2 | * linux/fs/revoke.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 | ||
5 | * | ||
6 | * Copyright 2000 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Journal revoke routines for the generic filesystem journaling code; | ||
13 | * part of the ext2fs journaling system. | ||
14 | * | ||
15 | * Revoke is the mechanism used to prevent old log records for deleted | ||
16 | * metadata from being replayed on top of newer data using the same | ||
17 | * blocks. The revoke mechanism is used in two separate places: | ||
18 | * | ||
19 | * + Commit: during commit we write the entire list of the current | ||
20 | * transaction's revoked blocks to the journal | ||
21 | * | ||
22 | * + Recovery: during recovery we record the transaction ID of all | ||
23 | * revoked blocks. If there are multiple revoke records in the log | ||
24 | * for a single block, only the last one counts, and if there is a log | ||
25 | * entry for a block beyond the last revoke, then that log entry still | ||
26 | * gets replayed. | ||
27 | * | ||
28 | * We can get interactions between revokes and new log data within a | ||
29 | * single transaction: | ||
30 | * | ||
31 | * Block is revoked and then journaled: | ||
32 | * The desired end result is the journaling of the new block, so we | ||
33 | * cancel the revoke before the transaction commits. | ||
34 | * | ||
35 | * Block is journaled and then revoked: | ||
36 | * The revoke must take precedence over the write of the block, so we | ||
37 | * need either to cancel the journal entry or to write the revoke | ||
38 | * later in the log than the log block. In this case, we choose the | ||
39 | * latter: journaling a block cancels any revoke record for that block | ||
40 | * in the current transaction, so any revoke for that block in the | ||
41 | * transaction must have happened after the block was journaled and so | ||
42 | * the revoke must take precedence. | ||
43 | * | ||
44 | * Block is revoked and then written as data: | ||
45 | * The data write is allowed to succeed, but the revoke is _not_ | ||
46 | * cancelled. We still need to prevent old log records from | ||
47 | * overwriting the new data. We don't even need to clear the revoke | ||
48 | * bit here. | ||
49 | * | ||
50 | * Revoke information on buffers is a tri-state value: | ||
51 | * | ||
52 | * RevokeValid clear: no cached revoke status, need to look it up | ||
53 | * RevokeValid set, Revoked clear: | ||
54 | * buffer has not been revoked, and cancel_revoke | ||
55 | * need do nothing. | ||
56 | * RevokeValid set, Revoked set: | ||
57 | * buffer has been revoked. | ||
58 | */ | ||
59 | |||
60 | #ifndef __KERNEL__ | ||
61 | #include "jfs_user.h" | ||
62 | #else | ||
63 | #include <linux/time.h> | ||
64 | #include <linux/fs.h> | ||
65 | #include <linux/jbd.h> | ||
66 | #include <linux/errno.h> | ||
67 | #include <linux/slab.h> | ||
68 | #include <linux/list.h> | ||
69 | #include <linux/smp_lock.h> | ||
70 | #include <linux/init.h> | ||
71 | #endif | ||
72 | |||
73 | static kmem_cache_t *revoke_record_cache; | ||
74 | static kmem_cache_t *revoke_table_cache; | ||
75 | |||
76 | /* Each revoke record represents one single revoked block. During | ||
77 | journal replay, this involves recording the transaction ID of the | ||
78 | last transaction to revoke this block. */ | ||
79 | |||
80 | struct jbd_revoke_record_s | ||
81 | { | ||
82 | struct list_head hash; | ||
83 | tid_t sequence; /* Used for recovery only */ | ||
84 | unsigned long blocknr; | ||
85 | }; | ||
86 | |||
87 | |||
88 | /* The revoke table is just a simple hash table of revoke records. */ | ||
89 | struct jbd_revoke_table_s | ||
90 | { | ||
91 | /* It is conceivable that we might want a larger hash table | ||
92 | * for recovery. Must be a power of two. */ | ||
93 | int hash_size; | ||
94 | int hash_shift; | ||
95 | struct list_head *hash_table; | ||
96 | }; | ||
97 | |||
98 | |||
99 | #ifdef __KERNEL__ | ||
100 | static void write_one_revoke_record(journal_t *, transaction_t *, | ||
101 | struct journal_head **, int *, | ||
102 | struct jbd_revoke_record_s *); | ||
103 | static void flush_descriptor(journal_t *, struct journal_head *, int); | ||
104 | #endif | ||
105 | |||
106 | /* Utility functions to maintain the revoke table */ | ||
107 | |||
108 | /* Borrowed from buffer.c: this is a tried and tested block hash function */ | ||
109 | static inline int hash(journal_t *journal, unsigned long block) | ||
110 | { | ||
111 | struct jbd_revoke_table_s *table = journal->j_revoke; | ||
112 | int hash_shift = table->hash_shift; | ||
113 | |||
114 | return ((block << (hash_shift - 6)) ^ | ||
115 | (block >> 13) ^ | ||
116 | (block << (hash_shift - 12))) & (table->hash_size - 1); | ||
117 | } | ||
118 | |||
119 | static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, | ||
120 | tid_t seq) | ||
121 | { | ||
122 | struct list_head *hash_list; | ||
123 | struct jbd_revoke_record_s *record; | ||
124 | |||
125 | repeat: | ||
126 | record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); | ||
127 | if (!record) | ||
128 | goto oom; | ||
129 | |||
130 | record->sequence = seq; | ||
131 | record->blocknr = blocknr; | ||
132 | hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; | ||
133 | spin_lock(&journal->j_revoke_lock); | ||
134 | list_add(&record->hash, hash_list); | ||
135 | spin_unlock(&journal->j_revoke_lock); | ||
136 | return 0; | ||
137 | |||
138 | oom: | ||
139 | if (!journal_oom_retry) | ||
140 | return -ENOMEM; | ||
141 | jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__); | ||
142 | yield(); | ||
143 | goto repeat; | ||
144 | } | ||
145 | |||
146 | /* Find a revoke record in the journal's hash table. */ | ||
147 | |||
148 | static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, | ||
149 | unsigned long blocknr) | ||
150 | { | ||
151 | struct list_head *hash_list; | ||
152 | struct jbd_revoke_record_s *record; | ||
153 | |||
154 | hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; | ||
155 | |||
156 | spin_lock(&journal->j_revoke_lock); | ||
157 | record = (struct jbd_revoke_record_s *) hash_list->next; | ||
158 | while (&(record->hash) != hash_list) { | ||
159 | if (record->blocknr == blocknr) { | ||
160 | spin_unlock(&journal->j_revoke_lock); | ||
161 | return record; | ||
162 | } | ||
163 | record = (struct jbd_revoke_record_s *) record->hash.next; | ||
164 | } | ||
165 | spin_unlock(&journal->j_revoke_lock); | ||
166 | return NULL; | ||
167 | } | ||
168 | |||
169 | int __init journal_init_revoke_caches(void) | ||
170 | { | ||
171 | revoke_record_cache = kmem_cache_create("revoke_record", | ||
172 | sizeof(struct jbd_revoke_record_s), | ||
173 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
174 | if (revoke_record_cache == 0) | ||
175 | return -ENOMEM; | ||
176 | |||
177 | revoke_table_cache = kmem_cache_create("revoke_table", | ||
178 | sizeof(struct jbd_revoke_table_s), | ||
179 | 0, 0, NULL, NULL); | ||
180 | if (revoke_table_cache == 0) { | ||
181 | kmem_cache_destroy(revoke_record_cache); | ||
182 | revoke_record_cache = NULL; | ||
183 | return -ENOMEM; | ||
184 | } | ||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | void journal_destroy_revoke_caches(void) | ||
189 | { | ||
190 | kmem_cache_destroy(revoke_record_cache); | ||
191 | revoke_record_cache = NULL; | ||
192 | kmem_cache_destroy(revoke_table_cache); | ||
193 | revoke_table_cache = NULL; | ||
194 | } | ||
195 | |||
196 | /* Initialise the revoke table for a given journal to a given size. */ | ||
197 | |||
198 | int journal_init_revoke(journal_t *journal, int hash_size) | ||
199 | { | ||
200 | int shift, tmp; | ||
201 | |||
202 | J_ASSERT (journal->j_revoke_table[0] == NULL); | ||
203 | |||
204 | shift = 0; | ||
205 | tmp = hash_size; | ||
206 | while((tmp >>= 1UL) != 0UL) | ||
207 | shift++; | ||
208 | |||
209 | journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); | ||
210 | if (!journal->j_revoke_table[0]) | ||
211 | return -ENOMEM; | ||
212 | journal->j_revoke = journal->j_revoke_table[0]; | ||
213 | |||
214 | /* Check that the hash_size is a power of two */ | ||
215 | J_ASSERT ((hash_size & (hash_size-1)) == 0); | ||
216 | |||
217 | journal->j_revoke->hash_size = hash_size; | ||
218 | |||
219 | journal->j_revoke->hash_shift = shift; | ||
220 | |||
221 | journal->j_revoke->hash_table = | ||
222 | kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); | ||
223 | if (!journal->j_revoke->hash_table) { | ||
224 | kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); | ||
225 | journal->j_revoke = NULL; | ||
226 | return -ENOMEM; | ||
227 | } | ||
228 | |||
229 | for (tmp = 0; tmp < hash_size; tmp++) | ||
230 | INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); | ||
231 | |||
232 | journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); | ||
233 | if (!journal->j_revoke_table[1]) { | ||
234 | kfree(journal->j_revoke_table[0]->hash_table); | ||
235 | kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); | ||
236 | return -ENOMEM; | ||
237 | } | ||
238 | |||
239 | journal->j_revoke = journal->j_revoke_table[1]; | ||
240 | |||
241 | /* Check that the hash_size is a power of two */ | ||
242 | J_ASSERT ((hash_size & (hash_size-1)) == 0); | ||
243 | |||
244 | journal->j_revoke->hash_size = hash_size; | ||
245 | |||
246 | journal->j_revoke->hash_shift = shift; | ||
247 | |||
248 | journal->j_revoke->hash_table = | ||
249 | kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); | ||
250 | if (!journal->j_revoke->hash_table) { | ||
251 | kfree(journal->j_revoke_table[0]->hash_table); | ||
252 | kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); | ||
253 | kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]); | ||
254 | journal->j_revoke = NULL; | ||
255 | return -ENOMEM; | ||
256 | } | ||
257 | |||
258 | for (tmp = 0; tmp < hash_size; tmp++) | ||
259 | INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); | ||
260 | |||
261 | spin_lock_init(&journal->j_revoke_lock); | ||
262 | |||
263 | return 0; | ||
264 | } | ||
265 | |||
266 | /* Destoy a journal's revoke table. The table must already be empty! */ | ||
267 | |||
268 | void journal_destroy_revoke(journal_t *journal) | ||
269 | { | ||
270 | struct jbd_revoke_table_s *table; | ||
271 | struct list_head *hash_list; | ||
272 | int i; | ||
273 | |||
274 | table = journal->j_revoke_table[0]; | ||
275 | if (!table) | ||
276 | return; | ||
277 | |||
278 | for (i=0; i<table->hash_size; i++) { | ||
279 | hash_list = &table->hash_table[i]; | ||
280 | J_ASSERT (list_empty(hash_list)); | ||
281 | } | ||
282 | |||
283 | kfree(table->hash_table); | ||
284 | kmem_cache_free(revoke_table_cache, table); | ||
285 | journal->j_revoke = NULL; | ||
286 | |||
287 | table = journal->j_revoke_table[1]; | ||
288 | if (!table) | ||
289 | return; | ||
290 | |||
291 | for (i=0; i<table->hash_size; i++) { | ||
292 | hash_list = &table->hash_table[i]; | ||
293 | J_ASSERT (list_empty(hash_list)); | ||
294 | } | ||
295 | |||
296 | kfree(table->hash_table); | ||
297 | kmem_cache_free(revoke_table_cache, table); | ||
298 | journal->j_revoke = NULL; | ||
299 | } | ||
300 | |||
301 | |||
302 | #ifdef __KERNEL__ | ||
303 | |||
304 | /* | ||
305 | * journal_revoke: revoke a given buffer_head from the journal. This | ||
306 | * prevents the block from being replayed during recovery if we take a | ||
307 | * crash after this current transaction commits. Any subsequent | ||
308 | * metadata writes of the buffer in this transaction cancel the | ||
309 | * revoke. | ||
310 | * | ||
311 | * Note that this call may block --- it is up to the caller to make | ||
312 | * sure that there are no further calls to journal_write_metadata | ||
313 | * before the revoke is complete. In ext3, this implies calling the | ||
314 | * revoke before clearing the block bitmap when we are deleting | ||
315 | * metadata. | ||
316 | * | ||
317 | * Revoke performs a journal_forget on any buffer_head passed in as a | ||
318 | * parameter, but does _not_ forget the buffer_head if the bh was only | ||
319 | * found implicitly. | ||
320 | * | ||
321 | * bh_in may not be a journalled buffer - it may have come off | ||
322 | * the hash tables without an attached journal_head. | ||
323 | * | ||
324 | * If bh_in is non-zero, journal_revoke() will decrement its b_count | ||
325 | * by one. | ||
326 | */ | ||
327 | |||
328 | int journal_revoke(handle_t *handle, unsigned long blocknr, | ||
329 | struct buffer_head *bh_in) | ||
330 | { | ||
331 | struct buffer_head *bh = NULL; | ||
332 | journal_t *journal; | ||
333 | struct block_device *bdev; | ||
334 | int err; | ||
335 | |||
336 | might_sleep(); | ||
337 | if (bh_in) | ||
338 | BUFFER_TRACE(bh_in, "enter"); | ||
339 | |||
340 | journal = handle->h_transaction->t_journal; | ||
341 | if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ | ||
342 | J_ASSERT (!"Cannot set revoke feature!"); | ||
343 | return -EINVAL; | ||
344 | } | ||
345 | |||
346 | bdev = journal->j_fs_dev; | ||
347 | bh = bh_in; | ||
348 | |||
349 | if (!bh) { | ||
350 | bh = __find_get_block(bdev, blocknr, journal->j_blocksize); | ||
351 | if (bh) | ||
352 | BUFFER_TRACE(bh, "found on hash"); | ||
353 | } | ||
354 | #ifdef JBD_EXPENSIVE_CHECKING | ||
355 | else { | ||
356 | struct buffer_head *bh2; | ||
357 | |||
358 | /* If there is a different buffer_head lying around in | ||
359 | * memory anywhere... */ | ||
360 | bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); | ||
361 | if (bh2) { | ||
362 | /* ... and it has RevokeValid status... */ | ||
363 | if (bh2 != bh && buffer_revokevalid(bh2)) | ||
364 | /* ...then it better be revoked too, | ||
365 | * since it's illegal to create a revoke | ||
366 | * record against a buffer_head which is | ||
367 | * not marked revoked --- that would | ||
368 | * risk missing a subsequent revoke | ||
369 | * cancel. */ | ||
370 | J_ASSERT_BH(bh2, buffer_revoked(bh2)); | ||
371 | put_bh(bh2); | ||
372 | } | ||
373 | } | ||
374 | #endif | ||
375 | |||
376 | /* We really ought not ever to revoke twice in a row without | ||
377 | first having the revoke cancelled: it's illegal to free a | ||
378 | block twice without allocating it in between! */ | ||
379 | if (bh) { | ||
380 | if (!J_EXPECT_BH(bh, !buffer_revoked(bh), | ||
381 | "inconsistent data on disk")) { | ||
382 | if (!bh_in) | ||
383 | brelse(bh); | ||
384 | return -EIO; | ||
385 | } | ||
386 | set_buffer_revoked(bh); | ||
387 | set_buffer_revokevalid(bh); | ||
388 | if (bh_in) { | ||
389 | BUFFER_TRACE(bh_in, "call journal_forget"); | ||
390 | journal_forget(handle, bh_in); | ||
391 | } else { | ||
392 | BUFFER_TRACE(bh, "call brelse"); | ||
393 | __brelse(bh); | ||
394 | } | ||
395 | } | ||
396 | |||
397 | jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); | ||
398 | err = insert_revoke_hash(journal, blocknr, | ||
399 | handle->h_transaction->t_tid); | ||
400 | BUFFER_TRACE(bh_in, "exit"); | ||
401 | return err; | ||
402 | } | ||
403 | |||
404 | /* | ||
405 | * Cancel an outstanding revoke. For use only internally by the | ||
406 | * journaling code (called from journal_get_write_access). | ||
407 | * | ||
408 | * We trust buffer_revoked() on the buffer if the buffer is already | ||
409 | * being journaled: if there is no revoke pending on the buffer, then we | ||
410 | * don't do anything here. | ||
411 | * | ||
412 | * This would break if it were possible for a buffer to be revoked and | ||
413 | * discarded, and then reallocated within the same transaction. In such | ||
414 | * a case we would have lost the revoked bit, but when we arrived here | ||
415 | * the second time we would still have a pending revoke to cancel. So, | ||
416 | * do not trust the Revoked bit on buffers unless RevokeValid is also | ||
417 | * set. | ||
418 | * | ||
419 | * The caller must have the journal locked. | ||
420 | */ | ||
421 | int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) | ||
422 | { | ||
423 | struct jbd_revoke_record_s *record; | ||
424 | journal_t *journal = handle->h_transaction->t_journal; | ||
425 | int need_cancel; | ||
426 | int did_revoke = 0; /* akpm: debug */ | ||
427 | struct buffer_head *bh = jh2bh(jh); | ||
428 | |||
429 | jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); | ||
430 | |||
431 | /* Is the existing Revoke bit valid? If so, we trust it, and | ||
432 | * only perform the full cancel if the revoke bit is set. If | ||
433 | * not, we can't trust the revoke bit, and we need to do the | ||
434 | * full search for a revoke record. */ | ||
435 | if (test_set_buffer_revokevalid(bh)) { | ||
436 | need_cancel = test_clear_buffer_revoked(bh); | ||
437 | } else { | ||
438 | need_cancel = 1; | ||
439 | clear_buffer_revoked(bh); | ||
440 | } | ||
441 | |||
442 | if (need_cancel) { | ||
443 | record = find_revoke_record(journal, bh->b_blocknr); | ||
444 | if (record) { | ||
445 | jbd_debug(4, "cancelled existing revoke on " | ||
446 | "blocknr %llu\n", (unsigned long long)bh->b_blocknr); | ||
447 | spin_lock(&journal->j_revoke_lock); | ||
448 | list_del(&record->hash); | ||
449 | spin_unlock(&journal->j_revoke_lock); | ||
450 | kmem_cache_free(revoke_record_cache, record); | ||
451 | did_revoke = 1; | ||
452 | } | ||
453 | } | ||
454 | |||
455 | #ifdef JBD_EXPENSIVE_CHECKING | ||
456 | /* There better not be one left behind by now! */ | ||
457 | record = find_revoke_record(journal, bh->b_blocknr); | ||
458 | J_ASSERT_JH(jh, record == NULL); | ||
459 | #endif | ||
460 | |||
461 | /* Finally, have we just cleared revoke on an unhashed | ||
462 | * buffer_head? If so, we'd better make sure we clear the | ||
463 | * revoked status on any hashed alias too, otherwise the revoke | ||
464 | * state machine will get very upset later on. */ | ||
465 | if (need_cancel) { | ||
466 | struct buffer_head *bh2; | ||
467 | bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); | ||
468 | if (bh2) { | ||
469 | if (bh2 != bh) | ||
470 | clear_buffer_revoked(bh2); | ||
471 | __brelse(bh2); | ||
472 | } | ||
473 | } | ||
474 | return did_revoke; | ||
475 | } | ||
476 | |||
477 | /* journal_switch_revoke table select j_revoke for next transaction | ||
478 | * we do not want to suspend any processing until all revokes are | ||
479 | * written -bzzz | ||
480 | */ | ||
481 | void journal_switch_revoke_table(journal_t *journal) | ||
482 | { | ||
483 | int i; | ||
484 | |||
485 | if (journal->j_revoke == journal->j_revoke_table[0]) | ||
486 | journal->j_revoke = journal->j_revoke_table[1]; | ||
487 | else | ||
488 | journal->j_revoke = journal->j_revoke_table[0]; | ||
489 | |||
490 | for (i = 0; i < journal->j_revoke->hash_size; i++) | ||
491 | INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * Write revoke records to the journal for all entries in the current | ||
496 | * revoke hash, deleting the entries as we go. | ||
497 | * | ||
498 | * Called with the journal lock held. | ||
499 | */ | ||
500 | |||
501 | void journal_write_revoke_records(journal_t *journal, | ||
502 | transaction_t *transaction) | ||
503 | { | ||
504 | struct journal_head *descriptor; | ||
505 | struct jbd_revoke_record_s *record; | ||
506 | struct jbd_revoke_table_s *revoke; | ||
507 | struct list_head *hash_list; | ||
508 | int i, offset, count; | ||
509 | |||
510 | descriptor = NULL; | ||
511 | offset = 0; | ||
512 | count = 0; | ||
513 | |||
514 | /* select revoke table for committing transaction */ | ||
515 | revoke = journal->j_revoke == journal->j_revoke_table[0] ? | ||
516 | journal->j_revoke_table[1] : journal->j_revoke_table[0]; | ||
517 | |||
518 | for (i = 0; i < revoke->hash_size; i++) { | ||
519 | hash_list = &revoke->hash_table[i]; | ||
520 | |||
521 | while (!list_empty(hash_list)) { | ||
522 | record = (struct jbd_revoke_record_s *) | ||
523 | hash_list->next; | ||
524 | write_one_revoke_record(journal, transaction, | ||
525 | &descriptor, &offset, | ||
526 | record); | ||
527 | count++; | ||
528 | list_del(&record->hash); | ||
529 | kmem_cache_free(revoke_record_cache, record); | ||
530 | } | ||
531 | } | ||
532 | if (descriptor) | ||
533 | flush_descriptor(journal, descriptor, offset); | ||
534 | jbd_debug(1, "Wrote %d revoke records\n", count); | ||
535 | } | ||
536 | |||
537 | /* | ||
538 | * Write out one revoke record. We need to create a new descriptor | ||
539 | * block if the old one is full or if we have not already created one. | ||
540 | */ | ||
541 | |||
542 | static void write_one_revoke_record(journal_t *journal, | ||
543 | transaction_t *transaction, | ||
544 | struct journal_head **descriptorp, | ||
545 | int *offsetp, | ||
546 | struct jbd_revoke_record_s *record) | ||
547 | { | ||
548 | struct journal_head *descriptor; | ||
549 | int offset; | ||
550 | journal_header_t *header; | ||
551 | |||
552 | /* If we are already aborting, this all becomes a noop. We | ||
553 | still need to go round the loop in | ||
554 | journal_write_revoke_records in order to free all of the | ||
555 | revoke records: only the IO to the journal is omitted. */ | ||
556 | if (is_journal_aborted(journal)) | ||
557 | return; | ||
558 | |||
559 | descriptor = *descriptorp; | ||
560 | offset = *offsetp; | ||
561 | |||
562 | /* Make sure we have a descriptor with space left for the record */ | ||
563 | if (descriptor) { | ||
564 | if (offset == journal->j_blocksize) { | ||
565 | flush_descriptor(journal, descriptor, offset); | ||
566 | descriptor = NULL; | ||
567 | } | ||
568 | } | ||
569 | |||
570 | if (!descriptor) { | ||
571 | descriptor = journal_get_descriptor_buffer(journal); | ||
572 | if (!descriptor) | ||
573 | return; | ||
574 | header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; | ||
575 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
576 | header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK); | ||
577 | header->h_sequence = cpu_to_be32(transaction->t_tid); | ||
578 | |||
579 | /* Record it so that we can wait for IO completion later */ | ||
580 | JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); | ||
581 | journal_file_buffer(descriptor, transaction, BJ_LogCtl); | ||
582 | |||
583 | offset = sizeof(journal_revoke_header_t); | ||
584 | *descriptorp = descriptor; | ||
585 | } | ||
586 | |||
587 | * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = | ||
588 | cpu_to_be32(record->blocknr); | ||
589 | offset += 4; | ||
590 | *offsetp = offset; | ||
591 | } | ||
592 | |||
593 | /* | ||
594 | * Flush a revoke descriptor out to the journal. If we are aborting, | ||
595 | * this is a noop; otherwise we are generating a buffer which needs to | ||
596 | * be waited for during commit, so it has to go onto the appropriate | ||
597 | * journal buffer list. | ||
598 | */ | ||
599 | |||
600 | static void flush_descriptor(journal_t *journal, | ||
601 | struct journal_head *descriptor, | ||
602 | int offset) | ||
603 | { | ||
604 | journal_revoke_header_t *header; | ||
605 | struct buffer_head *bh = jh2bh(descriptor); | ||
606 | |||
607 | if (is_journal_aborted(journal)) { | ||
608 | put_bh(bh); | ||
609 | return; | ||
610 | } | ||
611 | |||
612 | header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; | ||
613 | header->r_count = cpu_to_be32(offset); | ||
614 | set_buffer_jwrite(bh); | ||
615 | BUFFER_TRACE(bh, "write"); | ||
616 | set_buffer_dirty(bh); | ||
617 | ll_rw_block(SWRITE, 1, &bh); | ||
618 | } | ||
619 | #endif | ||
620 | |||
621 | /* | ||
622 | * Revoke support for recovery. | ||
623 | * | ||
624 | * Recovery needs to be able to: | ||
625 | * | ||
626 | * record all revoke records, including the tid of the latest instance | ||
627 | * of each revoke in the journal | ||
628 | * | ||
629 | * check whether a given block in a given transaction should be replayed | ||
630 | * (ie. has not been revoked by a revoke record in that or a subsequent | ||
631 | * transaction) | ||
632 | * | ||
633 | * empty the revoke table after recovery. | ||
634 | */ | ||
635 | |||
636 | /* | ||
637 | * First, setting revoke records. We create a new revoke record for | ||
638 | * every block ever revoked in the log as we scan it for recovery, and | ||
639 | * we update the existing records if we find multiple revokes for a | ||
640 | * single block. | ||
641 | */ | ||
642 | |||
643 | int journal_set_revoke(journal_t *journal, | ||
644 | unsigned long blocknr, | ||
645 | tid_t sequence) | ||
646 | { | ||
647 | struct jbd_revoke_record_s *record; | ||
648 | |||
649 | record = find_revoke_record(journal, blocknr); | ||
650 | if (record) { | ||
651 | /* If we have multiple occurrences, only record the | ||
652 | * latest sequence number in the hashed record */ | ||
653 | if (tid_gt(sequence, record->sequence)) | ||
654 | record->sequence = sequence; | ||
655 | return 0; | ||
656 | } | ||
657 | return insert_revoke_hash(journal, blocknr, sequence); | ||
658 | } | ||
659 | |||
660 | /* | ||
661 | * Test revoke records. For a given block referenced in the log, has | ||
662 | * that block been revoked? A revoke record with a given transaction | ||
663 | * sequence number revokes all blocks in that transaction and earlier | ||
664 | * ones, but later transactions still need replayed. | ||
665 | */ | ||
666 | |||
667 | int journal_test_revoke(journal_t *journal, | ||
668 | unsigned long blocknr, | ||
669 | tid_t sequence) | ||
670 | { | ||
671 | struct jbd_revoke_record_s *record; | ||
672 | |||
673 | record = find_revoke_record(journal, blocknr); | ||
674 | if (!record) | ||
675 | return 0; | ||
676 | if (tid_gt(sequence, record->sequence)) | ||
677 | return 0; | ||
678 | return 1; | ||
679 | } | ||
680 | |||
681 | /* | ||
682 | * Finally, once recovery is over, we need to clear the revoke table so | ||
683 | * that it can be reused by the running filesystem. | ||
684 | */ | ||
685 | |||
686 | void journal_clear_revoke(journal_t *journal) | ||
687 | { | ||
688 | int i; | ||
689 | struct list_head *hash_list; | ||
690 | struct jbd_revoke_record_s *record; | ||
691 | struct jbd_revoke_table_s *revoke; | ||
692 | |||
693 | revoke = journal->j_revoke; | ||
694 | |||
695 | for (i = 0; i < revoke->hash_size; i++) { | ||
696 | hash_list = &revoke->hash_table[i]; | ||
697 | while (!list_empty(hash_list)) { | ||
698 | record = (struct jbd_revoke_record_s*) hash_list->next; | ||
699 | list_del(&record->hash); | ||
700 | kmem_cache_free(revoke_record_cache, record); | ||
701 | } | ||
702 | } | ||
703 | } | ||
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c new file mode 100644 index 000000000000..e1b3c8af4d17 --- /dev/null +++ b/fs/jbd2/transaction.c | |||
@@ -0,0 +1,2080 @@ | |||
1 | /* | ||
2 | * linux/fs/transaction.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | ||
5 | * | ||
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Generic filesystem transaction handling code; part of the ext2fs | ||
13 | * journaling system. | ||
14 | * | ||
15 | * This file manages transactions (compound commits managed by the | ||
16 | * journaling code) and handles (individual atomic operations by the | ||
17 | * filesystem). | ||
18 | */ | ||
19 | |||
20 | #include <linux/time.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/jbd.h> | ||
23 | #include <linux/errno.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/timer.h> | ||
26 | #include <linux/smp_lock.h> | ||
27 | #include <linux/mm.h> | ||
28 | #include <linux/highmem.h> | ||
29 | |||
30 | /* | ||
31 | * get_transaction: obtain a new transaction_t object. | ||
32 | * | ||
33 | * Simply allocate and initialise a new transaction. Create it in | ||
34 | * RUNNING state and add it to the current journal (which should not | ||
35 | * have an existing running transaction: we only make a new transaction | ||
36 | * once we have started to commit the old one). | ||
37 | * | ||
38 | * Preconditions: | ||
39 | * The journal MUST be locked. We don't perform atomic mallocs on the | ||
40 | * new transaction and we can't block without protecting against other | ||
41 | * processes trying to touch the journal while it is in transition. | ||
42 | * | ||
43 | * Called under j_state_lock | ||
44 | */ | ||
45 | |||
46 | static transaction_t * | ||
47 | get_transaction(journal_t *journal, transaction_t *transaction) | ||
48 | { | ||
49 | transaction->t_journal = journal; | ||
50 | transaction->t_state = T_RUNNING; | ||
51 | transaction->t_tid = journal->j_transaction_sequence++; | ||
52 | transaction->t_expires = jiffies + journal->j_commit_interval; | ||
53 | spin_lock_init(&transaction->t_handle_lock); | ||
54 | |||
55 | /* Set up the commit timer for the new transaction. */ | ||
56 | journal->j_commit_timer.expires = transaction->t_expires; | ||
57 | add_timer(&journal->j_commit_timer); | ||
58 | |||
59 | J_ASSERT(journal->j_running_transaction == NULL); | ||
60 | journal->j_running_transaction = transaction; | ||
61 | |||
62 | return transaction; | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * Handle management. | ||
67 | * | ||
68 | * A handle_t is an object which represents a single atomic update to a | ||
69 | * filesystem, and which tracks all of the modifications which form part | ||
70 | * of that one update. | ||
71 | */ | ||
72 | |||
73 | /* | ||
74 | * start_this_handle: Given a handle, deal with any locking or stalling | ||
75 | * needed to make sure that there is enough journal space for the handle | ||
76 | * to begin. Attach the handle to a transaction and set up the | ||
77 | * transaction's buffer credits. | ||
78 | */ | ||
79 | |||
80 | static int start_this_handle(journal_t *journal, handle_t *handle) | ||
81 | { | ||
82 | transaction_t *transaction; | ||
83 | int needed; | ||
84 | int nblocks = handle->h_buffer_credits; | ||
85 | transaction_t *new_transaction = NULL; | ||
86 | int ret = 0; | ||
87 | |||
88 | if (nblocks > journal->j_max_transaction_buffers) { | ||
89 | printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", | ||
90 | current->comm, nblocks, | ||
91 | journal->j_max_transaction_buffers); | ||
92 | ret = -ENOSPC; | ||
93 | goto out; | ||
94 | } | ||
95 | |||
96 | alloc_transaction: | ||
97 | if (!journal->j_running_transaction) { | ||
98 | new_transaction = jbd_kmalloc(sizeof(*new_transaction), | ||
99 | GFP_NOFS); | ||
100 | if (!new_transaction) { | ||
101 | ret = -ENOMEM; | ||
102 | goto out; | ||
103 | } | ||
104 | memset(new_transaction, 0, sizeof(*new_transaction)); | ||
105 | } | ||
106 | |||
107 | jbd_debug(3, "New handle %p going live.\n", handle); | ||
108 | |||
109 | repeat: | ||
110 | |||
111 | /* | ||
112 | * We need to hold j_state_lock until t_updates has been incremented, | ||
113 | * for proper journal barrier handling | ||
114 | */ | ||
115 | spin_lock(&journal->j_state_lock); | ||
116 | repeat_locked: | ||
117 | if (is_journal_aborted(journal) || | ||
118 | (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { | ||
119 | spin_unlock(&journal->j_state_lock); | ||
120 | ret = -EROFS; | ||
121 | goto out; | ||
122 | } | ||
123 | |||
124 | /* Wait on the journal's transaction barrier if necessary */ | ||
125 | if (journal->j_barrier_count) { | ||
126 | spin_unlock(&journal->j_state_lock); | ||
127 | wait_event(journal->j_wait_transaction_locked, | ||
128 | journal->j_barrier_count == 0); | ||
129 | goto repeat; | ||
130 | } | ||
131 | |||
132 | if (!journal->j_running_transaction) { | ||
133 | if (!new_transaction) { | ||
134 | spin_unlock(&journal->j_state_lock); | ||
135 | goto alloc_transaction; | ||
136 | } | ||
137 | get_transaction(journal, new_transaction); | ||
138 | new_transaction = NULL; | ||
139 | } | ||
140 | |||
141 | transaction = journal->j_running_transaction; | ||
142 | |||
143 | /* | ||
144 | * If the current transaction is locked down for commit, wait for the | ||
145 | * lock to be released. | ||
146 | */ | ||
147 | if (transaction->t_state == T_LOCKED) { | ||
148 | DEFINE_WAIT(wait); | ||
149 | |||
150 | prepare_to_wait(&journal->j_wait_transaction_locked, | ||
151 | &wait, TASK_UNINTERRUPTIBLE); | ||
152 | spin_unlock(&journal->j_state_lock); | ||
153 | schedule(); | ||
154 | finish_wait(&journal->j_wait_transaction_locked, &wait); | ||
155 | goto repeat; | ||
156 | } | ||
157 | |||
158 | /* | ||
159 | * If there is not enough space left in the log to write all potential | ||
160 | * buffers requested by this operation, we need to stall pending a log | ||
161 | * checkpoint to free some more log space. | ||
162 | */ | ||
163 | spin_lock(&transaction->t_handle_lock); | ||
164 | needed = transaction->t_outstanding_credits + nblocks; | ||
165 | |||
166 | if (needed > journal->j_max_transaction_buffers) { | ||
167 | /* | ||
168 | * If the current transaction is already too large, then start | ||
169 | * to commit it: we can then go back and attach this handle to | ||
170 | * a new transaction. | ||
171 | */ | ||
172 | DEFINE_WAIT(wait); | ||
173 | |||
174 | jbd_debug(2, "Handle %p starting new commit...\n", handle); | ||
175 | spin_unlock(&transaction->t_handle_lock); | ||
176 | prepare_to_wait(&journal->j_wait_transaction_locked, &wait, | ||
177 | TASK_UNINTERRUPTIBLE); | ||
178 | __log_start_commit(journal, transaction->t_tid); | ||
179 | spin_unlock(&journal->j_state_lock); | ||
180 | schedule(); | ||
181 | finish_wait(&journal->j_wait_transaction_locked, &wait); | ||
182 | goto repeat; | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * The commit code assumes that it can get enough log space | ||
187 | * without forcing a checkpoint. This is *critical* for | ||
188 | * correctness: a checkpoint of a buffer which is also | ||
189 | * associated with a committing transaction creates a deadlock, | ||
190 | * so commit simply cannot force through checkpoints. | ||
191 | * | ||
192 | * We must therefore ensure the necessary space in the journal | ||
193 | * *before* starting to dirty potentially checkpointed buffers | ||
194 | * in the new transaction. | ||
195 | * | ||
196 | * The worst part is, any transaction currently committing can | ||
197 | * reduce the free space arbitrarily. Be careful to account for | ||
198 | * those buffers when checkpointing. | ||
199 | */ | ||
200 | |||
201 | /* | ||
202 | * @@@ AKPM: This seems rather over-defensive. We're giving commit | ||
203 | * a _lot_ of headroom: 1/4 of the journal plus the size of | ||
204 | * the committing transaction. Really, we only need to give it | ||
205 | * committing_transaction->t_outstanding_credits plus "enough" for | ||
206 | * the log control blocks. | ||
207 | * Also, this test is inconsitent with the matching one in | ||
208 | * journal_extend(). | ||
209 | */ | ||
210 | if (__log_space_left(journal) < jbd_space_needed(journal)) { | ||
211 | jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); | ||
212 | spin_unlock(&transaction->t_handle_lock); | ||
213 | __log_wait_for_space(journal); | ||
214 | goto repeat_locked; | ||
215 | } | ||
216 | |||
217 | /* OK, account for the buffers that this operation expects to | ||
218 | * use and add the handle to the running transaction. */ | ||
219 | |||
220 | handle->h_transaction = transaction; | ||
221 | transaction->t_outstanding_credits += nblocks; | ||
222 | transaction->t_updates++; | ||
223 | transaction->t_handle_count++; | ||
224 | jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", | ||
225 | handle, nblocks, transaction->t_outstanding_credits, | ||
226 | __log_space_left(journal)); | ||
227 | spin_unlock(&transaction->t_handle_lock); | ||
228 | spin_unlock(&journal->j_state_lock); | ||
229 | out: | ||
230 | if (unlikely(new_transaction)) /* It's usually NULL */ | ||
231 | kfree(new_transaction); | ||
232 | return ret; | ||
233 | } | ||
234 | |||
235 | /* Allocate a new handle. This should probably be in a slab... */ | ||
236 | static handle_t *new_handle(int nblocks) | ||
237 | { | ||
238 | handle_t *handle = jbd_alloc_handle(GFP_NOFS); | ||
239 | if (!handle) | ||
240 | return NULL; | ||
241 | memset(handle, 0, sizeof(*handle)); | ||
242 | handle->h_buffer_credits = nblocks; | ||
243 | handle->h_ref = 1; | ||
244 | |||
245 | return handle; | ||
246 | } | ||
247 | |||
248 | /** | ||
249 | * handle_t *journal_start() - Obtain a new handle. | ||
250 | * @journal: Journal to start transaction on. | ||
251 | * @nblocks: number of block buffer we might modify | ||
252 | * | ||
253 | * We make sure that the transaction can guarantee at least nblocks of | ||
254 | * modified buffers in the log. We block until the log can guarantee | ||
255 | * that much space. | ||
256 | * | ||
257 | * This function is visible to journal users (like ext3fs), so is not | ||
258 | * called with the journal already locked. | ||
259 | * | ||
260 | * Return a pointer to a newly allocated handle, or NULL on failure | ||
261 | */ | ||
262 | handle_t *journal_start(journal_t *journal, int nblocks) | ||
263 | { | ||
264 | handle_t *handle = journal_current_handle(); | ||
265 | int err; | ||
266 | |||
267 | if (!journal) | ||
268 | return ERR_PTR(-EROFS); | ||
269 | |||
270 | if (handle) { | ||
271 | J_ASSERT(handle->h_transaction->t_journal == journal); | ||
272 | handle->h_ref++; | ||
273 | return handle; | ||
274 | } | ||
275 | |||
276 | handle = new_handle(nblocks); | ||
277 | if (!handle) | ||
278 | return ERR_PTR(-ENOMEM); | ||
279 | |||
280 | current->journal_info = handle; | ||
281 | |||
282 | err = start_this_handle(journal, handle); | ||
283 | if (err < 0) { | ||
284 | jbd_free_handle(handle); | ||
285 | current->journal_info = NULL; | ||
286 | handle = ERR_PTR(err); | ||
287 | } | ||
288 | return handle; | ||
289 | } | ||
290 | |||
291 | /** | ||
292 | * int journal_extend() - extend buffer credits. | ||
293 | * @handle: handle to 'extend' | ||
294 | * @nblocks: nr blocks to try to extend by. | ||
295 | * | ||
296 | * Some transactions, such as large extends and truncates, can be done | ||
297 | * atomically all at once or in several stages. The operation requests | ||
298 | * a credit for a number of buffer modications in advance, but can | ||
299 | * extend its credit if it needs more. | ||
300 | * | ||
301 | * journal_extend tries to give the running handle more buffer credits. | ||
302 | * It does not guarantee that allocation - this is a best-effort only. | ||
303 | * The calling process MUST be able to deal cleanly with a failure to | ||
304 | * extend here. | ||
305 | * | ||
306 | * Return 0 on success, non-zero on failure. | ||
307 | * | ||
308 | * return code < 0 implies an error | ||
309 | * return code > 0 implies normal transaction-full status. | ||
310 | */ | ||
311 | int journal_extend(handle_t *handle, int nblocks) | ||
312 | { | ||
313 | transaction_t *transaction = handle->h_transaction; | ||
314 | journal_t *journal = transaction->t_journal; | ||
315 | int result; | ||
316 | int wanted; | ||
317 | |||
318 | result = -EIO; | ||
319 | if (is_handle_aborted(handle)) | ||
320 | goto out; | ||
321 | |||
322 | result = 1; | ||
323 | |||
324 | spin_lock(&journal->j_state_lock); | ||
325 | |||
326 | /* Don't extend a locked-down transaction! */ | ||
327 | if (handle->h_transaction->t_state != T_RUNNING) { | ||
328 | jbd_debug(3, "denied handle %p %d blocks: " | ||
329 | "transaction not running\n", handle, nblocks); | ||
330 | goto error_out; | ||
331 | } | ||
332 | |||
333 | spin_lock(&transaction->t_handle_lock); | ||
334 | wanted = transaction->t_outstanding_credits + nblocks; | ||
335 | |||
336 | if (wanted > journal->j_max_transaction_buffers) { | ||
337 | jbd_debug(3, "denied handle %p %d blocks: " | ||
338 | "transaction too large\n", handle, nblocks); | ||
339 | goto unlock; | ||
340 | } | ||
341 | |||
342 | if (wanted > __log_space_left(journal)) { | ||
343 | jbd_debug(3, "denied handle %p %d blocks: " | ||
344 | "insufficient log space\n", handle, nblocks); | ||
345 | goto unlock; | ||
346 | } | ||
347 | |||
348 | handle->h_buffer_credits += nblocks; | ||
349 | transaction->t_outstanding_credits += nblocks; | ||
350 | result = 0; | ||
351 | |||
352 | jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); | ||
353 | unlock: | ||
354 | spin_unlock(&transaction->t_handle_lock); | ||
355 | error_out: | ||
356 | spin_unlock(&journal->j_state_lock); | ||
357 | out: | ||
358 | return result; | ||
359 | } | ||
360 | |||
361 | |||
362 | /** | ||
363 | * int journal_restart() - restart a handle . | ||
364 | * @handle: handle to restart | ||
365 | * @nblocks: nr credits requested | ||
366 | * | ||
367 | * Restart a handle for a multi-transaction filesystem | ||
368 | * operation. | ||
369 | * | ||
370 | * If the journal_extend() call above fails to grant new buffer credits | ||
371 | * to a running handle, a call to journal_restart will commit the | ||
372 | * handle's transaction so far and reattach the handle to a new | ||
373 | * transaction capabable of guaranteeing the requested number of | ||
374 | * credits. | ||
375 | */ | ||
376 | |||
377 | int journal_restart(handle_t *handle, int nblocks) | ||
378 | { | ||
379 | transaction_t *transaction = handle->h_transaction; | ||
380 | journal_t *journal = transaction->t_journal; | ||
381 | int ret; | ||
382 | |||
383 | /* If we've had an abort of any type, don't even think about | ||
384 | * actually doing the restart! */ | ||
385 | if (is_handle_aborted(handle)) | ||
386 | return 0; | ||
387 | |||
388 | /* | ||
389 | * First unlink the handle from its current transaction, and start the | ||
390 | * commit on that. | ||
391 | */ | ||
392 | J_ASSERT(transaction->t_updates > 0); | ||
393 | J_ASSERT(journal_current_handle() == handle); | ||
394 | |||
395 | spin_lock(&journal->j_state_lock); | ||
396 | spin_lock(&transaction->t_handle_lock); | ||
397 | transaction->t_outstanding_credits -= handle->h_buffer_credits; | ||
398 | transaction->t_updates--; | ||
399 | |||
400 | if (!transaction->t_updates) | ||
401 | wake_up(&journal->j_wait_updates); | ||
402 | spin_unlock(&transaction->t_handle_lock); | ||
403 | |||
404 | jbd_debug(2, "restarting handle %p\n", handle); | ||
405 | __log_start_commit(journal, transaction->t_tid); | ||
406 | spin_unlock(&journal->j_state_lock); | ||
407 | |||
408 | handle->h_buffer_credits = nblocks; | ||
409 | ret = start_this_handle(journal, handle); | ||
410 | return ret; | ||
411 | } | ||
412 | |||
413 | |||
414 | /** | ||
415 | * void journal_lock_updates () - establish a transaction barrier. | ||
416 | * @journal: Journal to establish a barrier on. | ||
417 | * | ||
418 | * This locks out any further updates from being started, and blocks | ||
419 | * until all existing updates have completed, returning only once the | ||
420 | * journal is in a quiescent state with no updates running. | ||
421 | * | ||
422 | * The journal lock should not be held on entry. | ||
423 | */ | ||
424 | void journal_lock_updates(journal_t *journal) | ||
425 | { | ||
426 | DEFINE_WAIT(wait); | ||
427 | |||
428 | spin_lock(&journal->j_state_lock); | ||
429 | ++journal->j_barrier_count; | ||
430 | |||
431 | /* Wait until there are no running updates */ | ||
432 | while (1) { | ||
433 | transaction_t *transaction = journal->j_running_transaction; | ||
434 | |||
435 | if (!transaction) | ||
436 | break; | ||
437 | |||
438 | spin_lock(&transaction->t_handle_lock); | ||
439 | if (!transaction->t_updates) { | ||
440 | spin_unlock(&transaction->t_handle_lock); | ||
441 | break; | ||
442 | } | ||
443 | prepare_to_wait(&journal->j_wait_updates, &wait, | ||
444 | TASK_UNINTERRUPTIBLE); | ||
445 | spin_unlock(&transaction->t_handle_lock); | ||
446 | spin_unlock(&journal->j_state_lock); | ||
447 | schedule(); | ||
448 | finish_wait(&journal->j_wait_updates, &wait); | ||
449 | spin_lock(&journal->j_state_lock); | ||
450 | } | ||
451 | spin_unlock(&journal->j_state_lock); | ||
452 | |||
453 | /* | ||
454 | * We have now established a barrier against other normal updates, but | ||
455 | * we also need to barrier against other journal_lock_updates() calls | ||
456 | * to make sure that we serialise special journal-locked operations | ||
457 | * too. | ||
458 | */ | ||
459 | mutex_lock(&journal->j_barrier); | ||
460 | } | ||
461 | |||
462 | /** | ||
463 | * void journal_unlock_updates (journal_t* journal) - release barrier | ||
464 | * @journal: Journal to release the barrier on. | ||
465 | * | ||
466 | * Release a transaction barrier obtained with journal_lock_updates(). | ||
467 | * | ||
468 | * Should be called without the journal lock held. | ||
469 | */ | ||
470 | void journal_unlock_updates (journal_t *journal) | ||
471 | { | ||
472 | J_ASSERT(journal->j_barrier_count != 0); | ||
473 | |||
474 | mutex_unlock(&journal->j_barrier); | ||
475 | spin_lock(&journal->j_state_lock); | ||
476 | --journal->j_barrier_count; | ||
477 | spin_unlock(&journal->j_state_lock); | ||
478 | wake_up(&journal->j_wait_transaction_locked); | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * Report any unexpected dirty buffers which turn up. Normally those | ||
483 | * indicate an error, but they can occur if the user is running (say) | ||
484 | * tune2fs to modify the live filesystem, so we need the option of | ||
485 | * continuing as gracefully as possible. # | ||
486 | * | ||
487 | * The caller should already hold the journal lock and | ||
488 | * j_list_lock spinlock: most callers will need those anyway | ||
489 | * in order to probe the buffer's journaling state safely. | ||
490 | */ | ||
491 | static void jbd_unexpected_dirty_buffer(struct journal_head *jh) | ||
492 | { | ||
493 | int jlist; | ||
494 | |||
495 | /* If this buffer is one which might reasonably be dirty | ||
496 | * --- ie. data, or not part of this journal --- then | ||
497 | * we're OK to leave it alone, but otherwise we need to | ||
498 | * move the dirty bit to the journal's own internal | ||
499 | * JBDDirty bit. */ | ||
500 | jlist = jh->b_jlist; | ||
501 | |||
502 | if (jlist == BJ_Metadata || jlist == BJ_Reserved || | ||
503 | jlist == BJ_Shadow || jlist == BJ_Forget) { | ||
504 | struct buffer_head *bh = jh2bh(jh); | ||
505 | |||
506 | if (test_clear_buffer_dirty(bh)) | ||
507 | set_buffer_jbddirty(bh); | ||
508 | } | ||
509 | } | ||
510 | |||
511 | /* | ||
512 | * If the buffer is already part of the current transaction, then there | ||
513 | * is nothing we need to do. If it is already part of a prior | ||
514 | * transaction which we are still committing to disk, then we need to | ||
515 | * make sure that we do not overwrite the old copy: we do copy-out to | ||
516 | * preserve the copy going to disk. We also account the buffer against | ||
517 | * the handle's metadata buffer credits (unless the buffer is already | ||
518 | * part of the transaction, that is). | ||
519 | * | ||
520 | */ | ||
521 | static int | ||
522 | do_get_write_access(handle_t *handle, struct journal_head *jh, | ||
523 | int force_copy) | ||
524 | { | ||
525 | struct buffer_head *bh; | ||
526 | transaction_t *transaction; | ||
527 | journal_t *journal; | ||
528 | int error; | ||
529 | char *frozen_buffer = NULL; | ||
530 | int need_copy = 0; | ||
531 | |||
532 | if (is_handle_aborted(handle)) | ||
533 | return -EROFS; | ||
534 | |||
535 | transaction = handle->h_transaction; | ||
536 | journal = transaction->t_journal; | ||
537 | |||
538 | jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); | ||
539 | |||
540 | JBUFFER_TRACE(jh, "entry"); | ||
541 | repeat: | ||
542 | bh = jh2bh(jh); | ||
543 | |||
544 | /* @@@ Need to check for errors here at some point. */ | ||
545 | |||
546 | lock_buffer(bh); | ||
547 | jbd_lock_bh_state(bh); | ||
548 | |||
549 | /* We now hold the buffer lock so it is safe to query the buffer | ||
550 | * state. Is the buffer dirty? | ||
551 | * | ||
552 | * If so, there are two possibilities. The buffer may be | ||
553 | * non-journaled, and undergoing a quite legitimate writeback. | ||
554 | * Otherwise, it is journaled, and we don't expect dirty buffers | ||
555 | * in that state (the buffers should be marked JBD_Dirty | ||
556 | * instead.) So either the IO is being done under our own | ||
557 | * control and this is a bug, or it's a third party IO such as | ||
558 | * dump(8) (which may leave the buffer scheduled for read --- | ||
559 | * ie. locked but not dirty) or tune2fs (which may actually have | ||
560 | * the buffer dirtied, ugh.) */ | ||
561 | |||
562 | if (buffer_dirty(bh)) { | ||
563 | /* | ||
564 | * First question: is this buffer already part of the current | ||
565 | * transaction or the existing committing transaction? | ||
566 | */ | ||
567 | if (jh->b_transaction) { | ||
568 | J_ASSERT_JH(jh, | ||
569 | jh->b_transaction == transaction || | ||
570 | jh->b_transaction == | ||
571 | journal->j_committing_transaction); | ||
572 | if (jh->b_next_transaction) | ||
573 | J_ASSERT_JH(jh, jh->b_next_transaction == | ||
574 | transaction); | ||
575 | } | ||
576 | /* | ||
577 | * In any case we need to clean the dirty flag and we must | ||
578 | * do it under the buffer lock to be sure we don't race | ||
579 | * with running write-out. | ||
580 | */ | ||
581 | JBUFFER_TRACE(jh, "Unexpected dirty buffer"); | ||
582 | jbd_unexpected_dirty_buffer(jh); | ||
583 | } | ||
584 | |||
585 | unlock_buffer(bh); | ||
586 | |||
587 | error = -EROFS; | ||
588 | if (is_handle_aborted(handle)) { | ||
589 | jbd_unlock_bh_state(bh); | ||
590 | goto out; | ||
591 | } | ||
592 | error = 0; | ||
593 | |||
594 | /* | ||
595 | * The buffer is already part of this transaction if b_transaction or | ||
596 | * b_next_transaction points to it | ||
597 | */ | ||
598 | if (jh->b_transaction == transaction || | ||
599 | jh->b_next_transaction == transaction) | ||
600 | goto done; | ||
601 | |||
602 | /* | ||
603 | * If there is already a copy-out version of this buffer, then we don't | ||
604 | * need to make another one | ||
605 | */ | ||
606 | if (jh->b_frozen_data) { | ||
607 | JBUFFER_TRACE(jh, "has frozen data"); | ||
608 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
609 | jh->b_next_transaction = transaction; | ||
610 | goto done; | ||
611 | } | ||
612 | |||
613 | /* Is there data here we need to preserve? */ | ||
614 | |||
615 | if (jh->b_transaction && jh->b_transaction != transaction) { | ||
616 | JBUFFER_TRACE(jh, "owned by older transaction"); | ||
617 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
618 | J_ASSERT_JH(jh, jh->b_transaction == | ||
619 | journal->j_committing_transaction); | ||
620 | |||
621 | /* There is one case we have to be very careful about. | ||
622 | * If the committing transaction is currently writing | ||
623 | * this buffer out to disk and has NOT made a copy-out, | ||
624 | * then we cannot modify the buffer contents at all | ||
625 | * right now. The essence of copy-out is that it is the | ||
626 | * extra copy, not the primary copy, which gets | ||
627 | * journaled. If the primary copy is already going to | ||
628 | * disk then we cannot do copy-out here. */ | ||
629 | |||
630 | if (jh->b_jlist == BJ_Shadow) { | ||
631 | DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); | ||
632 | wait_queue_head_t *wqh; | ||
633 | |||
634 | wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); | ||
635 | |||
636 | JBUFFER_TRACE(jh, "on shadow: sleep"); | ||
637 | jbd_unlock_bh_state(bh); | ||
638 | /* commit wakes up all shadow buffers after IO */ | ||
639 | for ( ; ; ) { | ||
640 | prepare_to_wait(wqh, &wait.wait, | ||
641 | TASK_UNINTERRUPTIBLE); | ||
642 | if (jh->b_jlist != BJ_Shadow) | ||
643 | break; | ||
644 | schedule(); | ||
645 | } | ||
646 | finish_wait(wqh, &wait.wait); | ||
647 | goto repeat; | ||
648 | } | ||
649 | |||
650 | /* Only do the copy if the currently-owning transaction | ||
651 | * still needs it. If it is on the Forget list, the | ||
652 | * committing transaction is past that stage. The | ||
653 | * buffer had better remain locked during the kmalloc, | ||
654 | * but that should be true --- we hold the journal lock | ||
655 | * still and the buffer is already on the BUF_JOURNAL | ||
656 | * list so won't be flushed. | ||
657 | * | ||
658 | * Subtle point, though: if this is a get_undo_access, | ||
659 | * then we will be relying on the frozen_data to contain | ||
660 | * the new value of the committed_data record after the | ||
661 | * transaction, so we HAVE to force the frozen_data copy | ||
662 | * in that case. */ | ||
663 | |||
664 | if (jh->b_jlist != BJ_Forget || force_copy) { | ||
665 | JBUFFER_TRACE(jh, "generate frozen data"); | ||
666 | if (!frozen_buffer) { | ||
667 | JBUFFER_TRACE(jh, "allocate memory for buffer"); | ||
668 | jbd_unlock_bh_state(bh); | ||
669 | frozen_buffer = | ||
670 | jbd_slab_alloc(jh2bh(jh)->b_size, | ||
671 | GFP_NOFS); | ||
672 | if (!frozen_buffer) { | ||
673 | printk(KERN_EMERG | ||
674 | "%s: OOM for frozen_buffer\n", | ||
675 | __FUNCTION__); | ||
676 | JBUFFER_TRACE(jh, "oom!"); | ||
677 | error = -ENOMEM; | ||
678 | jbd_lock_bh_state(bh); | ||
679 | goto done; | ||
680 | } | ||
681 | goto repeat; | ||
682 | } | ||
683 | jh->b_frozen_data = frozen_buffer; | ||
684 | frozen_buffer = NULL; | ||
685 | need_copy = 1; | ||
686 | } | ||
687 | jh->b_next_transaction = transaction; | ||
688 | } | ||
689 | |||
690 | |||
691 | /* | ||
692 | * Finally, if the buffer is not journaled right now, we need to make | ||
693 | * sure it doesn't get written to disk before the caller actually | ||
694 | * commits the new data | ||
695 | */ | ||
696 | if (!jh->b_transaction) { | ||
697 | JBUFFER_TRACE(jh, "no transaction"); | ||
698 | J_ASSERT_JH(jh, !jh->b_next_transaction); | ||
699 | jh->b_transaction = transaction; | ||
700 | JBUFFER_TRACE(jh, "file as BJ_Reserved"); | ||
701 | spin_lock(&journal->j_list_lock); | ||
702 | __journal_file_buffer(jh, transaction, BJ_Reserved); | ||
703 | spin_unlock(&journal->j_list_lock); | ||
704 | } | ||
705 | |||
706 | done: | ||
707 | if (need_copy) { | ||
708 | struct page *page; | ||
709 | int offset; | ||
710 | char *source; | ||
711 | |||
712 | J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), | ||
713 | "Possible IO failure.\n"); | ||
714 | page = jh2bh(jh)->b_page; | ||
715 | offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; | ||
716 | source = kmap_atomic(page, KM_USER0); | ||
717 | memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); | ||
718 | kunmap_atomic(source, KM_USER0); | ||
719 | } | ||
720 | jbd_unlock_bh_state(bh); | ||
721 | |||
722 | /* | ||
723 | * If we are about to journal a buffer, then any revoke pending on it is | ||
724 | * no longer valid | ||
725 | */ | ||
726 | journal_cancel_revoke(handle, jh); | ||
727 | |||
728 | out: | ||
729 | if (unlikely(frozen_buffer)) /* It's usually NULL */ | ||
730 | jbd_slab_free(frozen_buffer, bh->b_size); | ||
731 | |||
732 | JBUFFER_TRACE(jh, "exit"); | ||
733 | return error; | ||
734 | } | ||
735 | |||
736 | /** | ||
737 | * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. | ||
738 | * @handle: transaction to add buffer modifications to | ||
739 | * @bh: bh to be used for metadata writes | ||
740 | * @credits: variable that will receive credits for the buffer | ||
741 | * | ||
742 | * Returns an error code or 0 on success. | ||
743 | * | ||
744 | * In full data journalling mode the buffer may be of type BJ_AsyncData, | ||
745 | * because we're write()ing a buffer which is also part of a shared mapping. | ||
746 | */ | ||
747 | |||
748 | int journal_get_write_access(handle_t *handle, struct buffer_head *bh) | ||
749 | { | ||
750 | struct journal_head *jh = journal_add_journal_head(bh); | ||
751 | int rc; | ||
752 | |||
753 | /* We do not want to get caught playing with fields which the | ||
754 | * log thread also manipulates. Make sure that the buffer | ||
755 | * completes any outstanding IO before proceeding. */ | ||
756 | rc = do_get_write_access(handle, jh, 0); | ||
757 | journal_put_journal_head(jh); | ||
758 | return rc; | ||
759 | } | ||
760 | |||
761 | |||
762 | /* | ||
763 | * When the user wants to journal a newly created buffer_head | ||
764 | * (ie. getblk() returned a new buffer and we are going to populate it | ||
765 | * manually rather than reading off disk), then we need to keep the | ||
766 | * buffer_head locked until it has been completely filled with new | ||
767 | * data. In this case, we should be able to make the assertion that | ||
768 | * the bh is not already part of an existing transaction. | ||
769 | * | ||
770 | * The buffer should already be locked by the caller by this point. | ||
771 | * There is no lock ranking violation: it was a newly created, | ||
772 | * unlocked buffer beforehand. */ | ||
773 | |||
774 | /** | ||
775 | * int journal_get_create_access () - notify intent to use newly created bh | ||
776 | * @handle: transaction to new buffer to | ||
777 | * @bh: new buffer. | ||
778 | * | ||
779 | * Call this if you create a new bh. | ||
780 | */ | ||
781 | int journal_get_create_access(handle_t *handle, struct buffer_head *bh) | ||
782 | { | ||
783 | transaction_t *transaction = handle->h_transaction; | ||
784 | journal_t *journal = transaction->t_journal; | ||
785 | struct journal_head *jh = journal_add_journal_head(bh); | ||
786 | int err; | ||
787 | |||
788 | jbd_debug(5, "journal_head %p\n", jh); | ||
789 | err = -EROFS; | ||
790 | if (is_handle_aborted(handle)) | ||
791 | goto out; | ||
792 | err = 0; | ||
793 | |||
794 | JBUFFER_TRACE(jh, "entry"); | ||
795 | /* | ||
796 | * The buffer may already belong to this transaction due to pre-zeroing | ||
797 | * in the filesystem's new_block code. It may also be on the previous, | ||
798 | * committing transaction's lists, but it HAS to be in Forget state in | ||
799 | * that case: the transaction must have deleted the buffer for it to be | ||
800 | * reused here. | ||
801 | */ | ||
802 | jbd_lock_bh_state(bh); | ||
803 | spin_lock(&journal->j_list_lock); | ||
804 | J_ASSERT_JH(jh, (jh->b_transaction == transaction || | ||
805 | jh->b_transaction == NULL || | ||
806 | (jh->b_transaction == journal->j_committing_transaction && | ||
807 | jh->b_jlist == BJ_Forget))); | ||
808 | |||
809 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
810 | J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); | ||
811 | |||
812 | if (jh->b_transaction == NULL) { | ||
813 | jh->b_transaction = transaction; | ||
814 | JBUFFER_TRACE(jh, "file as BJ_Reserved"); | ||
815 | __journal_file_buffer(jh, transaction, BJ_Reserved); | ||
816 | } else if (jh->b_transaction == journal->j_committing_transaction) { | ||
817 | JBUFFER_TRACE(jh, "set next transaction"); | ||
818 | jh->b_next_transaction = transaction; | ||
819 | } | ||
820 | spin_unlock(&journal->j_list_lock); | ||
821 | jbd_unlock_bh_state(bh); | ||
822 | |||
823 | /* | ||
824 | * akpm: I added this. ext3_alloc_branch can pick up new indirect | ||
825 | * blocks which contain freed but then revoked metadata. We need | ||
826 | * to cancel the revoke in case we end up freeing it yet again | ||
827 | * and the reallocating as data - this would cause a second revoke, | ||
828 | * which hits an assertion error. | ||
829 | */ | ||
830 | JBUFFER_TRACE(jh, "cancelling revoke"); | ||
831 | journal_cancel_revoke(handle, jh); | ||
832 | journal_put_journal_head(jh); | ||
833 | out: | ||
834 | return err; | ||
835 | } | ||
836 | |||
837 | /** | ||
838 | * int journal_get_undo_access() - Notify intent to modify metadata with | ||
839 | * non-rewindable consequences | ||
840 | * @handle: transaction | ||
841 | * @bh: buffer to undo | ||
842 | * @credits: store the number of taken credits here (if not NULL) | ||
843 | * | ||
844 | * Sometimes there is a need to distinguish between metadata which has | ||
845 | * been committed to disk and that which has not. The ext3fs code uses | ||
846 | * this for freeing and allocating space, we have to make sure that we | ||
847 | * do not reuse freed space until the deallocation has been committed, | ||
848 | * since if we overwrote that space we would make the delete | ||
849 | * un-rewindable in case of a crash. | ||
850 | * | ||
851 | * To deal with that, journal_get_undo_access requests write access to a | ||
852 | * buffer for parts of non-rewindable operations such as delete | ||
853 | * operations on the bitmaps. The journaling code must keep a copy of | ||
854 | * the buffer's contents prior to the undo_access call until such time | ||
855 | * as we know that the buffer has definitely been committed to disk. | ||
856 | * | ||
857 | * We never need to know which transaction the committed data is part | ||
858 | * of, buffers touched here are guaranteed to be dirtied later and so | ||
859 | * will be committed to a new transaction in due course, at which point | ||
860 | * we can discard the old committed data pointer. | ||
861 | * | ||
862 | * Returns error number or 0 on success. | ||
863 | */ | ||
864 | int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) | ||
865 | { | ||
866 | int err; | ||
867 | struct journal_head *jh = journal_add_journal_head(bh); | ||
868 | char *committed_data = NULL; | ||
869 | |||
870 | JBUFFER_TRACE(jh, "entry"); | ||
871 | |||
872 | /* | ||
873 | * Do this first --- it can drop the journal lock, so we want to | ||
874 | * make sure that obtaining the committed_data is done | ||
875 | * atomically wrt. completion of any outstanding commits. | ||
876 | */ | ||
877 | err = do_get_write_access(handle, jh, 1); | ||
878 | if (err) | ||
879 | goto out; | ||
880 | |||
881 | repeat: | ||
882 | if (!jh->b_committed_data) { | ||
883 | committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS); | ||
884 | if (!committed_data) { | ||
885 | printk(KERN_EMERG "%s: No memory for committed data\n", | ||
886 | __FUNCTION__); | ||
887 | err = -ENOMEM; | ||
888 | goto out; | ||
889 | } | ||
890 | } | ||
891 | |||
892 | jbd_lock_bh_state(bh); | ||
893 | if (!jh->b_committed_data) { | ||
894 | /* Copy out the current buffer contents into the | ||
895 | * preserved, committed copy. */ | ||
896 | JBUFFER_TRACE(jh, "generate b_committed data"); | ||
897 | if (!committed_data) { | ||
898 | jbd_unlock_bh_state(bh); | ||
899 | goto repeat; | ||
900 | } | ||
901 | |||
902 | jh->b_committed_data = committed_data; | ||
903 | committed_data = NULL; | ||
904 | memcpy(jh->b_committed_data, bh->b_data, bh->b_size); | ||
905 | } | ||
906 | jbd_unlock_bh_state(bh); | ||
907 | out: | ||
908 | journal_put_journal_head(jh); | ||
909 | if (unlikely(committed_data)) | ||
910 | jbd_slab_free(committed_data, bh->b_size); | ||
911 | return err; | ||
912 | } | ||
913 | |||
914 | /** | ||
915 | * int journal_dirty_data() - mark a buffer as containing dirty data which | ||
916 | * needs to be flushed before we can commit the | ||
917 | * current transaction. | ||
918 | * @handle: transaction | ||
919 | * @bh: bufferhead to mark | ||
920 | * | ||
921 | * The buffer is placed on the transaction's data list and is marked as | ||
922 | * belonging to the transaction. | ||
923 | * | ||
924 | * Returns error number or 0 on success. | ||
925 | * | ||
926 | * journal_dirty_data() can be called via page_launder->ext3_writepage | ||
927 | * by kswapd. | ||
928 | */ | ||
929 | int journal_dirty_data(handle_t *handle, struct buffer_head *bh) | ||
930 | { | ||
931 | journal_t *journal = handle->h_transaction->t_journal; | ||
932 | int need_brelse = 0; | ||
933 | struct journal_head *jh; | ||
934 | |||
935 | if (is_handle_aborted(handle)) | ||
936 | return 0; | ||
937 | |||
938 | jh = journal_add_journal_head(bh); | ||
939 | JBUFFER_TRACE(jh, "entry"); | ||
940 | |||
941 | /* | ||
942 | * The buffer could *already* be dirty. Writeout can start | ||
943 | * at any time. | ||
944 | */ | ||
945 | jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); | ||
946 | |||
947 | /* | ||
948 | * What if the buffer is already part of a running transaction? | ||
949 | * | ||
950 | * There are two cases: | ||
951 | * 1) It is part of the current running transaction. Refile it, | ||
952 | * just in case we have allocated it as metadata, deallocated | ||
953 | * it, then reallocated it as data. | ||
954 | * 2) It is part of the previous, still-committing transaction. | ||
955 | * If all we want to do is to guarantee that the buffer will be | ||
956 | * written to disk before this new transaction commits, then | ||
957 | * being sure that the *previous* transaction has this same | ||
958 | * property is sufficient for us! Just leave it on its old | ||
959 | * transaction. | ||
960 | * | ||
961 | * In case (2), the buffer must not already exist as metadata | ||
962 | * --- that would violate write ordering (a transaction is free | ||
963 | * to write its data at any point, even before the previous | ||
964 | * committing transaction has committed). The caller must | ||
965 | * never, ever allow this to happen: there's nothing we can do | ||
966 | * about it in this layer. | ||
967 | */ | ||
968 | jbd_lock_bh_state(bh); | ||
969 | spin_lock(&journal->j_list_lock); | ||
970 | if (jh->b_transaction) { | ||
971 | JBUFFER_TRACE(jh, "has transaction"); | ||
972 | if (jh->b_transaction != handle->h_transaction) { | ||
973 | JBUFFER_TRACE(jh, "belongs to older transaction"); | ||
974 | J_ASSERT_JH(jh, jh->b_transaction == | ||
975 | journal->j_committing_transaction); | ||
976 | |||
977 | /* @@@ IS THIS TRUE ? */ | ||
978 | /* | ||
979 | * Not any more. Scenario: someone does a write() | ||
980 | * in data=journal mode. The buffer's transaction has | ||
981 | * moved into commit. Then someone does another | ||
982 | * write() to the file. We do the frozen data copyout | ||
983 | * and set b_next_transaction to point to j_running_t. | ||
984 | * And while we're in that state, someone does a | ||
985 | * writepage() in an attempt to pageout the same area | ||
986 | * of the file via a shared mapping. At present that | ||
987 | * calls journal_dirty_data(), and we get right here. | ||
988 | * It may be too late to journal the data. Simply | ||
989 | * falling through to the next test will suffice: the | ||
990 | * data will be dirty and wil be checkpointed. The | ||
991 | * ordering comments in the next comment block still | ||
992 | * apply. | ||
993 | */ | ||
994 | //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
995 | |||
996 | /* | ||
997 | * If we're journalling data, and this buffer was | ||
998 | * subject to a write(), it could be metadata, forget | ||
999 | * or shadow against the committing transaction. Now, | ||
1000 | * someone has dirtied the same darn page via a mapping | ||
1001 | * and it is being writepage()'d. | ||
1002 | * We *could* just steal the page from commit, with some | ||
1003 | * fancy locking there. Instead, we just skip it - | ||
1004 | * don't tie the page's buffers to the new transaction | ||
1005 | * at all. | ||
1006 | * Implication: if we crash before the writepage() data | ||
1007 | * is written into the filesystem, recovery will replay | ||
1008 | * the write() data. | ||
1009 | */ | ||
1010 | if (jh->b_jlist != BJ_None && | ||
1011 | jh->b_jlist != BJ_SyncData && | ||
1012 | jh->b_jlist != BJ_Locked) { | ||
1013 | JBUFFER_TRACE(jh, "Not stealing"); | ||
1014 | goto no_journal; | ||
1015 | } | ||
1016 | |||
1017 | /* | ||
1018 | * This buffer may be undergoing writeout in commit. We | ||
1019 | * can't return from here and let the caller dirty it | ||
1020 | * again because that can cause the write-out loop in | ||
1021 | * commit to never terminate. | ||
1022 | */ | ||
1023 | if (buffer_dirty(bh)) { | ||
1024 | get_bh(bh); | ||
1025 | spin_unlock(&journal->j_list_lock); | ||
1026 | jbd_unlock_bh_state(bh); | ||
1027 | need_brelse = 1; | ||
1028 | sync_dirty_buffer(bh); | ||
1029 | jbd_lock_bh_state(bh); | ||
1030 | spin_lock(&journal->j_list_lock); | ||
1031 | /* The buffer may become locked again at any | ||
1032 | time if it is redirtied */ | ||
1033 | } | ||
1034 | |||
1035 | /* journal_clean_data_list() may have got there first */ | ||
1036 | if (jh->b_transaction != NULL) { | ||
1037 | JBUFFER_TRACE(jh, "unfile from commit"); | ||
1038 | __journal_temp_unlink_buffer(jh); | ||
1039 | /* It still points to the committing | ||
1040 | * transaction; move it to this one so | ||
1041 | * that the refile assert checks are | ||
1042 | * happy. */ | ||
1043 | jh->b_transaction = handle->h_transaction; | ||
1044 | } | ||
1045 | /* The buffer will be refiled below */ | ||
1046 | |||
1047 | } | ||
1048 | /* | ||
1049 | * Special case --- the buffer might actually have been | ||
1050 | * allocated and then immediately deallocated in the previous, | ||
1051 | * committing transaction, so might still be left on that | ||
1052 | * transaction's metadata lists. | ||
1053 | */ | ||
1054 | if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { | ||
1055 | JBUFFER_TRACE(jh, "not on correct data list: unfile"); | ||
1056 | J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); | ||
1057 | __journal_temp_unlink_buffer(jh); | ||
1058 | jh->b_transaction = handle->h_transaction; | ||
1059 | JBUFFER_TRACE(jh, "file as data"); | ||
1060 | __journal_file_buffer(jh, handle->h_transaction, | ||
1061 | BJ_SyncData); | ||
1062 | } | ||
1063 | } else { | ||
1064 | JBUFFER_TRACE(jh, "not on a transaction"); | ||
1065 | __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); | ||
1066 | } | ||
1067 | no_journal: | ||
1068 | spin_unlock(&journal->j_list_lock); | ||
1069 | jbd_unlock_bh_state(bh); | ||
1070 | if (need_brelse) { | ||
1071 | BUFFER_TRACE(bh, "brelse"); | ||
1072 | __brelse(bh); | ||
1073 | } | ||
1074 | JBUFFER_TRACE(jh, "exit"); | ||
1075 | journal_put_journal_head(jh); | ||
1076 | return 0; | ||
1077 | } | ||
1078 | |||
1079 | /** | ||
1080 | * int journal_dirty_metadata() - mark a buffer as containing dirty metadata | ||
1081 | * @handle: transaction to add buffer to. | ||
1082 | * @bh: buffer to mark | ||
1083 | * | ||
1084 | * mark dirty metadata which needs to be journaled as part of the current | ||
1085 | * transaction. | ||
1086 | * | ||
1087 | * The buffer is placed on the transaction's metadata list and is marked | ||
1088 | * as belonging to the transaction. | ||
1089 | * | ||
1090 | * Returns error number or 0 on success. | ||
1091 | * | ||
1092 | * Special care needs to be taken if the buffer already belongs to the | ||
1093 | * current committing transaction (in which case we should have frozen | ||
1094 | * data present for that commit). In that case, we don't relink the | ||
1095 | * buffer: that only gets done when the old transaction finally | ||
1096 | * completes its commit. | ||
1097 | */ | ||
1098 | int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) | ||
1099 | { | ||
1100 | transaction_t *transaction = handle->h_transaction; | ||
1101 | journal_t *journal = transaction->t_journal; | ||
1102 | struct journal_head *jh = bh2jh(bh); | ||
1103 | |||
1104 | jbd_debug(5, "journal_head %p\n", jh); | ||
1105 | JBUFFER_TRACE(jh, "entry"); | ||
1106 | if (is_handle_aborted(handle)) | ||
1107 | goto out; | ||
1108 | |||
1109 | jbd_lock_bh_state(bh); | ||
1110 | |||
1111 | if (jh->b_modified == 0) { | ||
1112 | /* | ||
1113 | * This buffer's got modified and becoming part | ||
1114 | * of the transaction. This needs to be done | ||
1115 | * once a transaction -bzzz | ||
1116 | */ | ||
1117 | jh->b_modified = 1; | ||
1118 | J_ASSERT_JH(jh, handle->h_buffer_credits > 0); | ||
1119 | handle->h_buffer_credits--; | ||
1120 | } | ||
1121 | |||
1122 | /* | ||
1123 | * fastpath, to avoid expensive locking. If this buffer is already | ||
1124 | * on the running transaction's metadata list there is nothing to do. | ||
1125 | * Nobody can take it off again because there is a handle open. | ||
1126 | * I _think_ we're OK here with SMP barriers - a mistaken decision will | ||
1127 | * result in this test being false, so we go in and take the locks. | ||
1128 | */ | ||
1129 | if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { | ||
1130 | JBUFFER_TRACE(jh, "fastpath"); | ||
1131 | J_ASSERT_JH(jh, jh->b_transaction == | ||
1132 | journal->j_running_transaction); | ||
1133 | goto out_unlock_bh; | ||
1134 | } | ||
1135 | |||
1136 | set_buffer_jbddirty(bh); | ||
1137 | |||
1138 | /* | ||
1139 | * Metadata already on the current transaction list doesn't | ||
1140 | * need to be filed. Metadata on another transaction's list must | ||
1141 | * be committing, and will be refiled once the commit completes: | ||
1142 | * leave it alone for now. | ||
1143 | */ | ||
1144 | if (jh->b_transaction != transaction) { | ||
1145 | JBUFFER_TRACE(jh, "already on other transaction"); | ||
1146 | J_ASSERT_JH(jh, jh->b_transaction == | ||
1147 | journal->j_committing_transaction); | ||
1148 | J_ASSERT_JH(jh, jh->b_next_transaction == transaction); | ||
1149 | /* And this case is illegal: we can't reuse another | ||
1150 | * transaction's data buffer, ever. */ | ||
1151 | goto out_unlock_bh; | ||
1152 | } | ||
1153 | |||
1154 | /* That test should have eliminated the following case: */ | ||
1155 | J_ASSERT_JH(jh, jh->b_frozen_data == 0); | ||
1156 | |||
1157 | JBUFFER_TRACE(jh, "file as BJ_Metadata"); | ||
1158 | spin_lock(&journal->j_list_lock); | ||
1159 | __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); | ||
1160 | spin_unlock(&journal->j_list_lock); | ||
1161 | out_unlock_bh: | ||
1162 | jbd_unlock_bh_state(bh); | ||
1163 | out: | ||
1164 | JBUFFER_TRACE(jh, "exit"); | ||
1165 | return 0; | ||
1166 | } | ||
1167 | |||
1168 | /* | ||
1169 | * journal_release_buffer: undo a get_write_access without any buffer | ||
1170 | * updates, if the update decided in the end that it didn't need access. | ||
1171 | * | ||
1172 | */ | ||
1173 | void | ||
1174 | journal_release_buffer(handle_t *handle, struct buffer_head *bh) | ||
1175 | { | ||
1176 | BUFFER_TRACE(bh, "entry"); | ||
1177 | } | ||
1178 | |||
1179 | /** | ||
1180 | * void journal_forget() - bforget() for potentially-journaled buffers. | ||
1181 | * @handle: transaction handle | ||
1182 | * @bh: bh to 'forget' | ||
1183 | * | ||
1184 | * We can only do the bforget if there are no commits pending against the | ||
1185 | * buffer. If the buffer is dirty in the current running transaction we | ||
1186 | * can safely unlink it. | ||
1187 | * | ||
1188 | * bh may not be a journalled buffer at all - it may be a non-JBD | ||
1189 | * buffer which came off the hashtable. Check for this. | ||
1190 | * | ||
1191 | * Decrements bh->b_count by one. | ||
1192 | * | ||
1193 | * Allow this call even if the handle has aborted --- it may be part of | ||
1194 | * the caller's cleanup after an abort. | ||
1195 | */ | ||
1196 | int journal_forget (handle_t *handle, struct buffer_head *bh) | ||
1197 | { | ||
1198 | transaction_t *transaction = handle->h_transaction; | ||
1199 | journal_t *journal = transaction->t_journal; | ||
1200 | struct journal_head *jh; | ||
1201 | int drop_reserve = 0; | ||
1202 | int err = 0; | ||
1203 | |||
1204 | BUFFER_TRACE(bh, "entry"); | ||
1205 | |||
1206 | jbd_lock_bh_state(bh); | ||
1207 | spin_lock(&journal->j_list_lock); | ||
1208 | |||
1209 | if (!buffer_jbd(bh)) | ||
1210 | goto not_jbd; | ||
1211 | jh = bh2jh(bh); | ||
1212 | |||
1213 | /* Critical error: attempting to delete a bitmap buffer, maybe? | ||
1214 | * Don't do any jbd operations, and return an error. */ | ||
1215 | if (!J_EXPECT_JH(jh, !jh->b_committed_data, | ||
1216 | "inconsistent data on disk")) { | ||
1217 | err = -EIO; | ||
1218 | goto not_jbd; | ||
1219 | } | ||
1220 | |||
1221 | /* | ||
1222 | * The buffer's going from the transaction, we must drop | ||
1223 | * all references -bzzz | ||
1224 | */ | ||
1225 | jh->b_modified = 0; | ||
1226 | |||
1227 | if (jh->b_transaction == handle->h_transaction) { | ||
1228 | J_ASSERT_JH(jh, !jh->b_frozen_data); | ||
1229 | |||
1230 | /* If we are forgetting a buffer which is already part | ||
1231 | * of this transaction, then we can just drop it from | ||
1232 | * the transaction immediately. */ | ||
1233 | clear_buffer_dirty(bh); | ||
1234 | clear_buffer_jbddirty(bh); | ||
1235 | |||
1236 | JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); | ||
1237 | |||
1238 | drop_reserve = 1; | ||
1239 | |||
1240 | /* | ||
1241 | * We are no longer going to journal this buffer. | ||
1242 | * However, the commit of this transaction is still | ||
1243 | * important to the buffer: the delete that we are now | ||
1244 | * processing might obsolete an old log entry, so by | ||
1245 | * committing, we can satisfy the buffer's checkpoint. | ||
1246 | * | ||
1247 | * So, if we have a checkpoint on the buffer, we should | ||
1248 | * now refile the buffer on our BJ_Forget list so that | ||
1249 | * we know to remove the checkpoint after we commit. | ||
1250 | */ | ||
1251 | |||
1252 | if (jh->b_cp_transaction) { | ||
1253 | __journal_temp_unlink_buffer(jh); | ||
1254 | __journal_file_buffer(jh, transaction, BJ_Forget); | ||
1255 | } else { | ||
1256 | __journal_unfile_buffer(jh); | ||
1257 | journal_remove_journal_head(bh); | ||
1258 | __brelse(bh); | ||
1259 | if (!buffer_jbd(bh)) { | ||
1260 | spin_unlock(&journal->j_list_lock); | ||
1261 | jbd_unlock_bh_state(bh); | ||
1262 | __bforget(bh); | ||
1263 | goto drop; | ||
1264 | } | ||
1265 | } | ||
1266 | } else if (jh->b_transaction) { | ||
1267 | J_ASSERT_JH(jh, (jh->b_transaction == | ||
1268 | journal->j_committing_transaction)); | ||
1269 | /* However, if the buffer is still owned by a prior | ||
1270 | * (committing) transaction, we can't drop it yet... */ | ||
1271 | JBUFFER_TRACE(jh, "belongs to older transaction"); | ||
1272 | /* ... but we CAN drop it from the new transaction if we | ||
1273 | * have also modified it since the original commit. */ | ||
1274 | |||
1275 | if (jh->b_next_transaction) { | ||
1276 | J_ASSERT(jh->b_next_transaction == transaction); | ||
1277 | jh->b_next_transaction = NULL; | ||
1278 | drop_reserve = 1; | ||
1279 | } | ||
1280 | } | ||
1281 | |||
1282 | not_jbd: | ||
1283 | spin_unlock(&journal->j_list_lock); | ||
1284 | jbd_unlock_bh_state(bh); | ||
1285 | __brelse(bh); | ||
1286 | drop: | ||
1287 | if (drop_reserve) { | ||
1288 | /* no need to reserve log space for this block -bzzz */ | ||
1289 | handle->h_buffer_credits++; | ||
1290 | } | ||
1291 | return err; | ||
1292 | } | ||
1293 | |||
1294 | /** | ||
1295 | * int journal_stop() - complete a transaction | ||
1296 | * @handle: tranaction to complete. | ||
1297 | * | ||
1298 | * All done for a particular handle. | ||
1299 | * | ||
1300 | * There is not much action needed here. We just return any remaining | ||
1301 | * buffer credits to the transaction and remove the handle. The only | ||
1302 | * complication is that we need to start a commit operation if the | ||
1303 | * filesystem is marked for synchronous update. | ||
1304 | * | ||
1305 | * journal_stop itself will not usually return an error, but it may | ||
1306 | * do so in unusual circumstances. In particular, expect it to | ||
1307 | * return -EIO if a journal_abort has been executed since the | ||
1308 | * transaction began. | ||
1309 | */ | ||
1310 | int journal_stop(handle_t *handle) | ||
1311 | { | ||
1312 | transaction_t *transaction = handle->h_transaction; | ||
1313 | journal_t *journal = transaction->t_journal; | ||
1314 | int old_handle_count, err; | ||
1315 | pid_t pid; | ||
1316 | |||
1317 | J_ASSERT(transaction->t_updates > 0); | ||
1318 | J_ASSERT(journal_current_handle() == handle); | ||
1319 | |||
1320 | if (is_handle_aborted(handle)) | ||
1321 | err = -EIO; | ||
1322 | else | ||
1323 | err = 0; | ||
1324 | |||
1325 | if (--handle->h_ref > 0) { | ||
1326 | jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, | ||
1327 | handle->h_ref); | ||
1328 | return err; | ||
1329 | } | ||
1330 | |||
1331 | jbd_debug(4, "Handle %p going down\n", handle); | ||
1332 | |||
1333 | /* | ||
1334 | * Implement synchronous transaction batching. If the handle | ||
1335 | * was synchronous, don't force a commit immediately. Let's | ||
1336 | * yield and let another thread piggyback onto this transaction. | ||
1337 | * Keep doing that while new threads continue to arrive. | ||
1338 | * It doesn't cost much - we're about to run a commit and sleep | ||
1339 | * on IO anyway. Speeds up many-threaded, many-dir operations | ||
1340 | * by 30x or more... | ||
1341 | * | ||
1342 | * But don't do this if this process was the most recent one to | ||
1343 | * perform a synchronous write. We do this to detect the case where a | ||
1344 | * single process is doing a stream of sync writes. No point in waiting | ||
1345 | * for joiners in that case. | ||
1346 | */ | ||
1347 | pid = current->pid; | ||
1348 | if (handle->h_sync && journal->j_last_sync_writer != pid) { | ||
1349 | journal->j_last_sync_writer = pid; | ||
1350 | do { | ||
1351 | old_handle_count = transaction->t_handle_count; | ||
1352 | schedule_timeout_uninterruptible(1); | ||
1353 | } while (old_handle_count != transaction->t_handle_count); | ||
1354 | } | ||
1355 | |||
1356 | current->journal_info = NULL; | ||
1357 | spin_lock(&journal->j_state_lock); | ||
1358 | spin_lock(&transaction->t_handle_lock); | ||
1359 | transaction->t_outstanding_credits -= handle->h_buffer_credits; | ||
1360 | transaction->t_updates--; | ||
1361 | if (!transaction->t_updates) { | ||
1362 | wake_up(&journal->j_wait_updates); | ||
1363 | if (journal->j_barrier_count) | ||
1364 | wake_up(&journal->j_wait_transaction_locked); | ||
1365 | } | ||
1366 | |||
1367 | /* | ||
1368 | * If the handle is marked SYNC, we need to set another commit | ||
1369 | * going! We also want to force a commit if the current | ||
1370 | * transaction is occupying too much of the log, or if the | ||
1371 | * transaction is too old now. | ||
1372 | */ | ||
1373 | if (handle->h_sync || | ||
1374 | transaction->t_outstanding_credits > | ||
1375 | journal->j_max_transaction_buffers || | ||
1376 | time_after_eq(jiffies, transaction->t_expires)) { | ||
1377 | /* Do this even for aborted journals: an abort still | ||
1378 | * completes the commit thread, it just doesn't write | ||
1379 | * anything to disk. */ | ||
1380 | tid_t tid = transaction->t_tid; | ||
1381 | |||
1382 | spin_unlock(&transaction->t_handle_lock); | ||
1383 | jbd_debug(2, "transaction too old, requesting commit for " | ||
1384 | "handle %p\n", handle); | ||
1385 | /* This is non-blocking */ | ||
1386 | __log_start_commit(journal, transaction->t_tid); | ||
1387 | spin_unlock(&journal->j_state_lock); | ||
1388 | |||
1389 | /* | ||
1390 | * Special case: JFS_SYNC synchronous updates require us | ||
1391 | * to wait for the commit to complete. | ||
1392 | */ | ||
1393 | if (handle->h_sync && !(current->flags & PF_MEMALLOC)) | ||
1394 | err = log_wait_commit(journal, tid); | ||
1395 | } else { | ||
1396 | spin_unlock(&transaction->t_handle_lock); | ||
1397 | spin_unlock(&journal->j_state_lock); | ||
1398 | } | ||
1399 | |||
1400 | jbd_free_handle(handle); | ||
1401 | return err; | ||
1402 | } | ||
1403 | |||
1404 | /**int journal_force_commit() - force any uncommitted transactions | ||
1405 | * @journal: journal to force | ||
1406 | * | ||
1407 | * For synchronous operations: force any uncommitted transactions | ||
1408 | * to disk. May seem kludgy, but it reuses all the handle batching | ||
1409 | * code in a very simple manner. | ||
1410 | */ | ||
1411 | int journal_force_commit(journal_t *journal) | ||
1412 | { | ||
1413 | handle_t *handle; | ||
1414 | int ret; | ||
1415 | |||
1416 | handle = journal_start(journal, 1); | ||
1417 | if (IS_ERR(handle)) { | ||
1418 | ret = PTR_ERR(handle); | ||
1419 | } else { | ||
1420 | handle->h_sync = 1; | ||
1421 | ret = journal_stop(handle); | ||
1422 | } | ||
1423 | return ret; | ||
1424 | } | ||
1425 | |||
1426 | /* | ||
1427 | * | ||
1428 | * List management code snippets: various functions for manipulating the | ||
1429 | * transaction buffer lists. | ||
1430 | * | ||
1431 | */ | ||
1432 | |||
1433 | /* | ||
1434 | * Append a buffer to a transaction list, given the transaction's list head | ||
1435 | * pointer. | ||
1436 | * | ||
1437 | * j_list_lock is held. | ||
1438 | * | ||
1439 | * jbd_lock_bh_state(jh2bh(jh)) is held. | ||
1440 | */ | ||
1441 | |||
1442 | static inline void | ||
1443 | __blist_add_buffer(struct journal_head **list, struct journal_head *jh) | ||
1444 | { | ||
1445 | if (!*list) { | ||
1446 | jh->b_tnext = jh->b_tprev = jh; | ||
1447 | *list = jh; | ||
1448 | } else { | ||
1449 | /* Insert at the tail of the list to preserve order */ | ||
1450 | struct journal_head *first = *list, *last = first->b_tprev; | ||
1451 | jh->b_tprev = last; | ||
1452 | jh->b_tnext = first; | ||
1453 | last->b_tnext = first->b_tprev = jh; | ||
1454 | } | ||
1455 | } | ||
1456 | |||
1457 | /* | ||
1458 | * Remove a buffer from a transaction list, given the transaction's list | ||
1459 | * head pointer. | ||
1460 | * | ||
1461 | * Called with j_list_lock held, and the journal may not be locked. | ||
1462 | * | ||
1463 | * jbd_lock_bh_state(jh2bh(jh)) is held. | ||
1464 | */ | ||
1465 | |||
1466 | static inline void | ||
1467 | __blist_del_buffer(struct journal_head **list, struct journal_head *jh) | ||
1468 | { | ||
1469 | if (*list == jh) { | ||
1470 | *list = jh->b_tnext; | ||
1471 | if (*list == jh) | ||
1472 | *list = NULL; | ||
1473 | } | ||
1474 | jh->b_tprev->b_tnext = jh->b_tnext; | ||
1475 | jh->b_tnext->b_tprev = jh->b_tprev; | ||
1476 | } | ||
1477 | |||
1478 | /* | ||
1479 | * Remove a buffer from the appropriate transaction list. | ||
1480 | * | ||
1481 | * Note that this function can *change* the value of | ||
1482 | * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, | ||
1483 | * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller | ||
1484 | * is holding onto a copy of one of thee pointers, it could go bad. | ||
1485 | * Generally the caller needs to re-read the pointer from the transaction_t. | ||
1486 | * | ||
1487 | * Called under j_list_lock. The journal may not be locked. | ||
1488 | */ | ||
1489 | void __journal_temp_unlink_buffer(struct journal_head *jh) | ||
1490 | { | ||
1491 | struct journal_head **list = NULL; | ||
1492 | transaction_t *transaction; | ||
1493 | struct buffer_head *bh = jh2bh(jh); | ||
1494 | |||
1495 | J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); | ||
1496 | transaction = jh->b_transaction; | ||
1497 | if (transaction) | ||
1498 | assert_spin_locked(&transaction->t_journal->j_list_lock); | ||
1499 | |||
1500 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); | ||
1501 | if (jh->b_jlist != BJ_None) | ||
1502 | J_ASSERT_JH(jh, transaction != 0); | ||
1503 | |||
1504 | switch (jh->b_jlist) { | ||
1505 | case BJ_None: | ||
1506 | return; | ||
1507 | case BJ_SyncData: | ||
1508 | list = &transaction->t_sync_datalist; | ||
1509 | break; | ||
1510 | case BJ_Metadata: | ||
1511 | transaction->t_nr_buffers--; | ||
1512 | J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); | ||
1513 | list = &transaction->t_buffers; | ||
1514 | break; | ||
1515 | case BJ_Forget: | ||
1516 | list = &transaction->t_forget; | ||
1517 | break; | ||
1518 | case BJ_IO: | ||
1519 | list = &transaction->t_iobuf_list; | ||
1520 | break; | ||
1521 | case BJ_Shadow: | ||
1522 | list = &transaction->t_shadow_list; | ||
1523 | break; | ||
1524 | case BJ_LogCtl: | ||
1525 | list = &transaction->t_log_list; | ||
1526 | break; | ||
1527 | case BJ_Reserved: | ||
1528 | list = &transaction->t_reserved_list; | ||
1529 | break; | ||
1530 | case BJ_Locked: | ||
1531 | list = &transaction->t_locked_list; | ||
1532 | break; | ||
1533 | } | ||
1534 | |||
1535 | __blist_del_buffer(list, jh); | ||
1536 | jh->b_jlist = BJ_None; | ||
1537 | if (test_clear_buffer_jbddirty(bh)) | ||
1538 | mark_buffer_dirty(bh); /* Expose it to the VM */ | ||
1539 | } | ||
1540 | |||
1541 | void __journal_unfile_buffer(struct journal_head *jh) | ||
1542 | { | ||
1543 | __journal_temp_unlink_buffer(jh); | ||
1544 | jh->b_transaction = NULL; | ||
1545 | } | ||
1546 | |||
1547 | void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) | ||
1548 | { | ||
1549 | jbd_lock_bh_state(jh2bh(jh)); | ||
1550 | spin_lock(&journal->j_list_lock); | ||
1551 | __journal_unfile_buffer(jh); | ||
1552 | spin_unlock(&journal->j_list_lock); | ||
1553 | jbd_unlock_bh_state(jh2bh(jh)); | ||
1554 | } | ||
1555 | |||
1556 | /* | ||
1557 | * Called from journal_try_to_free_buffers(). | ||
1558 | * | ||
1559 | * Called under jbd_lock_bh_state(bh) | ||
1560 | */ | ||
1561 | static void | ||
1562 | __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) | ||
1563 | { | ||
1564 | struct journal_head *jh; | ||
1565 | |||
1566 | jh = bh2jh(bh); | ||
1567 | |||
1568 | if (buffer_locked(bh) || buffer_dirty(bh)) | ||
1569 | goto out; | ||
1570 | |||
1571 | if (jh->b_next_transaction != 0) | ||
1572 | goto out; | ||
1573 | |||
1574 | spin_lock(&journal->j_list_lock); | ||
1575 | if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { | ||
1576 | if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { | ||
1577 | /* A written-back ordered data buffer */ | ||
1578 | JBUFFER_TRACE(jh, "release data"); | ||
1579 | __journal_unfile_buffer(jh); | ||
1580 | journal_remove_journal_head(bh); | ||
1581 | __brelse(bh); | ||
1582 | } | ||
1583 | } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { | ||
1584 | /* written-back checkpointed metadata buffer */ | ||
1585 | if (jh->b_jlist == BJ_None) { | ||
1586 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | ||
1587 | __journal_remove_checkpoint(jh); | ||
1588 | journal_remove_journal_head(bh); | ||
1589 | __brelse(bh); | ||
1590 | } | ||
1591 | } | ||
1592 | spin_unlock(&journal->j_list_lock); | ||
1593 | out: | ||
1594 | return; | ||
1595 | } | ||
1596 | |||
1597 | |||
1598 | /** | ||
1599 | * int journal_try_to_free_buffers() - try to free page buffers. | ||
1600 | * @journal: journal for operation | ||
1601 | * @page: to try and free | ||
1602 | * @unused_gfp_mask: unused | ||
1603 | * | ||
1604 | * | ||
1605 | * For all the buffers on this page, | ||
1606 | * if they are fully written out ordered data, move them onto BUF_CLEAN | ||
1607 | * so try_to_free_buffers() can reap them. | ||
1608 | * | ||
1609 | * This function returns non-zero if we wish try_to_free_buffers() | ||
1610 | * to be called. We do this if the page is releasable by try_to_free_buffers(). | ||
1611 | * We also do it if the page has locked or dirty buffers and the caller wants | ||
1612 | * us to perform sync or async writeout. | ||
1613 | * | ||
1614 | * This complicates JBD locking somewhat. We aren't protected by the | ||
1615 | * BKL here. We wish to remove the buffer from its committing or | ||
1616 | * running transaction's ->t_datalist via __journal_unfile_buffer. | ||
1617 | * | ||
1618 | * This may *change* the value of transaction_t->t_datalist, so anyone | ||
1619 | * who looks at t_datalist needs to lock against this function. | ||
1620 | * | ||
1621 | * Even worse, someone may be doing a journal_dirty_data on this | ||
1622 | * buffer. So we need to lock against that. journal_dirty_data() | ||
1623 | * will come out of the lock with the buffer dirty, which makes it | ||
1624 | * ineligible for release here. | ||
1625 | * | ||
1626 | * Who else is affected by this? hmm... Really the only contender | ||
1627 | * is do_get_write_access() - it could be looking at the buffer while | ||
1628 | * journal_try_to_free_buffer() is changing its state. But that | ||
1629 | * cannot happen because we never reallocate freed data as metadata | ||
1630 | * while the data is part of a transaction. Yes? | ||
1631 | */ | ||
1632 | int journal_try_to_free_buffers(journal_t *journal, | ||
1633 | struct page *page, gfp_t unused_gfp_mask) | ||
1634 | { | ||
1635 | struct buffer_head *head; | ||
1636 | struct buffer_head *bh; | ||
1637 | int ret = 0; | ||
1638 | |||
1639 | J_ASSERT(PageLocked(page)); | ||
1640 | |||
1641 | head = page_buffers(page); | ||
1642 | bh = head; | ||
1643 | do { | ||
1644 | struct journal_head *jh; | ||
1645 | |||
1646 | /* | ||
1647 | * We take our own ref against the journal_head here to avoid | ||
1648 | * having to add tons of locking around each instance of | ||
1649 | * journal_remove_journal_head() and journal_put_journal_head(). | ||
1650 | */ | ||
1651 | jh = journal_grab_journal_head(bh); | ||
1652 | if (!jh) | ||
1653 | continue; | ||
1654 | |||
1655 | jbd_lock_bh_state(bh); | ||
1656 | __journal_try_to_free_buffer(journal, bh); | ||
1657 | journal_put_journal_head(jh); | ||
1658 | jbd_unlock_bh_state(bh); | ||
1659 | if (buffer_jbd(bh)) | ||
1660 | goto busy; | ||
1661 | } while ((bh = bh->b_this_page) != head); | ||
1662 | ret = try_to_free_buffers(page); | ||
1663 | busy: | ||
1664 | return ret; | ||
1665 | } | ||
1666 | |||
1667 | /* | ||
1668 | * This buffer is no longer needed. If it is on an older transaction's | ||
1669 | * checkpoint list we need to record it on this transaction's forget list | ||
1670 | * to pin this buffer (and hence its checkpointing transaction) down until | ||
1671 | * this transaction commits. If the buffer isn't on a checkpoint list, we | ||
1672 | * release it. | ||
1673 | * Returns non-zero if JBD no longer has an interest in the buffer. | ||
1674 | * | ||
1675 | * Called under j_list_lock. | ||
1676 | * | ||
1677 | * Called under jbd_lock_bh_state(bh). | ||
1678 | */ | ||
1679 | static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) | ||
1680 | { | ||
1681 | int may_free = 1; | ||
1682 | struct buffer_head *bh = jh2bh(jh); | ||
1683 | |||
1684 | __journal_unfile_buffer(jh); | ||
1685 | |||
1686 | if (jh->b_cp_transaction) { | ||
1687 | JBUFFER_TRACE(jh, "on running+cp transaction"); | ||
1688 | __journal_file_buffer(jh, transaction, BJ_Forget); | ||
1689 | clear_buffer_jbddirty(bh); | ||
1690 | may_free = 0; | ||
1691 | } else { | ||
1692 | JBUFFER_TRACE(jh, "on running transaction"); | ||
1693 | journal_remove_journal_head(bh); | ||
1694 | __brelse(bh); | ||
1695 | } | ||
1696 | return may_free; | ||
1697 | } | ||
1698 | |||
1699 | /* | ||
1700 | * journal_invalidatepage | ||
1701 | * | ||
1702 | * This code is tricky. It has a number of cases to deal with. | ||
1703 | * | ||
1704 | * There are two invariants which this code relies on: | ||
1705 | * | ||
1706 | * i_size must be updated on disk before we start calling invalidatepage on the | ||
1707 | * data. | ||
1708 | * | ||
1709 | * This is done in ext3 by defining an ext3_setattr method which | ||
1710 | * updates i_size before truncate gets going. By maintaining this | ||
1711 | * invariant, we can be sure that it is safe to throw away any buffers | ||
1712 | * attached to the current transaction: once the transaction commits, | ||
1713 | * we know that the data will not be needed. | ||
1714 | * | ||
1715 | * Note however that we can *not* throw away data belonging to the | ||
1716 | * previous, committing transaction! | ||
1717 | * | ||
1718 | * Any disk blocks which *are* part of the previous, committing | ||
1719 | * transaction (and which therefore cannot be discarded immediately) are | ||
1720 | * not going to be reused in the new running transaction | ||
1721 | * | ||
1722 | * The bitmap committed_data images guarantee this: any block which is | ||
1723 | * allocated in one transaction and removed in the next will be marked | ||
1724 | * as in-use in the committed_data bitmap, so cannot be reused until | ||
1725 | * the next transaction to delete the block commits. This means that | ||
1726 | * leaving committing buffers dirty is quite safe: the disk blocks | ||
1727 | * cannot be reallocated to a different file and so buffer aliasing is | ||
1728 | * not possible. | ||
1729 | * | ||
1730 | * | ||
1731 | * The above applies mainly to ordered data mode. In writeback mode we | ||
1732 | * don't make guarantees about the order in which data hits disk --- in | ||
1733 | * particular we don't guarantee that new dirty data is flushed before | ||
1734 | * transaction commit --- so it is always safe just to discard data | ||
1735 | * immediately in that mode. --sct | ||
1736 | */ | ||
1737 | |||
1738 | /* | ||
1739 | * The journal_unmap_buffer helper function returns zero if the buffer | ||
1740 | * concerned remains pinned as an anonymous buffer belonging to an older | ||
1741 | * transaction. | ||
1742 | * | ||
1743 | * We're outside-transaction here. Either or both of j_running_transaction | ||
1744 | * and j_committing_transaction may be NULL. | ||
1745 | */ | ||
1746 | static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) | ||
1747 | { | ||
1748 | transaction_t *transaction; | ||
1749 | struct journal_head *jh; | ||
1750 | int may_free = 1; | ||
1751 | int ret; | ||
1752 | |||
1753 | BUFFER_TRACE(bh, "entry"); | ||
1754 | |||
1755 | /* | ||
1756 | * It is safe to proceed here without the j_list_lock because the | ||
1757 | * buffers cannot be stolen by try_to_free_buffers as long as we are | ||
1758 | * holding the page lock. --sct | ||
1759 | */ | ||
1760 | |||
1761 | if (!buffer_jbd(bh)) | ||
1762 | goto zap_buffer_unlocked; | ||
1763 | |||
1764 | spin_lock(&journal->j_state_lock); | ||
1765 | jbd_lock_bh_state(bh); | ||
1766 | spin_lock(&journal->j_list_lock); | ||
1767 | |||
1768 | jh = journal_grab_journal_head(bh); | ||
1769 | if (!jh) | ||
1770 | goto zap_buffer_no_jh; | ||
1771 | |||
1772 | transaction = jh->b_transaction; | ||
1773 | if (transaction == NULL) { | ||
1774 | /* First case: not on any transaction. If it | ||
1775 | * has no checkpoint link, then we can zap it: | ||
1776 | * it's a writeback-mode buffer so we don't care | ||
1777 | * if it hits disk safely. */ | ||
1778 | if (!jh->b_cp_transaction) { | ||
1779 | JBUFFER_TRACE(jh, "not on any transaction: zap"); | ||
1780 | goto zap_buffer; | ||
1781 | } | ||
1782 | |||
1783 | if (!buffer_dirty(bh)) { | ||
1784 | /* bdflush has written it. We can drop it now */ | ||
1785 | goto zap_buffer; | ||
1786 | } | ||
1787 | |||
1788 | /* OK, it must be in the journal but still not | ||
1789 | * written fully to disk: it's metadata or | ||
1790 | * journaled data... */ | ||
1791 | |||
1792 | if (journal->j_running_transaction) { | ||
1793 | /* ... and once the current transaction has | ||
1794 | * committed, the buffer won't be needed any | ||
1795 | * longer. */ | ||
1796 | JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); | ||
1797 | ret = __dispose_buffer(jh, | ||
1798 | journal->j_running_transaction); | ||
1799 | journal_put_journal_head(jh); | ||
1800 | spin_unlock(&journal->j_list_lock); | ||
1801 | jbd_unlock_bh_state(bh); | ||
1802 | spin_unlock(&journal->j_state_lock); | ||
1803 | return ret; | ||
1804 | } else { | ||
1805 | /* There is no currently-running transaction. So the | ||
1806 | * orphan record which we wrote for this file must have | ||
1807 | * passed into commit. We must attach this buffer to | ||
1808 | * the committing transaction, if it exists. */ | ||
1809 | if (journal->j_committing_transaction) { | ||
1810 | JBUFFER_TRACE(jh, "give to committing trans"); | ||
1811 | ret = __dispose_buffer(jh, | ||
1812 | journal->j_committing_transaction); | ||
1813 | journal_put_journal_head(jh); | ||
1814 | spin_unlock(&journal->j_list_lock); | ||
1815 | jbd_unlock_bh_state(bh); | ||
1816 | spin_unlock(&journal->j_state_lock); | ||
1817 | return ret; | ||
1818 | } else { | ||
1819 | /* The orphan record's transaction has | ||
1820 | * committed. We can cleanse this buffer */ | ||
1821 | clear_buffer_jbddirty(bh); | ||
1822 | goto zap_buffer; | ||
1823 | } | ||
1824 | } | ||
1825 | } else if (transaction == journal->j_committing_transaction) { | ||
1826 | if (jh->b_jlist == BJ_Locked) { | ||
1827 | /* | ||
1828 | * The buffer is on the committing transaction's locked | ||
1829 | * list. We have the buffer locked, so I/O has | ||
1830 | * completed. So we can nail the buffer now. | ||
1831 | */ | ||
1832 | may_free = __dispose_buffer(jh, transaction); | ||
1833 | goto zap_buffer; | ||
1834 | } | ||
1835 | /* | ||
1836 | * If it is committing, we simply cannot touch it. We | ||
1837 | * can remove it's next_transaction pointer from the | ||
1838 | * running transaction if that is set, but nothing | ||
1839 | * else. */ | ||
1840 | JBUFFER_TRACE(jh, "on committing transaction"); | ||
1841 | set_buffer_freed(bh); | ||
1842 | if (jh->b_next_transaction) { | ||
1843 | J_ASSERT(jh->b_next_transaction == | ||
1844 | journal->j_running_transaction); | ||
1845 | jh->b_next_transaction = NULL; | ||
1846 | } | ||
1847 | journal_put_journal_head(jh); | ||
1848 | spin_unlock(&journal->j_list_lock); | ||
1849 | jbd_unlock_bh_state(bh); | ||
1850 | spin_unlock(&journal->j_state_lock); | ||
1851 | return 0; | ||
1852 | } else { | ||
1853 | /* Good, the buffer belongs to the running transaction. | ||
1854 | * We are writing our own transaction's data, not any | ||
1855 | * previous one's, so it is safe to throw it away | ||
1856 | * (remember that we expect the filesystem to have set | ||
1857 | * i_size already for this truncate so recovery will not | ||
1858 | * expose the disk blocks we are discarding here.) */ | ||
1859 | J_ASSERT_JH(jh, transaction == journal->j_running_transaction); | ||
1860 | may_free = __dispose_buffer(jh, transaction); | ||
1861 | } | ||
1862 | |||
1863 | zap_buffer: | ||
1864 | journal_put_journal_head(jh); | ||
1865 | zap_buffer_no_jh: | ||
1866 | spin_unlock(&journal->j_list_lock); | ||
1867 | jbd_unlock_bh_state(bh); | ||
1868 | spin_unlock(&journal->j_state_lock); | ||
1869 | zap_buffer_unlocked: | ||
1870 | clear_buffer_dirty(bh); | ||
1871 | J_ASSERT_BH(bh, !buffer_jbddirty(bh)); | ||
1872 | clear_buffer_mapped(bh); | ||
1873 | clear_buffer_req(bh); | ||
1874 | clear_buffer_new(bh); | ||
1875 | bh->b_bdev = NULL; | ||
1876 | return may_free; | ||
1877 | } | ||
1878 | |||
1879 | /** | ||
1880 | * void journal_invalidatepage() | ||
1881 | * @journal: journal to use for flush... | ||
1882 | * @page: page to flush | ||
1883 | * @offset: length of page to invalidate. | ||
1884 | * | ||
1885 | * Reap page buffers containing data after offset in page. | ||
1886 | * | ||
1887 | */ | ||
1888 | void journal_invalidatepage(journal_t *journal, | ||
1889 | struct page *page, | ||
1890 | unsigned long offset) | ||
1891 | { | ||
1892 | struct buffer_head *head, *bh, *next; | ||
1893 | unsigned int curr_off = 0; | ||
1894 | int may_free = 1; | ||
1895 | |||
1896 | if (!PageLocked(page)) | ||
1897 | BUG(); | ||
1898 | if (!page_has_buffers(page)) | ||
1899 | return; | ||
1900 | |||
1901 | /* We will potentially be playing with lists other than just the | ||
1902 | * data lists (especially for journaled data mode), so be | ||
1903 | * cautious in our locking. */ | ||
1904 | |||
1905 | head = bh = page_buffers(page); | ||
1906 | do { | ||
1907 | unsigned int next_off = curr_off + bh->b_size; | ||
1908 | next = bh->b_this_page; | ||
1909 | |||
1910 | if (offset <= curr_off) { | ||
1911 | /* This block is wholly outside the truncation point */ | ||
1912 | lock_buffer(bh); | ||
1913 | may_free &= journal_unmap_buffer(journal, bh); | ||
1914 | unlock_buffer(bh); | ||
1915 | } | ||
1916 | curr_off = next_off; | ||
1917 | bh = next; | ||
1918 | |||
1919 | } while (bh != head); | ||
1920 | |||
1921 | if (!offset) { | ||
1922 | if (may_free && try_to_free_buffers(page)) | ||
1923 | J_ASSERT(!page_has_buffers(page)); | ||
1924 | } | ||
1925 | } | ||
1926 | |||
1927 | /* | ||
1928 | * File a buffer on the given transaction list. | ||
1929 | */ | ||
1930 | void __journal_file_buffer(struct journal_head *jh, | ||
1931 | transaction_t *transaction, int jlist) | ||
1932 | { | ||
1933 | struct journal_head **list = NULL; | ||
1934 | int was_dirty = 0; | ||
1935 | struct buffer_head *bh = jh2bh(jh); | ||
1936 | |||
1937 | J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); | ||
1938 | assert_spin_locked(&transaction->t_journal->j_list_lock); | ||
1939 | |||
1940 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); | ||
1941 | J_ASSERT_JH(jh, jh->b_transaction == transaction || | ||
1942 | jh->b_transaction == 0); | ||
1943 | |||
1944 | if (jh->b_transaction && jh->b_jlist == jlist) | ||
1945 | return; | ||
1946 | |||
1947 | /* The following list of buffer states needs to be consistent | ||
1948 | * with __jbd_unexpected_dirty_buffer()'s handling of dirty | ||
1949 | * state. */ | ||
1950 | |||
1951 | if (jlist == BJ_Metadata || jlist == BJ_Reserved || | ||
1952 | jlist == BJ_Shadow || jlist == BJ_Forget) { | ||
1953 | if (test_clear_buffer_dirty(bh) || | ||
1954 | test_clear_buffer_jbddirty(bh)) | ||
1955 | was_dirty = 1; | ||
1956 | } | ||
1957 | |||
1958 | if (jh->b_transaction) | ||
1959 | __journal_temp_unlink_buffer(jh); | ||
1960 | jh->b_transaction = transaction; | ||
1961 | |||
1962 | switch (jlist) { | ||
1963 | case BJ_None: | ||
1964 | J_ASSERT_JH(jh, !jh->b_committed_data); | ||
1965 | J_ASSERT_JH(jh, !jh->b_frozen_data); | ||
1966 | return; | ||
1967 | case BJ_SyncData: | ||
1968 | list = &transaction->t_sync_datalist; | ||
1969 | break; | ||
1970 | case BJ_Metadata: | ||
1971 | transaction->t_nr_buffers++; | ||
1972 | list = &transaction->t_buffers; | ||
1973 | break; | ||
1974 | case BJ_Forget: | ||
1975 | list = &transaction->t_forget; | ||
1976 | break; | ||
1977 | case BJ_IO: | ||
1978 | list = &transaction->t_iobuf_list; | ||
1979 | break; | ||
1980 | case BJ_Shadow: | ||
1981 | list = &transaction->t_shadow_list; | ||
1982 | break; | ||
1983 | case BJ_LogCtl: | ||
1984 | list = &transaction->t_log_list; | ||
1985 | break; | ||
1986 | case BJ_Reserved: | ||
1987 | list = &transaction->t_reserved_list; | ||
1988 | break; | ||
1989 | case BJ_Locked: | ||
1990 | list = &transaction->t_locked_list; | ||
1991 | break; | ||
1992 | } | ||
1993 | |||
1994 | __blist_add_buffer(list, jh); | ||
1995 | jh->b_jlist = jlist; | ||
1996 | |||
1997 | if (was_dirty) | ||
1998 | set_buffer_jbddirty(bh); | ||
1999 | } | ||
2000 | |||
2001 | void journal_file_buffer(struct journal_head *jh, | ||
2002 | transaction_t *transaction, int jlist) | ||
2003 | { | ||
2004 | jbd_lock_bh_state(jh2bh(jh)); | ||
2005 | spin_lock(&transaction->t_journal->j_list_lock); | ||
2006 | __journal_file_buffer(jh, transaction, jlist); | ||
2007 | spin_unlock(&transaction->t_journal->j_list_lock); | ||
2008 | jbd_unlock_bh_state(jh2bh(jh)); | ||
2009 | } | ||
2010 | |||
2011 | /* | ||
2012 | * Remove a buffer from its current buffer list in preparation for | ||
2013 | * dropping it from its current transaction entirely. If the buffer has | ||
2014 | * already started to be used by a subsequent transaction, refile the | ||
2015 | * buffer on that transaction's metadata list. | ||
2016 | * | ||
2017 | * Called under journal->j_list_lock | ||
2018 | * | ||
2019 | * Called under jbd_lock_bh_state(jh2bh(jh)) | ||
2020 | */ | ||
2021 | void __journal_refile_buffer(struct journal_head *jh) | ||
2022 | { | ||
2023 | int was_dirty; | ||
2024 | struct buffer_head *bh = jh2bh(jh); | ||
2025 | |||
2026 | J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); | ||
2027 | if (jh->b_transaction) | ||
2028 | assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); | ||
2029 | |||
2030 | /* If the buffer is now unused, just drop it. */ | ||
2031 | if (jh->b_next_transaction == NULL) { | ||
2032 | __journal_unfile_buffer(jh); | ||
2033 | return; | ||
2034 | } | ||
2035 | |||
2036 | /* | ||
2037 | * It has been modified by a later transaction: add it to the new | ||
2038 | * transaction's metadata list. | ||
2039 | */ | ||
2040 | |||
2041 | was_dirty = test_clear_buffer_jbddirty(bh); | ||
2042 | __journal_temp_unlink_buffer(jh); | ||
2043 | jh->b_transaction = jh->b_next_transaction; | ||
2044 | jh->b_next_transaction = NULL; | ||
2045 | __journal_file_buffer(jh, jh->b_transaction, | ||
2046 | was_dirty ? BJ_Metadata : BJ_Reserved); | ||
2047 | J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); | ||
2048 | |||
2049 | if (was_dirty) | ||
2050 | set_buffer_jbddirty(bh); | ||
2051 | } | ||
2052 | |||
2053 | /* | ||
2054 | * For the unlocked version of this call, also make sure that any | ||
2055 | * hanging journal_head is cleaned up if necessary. | ||
2056 | * | ||
2057 | * __journal_refile_buffer is usually called as part of a single locked | ||
2058 | * operation on a buffer_head, in which the caller is probably going to | ||
2059 | * be hooking the journal_head onto other lists. In that case it is up | ||
2060 | * to the caller to remove the journal_head if necessary. For the | ||
2061 | * unlocked journal_refile_buffer call, the caller isn't going to be | ||
2062 | * doing anything else to the buffer so we need to do the cleanup | ||
2063 | * ourselves to avoid a jh leak. | ||
2064 | * | ||
2065 | * *** The journal_head may be freed by this call! *** | ||
2066 | */ | ||
2067 | void journal_refile_buffer(journal_t *journal, struct journal_head *jh) | ||
2068 | { | ||
2069 | struct buffer_head *bh = jh2bh(jh); | ||
2070 | |||
2071 | jbd_lock_bh_state(bh); | ||
2072 | spin_lock(&journal->j_list_lock); | ||
2073 | |||
2074 | __journal_refile_buffer(jh); | ||
2075 | jbd_unlock_bh_state(bh); | ||
2076 | journal_remove_journal_head(bh); | ||
2077 | |||
2078 | spin_unlock(&journal->j_list_lock); | ||
2079 | __brelse(bh); | ||
2080 | } | ||