diff options
Diffstat (limited to 'fs/jbd')
-rw-r--r-- | fs/jbd/Kconfig | 30 | ||||
-rw-r--r-- | fs/jbd/Makefile | 7 | ||||
-rw-r--r-- | fs/jbd/checkpoint.c | 782 | ||||
-rw-r--r-- | fs/jbd/commit.c | 1021 | ||||
-rw-r--r-- | fs/jbd/journal.c | 2145 | ||||
-rw-r--r-- | fs/jbd/recovery.c | 594 | ||||
-rw-r--r-- | fs/jbd/revoke.c | 733 | ||||
-rw-r--r-- | fs/jbd/transaction.c | 2237 |
8 files changed, 0 insertions, 7549 deletions
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig deleted file mode 100644 index 4e28beeed157..000000000000 --- a/fs/jbd/Kconfig +++ /dev/null | |||
@@ -1,30 +0,0 @@ | |||
1 | config JBD | ||
2 | tristate | ||
3 | help | ||
4 | This is a generic journalling layer for block devices. It is | ||
5 | currently used by the ext3 file system, but it could also be | ||
6 | used to add journal support to other file systems or block | ||
7 | devices such as RAID or LVM. | ||
8 | |||
9 | If you are using the ext3 file system, you need to say Y here. | ||
10 | If you are not using ext3 then you will probably want to say N. | ||
11 | |||
12 | To compile this device as a module, choose M here: the module will be | ||
13 | called jbd. If you are compiling ext3 into the kernel, you | ||
14 | cannot compile this code as a module. | ||
15 | |||
16 | config JBD_DEBUG | ||
17 | bool "JBD (ext3) debugging support" | ||
18 | depends on JBD && DEBUG_FS | ||
19 | help | ||
20 | If you are using the ext3 journaled file system (or potentially any | ||
21 | other file system/device using JBD), this option allows you to | ||
22 | enable debugging output while the system is running, in order to | ||
23 | help track down any problems you are having. By default the | ||
24 | debugging output will be turned off. | ||
25 | |||
26 | If you select Y here, then you will be able to turn on debugging | ||
27 | with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a | ||
28 | number between 1 and 5, the higher the number, the more debugging | ||
29 | output is generated. To turn debugging off again, do | ||
30 | "echo 0 > /sys/kernel/debug/jbd/jbd-debug". | ||
diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile deleted file mode 100644 index 54aca4868a36..000000000000 --- a/fs/jbd/Makefile +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for the linux journaling routines. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_JBD) += jbd.o | ||
6 | |||
7 | jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o | ||
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c deleted file mode 100644 index 08c03044abdd..000000000000 --- a/fs/jbd/checkpoint.c +++ /dev/null | |||
@@ -1,782 +0,0 @@ | |||
1 | /* | ||
2 | * linux/fs/jbd/checkpoint.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 | ||
5 | * | ||
6 | * Copyright 1999 Red Hat Software --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Checkpoint routines for the generic filesystem journaling code. | ||
13 | * Part of the ext2fs journaling system. | ||
14 | * | ||
15 | * Checkpointing is the process of ensuring that a section of the log is | ||
16 | * committed fully to disk, so that that portion of the log can be | ||
17 | * reused. | ||
18 | */ | ||
19 | |||
20 | #include <linux/time.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/jbd.h> | ||
23 | #include <linux/errno.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/blkdev.h> | ||
26 | #include <trace/events/jbd.h> | ||
27 | |||
28 | /* | ||
29 | * Unlink a buffer from a transaction checkpoint list. | ||
30 | * | ||
31 | * Called with j_list_lock held. | ||
32 | */ | ||
33 | static inline void __buffer_unlink_first(struct journal_head *jh) | ||
34 | { | ||
35 | transaction_t *transaction = jh->b_cp_transaction; | ||
36 | |||
37 | jh->b_cpnext->b_cpprev = jh->b_cpprev; | ||
38 | jh->b_cpprev->b_cpnext = jh->b_cpnext; | ||
39 | if (transaction->t_checkpoint_list == jh) { | ||
40 | transaction->t_checkpoint_list = jh->b_cpnext; | ||
41 | if (transaction->t_checkpoint_list == jh) | ||
42 | transaction->t_checkpoint_list = NULL; | ||
43 | } | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * Unlink a buffer from a transaction checkpoint(io) list. | ||
48 | * | ||
49 | * Called with j_list_lock held. | ||
50 | */ | ||
51 | static inline void __buffer_unlink(struct journal_head *jh) | ||
52 | { | ||
53 | transaction_t *transaction = jh->b_cp_transaction; | ||
54 | |||
55 | __buffer_unlink_first(jh); | ||
56 | if (transaction->t_checkpoint_io_list == jh) { | ||
57 | transaction->t_checkpoint_io_list = jh->b_cpnext; | ||
58 | if (transaction->t_checkpoint_io_list == jh) | ||
59 | transaction->t_checkpoint_io_list = NULL; | ||
60 | } | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Move a buffer from the checkpoint list to the checkpoint io list | ||
65 | * | ||
66 | * Called with j_list_lock held | ||
67 | */ | ||
68 | static inline void __buffer_relink_io(struct journal_head *jh) | ||
69 | { | ||
70 | transaction_t *transaction = jh->b_cp_transaction; | ||
71 | |||
72 | __buffer_unlink_first(jh); | ||
73 | |||
74 | if (!transaction->t_checkpoint_io_list) { | ||
75 | jh->b_cpnext = jh->b_cpprev = jh; | ||
76 | } else { | ||
77 | jh->b_cpnext = transaction->t_checkpoint_io_list; | ||
78 | jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; | ||
79 | jh->b_cpprev->b_cpnext = jh; | ||
80 | jh->b_cpnext->b_cpprev = jh; | ||
81 | } | ||
82 | transaction->t_checkpoint_io_list = jh; | ||
83 | } | ||
84 | |||
85 | /* | ||
86 | * Try to release a checkpointed buffer from its transaction. | ||
87 | * Returns 1 if we released it and 2 if we also released the | ||
88 | * whole transaction. | ||
89 | * | ||
90 | * Requires j_list_lock | ||
91 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | ||
92 | */ | ||
93 | static int __try_to_free_cp_buf(struct journal_head *jh) | ||
94 | { | ||
95 | int ret = 0; | ||
96 | struct buffer_head *bh = jh2bh(jh); | ||
97 | |||
98 | if (jh->b_jlist == BJ_None && !buffer_locked(bh) && | ||
99 | !buffer_dirty(bh) && !buffer_write_io_error(bh)) { | ||
100 | /* | ||
101 | * Get our reference so that bh cannot be freed before | ||
102 | * we unlock it | ||
103 | */ | ||
104 | get_bh(bh); | ||
105 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | ||
106 | ret = __journal_remove_checkpoint(jh) + 1; | ||
107 | jbd_unlock_bh_state(bh); | ||
108 | BUFFER_TRACE(bh, "release"); | ||
109 | __brelse(bh); | ||
110 | } else { | ||
111 | jbd_unlock_bh_state(bh); | ||
112 | } | ||
113 | return ret; | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * __log_wait_for_space: wait until there is space in the journal. | ||
118 | * | ||
119 | * Called under j-state_lock *only*. It will be unlocked if we have to wait | ||
120 | * for a checkpoint to free up some space in the log. | ||
121 | */ | ||
122 | void __log_wait_for_space(journal_t *journal) | ||
123 | { | ||
124 | int nblocks, space_left; | ||
125 | assert_spin_locked(&journal->j_state_lock); | ||
126 | |||
127 | nblocks = jbd_space_needed(journal); | ||
128 | while (__log_space_left(journal) < nblocks) { | ||
129 | if (journal->j_flags & JFS_ABORT) | ||
130 | return; | ||
131 | spin_unlock(&journal->j_state_lock); | ||
132 | mutex_lock(&journal->j_checkpoint_mutex); | ||
133 | |||
134 | /* | ||
135 | * Test again, another process may have checkpointed while we | ||
136 | * were waiting for the checkpoint lock. If there are no | ||
137 | * transactions ready to be checkpointed, try to recover | ||
138 | * journal space by calling cleanup_journal_tail(), and if | ||
139 | * that doesn't work, by waiting for the currently committing | ||
140 | * transaction to complete. If there is absolutely no way | ||
141 | * to make progress, this is either a BUG or corrupted | ||
142 | * filesystem, so abort the journal and leave a stack | ||
143 | * trace for forensic evidence. | ||
144 | */ | ||
145 | spin_lock(&journal->j_state_lock); | ||
146 | spin_lock(&journal->j_list_lock); | ||
147 | nblocks = jbd_space_needed(journal); | ||
148 | space_left = __log_space_left(journal); | ||
149 | if (space_left < nblocks) { | ||
150 | int chkpt = journal->j_checkpoint_transactions != NULL; | ||
151 | tid_t tid = 0; | ||
152 | |||
153 | if (journal->j_committing_transaction) | ||
154 | tid = journal->j_committing_transaction->t_tid; | ||
155 | spin_unlock(&journal->j_list_lock); | ||
156 | spin_unlock(&journal->j_state_lock); | ||
157 | if (chkpt) { | ||
158 | log_do_checkpoint(journal); | ||
159 | } else if (cleanup_journal_tail(journal) == 0) { | ||
160 | /* We were able to recover space; yay! */ | ||
161 | ; | ||
162 | } else if (tid) { | ||
163 | log_wait_commit(journal, tid); | ||
164 | } else { | ||
165 | printk(KERN_ERR "%s: needed %d blocks and " | ||
166 | "only had %d space available\n", | ||
167 | __func__, nblocks, space_left); | ||
168 | printk(KERN_ERR "%s: no way to get more " | ||
169 | "journal space\n", __func__); | ||
170 | WARN_ON(1); | ||
171 | journal_abort(journal, 0); | ||
172 | } | ||
173 | spin_lock(&journal->j_state_lock); | ||
174 | } else { | ||
175 | spin_unlock(&journal->j_list_lock); | ||
176 | } | ||
177 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
178 | } | ||
179 | } | ||
180 | |||
181 | /* | ||
182 | * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. | ||
183 | * The caller must restart a list walk. Wait for someone else to run | ||
184 | * jbd_unlock_bh_state(). | ||
185 | */ | ||
186 | static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) | ||
187 | __releases(journal->j_list_lock) | ||
188 | { | ||
189 | get_bh(bh); | ||
190 | spin_unlock(&journal->j_list_lock); | ||
191 | jbd_lock_bh_state(bh); | ||
192 | jbd_unlock_bh_state(bh); | ||
193 | put_bh(bh); | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Clean up transaction's list of buffers submitted for io. | ||
198 | * We wait for any pending IO to complete and remove any clean | ||
199 | * buffers. Note that we take the buffers in the opposite ordering | ||
200 | * from the one in which they were submitted for IO. | ||
201 | * | ||
202 | * Return 0 on success, and return <0 if some buffers have failed | ||
203 | * to be written out. | ||
204 | * | ||
205 | * Called with j_list_lock held. | ||
206 | */ | ||
207 | static int __wait_cp_io(journal_t *journal, transaction_t *transaction) | ||
208 | { | ||
209 | struct journal_head *jh; | ||
210 | struct buffer_head *bh; | ||
211 | tid_t this_tid; | ||
212 | int released = 0; | ||
213 | int ret = 0; | ||
214 | |||
215 | this_tid = transaction->t_tid; | ||
216 | restart: | ||
217 | /* Did somebody clean up the transaction in the meanwhile? */ | ||
218 | if (journal->j_checkpoint_transactions != transaction || | ||
219 | transaction->t_tid != this_tid) | ||
220 | return ret; | ||
221 | while (!released && transaction->t_checkpoint_io_list) { | ||
222 | jh = transaction->t_checkpoint_io_list; | ||
223 | bh = jh2bh(jh); | ||
224 | if (!jbd_trylock_bh_state(bh)) { | ||
225 | jbd_sync_bh(journal, bh); | ||
226 | spin_lock(&journal->j_list_lock); | ||
227 | goto restart; | ||
228 | } | ||
229 | get_bh(bh); | ||
230 | if (buffer_locked(bh)) { | ||
231 | spin_unlock(&journal->j_list_lock); | ||
232 | jbd_unlock_bh_state(bh); | ||
233 | wait_on_buffer(bh); | ||
234 | /* the journal_head may have gone by now */ | ||
235 | BUFFER_TRACE(bh, "brelse"); | ||
236 | __brelse(bh); | ||
237 | spin_lock(&journal->j_list_lock); | ||
238 | goto restart; | ||
239 | } | ||
240 | if (unlikely(buffer_write_io_error(bh))) | ||
241 | ret = -EIO; | ||
242 | |||
243 | /* | ||
244 | * Now in whatever state the buffer currently is, we know that | ||
245 | * it has been written out and so we can drop it from the list | ||
246 | */ | ||
247 | released = __journal_remove_checkpoint(jh); | ||
248 | jbd_unlock_bh_state(bh); | ||
249 | __brelse(bh); | ||
250 | } | ||
251 | |||
252 | return ret; | ||
253 | } | ||
254 | |||
255 | #define NR_BATCH 64 | ||
256 | |||
257 | static void | ||
258 | __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) | ||
259 | { | ||
260 | int i; | ||
261 | struct blk_plug plug; | ||
262 | |||
263 | blk_start_plug(&plug); | ||
264 | for (i = 0; i < *batch_count; i++) | ||
265 | write_dirty_buffer(bhs[i], WRITE_SYNC); | ||
266 | blk_finish_plug(&plug); | ||
267 | |||
268 | for (i = 0; i < *batch_count; i++) { | ||
269 | struct buffer_head *bh = bhs[i]; | ||
270 | clear_buffer_jwrite(bh); | ||
271 | BUFFER_TRACE(bh, "brelse"); | ||
272 | __brelse(bh); | ||
273 | } | ||
274 | *batch_count = 0; | ||
275 | } | ||
276 | |||
277 | /* | ||
278 | * Try to flush one buffer from the checkpoint list to disk. | ||
279 | * | ||
280 | * Return 1 if something happened which requires us to abort the current | ||
281 | * scan of the checkpoint list. Return <0 if the buffer has failed to | ||
282 | * be written out. | ||
283 | * | ||
284 | * Called with j_list_lock held and drops it if 1 is returned | ||
285 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | ||
286 | */ | ||
287 | static int __process_buffer(journal_t *journal, struct journal_head *jh, | ||
288 | struct buffer_head **bhs, int *batch_count) | ||
289 | { | ||
290 | struct buffer_head *bh = jh2bh(jh); | ||
291 | int ret = 0; | ||
292 | |||
293 | if (buffer_locked(bh)) { | ||
294 | get_bh(bh); | ||
295 | spin_unlock(&journal->j_list_lock); | ||
296 | jbd_unlock_bh_state(bh); | ||
297 | wait_on_buffer(bh); | ||
298 | /* the journal_head may have gone by now */ | ||
299 | BUFFER_TRACE(bh, "brelse"); | ||
300 | __brelse(bh); | ||
301 | ret = 1; | ||
302 | } else if (jh->b_transaction != NULL) { | ||
303 | transaction_t *t = jh->b_transaction; | ||
304 | tid_t tid = t->t_tid; | ||
305 | |||
306 | spin_unlock(&journal->j_list_lock); | ||
307 | jbd_unlock_bh_state(bh); | ||
308 | log_start_commit(journal, tid); | ||
309 | log_wait_commit(journal, tid); | ||
310 | ret = 1; | ||
311 | } else if (!buffer_dirty(bh)) { | ||
312 | ret = 1; | ||
313 | if (unlikely(buffer_write_io_error(bh))) | ||
314 | ret = -EIO; | ||
315 | get_bh(bh); | ||
316 | J_ASSERT_JH(jh, !buffer_jbddirty(bh)); | ||
317 | BUFFER_TRACE(bh, "remove from checkpoint"); | ||
318 | __journal_remove_checkpoint(jh); | ||
319 | spin_unlock(&journal->j_list_lock); | ||
320 | jbd_unlock_bh_state(bh); | ||
321 | __brelse(bh); | ||
322 | } else { | ||
323 | /* | ||
324 | * Important: we are about to write the buffer, and | ||
325 | * possibly block, while still holding the journal lock. | ||
326 | * We cannot afford to let the transaction logic start | ||
327 | * messing around with this buffer before we write it to | ||
328 | * disk, as that would break recoverability. | ||
329 | */ | ||
330 | BUFFER_TRACE(bh, "queue"); | ||
331 | get_bh(bh); | ||
332 | J_ASSERT_BH(bh, !buffer_jwrite(bh)); | ||
333 | set_buffer_jwrite(bh); | ||
334 | bhs[*batch_count] = bh; | ||
335 | __buffer_relink_io(jh); | ||
336 | jbd_unlock_bh_state(bh); | ||
337 | (*batch_count)++; | ||
338 | if (*batch_count == NR_BATCH) { | ||
339 | spin_unlock(&journal->j_list_lock); | ||
340 | __flush_batch(journal, bhs, batch_count); | ||
341 | ret = 1; | ||
342 | } | ||
343 | } | ||
344 | return ret; | ||
345 | } | ||
346 | |||
347 | /* | ||
348 | * Perform an actual checkpoint. We take the first transaction on the | ||
349 | * list of transactions to be checkpointed and send all its buffers | ||
350 | * to disk. We submit larger chunks of data at once. | ||
351 | * | ||
352 | * The journal should be locked before calling this function. | ||
353 | * Called with j_checkpoint_mutex held. | ||
354 | */ | ||
355 | int log_do_checkpoint(journal_t *journal) | ||
356 | { | ||
357 | transaction_t *transaction; | ||
358 | tid_t this_tid; | ||
359 | int result; | ||
360 | |||
361 | jbd_debug(1, "Start checkpoint\n"); | ||
362 | |||
363 | /* | ||
364 | * First thing: if there are any transactions in the log which | ||
365 | * don't need checkpointing, just eliminate them from the | ||
366 | * journal straight away. | ||
367 | */ | ||
368 | result = cleanup_journal_tail(journal); | ||
369 | trace_jbd_checkpoint(journal, result); | ||
370 | jbd_debug(1, "cleanup_journal_tail returned %d\n", result); | ||
371 | if (result <= 0) | ||
372 | return result; | ||
373 | |||
374 | /* | ||
375 | * OK, we need to start writing disk blocks. Take one transaction | ||
376 | * and write it. | ||
377 | */ | ||
378 | result = 0; | ||
379 | spin_lock(&journal->j_list_lock); | ||
380 | if (!journal->j_checkpoint_transactions) | ||
381 | goto out; | ||
382 | transaction = journal->j_checkpoint_transactions; | ||
383 | this_tid = transaction->t_tid; | ||
384 | restart: | ||
385 | /* | ||
386 | * If someone cleaned up this transaction while we slept, we're | ||
387 | * done (maybe it's a new transaction, but it fell at the same | ||
388 | * address). | ||
389 | */ | ||
390 | if (journal->j_checkpoint_transactions == transaction && | ||
391 | transaction->t_tid == this_tid) { | ||
392 | int batch_count = 0; | ||
393 | struct buffer_head *bhs[NR_BATCH]; | ||
394 | struct journal_head *jh; | ||
395 | int retry = 0, err; | ||
396 | |||
397 | while (!retry && transaction->t_checkpoint_list) { | ||
398 | struct buffer_head *bh; | ||
399 | |||
400 | jh = transaction->t_checkpoint_list; | ||
401 | bh = jh2bh(jh); | ||
402 | if (!jbd_trylock_bh_state(bh)) { | ||
403 | jbd_sync_bh(journal, bh); | ||
404 | retry = 1; | ||
405 | break; | ||
406 | } | ||
407 | retry = __process_buffer(journal, jh, bhs,&batch_count); | ||
408 | if (retry < 0 && !result) | ||
409 | result = retry; | ||
410 | if (!retry && (need_resched() || | ||
411 | spin_needbreak(&journal->j_list_lock))) { | ||
412 | spin_unlock(&journal->j_list_lock); | ||
413 | retry = 1; | ||
414 | break; | ||
415 | } | ||
416 | } | ||
417 | |||
418 | if (batch_count) { | ||
419 | if (!retry) { | ||
420 | spin_unlock(&journal->j_list_lock); | ||
421 | retry = 1; | ||
422 | } | ||
423 | __flush_batch(journal, bhs, &batch_count); | ||
424 | } | ||
425 | |||
426 | if (retry) { | ||
427 | spin_lock(&journal->j_list_lock); | ||
428 | goto restart; | ||
429 | } | ||
430 | /* | ||
431 | * Now we have cleaned up the first transaction's checkpoint | ||
432 | * list. Let's clean up the second one | ||
433 | */ | ||
434 | err = __wait_cp_io(journal, transaction); | ||
435 | if (!result) | ||
436 | result = err; | ||
437 | } | ||
438 | out: | ||
439 | spin_unlock(&journal->j_list_lock); | ||
440 | if (result < 0) | ||
441 | journal_abort(journal, result); | ||
442 | else | ||
443 | result = cleanup_journal_tail(journal); | ||
444 | |||
445 | return (result < 0) ? result : 0; | ||
446 | } | ||
447 | |||
448 | /* | ||
449 | * Check the list of checkpoint transactions for the journal to see if | ||
450 | * we have already got rid of any since the last update of the log tail | ||
451 | * in the journal superblock. If so, we can instantly roll the | ||
452 | * superblock forward to remove those transactions from the log. | ||
453 | * | ||
454 | * Return <0 on error, 0 on success, 1 if there was nothing to clean up. | ||
455 | * | ||
456 | * This is the only part of the journaling code which really needs to be | ||
457 | * aware of transaction aborts. Checkpointing involves writing to the | ||
458 | * main filesystem area rather than to the journal, so it can proceed | ||
459 | * even in abort state, but we must not update the super block if | ||
460 | * checkpointing may have failed. Otherwise, we would lose some metadata | ||
461 | * buffers which should be written-back to the filesystem. | ||
462 | */ | ||
463 | |||
464 | int cleanup_journal_tail(journal_t *journal) | ||
465 | { | ||
466 | transaction_t * transaction; | ||
467 | tid_t first_tid; | ||
468 | unsigned int blocknr, freed; | ||
469 | |||
470 | if (is_journal_aborted(journal)) | ||
471 | return 1; | ||
472 | |||
473 | /* | ||
474 | * OK, work out the oldest transaction remaining in the log, and | ||
475 | * the log block it starts at. | ||
476 | * | ||
477 | * If the log is now empty, we need to work out which is the | ||
478 | * next transaction ID we will write, and where it will | ||
479 | * start. | ||
480 | */ | ||
481 | spin_lock(&journal->j_state_lock); | ||
482 | spin_lock(&journal->j_list_lock); | ||
483 | transaction = journal->j_checkpoint_transactions; | ||
484 | if (transaction) { | ||
485 | first_tid = transaction->t_tid; | ||
486 | blocknr = transaction->t_log_start; | ||
487 | } else if ((transaction = journal->j_committing_transaction) != NULL) { | ||
488 | first_tid = transaction->t_tid; | ||
489 | blocknr = transaction->t_log_start; | ||
490 | } else if ((transaction = journal->j_running_transaction) != NULL) { | ||
491 | first_tid = transaction->t_tid; | ||
492 | blocknr = journal->j_head; | ||
493 | } else { | ||
494 | first_tid = journal->j_transaction_sequence; | ||
495 | blocknr = journal->j_head; | ||
496 | } | ||
497 | spin_unlock(&journal->j_list_lock); | ||
498 | J_ASSERT(blocknr != 0); | ||
499 | |||
500 | /* If the oldest pinned transaction is at the tail of the log | ||
501 | already then there's not much we can do right now. */ | ||
502 | if (journal->j_tail_sequence == first_tid) { | ||
503 | spin_unlock(&journal->j_state_lock); | ||
504 | return 1; | ||
505 | } | ||
506 | spin_unlock(&journal->j_state_lock); | ||
507 | |||
508 | /* | ||
509 | * We need to make sure that any blocks that were recently written out | ||
510 | * --- perhaps by log_do_checkpoint() --- are flushed out before we | ||
511 | * drop the transactions from the journal. Similarly we need to be sure | ||
512 | * superblock makes it to disk before next transaction starts reusing | ||
513 | * freed space (otherwise we could replay some blocks of the new | ||
514 | * transaction thinking they belong to the old one). So we use | ||
515 | * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially | ||
516 | * with an appropriately sized journal, but we need this to guarantee | ||
517 | * correctness. Fortunately cleanup_journal_tail() doesn't get called | ||
518 | * all that often. | ||
519 | */ | ||
520 | journal_update_sb_log_tail(journal, first_tid, blocknr, | ||
521 | WRITE_FLUSH_FUA); | ||
522 | |||
523 | spin_lock(&journal->j_state_lock); | ||
524 | /* OK, update the superblock to recover the freed space. | ||
525 | * Physical blocks come first: have we wrapped beyond the end of | ||
526 | * the log? */ | ||
527 | freed = blocknr - journal->j_tail; | ||
528 | if (blocknr < journal->j_tail) | ||
529 | freed = freed + journal->j_last - journal->j_first; | ||
530 | |||
531 | trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); | ||
532 | jbd_debug(1, | ||
533 | "Cleaning journal tail from %d to %d (offset %u), " | ||
534 | "freeing %u\n", | ||
535 | journal->j_tail_sequence, first_tid, blocknr, freed); | ||
536 | |||
537 | journal->j_free += freed; | ||
538 | journal->j_tail_sequence = first_tid; | ||
539 | journal->j_tail = blocknr; | ||
540 | spin_unlock(&journal->j_state_lock); | ||
541 | return 0; | ||
542 | } | ||
543 | |||
544 | |||
545 | /* Checkpoint list management */ | ||
546 | |||
547 | /* | ||
548 | * journal_clean_one_cp_list | ||
549 | * | ||
550 | * Find all the written-back checkpoint buffers in the given list and release | ||
551 | * them. | ||
552 | * | ||
553 | * Called with j_list_lock held. | ||
554 | * Returns number of buffers reaped (for debug) | ||
555 | */ | ||
556 | |||
557 | static int journal_clean_one_cp_list(struct journal_head *jh, int *released) | ||
558 | { | ||
559 | struct journal_head *last_jh; | ||
560 | struct journal_head *next_jh = jh; | ||
561 | int ret, freed = 0; | ||
562 | |||
563 | *released = 0; | ||
564 | if (!jh) | ||
565 | return 0; | ||
566 | |||
567 | last_jh = jh->b_cpprev; | ||
568 | do { | ||
569 | jh = next_jh; | ||
570 | next_jh = jh->b_cpnext; | ||
571 | /* Use trylock because of the ranking */ | ||
572 | if (jbd_trylock_bh_state(jh2bh(jh))) { | ||
573 | ret = __try_to_free_cp_buf(jh); | ||
574 | if (ret) { | ||
575 | freed++; | ||
576 | if (ret == 2) { | ||
577 | *released = 1; | ||
578 | return freed; | ||
579 | } | ||
580 | } | ||
581 | } | ||
582 | /* | ||
583 | * This function only frees up some memory | ||
584 | * if possible so we dont have an obligation | ||
585 | * to finish processing. Bail out if preemption | ||
586 | * requested: | ||
587 | */ | ||
588 | if (need_resched()) | ||
589 | return freed; | ||
590 | } while (jh != last_jh); | ||
591 | |||
592 | return freed; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * journal_clean_checkpoint_list | ||
597 | * | ||
598 | * Find all the written-back checkpoint buffers in the journal and release them. | ||
599 | * | ||
600 | * Called with the journal locked. | ||
601 | * Called with j_list_lock held. | ||
602 | * Returns number of buffers reaped (for debug) | ||
603 | */ | ||
604 | |||
605 | int __journal_clean_checkpoint_list(journal_t *journal) | ||
606 | { | ||
607 | transaction_t *transaction, *last_transaction, *next_transaction; | ||
608 | int ret = 0; | ||
609 | int released; | ||
610 | |||
611 | transaction = journal->j_checkpoint_transactions; | ||
612 | if (!transaction) | ||
613 | goto out; | ||
614 | |||
615 | last_transaction = transaction->t_cpprev; | ||
616 | next_transaction = transaction; | ||
617 | do { | ||
618 | transaction = next_transaction; | ||
619 | next_transaction = transaction->t_cpnext; | ||
620 | ret += journal_clean_one_cp_list(transaction-> | ||
621 | t_checkpoint_list, &released); | ||
622 | /* | ||
623 | * This function only frees up some memory if possible so we | ||
624 | * dont have an obligation to finish processing. Bail out if | ||
625 | * preemption requested: | ||
626 | */ | ||
627 | if (need_resched()) | ||
628 | goto out; | ||
629 | if (released) | ||
630 | continue; | ||
631 | /* | ||
632 | * It is essential that we are as careful as in the case of | ||
633 | * t_checkpoint_list with removing the buffer from the list as | ||
634 | * we can possibly see not yet submitted buffers on io_list | ||
635 | */ | ||
636 | ret += journal_clean_one_cp_list(transaction-> | ||
637 | t_checkpoint_io_list, &released); | ||
638 | if (need_resched()) | ||
639 | goto out; | ||
640 | } while (transaction != last_transaction); | ||
641 | out: | ||
642 | return ret; | ||
643 | } | ||
644 | |||
645 | /* | ||
646 | * journal_remove_checkpoint: called after a buffer has been committed | ||
647 | * to disk (either by being write-back flushed to disk, or being | ||
648 | * committed to the log). | ||
649 | * | ||
650 | * We cannot safely clean a transaction out of the log until all of the | ||
651 | * buffer updates committed in that transaction have safely been stored | ||
652 | * elsewhere on disk. To achieve this, all of the buffers in a | ||
653 | * transaction need to be maintained on the transaction's checkpoint | ||
654 | * lists until they have been rewritten, at which point this function is | ||
655 | * called to remove the buffer from the existing transaction's | ||
656 | * checkpoint lists. | ||
657 | * | ||
658 | * The function returns 1 if it frees the transaction, 0 otherwise. | ||
659 | * The function can free jh and bh. | ||
660 | * | ||
661 | * This function is called with j_list_lock held. | ||
662 | * This function is called with jbd_lock_bh_state(jh2bh(jh)) | ||
663 | */ | ||
664 | |||
665 | int __journal_remove_checkpoint(struct journal_head *jh) | ||
666 | { | ||
667 | transaction_t *transaction; | ||
668 | journal_t *journal; | ||
669 | int ret = 0; | ||
670 | |||
671 | JBUFFER_TRACE(jh, "entry"); | ||
672 | |||
673 | if ((transaction = jh->b_cp_transaction) == NULL) { | ||
674 | JBUFFER_TRACE(jh, "not on transaction"); | ||
675 | goto out; | ||
676 | } | ||
677 | journal = transaction->t_journal; | ||
678 | |||
679 | JBUFFER_TRACE(jh, "removing from transaction"); | ||
680 | __buffer_unlink(jh); | ||
681 | jh->b_cp_transaction = NULL; | ||
682 | journal_put_journal_head(jh); | ||
683 | |||
684 | if (transaction->t_checkpoint_list != NULL || | ||
685 | transaction->t_checkpoint_io_list != NULL) | ||
686 | goto out; | ||
687 | |||
688 | /* | ||
689 | * There is one special case to worry about: if we have just pulled the | ||
690 | * buffer off a running or committing transaction's checkpoing list, | ||
691 | * then even if the checkpoint list is empty, the transaction obviously | ||
692 | * cannot be dropped! | ||
693 | * | ||
694 | * The locking here around t_state is a bit sleazy. | ||
695 | * See the comment at the end of journal_commit_transaction(). | ||
696 | */ | ||
697 | if (transaction->t_state != T_FINISHED) | ||
698 | goto out; | ||
699 | |||
700 | /* OK, that was the last buffer for the transaction: we can now | ||
701 | safely remove this transaction from the log */ | ||
702 | |||
703 | __journal_drop_transaction(journal, transaction); | ||
704 | |||
705 | /* Just in case anybody was waiting for more transactions to be | ||
706 | checkpointed... */ | ||
707 | wake_up(&journal->j_wait_logspace); | ||
708 | ret = 1; | ||
709 | out: | ||
710 | return ret; | ||
711 | } | ||
712 | |||
713 | /* | ||
714 | * journal_insert_checkpoint: put a committed buffer onto a checkpoint | ||
715 | * list so that we know when it is safe to clean the transaction out of | ||
716 | * the log. | ||
717 | * | ||
718 | * Called with the journal locked. | ||
719 | * Called with j_list_lock held. | ||
720 | */ | ||
721 | void __journal_insert_checkpoint(struct journal_head *jh, | ||
722 | transaction_t *transaction) | ||
723 | { | ||
724 | JBUFFER_TRACE(jh, "entry"); | ||
725 | J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); | ||
726 | J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); | ||
727 | |||
728 | /* Get reference for checkpointing transaction */ | ||
729 | journal_grab_journal_head(jh2bh(jh)); | ||
730 | jh->b_cp_transaction = transaction; | ||
731 | |||
732 | if (!transaction->t_checkpoint_list) { | ||
733 | jh->b_cpnext = jh->b_cpprev = jh; | ||
734 | } else { | ||
735 | jh->b_cpnext = transaction->t_checkpoint_list; | ||
736 | jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; | ||
737 | jh->b_cpprev->b_cpnext = jh; | ||
738 | jh->b_cpnext->b_cpprev = jh; | ||
739 | } | ||
740 | transaction->t_checkpoint_list = jh; | ||
741 | } | ||
742 | |||
743 | /* | ||
744 | * We've finished with this transaction structure: adios... | ||
745 | * | ||
746 | * The transaction must have no links except for the checkpoint by this | ||
747 | * point. | ||
748 | * | ||
749 | * Called with the journal locked. | ||
750 | * Called with j_list_lock held. | ||
751 | */ | ||
752 | |||
753 | void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) | ||
754 | { | ||
755 | assert_spin_locked(&journal->j_list_lock); | ||
756 | if (transaction->t_cpnext) { | ||
757 | transaction->t_cpnext->t_cpprev = transaction->t_cpprev; | ||
758 | transaction->t_cpprev->t_cpnext = transaction->t_cpnext; | ||
759 | if (journal->j_checkpoint_transactions == transaction) | ||
760 | journal->j_checkpoint_transactions = | ||
761 | transaction->t_cpnext; | ||
762 | if (journal->j_checkpoint_transactions == transaction) | ||
763 | journal->j_checkpoint_transactions = NULL; | ||
764 | } | ||
765 | |||
766 | J_ASSERT(transaction->t_state == T_FINISHED); | ||
767 | J_ASSERT(transaction->t_buffers == NULL); | ||
768 | J_ASSERT(transaction->t_sync_datalist == NULL); | ||
769 | J_ASSERT(transaction->t_forget == NULL); | ||
770 | J_ASSERT(transaction->t_iobuf_list == NULL); | ||
771 | J_ASSERT(transaction->t_shadow_list == NULL); | ||
772 | J_ASSERT(transaction->t_log_list == NULL); | ||
773 | J_ASSERT(transaction->t_checkpoint_list == NULL); | ||
774 | J_ASSERT(transaction->t_checkpoint_io_list == NULL); | ||
775 | J_ASSERT(transaction->t_updates == 0); | ||
776 | J_ASSERT(journal->j_committing_transaction != transaction); | ||
777 | J_ASSERT(journal->j_running_transaction != transaction); | ||
778 | |||
779 | trace_jbd_drop_transaction(journal, transaction); | ||
780 | jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); | ||
781 | kfree(transaction); | ||
782 | } | ||
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c deleted file mode 100644 index bb217dcb41af..000000000000 --- a/fs/jbd/commit.c +++ /dev/null | |||
@@ -1,1021 +0,0 @@ | |||
1 | /* | ||
2 | * linux/fs/jbd/commit.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | ||
5 | * | ||
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Journal commit routines for the generic filesystem journaling code; | ||
13 | * part of the ext2fs journaling system. | ||
14 | */ | ||
15 | |||
16 | #include <linux/time.h> | ||
17 | #include <linux/fs.h> | ||
18 | #include <linux/jbd.h> | ||
19 | #include <linux/errno.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/pagemap.h> | ||
22 | #include <linux/bio.h> | ||
23 | #include <linux/blkdev.h> | ||
24 | #include <trace/events/jbd.h> | ||
25 | |||
26 | /* | ||
27 | * Default IO end handler for temporary BJ_IO buffer_heads. | ||
28 | */ | ||
29 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | ||
30 | { | ||
31 | BUFFER_TRACE(bh, ""); | ||
32 | if (uptodate) | ||
33 | set_buffer_uptodate(bh); | ||
34 | else | ||
35 | clear_buffer_uptodate(bh); | ||
36 | unlock_buffer(bh); | ||
37 | } | ||
38 | |||
39 | /* | ||
40 | * When an ext3-ordered file is truncated, it is possible that many pages are | ||
41 | * not successfully freed, because they are attached to a committing transaction. | ||
42 | * After the transaction commits, these pages are left on the LRU, with no | ||
43 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | ||
44 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | ||
45 | * the numbers in /proc/meminfo look odd. | ||
46 | * | ||
47 | * So here, we have a buffer which has just come off the forget list. Look to | ||
48 | * see if we can strip all buffers from the backing page. | ||
49 | * | ||
50 | * Called under journal->j_list_lock. The caller provided us with a ref | ||
51 | * against the buffer, and we drop that here. | ||
52 | */ | ||
53 | static void release_buffer_page(struct buffer_head *bh) | ||
54 | { | ||
55 | struct page *page; | ||
56 | |||
57 | if (buffer_dirty(bh)) | ||
58 | goto nope; | ||
59 | if (atomic_read(&bh->b_count) != 1) | ||
60 | goto nope; | ||
61 | page = bh->b_page; | ||
62 | if (!page) | ||
63 | goto nope; | ||
64 | if (page->mapping) | ||
65 | goto nope; | ||
66 | |||
67 | /* OK, it's a truncated page */ | ||
68 | if (!trylock_page(page)) | ||
69 | goto nope; | ||
70 | |||
71 | page_cache_get(page); | ||
72 | __brelse(bh); | ||
73 | try_to_free_buffers(page); | ||
74 | unlock_page(page); | ||
75 | page_cache_release(page); | ||
76 | return; | ||
77 | |||
78 | nope: | ||
79 | __brelse(bh); | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Decrement reference counter for data buffer. If it has been marked | ||
84 | * 'BH_Freed', release it and the page to which it belongs if possible. | ||
85 | */ | ||
86 | static void release_data_buffer(struct buffer_head *bh) | ||
87 | { | ||
88 | if (buffer_freed(bh)) { | ||
89 | WARN_ON_ONCE(buffer_dirty(bh)); | ||
90 | clear_buffer_freed(bh); | ||
91 | clear_buffer_mapped(bh); | ||
92 | clear_buffer_new(bh); | ||
93 | clear_buffer_req(bh); | ||
94 | bh->b_bdev = NULL; | ||
95 | release_buffer_page(bh); | ||
96 | } else | ||
97 | put_bh(bh); | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is | ||
102 | * held. For ranking reasons we must trylock. If we lose, schedule away and | ||
103 | * return 0. j_list_lock is dropped in this case. | ||
104 | */ | ||
105 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) | ||
106 | { | ||
107 | if (!jbd_trylock_bh_state(bh)) { | ||
108 | spin_unlock(&journal->j_list_lock); | ||
109 | schedule(); | ||
110 | return 0; | ||
111 | } | ||
112 | return 1; | ||
113 | } | ||
114 | |||
115 | /* Done it all: now write the commit record. We should have | ||
116 | * cleaned up our previous buffers by now, so if we are in abort | ||
117 | * mode we can now just skip the rest of the journal write | ||
118 | * entirely. | ||
119 | * | ||
120 | * Returns 1 if the journal needs to be aborted or 0 on success | ||
121 | */ | ||
122 | static int journal_write_commit_record(journal_t *journal, | ||
123 | transaction_t *commit_transaction) | ||
124 | { | ||
125 | struct journal_head *descriptor; | ||
126 | struct buffer_head *bh; | ||
127 | journal_header_t *header; | ||
128 | int ret; | ||
129 | |||
130 | if (is_journal_aborted(journal)) | ||
131 | return 0; | ||
132 | |||
133 | descriptor = journal_get_descriptor_buffer(journal); | ||
134 | if (!descriptor) | ||
135 | return 1; | ||
136 | |||
137 | bh = jh2bh(descriptor); | ||
138 | |||
139 | header = (journal_header_t *)(bh->b_data); | ||
140 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
141 | header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); | ||
142 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | ||
143 | |||
144 | JBUFFER_TRACE(descriptor, "write commit block"); | ||
145 | set_buffer_dirty(bh); | ||
146 | |||
147 | if (journal->j_flags & JFS_BARRIER) | ||
148 | ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA); | ||
149 | else | ||
150 | ret = sync_dirty_buffer(bh); | ||
151 | |||
152 | put_bh(bh); /* One for getblk() */ | ||
153 | journal_put_journal_head(descriptor); | ||
154 | |||
155 | return (ret == -EIO); | ||
156 | } | ||
157 | |||
158 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, | ||
159 | int write_op) | ||
160 | { | ||
161 | int i; | ||
162 | |||
163 | for (i = 0; i < bufs; i++) { | ||
164 | wbuf[i]->b_end_io = end_buffer_write_sync; | ||
165 | /* | ||
166 | * Here we write back pagecache data that may be mmaped. Since | ||
167 | * we cannot afford to clean the page and set PageWriteback | ||
168 | * here due to lock ordering (page lock ranks above transaction | ||
169 | * start), the data can change while IO is in flight. Tell the | ||
170 | * block layer it should bounce the bio pages if stable data | ||
171 | * during write is required. | ||
172 | * | ||
173 | * We use up our safety reference in submit_bh(). | ||
174 | */ | ||
175 | _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); | ||
176 | } | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * Submit all the data buffers to disk | ||
181 | */ | ||
182 | static int journal_submit_data_buffers(journal_t *journal, | ||
183 | transaction_t *commit_transaction, | ||
184 | int write_op) | ||
185 | { | ||
186 | struct journal_head *jh; | ||
187 | struct buffer_head *bh; | ||
188 | int locked; | ||
189 | int bufs = 0; | ||
190 | struct buffer_head **wbuf = journal->j_wbuf; | ||
191 | int err = 0; | ||
192 | |||
193 | /* | ||
194 | * Whenever we unlock the journal and sleep, things can get added | ||
195 | * onto ->t_sync_datalist, so we have to keep looping back to | ||
196 | * write_out_data until we *know* that the list is empty. | ||
197 | * | ||
198 | * Cleanup any flushed data buffers from the data list. Even in | ||
199 | * abort mode, we want to flush this out as soon as possible. | ||
200 | */ | ||
201 | write_out_data: | ||
202 | cond_resched(); | ||
203 | spin_lock(&journal->j_list_lock); | ||
204 | |||
205 | while (commit_transaction->t_sync_datalist) { | ||
206 | jh = commit_transaction->t_sync_datalist; | ||
207 | bh = jh2bh(jh); | ||
208 | locked = 0; | ||
209 | |||
210 | /* Get reference just to make sure buffer does not disappear | ||
211 | * when we are forced to drop various locks */ | ||
212 | get_bh(bh); | ||
213 | /* If the buffer is dirty, we need to submit IO and hence | ||
214 | * we need the buffer lock. We try to lock the buffer without | ||
215 | * blocking. If we fail, we need to drop j_list_lock and do | ||
216 | * blocking lock_buffer(). | ||
217 | */ | ||
218 | if (buffer_dirty(bh)) { | ||
219 | if (!trylock_buffer(bh)) { | ||
220 | BUFFER_TRACE(bh, "needs blocking lock"); | ||
221 | spin_unlock(&journal->j_list_lock); | ||
222 | trace_jbd_do_submit_data(journal, | ||
223 | commit_transaction); | ||
224 | /* Write out all data to prevent deadlocks */ | ||
225 | journal_do_submit_data(wbuf, bufs, write_op); | ||
226 | bufs = 0; | ||
227 | lock_buffer(bh); | ||
228 | spin_lock(&journal->j_list_lock); | ||
229 | } | ||
230 | locked = 1; | ||
231 | } | ||
232 | /* We have to get bh_state lock. Again out of order, sigh. */ | ||
233 | if (!inverted_lock(journal, bh)) { | ||
234 | jbd_lock_bh_state(bh); | ||
235 | spin_lock(&journal->j_list_lock); | ||
236 | } | ||
237 | /* Someone already cleaned up the buffer? */ | ||
238 | if (!buffer_jbd(bh) || bh2jh(bh) != jh | ||
239 | || jh->b_transaction != commit_transaction | ||
240 | || jh->b_jlist != BJ_SyncData) { | ||
241 | jbd_unlock_bh_state(bh); | ||
242 | if (locked) | ||
243 | unlock_buffer(bh); | ||
244 | BUFFER_TRACE(bh, "already cleaned up"); | ||
245 | release_data_buffer(bh); | ||
246 | continue; | ||
247 | } | ||
248 | if (locked && test_clear_buffer_dirty(bh)) { | ||
249 | BUFFER_TRACE(bh, "needs writeout, adding to array"); | ||
250 | wbuf[bufs++] = bh; | ||
251 | __journal_file_buffer(jh, commit_transaction, | ||
252 | BJ_Locked); | ||
253 | jbd_unlock_bh_state(bh); | ||
254 | if (bufs == journal->j_wbufsize) { | ||
255 | spin_unlock(&journal->j_list_lock); | ||
256 | trace_jbd_do_submit_data(journal, | ||
257 | commit_transaction); | ||
258 | journal_do_submit_data(wbuf, bufs, write_op); | ||
259 | bufs = 0; | ||
260 | goto write_out_data; | ||
261 | } | ||
262 | } else if (!locked && buffer_locked(bh)) { | ||
263 | __journal_file_buffer(jh, commit_transaction, | ||
264 | BJ_Locked); | ||
265 | jbd_unlock_bh_state(bh); | ||
266 | put_bh(bh); | ||
267 | } else { | ||
268 | BUFFER_TRACE(bh, "writeout complete: unfile"); | ||
269 | if (unlikely(!buffer_uptodate(bh))) | ||
270 | err = -EIO; | ||
271 | __journal_unfile_buffer(jh); | ||
272 | jbd_unlock_bh_state(bh); | ||
273 | if (locked) | ||
274 | unlock_buffer(bh); | ||
275 | release_data_buffer(bh); | ||
276 | } | ||
277 | |||
278 | if (need_resched() || spin_needbreak(&journal->j_list_lock)) { | ||
279 | spin_unlock(&journal->j_list_lock); | ||
280 | goto write_out_data; | ||
281 | } | ||
282 | } | ||
283 | spin_unlock(&journal->j_list_lock); | ||
284 | trace_jbd_do_submit_data(journal, commit_transaction); | ||
285 | journal_do_submit_data(wbuf, bufs, write_op); | ||
286 | |||
287 | return err; | ||
288 | } | ||
289 | |||
290 | /* | ||
291 | * journal_commit_transaction | ||
292 | * | ||
293 | * The primary function for committing a transaction to the log. This | ||
294 | * function is called by the journal thread to begin a complete commit. | ||
295 | */ | ||
296 | void journal_commit_transaction(journal_t *journal) | ||
297 | { | ||
298 | transaction_t *commit_transaction; | ||
299 | struct journal_head *jh, *new_jh, *descriptor; | ||
300 | struct buffer_head **wbuf = journal->j_wbuf; | ||
301 | int bufs; | ||
302 | int flags; | ||
303 | int err; | ||
304 | unsigned int blocknr; | ||
305 | ktime_t start_time; | ||
306 | u64 commit_time; | ||
307 | char *tagp = NULL; | ||
308 | journal_header_t *header; | ||
309 | journal_block_tag_t *tag = NULL; | ||
310 | int space_left = 0; | ||
311 | int first_tag = 0; | ||
312 | int tag_flag; | ||
313 | int i; | ||
314 | struct blk_plug plug; | ||
315 | int write_op = WRITE; | ||
316 | |||
317 | /* | ||
318 | * First job: lock down the current transaction and wait for | ||
319 | * all outstanding updates to complete. | ||
320 | */ | ||
321 | |||
322 | /* Do we need to erase the effects of a prior journal_flush? */ | ||
323 | if (journal->j_flags & JFS_FLUSHED) { | ||
324 | jbd_debug(3, "super block updated\n"); | ||
325 | mutex_lock(&journal->j_checkpoint_mutex); | ||
326 | /* | ||
327 | * We hold j_checkpoint_mutex so tail cannot change under us. | ||
328 | * We don't need any special data guarantees for writing sb | ||
329 | * since journal is empty and it is ok for write to be | ||
330 | * flushed only with transaction commit. | ||
331 | */ | ||
332 | journal_update_sb_log_tail(journal, journal->j_tail_sequence, | ||
333 | journal->j_tail, WRITE_SYNC); | ||
334 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
335 | } else { | ||
336 | jbd_debug(3, "superblock not updated\n"); | ||
337 | } | ||
338 | |||
339 | J_ASSERT(journal->j_running_transaction != NULL); | ||
340 | J_ASSERT(journal->j_committing_transaction == NULL); | ||
341 | |||
342 | commit_transaction = journal->j_running_transaction; | ||
343 | |||
344 | trace_jbd_start_commit(journal, commit_transaction); | ||
345 | jbd_debug(1, "JBD: starting commit of transaction %d\n", | ||
346 | commit_transaction->t_tid); | ||
347 | |||
348 | spin_lock(&journal->j_state_lock); | ||
349 | J_ASSERT(commit_transaction->t_state == T_RUNNING); | ||
350 | commit_transaction->t_state = T_LOCKED; | ||
351 | |||
352 | trace_jbd_commit_locking(journal, commit_transaction); | ||
353 | spin_lock(&commit_transaction->t_handle_lock); | ||
354 | while (commit_transaction->t_updates) { | ||
355 | DEFINE_WAIT(wait); | ||
356 | |||
357 | prepare_to_wait(&journal->j_wait_updates, &wait, | ||
358 | TASK_UNINTERRUPTIBLE); | ||
359 | if (commit_transaction->t_updates) { | ||
360 | spin_unlock(&commit_transaction->t_handle_lock); | ||
361 | spin_unlock(&journal->j_state_lock); | ||
362 | schedule(); | ||
363 | spin_lock(&journal->j_state_lock); | ||
364 | spin_lock(&commit_transaction->t_handle_lock); | ||
365 | } | ||
366 | finish_wait(&journal->j_wait_updates, &wait); | ||
367 | } | ||
368 | spin_unlock(&commit_transaction->t_handle_lock); | ||
369 | |||
370 | J_ASSERT (commit_transaction->t_outstanding_credits <= | ||
371 | journal->j_max_transaction_buffers); | ||
372 | |||
373 | /* | ||
374 | * First thing we are allowed to do is to discard any remaining | ||
375 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume | ||
376 | * that there are no such buffers: if a large filesystem | ||
377 | * operation like a truncate needs to split itself over multiple | ||
378 | * transactions, then it may try to do a journal_restart() while | ||
379 | * there are still BJ_Reserved buffers outstanding. These must | ||
380 | * be released cleanly from the current transaction. | ||
381 | * | ||
382 | * In this case, the filesystem must still reserve write access | ||
383 | * again before modifying the buffer in the new transaction, but | ||
384 | * we do not require it to remember exactly which old buffers it | ||
385 | * has reserved. This is consistent with the existing behaviour | ||
386 | * that multiple journal_get_write_access() calls to the same | ||
387 | * buffer are perfectly permissible. | ||
388 | */ | ||
389 | while (commit_transaction->t_reserved_list) { | ||
390 | jh = commit_transaction->t_reserved_list; | ||
391 | JBUFFER_TRACE(jh, "reserved, unused: refile"); | ||
392 | /* | ||
393 | * A journal_get_undo_access()+journal_release_buffer() may | ||
394 | * leave undo-committed data. | ||
395 | */ | ||
396 | if (jh->b_committed_data) { | ||
397 | struct buffer_head *bh = jh2bh(jh); | ||
398 | |||
399 | jbd_lock_bh_state(bh); | ||
400 | jbd_free(jh->b_committed_data, bh->b_size); | ||
401 | jh->b_committed_data = NULL; | ||
402 | jbd_unlock_bh_state(bh); | ||
403 | } | ||
404 | journal_refile_buffer(journal, jh); | ||
405 | } | ||
406 | |||
407 | /* | ||
408 | * Now try to drop any written-back buffers from the journal's | ||
409 | * checkpoint lists. We do this *before* commit because it potentially | ||
410 | * frees some memory | ||
411 | */ | ||
412 | spin_lock(&journal->j_list_lock); | ||
413 | __journal_clean_checkpoint_list(journal); | ||
414 | spin_unlock(&journal->j_list_lock); | ||
415 | |||
416 | jbd_debug (3, "JBD: commit phase 1\n"); | ||
417 | |||
418 | /* | ||
419 | * Clear revoked flag to reflect there is no revoked buffers | ||
420 | * in the next transaction which is going to be started. | ||
421 | */ | ||
422 | journal_clear_buffer_revoked_flags(journal); | ||
423 | |||
424 | /* | ||
425 | * Switch to a new revoke table. | ||
426 | */ | ||
427 | journal_switch_revoke_table(journal); | ||
428 | |||
429 | trace_jbd_commit_flushing(journal, commit_transaction); | ||
430 | commit_transaction->t_state = T_FLUSH; | ||
431 | journal->j_committing_transaction = commit_transaction; | ||
432 | journal->j_running_transaction = NULL; | ||
433 | start_time = ktime_get(); | ||
434 | commit_transaction->t_log_start = journal->j_head; | ||
435 | wake_up(&journal->j_wait_transaction_locked); | ||
436 | spin_unlock(&journal->j_state_lock); | ||
437 | |||
438 | jbd_debug (3, "JBD: commit phase 2\n"); | ||
439 | |||
440 | if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) | ||
441 | write_op = WRITE_SYNC; | ||
442 | |||
443 | /* | ||
444 | * Now start flushing things to disk, in the order they appear | ||
445 | * on the transaction lists. Data blocks go first. | ||
446 | */ | ||
447 | blk_start_plug(&plug); | ||
448 | err = journal_submit_data_buffers(journal, commit_transaction, | ||
449 | write_op); | ||
450 | blk_finish_plug(&plug); | ||
451 | |||
452 | /* | ||
453 | * Wait for all previously submitted IO to complete. | ||
454 | */ | ||
455 | spin_lock(&journal->j_list_lock); | ||
456 | while (commit_transaction->t_locked_list) { | ||
457 | struct buffer_head *bh; | ||
458 | |||
459 | jh = commit_transaction->t_locked_list->b_tprev; | ||
460 | bh = jh2bh(jh); | ||
461 | get_bh(bh); | ||
462 | if (buffer_locked(bh)) { | ||
463 | spin_unlock(&journal->j_list_lock); | ||
464 | wait_on_buffer(bh); | ||
465 | spin_lock(&journal->j_list_lock); | ||
466 | } | ||
467 | if (unlikely(!buffer_uptodate(bh))) { | ||
468 | if (!trylock_page(bh->b_page)) { | ||
469 | spin_unlock(&journal->j_list_lock); | ||
470 | lock_page(bh->b_page); | ||
471 | spin_lock(&journal->j_list_lock); | ||
472 | } | ||
473 | if (bh->b_page->mapping) | ||
474 | set_bit(AS_EIO, &bh->b_page->mapping->flags); | ||
475 | |||
476 | unlock_page(bh->b_page); | ||
477 | SetPageError(bh->b_page); | ||
478 | err = -EIO; | ||
479 | } | ||
480 | if (!inverted_lock(journal, bh)) { | ||
481 | put_bh(bh); | ||
482 | spin_lock(&journal->j_list_lock); | ||
483 | continue; | ||
484 | } | ||
485 | if (buffer_jbd(bh) && bh2jh(bh) == jh && | ||
486 | jh->b_transaction == commit_transaction && | ||
487 | jh->b_jlist == BJ_Locked) | ||
488 | __journal_unfile_buffer(jh); | ||
489 | jbd_unlock_bh_state(bh); | ||
490 | release_data_buffer(bh); | ||
491 | cond_resched_lock(&journal->j_list_lock); | ||
492 | } | ||
493 | spin_unlock(&journal->j_list_lock); | ||
494 | |||
495 | if (err) { | ||
496 | char b[BDEVNAME_SIZE]; | ||
497 | |||
498 | printk(KERN_WARNING | ||
499 | "JBD: Detected IO errors while flushing file data " | ||
500 | "on %s\n", bdevname(journal->j_fs_dev, b)); | ||
501 | if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) | ||
502 | journal_abort(journal, err); | ||
503 | err = 0; | ||
504 | } | ||
505 | |||
506 | blk_start_plug(&plug); | ||
507 | |||
508 | journal_write_revoke_records(journal, commit_transaction, write_op); | ||
509 | |||
510 | /* | ||
511 | * If we found any dirty or locked buffers, then we should have | ||
512 | * looped back up to the write_out_data label. If there weren't | ||
513 | * any then journal_clean_data_list should have wiped the list | ||
514 | * clean by now, so check that it is in fact empty. | ||
515 | */ | ||
516 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); | ||
517 | |||
518 | jbd_debug (3, "JBD: commit phase 3\n"); | ||
519 | |||
520 | /* | ||
521 | * Way to go: we have now written out all of the data for a | ||
522 | * transaction! Now comes the tricky part: we need to write out | ||
523 | * metadata. Loop over the transaction's entire buffer list: | ||
524 | */ | ||
525 | spin_lock(&journal->j_state_lock); | ||
526 | commit_transaction->t_state = T_COMMIT; | ||
527 | spin_unlock(&journal->j_state_lock); | ||
528 | |||
529 | trace_jbd_commit_logging(journal, commit_transaction); | ||
530 | J_ASSERT(commit_transaction->t_nr_buffers <= | ||
531 | commit_transaction->t_outstanding_credits); | ||
532 | |||
533 | descriptor = NULL; | ||
534 | bufs = 0; | ||
535 | while (commit_transaction->t_buffers) { | ||
536 | |||
537 | /* Find the next buffer to be journaled... */ | ||
538 | |||
539 | jh = commit_transaction->t_buffers; | ||
540 | |||
541 | /* If we're in abort mode, we just un-journal the buffer and | ||
542 | release it. */ | ||
543 | |||
544 | if (is_journal_aborted(journal)) { | ||
545 | clear_buffer_jbddirty(jh2bh(jh)); | ||
546 | JBUFFER_TRACE(jh, "journal is aborting: refile"); | ||
547 | journal_refile_buffer(journal, jh); | ||
548 | /* If that was the last one, we need to clean up | ||
549 | * any descriptor buffers which may have been | ||
550 | * already allocated, even if we are now | ||
551 | * aborting. */ | ||
552 | if (!commit_transaction->t_buffers) | ||
553 | goto start_journal_io; | ||
554 | continue; | ||
555 | } | ||
556 | |||
557 | /* Make sure we have a descriptor block in which to | ||
558 | record the metadata buffer. */ | ||
559 | |||
560 | if (!descriptor) { | ||
561 | struct buffer_head *bh; | ||
562 | |||
563 | J_ASSERT (bufs == 0); | ||
564 | |||
565 | jbd_debug(4, "JBD: get descriptor\n"); | ||
566 | |||
567 | descriptor = journal_get_descriptor_buffer(journal); | ||
568 | if (!descriptor) { | ||
569 | journal_abort(journal, -EIO); | ||
570 | continue; | ||
571 | } | ||
572 | |||
573 | bh = jh2bh(descriptor); | ||
574 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", | ||
575 | (unsigned long long)bh->b_blocknr, bh->b_data); | ||
576 | header = (journal_header_t *)&bh->b_data[0]; | ||
577 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
578 | header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); | ||
579 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | ||
580 | |||
581 | tagp = &bh->b_data[sizeof(journal_header_t)]; | ||
582 | space_left = bh->b_size - sizeof(journal_header_t); | ||
583 | first_tag = 1; | ||
584 | set_buffer_jwrite(bh); | ||
585 | set_buffer_dirty(bh); | ||
586 | wbuf[bufs++] = bh; | ||
587 | |||
588 | /* Record it so that we can wait for IO | ||
589 | completion later */ | ||
590 | BUFFER_TRACE(bh, "ph3: file as descriptor"); | ||
591 | journal_file_buffer(descriptor, commit_transaction, | ||
592 | BJ_LogCtl); | ||
593 | } | ||
594 | |||
595 | /* Where is the buffer to be written? */ | ||
596 | |||
597 | err = journal_next_log_block(journal, &blocknr); | ||
598 | /* If the block mapping failed, just abandon the buffer | ||
599 | and repeat this loop: we'll fall into the | ||
600 | refile-on-abort condition above. */ | ||
601 | if (err) { | ||
602 | journal_abort(journal, err); | ||
603 | continue; | ||
604 | } | ||
605 | |||
606 | /* | ||
607 | * start_this_handle() uses t_outstanding_credits to determine | ||
608 | * the free space in the log, but this counter is changed | ||
609 | * by journal_next_log_block() also. | ||
610 | */ | ||
611 | commit_transaction->t_outstanding_credits--; | ||
612 | |||
613 | /* Bump b_count to prevent truncate from stumbling over | ||
614 | the shadowed buffer! @@@ This can go if we ever get | ||
615 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ | ||
616 | get_bh(jh2bh(jh)); | ||
617 | |||
618 | /* Make a temporary IO buffer with which to write it out | ||
619 | (this will requeue both the metadata buffer and the | ||
620 | temporary IO buffer). new_bh goes on BJ_IO*/ | ||
621 | |||
622 | set_buffer_jwrite(jh2bh(jh)); | ||
623 | /* | ||
624 | * akpm: journal_write_metadata_buffer() sets | ||
625 | * new_bh->b_transaction to commit_transaction. | ||
626 | * We need to clean this up before we release new_bh | ||
627 | * (which is of type BJ_IO) | ||
628 | */ | ||
629 | JBUFFER_TRACE(jh, "ph3: write metadata"); | ||
630 | flags = journal_write_metadata_buffer(commit_transaction, | ||
631 | jh, &new_jh, blocknr); | ||
632 | set_buffer_jwrite(jh2bh(new_jh)); | ||
633 | wbuf[bufs++] = jh2bh(new_jh); | ||
634 | |||
635 | /* Record the new block's tag in the current descriptor | ||
636 | buffer */ | ||
637 | |||
638 | tag_flag = 0; | ||
639 | if (flags & 1) | ||
640 | tag_flag |= JFS_FLAG_ESCAPE; | ||
641 | if (!first_tag) | ||
642 | tag_flag |= JFS_FLAG_SAME_UUID; | ||
643 | |||
644 | tag = (journal_block_tag_t *) tagp; | ||
645 | tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); | ||
646 | tag->t_flags = cpu_to_be32(tag_flag); | ||
647 | tagp += sizeof(journal_block_tag_t); | ||
648 | space_left -= sizeof(journal_block_tag_t); | ||
649 | |||
650 | if (first_tag) { | ||
651 | memcpy (tagp, journal->j_uuid, 16); | ||
652 | tagp += 16; | ||
653 | space_left -= 16; | ||
654 | first_tag = 0; | ||
655 | } | ||
656 | |||
657 | /* If there's no more to do, or if the descriptor is full, | ||
658 | let the IO rip! */ | ||
659 | |||
660 | if (bufs == journal->j_wbufsize || | ||
661 | commit_transaction->t_buffers == NULL || | ||
662 | space_left < sizeof(journal_block_tag_t) + 16) { | ||
663 | |||
664 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); | ||
665 | |||
666 | /* Write an end-of-descriptor marker before | ||
667 | submitting the IOs. "tag" still points to | ||
668 | the last tag we set up. */ | ||
669 | |||
670 | tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); | ||
671 | |||
672 | start_journal_io: | ||
673 | for (i = 0; i < bufs; i++) { | ||
674 | struct buffer_head *bh = wbuf[i]; | ||
675 | lock_buffer(bh); | ||
676 | clear_buffer_dirty(bh); | ||
677 | set_buffer_uptodate(bh); | ||
678 | bh->b_end_io = journal_end_buffer_io_sync; | ||
679 | /* | ||
680 | * In data=journal mode, here we can end up | ||
681 | * writing pagecache data that might be | ||
682 | * mmapped. Since we can't afford to clean the | ||
683 | * page and set PageWriteback (see the comment | ||
684 | * near the other use of _submit_bh()), the | ||
685 | * data can change while the write is in | ||
686 | * flight. Tell the block layer to bounce the | ||
687 | * bio pages if stable pages are required. | ||
688 | */ | ||
689 | _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); | ||
690 | } | ||
691 | cond_resched(); | ||
692 | |||
693 | /* Force a new descriptor to be generated next | ||
694 | time round the loop. */ | ||
695 | descriptor = NULL; | ||
696 | bufs = 0; | ||
697 | } | ||
698 | } | ||
699 | |||
700 | blk_finish_plug(&plug); | ||
701 | |||
702 | /* Lo and behold: we have just managed to send a transaction to | ||
703 | the log. Before we can commit it, wait for the IO so far to | ||
704 | complete. Control buffers being written are on the | ||
705 | transaction's t_log_list queue, and metadata buffers are on | ||
706 | the t_iobuf_list queue. | ||
707 | |||
708 | Wait for the buffers in reverse order. That way we are | ||
709 | less likely to be woken up until all IOs have completed, and | ||
710 | so we incur less scheduling load. | ||
711 | */ | ||
712 | |||
713 | jbd_debug(3, "JBD: commit phase 4\n"); | ||
714 | |||
715 | /* | ||
716 | * akpm: these are BJ_IO, and j_list_lock is not needed. | ||
717 | * See __journal_try_to_free_buffer. | ||
718 | */ | ||
719 | wait_for_iobuf: | ||
720 | while (commit_transaction->t_iobuf_list != NULL) { | ||
721 | struct buffer_head *bh; | ||
722 | |||
723 | jh = commit_transaction->t_iobuf_list->b_tprev; | ||
724 | bh = jh2bh(jh); | ||
725 | if (buffer_locked(bh)) { | ||
726 | wait_on_buffer(bh); | ||
727 | goto wait_for_iobuf; | ||
728 | } | ||
729 | if (cond_resched()) | ||
730 | goto wait_for_iobuf; | ||
731 | |||
732 | if (unlikely(!buffer_uptodate(bh))) | ||
733 | err = -EIO; | ||
734 | |||
735 | clear_buffer_jwrite(bh); | ||
736 | |||
737 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); | ||
738 | journal_unfile_buffer(journal, jh); | ||
739 | |||
740 | /* | ||
741 | * ->t_iobuf_list should contain only dummy buffer_heads | ||
742 | * which were created by journal_write_metadata_buffer(). | ||
743 | */ | ||
744 | BUFFER_TRACE(bh, "dumping temporary bh"); | ||
745 | journal_put_journal_head(jh); | ||
746 | __brelse(bh); | ||
747 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); | ||
748 | free_buffer_head(bh); | ||
749 | |||
750 | /* We also have to unlock and free the corresponding | ||
751 | shadowed buffer */ | ||
752 | jh = commit_transaction->t_shadow_list->b_tprev; | ||
753 | bh = jh2bh(jh); | ||
754 | clear_buffer_jwrite(bh); | ||
755 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | ||
756 | |||
757 | /* The metadata is now released for reuse, but we need | ||
758 | to remember it against this transaction so that when | ||
759 | we finally commit, we can do any checkpointing | ||
760 | required. */ | ||
761 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | ||
762 | journal_file_buffer(jh, commit_transaction, BJ_Forget); | ||
763 | /* | ||
764 | * Wake up any transactions which were waiting for this | ||
765 | * IO to complete. The barrier must be here so that changes | ||
766 | * by journal_file_buffer() take effect before wake_up_bit() | ||
767 | * does the waitqueue check. | ||
768 | */ | ||
769 | smp_mb(); | ||
770 | wake_up_bit(&bh->b_state, BH_Unshadow); | ||
771 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | ||
772 | __brelse(bh); | ||
773 | } | ||
774 | |||
775 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | ||
776 | |||
777 | jbd_debug(3, "JBD: commit phase 5\n"); | ||
778 | |||
779 | /* Here we wait for the revoke record and descriptor record buffers */ | ||
780 | wait_for_ctlbuf: | ||
781 | while (commit_transaction->t_log_list != NULL) { | ||
782 | struct buffer_head *bh; | ||
783 | |||
784 | jh = commit_transaction->t_log_list->b_tprev; | ||
785 | bh = jh2bh(jh); | ||
786 | if (buffer_locked(bh)) { | ||
787 | wait_on_buffer(bh); | ||
788 | goto wait_for_ctlbuf; | ||
789 | } | ||
790 | if (cond_resched()) | ||
791 | goto wait_for_ctlbuf; | ||
792 | |||
793 | if (unlikely(!buffer_uptodate(bh))) | ||
794 | err = -EIO; | ||
795 | |||
796 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); | ||
797 | clear_buffer_jwrite(bh); | ||
798 | journal_unfile_buffer(journal, jh); | ||
799 | journal_put_journal_head(jh); | ||
800 | __brelse(bh); /* One for getblk */ | ||
801 | /* AKPM: bforget here */ | ||
802 | } | ||
803 | |||
804 | if (err) | ||
805 | journal_abort(journal, err); | ||
806 | |||
807 | jbd_debug(3, "JBD: commit phase 6\n"); | ||
808 | |||
809 | /* All metadata is written, now write commit record and do cleanup */ | ||
810 | spin_lock(&journal->j_state_lock); | ||
811 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | ||
812 | commit_transaction->t_state = T_COMMIT_RECORD; | ||
813 | spin_unlock(&journal->j_state_lock); | ||
814 | |||
815 | if (journal_write_commit_record(journal, commit_transaction)) | ||
816 | err = -EIO; | ||
817 | |||
818 | if (err) | ||
819 | journal_abort(journal, err); | ||
820 | |||
821 | /* End of a transaction! Finally, we can do checkpoint | ||
822 | processing: any buffers committed as a result of this | ||
823 | transaction can be removed from any checkpoint list it was on | ||
824 | before. */ | ||
825 | |||
826 | jbd_debug(3, "JBD: commit phase 7\n"); | ||
827 | |||
828 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); | ||
829 | J_ASSERT(commit_transaction->t_buffers == NULL); | ||
830 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | ||
831 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | ||
832 | J_ASSERT(commit_transaction->t_shadow_list == NULL); | ||
833 | J_ASSERT(commit_transaction->t_log_list == NULL); | ||
834 | |||
835 | restart_loop: | ||
836 | /* | ||
837 | * As there are other places (journal_unmap_buffer()) adding buffers | ||
838 | * to this list we have to be careful and hold the j_list_lock. | ||
839 | */ | ||
840 | spin_lock(&journal->j_list_lock); | ||
841 | while (commit_transaction->t_forget) { | ||
842 | transaction_t *cp_transaction; | ||
843 | struct buffer_head *bh; | ||
844 | int try_to_free = 0; | ||
845 | |||
846 | jh = commit_transaction->t_forget; | ||
847 | spin_unlock(&journal->j_list_lock); | ||
848 | bh = jh2bh(jh); | ||
849 | /* | ||
850 | * Get a reference so that bh cannot be freed before we are | ||
851 | * done with it. | ||
852 | */ | ||
853 | get_bh(bh); | ||
854 | jbd_lock_bh_state(bh); | ||
855 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || | ||
856 | jh->b_transaction == journal->j_running_transaction); | ||
857 | |||
858 | /* | ||
859 | * If there is undo-protected committed data against | ||
860 | * this buffer, then we can remove it now. If it is a | ||
861 | * buffer needing such protection, the old frozen_data | ||
862 | * field now points to a committed version of the | ||
863 | * buffer, so rotate that field to the new committed | ||
864 | * data. | ||
865 | * | ||
866 | * Otherwise, we can just throw away the frozen data now. | ||
867 | */ | ||
868 | if (jh->b_committed_data) { | ||
869 | jbd_free(jh->b_committed_data, bh->b_size); | ||
870 | jh->b_committed_data = NULL; | ||
871 | if (jh->b_frozen_data) { | ||
872 | jh->b_committed_data = jh->b_frozen_data; | ||
873 | jh->b_frozen_data = NULL; | ||
874 | } | ||
875 | } else if (jh->b_frozen_data) { | ||
876 | jbd_free(jh->b_frozen_data, bh->b_size); | ||
877 | jh->b_frozen_data = NULL; | ||
878 | } | ||
879 | |||
880 | spin_lock(&journal->j_list_lock); | ||
881 | cp_transaction = jh->b_cp_transaction; | ||
882 | if (cp_transaction) { | ||
883 | JBUFFER_TRACE(jh, "remove from old cp transaction"); | ||
884 | __journal_remove_checkpoint(jh); | ||
885 | } | ||
886 | |||
887 | /* Only re-checkpoint the buffer_head if it is marked | ||
888 | * dirty. If the buffer was added to the BJ_Forget list | ||
889 | * by journal_forget, it may no longer be dirty and | ||
890 | * there's no point in keeping a checkpoint record for | ||
891 | * it. */ | ||
892 | |||
893 | /* | ||
894 | * A buffer which has been freed while still being journaled by | ||
895 | * a previous transaction. | ||
896 | */ | ||
897 | if (buffer_freed(bh)) { | ||
898 | /* | ||
899 | * If the running transaction is the one containing | ||
900 | * "add to orphan" operation (b_next_transaction != | ||
901 | * NULL), we have to wait for that transaction to | ||
902 | * commit before we can really get rid of the buffer. | ||
903 | * So just clear b_modified to not confuse transaction | ||
904 | * credit accounting and refile the buffer to | ||
905 | * BJ_Forget of the running transaction. If the just | ||
906 | * committed transaction contains "add to orphan" | ||
907 | * operation, we can completely invalidate the buffer | ||
908 | * now. We are rather throughout in that since the | ||
909 | * buffer may be still accessible when blocksize < | ||
910 | * pagesize and it is attached to the last partial | ||
911 | * page. | ||
912 | */ | ||
913 | jh->b_modified = 0; | ||
914 | if (!jh->b_next_transaction) { | ||
915 | clear_buffer_freed(bh); | ||
916 | clear_buffer_jbddirty(bh); | ||
917 | clear_buffer_mapped(bh); | ||
918 | clear_buffer_new(bh); | ||
919 | clear_buffer_req(bh); | ||
920 | bh->b_bdev = NULL; | ||
921 | } | ||
922 | } | ||
923 | |||
924 | if (buffer_jbddirty(bh)) { | ||
925 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); | ||
926 | __journal_insert_checkpoint(jh, commit_transaction); | ||
927 | if (is_journal_aborted(journal)) | ||
928 | clear_buffer_jbddirty(bh); | ||
929 | } else { | ||
930 | J_ASSERT_BH(bh, !buffer_dirty(bh)); | ||
931 | /* | ||
932 | * The buffer on BJ_Forget list and not jbddirty means | ||
933 | * it has been freed by this transaction and hence it | ||
934 | * could not have been reallocated until this | ||
935 | * transaction has committed. *BUT* it could be | ||
936 | * reallocated once we have written all the data to | ||
937 | * disk and before we process the buffer on BJ_Forget | ||
938 | * list. | ||
939 | */ | ||
940 | if (!jh->b_next_transaction) | ||
941 | try_to_free = 1; | ||
942 | } | ||
943 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); | ||
944 | __journal_refile_buffer(jh); | ||
945 | jbd_unlock_bh_state(bh); | ||
946 | if (try_to_free) | ||
947 | release_buffer_page(bh); | ||
948 | else | ||
949 | __brelse(bh); | ||
950 | cond_resched_lock(&journal->j_list_lock); | ||
951 | } | ||
952 | spin_unlock(&journal->j_list_lock); | ||
953 | /* | ||
954 | * This is a bit sleazy. We use j_list_lock to protect transition | ||
955 | * of a transaction into T_FINISHED state and calling | ||
956 | * __journal_drop_transaction(). Otherwise we could race with | ||
957 | * other checkpointing code processing the transaction... | ||
958 | */ | ||
959 | spin_lock(&journal->j_state_lock); | ||
960 | spin_lock(&journal->j_list_lock); | ||
961 | /* | ||
962 | * Now recheck if some buffers did not get attached to the transaction | ||
963 | * while the lock was dropped... | ||
964 | */ | ||
965 | if (commit_transaction->t_forget) { | ||
966 | spin_unlock(&journal->j_list_lock); | ||
967 | spin_unlock(&journal->j_state_lock); | ||
968 | goto restart_loop; | ||
969 | } | ||
970 | |||
971 | /* Done with this transaction! */ | ||
972 | |||
973 | jbd_debug(3, "JBD: commit phase 8\n"); | ||
974 | |||
975 | J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); | ||
976 | |||
977 | commit_transaction->t_state = T_FINISHED; | ||
978 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | ||
979 | journal->j_commit_sequence = commit_transaction->t_tid; | ||
980 | journal->j_committing_transaction = NULL; | ||
981 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); | ||
982 | |||
983 | /* | ||
984 | * weight the commit time higher than the average time so we don't | ||
985 | * react too strongly to vast changes in commit time | ||
986 | */ | ||
987 | if (likely(journal->j_average_commit_time)) | ||
988 | journal->j_average_commit_time = (commit_time*3 + | ||
989 | journal->j_average_commit_time) / 4; | ||
990 | else | ||
991 | journal->j_average_commit_time = commit_time; | ||
992 | |||
993 | spin_unlock(&journal->j_state_lock); | ||
994 | |||
995 | if (commit_transaction->t_checkpoint_list == NULL && | ||
996 | commit_transaction->t_checkpoint_io_list == NULL) { | ||
997 | __journal_drop_transaction(journal, commit_transaction); | ||
998 | } else { | ||
999 | if (journal->j_checkpoint_transactions == NULL) { | ||
1000 | journal->j_checkpoint_transactions = commit_transaction; | ||
1001 | commit_transaction->t_cpnext = commit_transaction; | ||
1002 | commit_transaction->t_cpprev = commit_transaction; | ||
1003 | } else { | ||
1004 | commit_transaction->t_cpnext = | ||
1005 | journal->j_checkpoint_transactions; | ||
1006 | commit_transaction->t_cpprev = | ||
1007 | commit_transaction->t_cpnext->t_cpprev; | ||
1008 | commit_transaction->t_cpnext->t_cpprev = | ||
1009 | commit_transaction; | ||
1010 | commit_transaction->t_cpprev->t_cpnext = | ||
1011 | commit_transaction; | ||
1012 | } | ||
1013 | } | ||
1014 | spin_unlock(&journal->j_list_lock); | ||
1015 | |||
1016 | trace_jbd_end_commit(journal, commit_transaction); | ||
1017 | jbd_debug(1, "JBD: commit %d complete, head %d\n", | ||
1018 | journal->j_commit_sequence, journal->j_tail_sequence); | ||
1019 | |||
1020 | wake_up(&journal->j_wait_done_commit); | ||
1021 | } | ||
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c deleted file mode 100644 index c46a79adb6ad..000000000000 --- a/fs/jbd/journal.c +++ /dev/null | |||
@@ -1,2145 +0,0 @@ | |||
1 | /* | ||
2 | * linux/fs/jbd/journal.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | ||
5 | * | ||
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Generic filesystem journal-writing code; part of the ext2fs | ||
13 | * journaling system. | ||
14 | * | ||
15 | * This file manages journals: areas of disk reserved for logging | ||
16 | * transactional updates. This includes the kernel journaling thread | ||
17 | * which is responsible for scheduling updates to the log. | ||
18 | * | ||
19 | * We do not actually manage the physical storage of the journal in this | ||
20 | * file: that is left to a per-journal policy function, which allows us | ||
21 | * to store the journal within a filesystem-specified area for ext2 | ||
22 | * journaling (ext2 can use a reserved inode for storing the log). | ||
23 | */ | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/time.h> | ||
27 | #include <linux/fs.h> | ||
28 | #include <linux/jbd.h> | ||
29 | #include <linux/errno.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/mm.h> | ||
33 | #include <linux/freezer.h> | ||
34 | #include <linux/pagemap.h> | ||
35 | #include <linux/kthread.h> | ||
36 | #include <linux/poison.h> | ||
37 | #include <linux/proc_fs.h> | ||
38 | #include <linux/debugfs.h> | ||
39 | #include <linux/ratelimit.h> | ||
40 | |||
41 | #define CREATE_TRACE_POINTS | ||
42 | #include <trace/events/jbd.h> | ||
43 | |||
44 | #include <asm/uaccess.h> | ||
45 | #include <asm/page.h> | ||
46 | |||
47 | EXPORT_SYMBOL(journal_start); | ||
48 | EXPORT_SYMBOL(journal_restart); | ||
49 | EXPORT_SYMBOL(journal_extend); | ||
50 | EXPORT_SYMBOL(journal_stop); | ||
51 | EXPORT_SYMBOL(journal_lock_updates); | ||
52 | EXPORT_SYMBOL(journal_unlock_updates); | ||
53 | EXPORT_SYMBOL(journal_get_write_access); | ||
54 | EXPORT_SYMBOL(journal_get_create_access); | ||
55 | EXPORT_SYMBOL(journal_get_undo_access); | ||
56 | EXPORT_SYMBOL(journal_dirty_data); | ||
57 | EXPORT_SYMBOL(journal_dirty_metadata); | ||
58 | EXPORT_SYMBOL(journal_release_buffer); | ||
59 | EXPORT_SYMBOL(journal_forget); | ||
60 | #if 0 | ||
61 | EXPORT_SYMBOL(journal_sync_buffer); | ||
62 | #endif | ||
63 | EXPORT_SYMBOL(journal_flush); | ||
64 | EXPORT_SYMBOL(journal_revoke); | ||
65 | |||
66 | EXPORT_SYMBOL(journal_init_dev); | ||
67 | EXPORT_SYMBOL(journal_init_inode); | ||
68 | EXPORT_SYMBOL(journal_update_format); | ||
69 | EXPORT_SYMBOL(journal_check_used_features); | ||
70 | EXPORT_SYMBOL(journal_check_available_features); | ||
71 | EXPORT_SYMBOL(journal_set_features); | ||
72 | EXPORT_SYMBOL(journal_create); | ||
73 | EXPORT_SYMBOL(journal_load); | ||
74 | EXPORT_SYMBOL(journal_destroy); | ||
75 | EXPORT_SYMBOL(journal_abort); | ||
76 | EXPORT_SYMBOL(journal_errno); | ||
77 | EXPORT_SYMBOL(journal_ack_err); | ||
78 | EXPORT_SYMBOL(journal_clear_err); | ||
79 | EXPORT_SYMBOL(log_wait_commit); | ||
80 | EXPORT_SYMBOL(log_start_commit); | ||
81 | EXPORT_SYMBOL(journal_start_commit); | ||
82 | EXPORT_SYMBOL(journal_force_commit_nested); | ||
83 | EXPORT_SYMBOL(journal_wipe); | ||
84 | EXPORT_SYMBOL(journal_blocks_per_page); | ||
85 | EXPORT_SYMBOL(journal_invalidatepage); | ||
86 | EXPORT_SYMBOL(journal_try_to_free_buffers); | ||
87 | EXPORT_SYMBOL(journal_force_commit); | ||
88 | |||
89 | static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); | ||
90 | static void __journal_abort_soft (journal_t *journal, int errno); | ||
91 | static const char *journal_dev_name(journal_t *journal, char *buffer); | ||
92 | |||
93 | #ifdef CONFIG_JBD_DEBUG | ||
94 | void __jbd_debug(int level, const char *file, const char *func, | ||
95 | unsigned int line, const char *fmt, ...) | ||
96 | { | ||
97 | struct va_format vaf; | ||
98 | va_list args; | ||
99 | |||
100 | if (level > journal_enable_debug) | ||
101 | return; | ||
102 | va_start(args, fmt); | ||
103 | vaf.fmt = fmt; | ||
104 | vaf.va = &args; | ||
105 | printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); | ||
106 | va_end(args); | ||
107 | } | ||
108 | EXPORT_SYMBOL(__jbd_debug); | ||
109 | #endif | ||
110 | |||
111 | /* | ||
112 | * Helper function used to manage commit timeouts | ||
113 | */ | ||
114 | |||
115 | static void commit_timeout(unsigned long __data) | ||
116 | { | ||
117 | struct task_struct * p = (struct task_struct *) __data; | ||
118 | |||
119 | wake_up_process(p); | ||
120 | } | ||
121 | |||
122 | /* | ||
123 | * kjournald: The main thread function used to manage a logging device | ||
124 | * journal. | ||
125 | * | ||
126 | * This kernel thread is responsible for two things: | ||
127 | * | ||
128 | * 1) COMMIT: Every so often we need to commit the current state of the | ||
129 | * filesystem to disk. The journal thread is responsible for writing | ||
130 | * all of the metadata buffers to disk. | ||
131 | * | ||
132 | * 2) CHECKPOINT: We cannot reuse a used section of the log file until all | ||
133 | * of the data in that part of the log has been rewritten elsewhere on | ||
134 | * the disk. Flushing these old buffers to reclaim space in the log is | ||
135 | * known as checkpointing, and this thread is responsible for that job. | ||
136 | */ | ||
137 | |||
138 | static int kjournald(void *arg) | ||
139 | { | ||
140 | journal_t *journal = arg; | ||
141 | transaction_t *transaction; | ||
142 | |||
143 | /* | ||
144 | * Set up an interval timer which can be used to trigger a commit wakeup | ||
145 | * after the commit interval expires | ||
146 | */ | ||
147 | setup_timer(&journal->j_commit_timer, commit_timeout, | ||
148 | (unsigned long)current); | ||
149 | |||
150 | set_freezable(); | ||
151 | |||
152 | /* Record that the journal thread is running */ | ||
153 | journal->j_task = current; | ||
154 | wake_up(&journal->j_wait_done_commit); | ||
155 | |||
156 | printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", | ||
157 | journal->j_commit_interval / HZ); | ||
158 | |||
159 | /* | ||
160 | * And now, wait forever for commit wakeup events. | ||
161 | */ | ||
162 | spin_lock(&journal->j_state_lock); | ||
163 | |||
164 | loop: | ||
165 | if (journal->j_flags & JFS_UNMOUNT) | ||
166 | goto end_loop; | ||
167 | |||
168 | jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", | ||
169 | journal->j_commit_sequence, journal->j_commit_request); | ||
170 | |||
171 | if (journal->j_commit_sequence != journal->j_commit_request) { | ||
172 | jbd_debug(1, "OK, requests differ\n"); | ||
173 | spin_unlock(&journal->j_state_lock); | ||
174 | del_timer_sync(&journal->j_commit_timer); | ||
175 | journal_commit_transaction(journal); | ||
176 | spin_lock(&journal->j_state_lock); | ||
177 | goto loop; | ||
178 | } | ||
179 | |||
180 | wake_up(&journal->j_wait_done_commit); | ||
181 | if (freezing(current)) { | ||
182 | /* | ||
183 | * The simpler the better. Flushing journal isn't a | ||
184 | * good idea, because that depends on threads that may | ||
185 | * be already stopped. | ||
186 | */ | ||
187 | jbd_debug(1, "Now suspending kjournald\n"); | ||
188 | spin_unlock(&journal->j_state_lock); | ||
189 | try_to_freeze(); | ||
190 | spin_lock(&journal->j_state_lock); | ||
191 | } else { | ||
192 | /* | ||
193 | * We assume on resume that commits are already there, | ||
194 | * so we don't sleep | ||
195 | */ | ||
196 | DEFINE_WAIT(wait); | ||
197 | int should_sleep = 1; | ||
198 | |||
199 | prepare_to_wait(&journal->j_wait_commit, &wait, | ||
200 | TASK_INTERRUPTIBLE); | ||
201 | if (journal->j_commit_sequence != journal->j_commit_request) | ||
202 | should_sleep = 0; | ||
203 | transaction = journal->j_running_transaction; | ||
204 | if (transaction && time_after_eq(jiffies, | ||
205 | transaction->t_expires)) | ||
206 | should_sleep = 0; | ||
207 | if (journal->j_flags & JFS_UNMOUNT) | ||
208 | should_sleep = 0; | ||
209 | if (should_sleep) { | ||
210 | spin_unlock(&journal->j_state_lock); | ||
211 | schedule(); | ||
212 | spin_lock(&journal->j_state_lock); | ||
213 | } | ||
214 | finish_wait(&journal->j_wait_commit, &wait); | ||
215 | } | ||
216 | |||
217 | jbd_debug(1, "kjournald wakes\n"); | ||
218 | |||
219 | /* | ||
220 | * Were we woken up by a commit wakeup event? | ||
221 | */ | ||
222 | transaction = journal->j_running_transaction; | ||
223 | if (transaction && time_after_eq(jiffies, transaction->t_expires)) { | ||
224 | journal->j_commit_request = transaction->t_tid; | ||
225 | jbd_debug(1, "woke because of timeout\n"); | ||
226 | } | ||
227 | goto loop; | ||
228 | |||
229 | end_loop: | ||
230 | spin_unlock(&journal->j_state_lock); | ||
231 | del_timer_sync(&journal->j_commit_timer); | ||
232 | journal->j_task = NULL; | ||
233 | wake_up(&journal->j_wait_done_commit); | ||
234 | jbd_debug(1, "Journal thread exiting.\n"); | ||
235 | return 0; | ||
236 | } | ||
237 | |||
238 | static int journal_start_thread(journal_t *journal) | ||
239 | { | ||
240 | struct task_struct *t; | ||
241 | |||
242 | t = kthread_run(kjournald, journal, "kjournald"); | ||
243 | if (IS_ERR(t)) | ||
244 | return PTR_ERR(t); | ||
245 | |||
246 | wait_event(journal->j_wait_done_commit, journal->j_task != NULL); | ||
247 | return 0; | ||
248 | } | ||
249 | |||
250 | static void journal_kill_thread(journal_t *journal) | ||
251 | { | ||
252 | spin_lock(&journal->j_state_lock); | ||
253 | journal->j_flags |= JFS_UNMOUNT; | ||
254 | |||
255 | while (journal->j_task) { | ||
256 | wake_up(&journal->j_wait_commit); | ||
257 | spin_unlock(&journal->j_state_lock); | ||
258 | wait_event(journal->j_wait_done_commit, | ||
259 | journal->j_task == NULL); | ||
260 | spin_lock(&journal->j_state_lock); | ||
261 | } | ||
262 | spin_unlock(&journal->j_state_lock); | ||
263 | } | ||
264 | |||
265 | /* | ||
266 | * journal_write_metadata_buffer: write a metadata buffer to the journal. | ||
267 | * | ||
268 | * Writes a metadata buffer to a given disk block. The actual IO is not | ||
269 | * performed but a new buffer_head is constructed which labels the data | ||
270 | * to be written with the correct destination disk block. | ||
271 | * | ||
272 | * Any magic-number escaping which needs to be done will cause a | ||
273 | * copy-out here. If the buffer happens to start with the | ||
274 | * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the | ||
275 | * magic number is only written to the log for descripter blocks. In | ||
276 | * this case, we copy the data and replace the first word with 0, and we | ||
277 | * return a result code which indicates that this buffer needs to be | ||
278 | * marked as an escaped buffer in the corresponding log descriptor | ||
279 | * block. The missing word can then be restored when the block is read | ||
280 | * during recovery. | ||
281 | * | ||
282 | * If the source buffer has already been modified by a new transaction | ||
283 | * since we took the last commit snapshot, we use the frozen copy of | ||
284 | * that data for IO. If we end up using the existing buffer_head's data | ||
285 | * for the write, then we *have* to lock the buffer to prevent anyone | ||
286 | * else from using and possibly modifying it while the IO is in | ||
287 | * progress. | ||
288 | * | ||
289 | * The function returns a pointer to the buffer_heads to be used for IO. | ||
290 | * | ||
291 | * We assume that the journal has already been locked in this function. | ||
292 | * | ||
293 | * Return value: | ||
294 | * <0: Error | ||
295 | * >=0: Finished OK | ||
296 | * | ||
297 | * On success: | ||
298 | * Bit 0 set == escape performed on the data | ||
299 | * Bit 1 set == buffer copy-out performed (kfree the data after IO) | ||
300 | */ | ||
301 | |||
302 | int journal_write_metadata_buffer(transaction_t *transaction, | ||
303 | struct journal_head *jh_in, | ||
304 | struct journal_head **jh_out, | ||
305 | unsigned int blocknr) | ||
306 | { | ||
307 | int need_copy_out = 0; | ||
308 | int done_copy_out = 0; | ||
309 | int do_escape = 0; | ||
310 | char *mapped_data; | ||
311 | struct buffer_head *new_bh; | ||
312 | struct journal_head *new_jh; | ||
313 | struct page *new_page; | ||
314 | unsigned int new_offset; | ||
315 | struct buffer_head *bh_in = jh2bh(jh_in); | ||
316 | journal_t *journal = transaction->t_journal; | ||
317 | |||
318 | /* | ||
319 | * The buffer really shouldn't be locked: only the current committing | ||
320 | * transaction is allowed to write it, so nobody else is allowed | ||
321 | * to do any IO. | ||
322 | * | ||
323 | * akpm: except if we're journalling data, and write() output is | ||
324 | * also part of a shared mapping, and another thread has | ||
325 | * decided to launch a writepage() against this buffer. | ||
326 | */ | ||
327 | J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); | ||
328 | |||
329 | new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); | ||
330 | /* keep subsequent assertions sane */ | ||
331 | atomic_set(&new_bh->b_count, 1); | ||
332 | new_jh = journal_add_journal_head(new_bh); /* This sleeps */ | ||
333 | |||
334 | /* | ||
335 | * If a new transaction has already done a buffer copy-out, then | ||
336 | * we use that version of the data for the commit. | ||
337 | */ | ||
338 | jbd_lock_bh_state(bh_in); | ||
339 | repeat: | ||
340 | if (jh_in->b_frozen_data) { | ||
341 | done_copy_out = 1; | ||
342 | new_page = virt_to_page(jh_in->b_frozen_data); | ||
343 | new_offset = offset_in_page(jh_in->b_frozen_data); | ||
344 | } else { | ||
345 | new_page = jh2bh(jh_in)->b_page; | ||
346 | new_offset = offset_in_page(jh2bh(jh_in)->b_data); | ||
347 | } | ||
348 | |||
349 | mapped_data = kmap_atomic(new_page); | ||
350 | /* | ||
351 | * Check for escaping | ||
352 | */ | ||
353 | if (*((__be32 *)(mapped_data + new_offset)) == | ||
354 | cpu_to_be32(JFS_MAGIC_NUMBER)) { | ||
355 | need_copy_out = 1; | ||
356 | do_escape = 1; | ||
357 | } | ||
358 | kunmap_atomic(mapped_data); | ||
359 | |||
360 | /* | ||
361 | * Do we need to do a data copy? | ||
362 | */ | ||
363 | if (need_copy_out && !done_copy_out) { | ||
364 | char *tmp; | ||
365 | |||
366 | jbd_unlock_bh_state(bh_in); | ||
367 | tmp = jbd_alloc(bh_in->b_size, GFP_NOFS); | ||
368 | jbd_lock_bh_state(bh_in); | ||
369 | if (jh_in->b_frozen_data) { | ||
370 | jbd_free(tmp, bh_in->b_size); | ||
371 | goto repeat; | ||
372 | } | ||
373 | |||
374 | jh_in->b_frozen_data = tmp; | ||
375 | mapped_data = kmap_atomic(new_page); | ||
376 | memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); | ||
377 | kunmap_atomic(mapped_data); | ||
378 | |||
379 | new_page = virt_to_page(tmp); | ||
380 | new_offset = offset_in_page(tmp); | ||
381 | done_copy_out = 1; | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Did we need to do an escaping? Now we've done all the | ||
386 | * copying, we can finally do so. | ||
387 | */ | ||
388 | if (do_escape) { | ||
389 | mapped_data = kmap_atomic(new_page); | ||
390 | *((unsigned int *)(mapped_data + new_offset)) = 0; | ||
391 | kunmap_atomic(mapped_data); | ||
392 | } | ||
393 | |||
394 | set_bh_page(new_bh, new_page, new_offset); | ||
395 | new_jh->b_transaction = NULL; | ||
396 | new_bh->b_size = jh2bh(jh_in)->b_size; | ||
397 | new_bh->b_bdev = transaction->t_journal->j_dev; | ||
398 | new_bh->b_blocknr = blocknr; | ||
399 | set_buffer_mapped(new_bh); | ||
400 | set_buffer_dirty(new_bh); | ||
401 | |||
402 | *jh_out = new_jh; | ||
403 | |||
404 | /* | ||
405 | * The to-be-written buffer needs to get moved to the io queue, | ||
406 | * and the original buffer whose contents we are shadowing or | ||
407 | * copying is moved to the transaction's shadow queue. | ||
408 | */ | ||
409 | JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); | ||
410 | spin_lock(&journal->j_list_lock); | ||
411 | __journal_file_buffer(jh_in, transaction, BJ_Shadow); | ||
412 | spin_unlock(&journal->j_list_lock); | ||
413 | jbd_unlock_bh_state(bh_in); | ||
414 | |||
415 | JBUFFER_TRACE(new_jh, "file as BJ_IO"); | ||
416 | journal_file_buffer(new_jh, transaction, BJ_IO); | ||
417 | |||
418 | return do_escape | (done_copy_out << 1); | ||
419 | } | ||
420 | |||
421 | /* | ||
422 | * Allocation code for the journal file. Manage the space left in the | ||
423 | * journal, so that we can begin checkpointing when appropriate. | ||
424 | */ | ||
425 | |||
426 | /* | ||
427 | * __log_space_left: Return the number of free blocks left in the journal. | ||
428 | * | ||
429 | * Called with the journal already locked. | ||
430 | * | ||
431 | * Called under j_state_lock | ||
432 | */ | ||
433 | |||
434 | int __log_space_left(journal_t *journal) | ||
435 | { | ||
436 | int left = journal->j_free; | ||
437 | |||
438 | assert_spin_locked(&journal->j_state_lock); | ||
439 | |||
440 | /* | ||
441 | * Be pessimistic here about the number of those free blocks which | ||
442 | * might be required for log descriptor control blocks. | ||
443 | */ | ||
444 | |||
445 | #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ | ||
446 | |||
447 | left -= MIN_LOG_RESERVED_BLOCKS; | ||
448 | |||
449 | if (left <= 0) | ||
450 | return 0; | ||
451 | left -= (left >> 3); | ||
452 | return left; | ||
453 | } | ||
454 | |||
455 | /* | ||
456 | * Called under j_state_lock. Returns true if a transaction commit was started. | ||
457 | */ | ||
458 | int __log_start_commit(journal_t *journal, tid_t target) | ||
459 | { | ||
460 | /* | ||
461 | * The only transaction we can possibly wait upon is the | ||
462 | * currently running transaction (if it exists). Otherwise, | ||
463 | * the target tid must be an old one. | ||
464 | */ | ||
465 | if (journal->j_commit_request != target && | ||
466 | journal->j_running_transaction && | ||
467 | journal->j_running_transaction->t_tid == target) { | ||
468 | /* | ||
469 | * We want a new commit: OK, mark the request and wakeup the | ||
470 | * commit thread. We do _not_ do the commit ourselves. | ||
471 | */ | ||
472 | |||
473 | journal->j_commit_request = target; | ||
474 | jbd_debug(1, "JBD: requesting commit %d/%d\n", | ||
475 | journal->j_commit_request, | ||
476 | journal->j_commit_sequence); | ||
477 | wake_up(&journal->j_wait_commit); | ||
478 | return 1; | ||
479 | } else if (!tid_geq(journal->j_commit_request, target)) | ||
480 | /* This should never happen, but if it does, preserve | ||
481 | the evidence before kjournald goes into a loop and | ||
482 | increments j_commit_sequence beyond all recognition. */ | ||
483 | WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", | ||
484 | journal->j_commit_request, journal->j_commit_sequence, | ||
485 | target, journal->j_running_transaction ? | ||
486 | journal->j_running_transaction->t_tid : 0); | ||
487 | return 0; | ||
488 | } | ||
489 | |||
490 | int log_start_commit(journal_t *journal, tid_t tid) | ||
491 | { | ||
492 | int ret; | ||
493 | |||
494 | spin_lock(&journal->j_state_lock); | ||
495 | ret = __log_start_commit(journal, tid); | ||
496 | spin_unlock(&journal->j_state_lock); | ||
497 | return ret; | ||
498 | } | ||
499 | |||
500 | /* | ||
501 | * Force and wait upon a commit if the calling process is not within | ||
502 | * transaction. This is used for forcing out undo-protected data which contains | ||
503 | * bitmaps, when the fs is running out of space. | ||
504 | * | ||
505 | * We can only force the running transaction if we don't have an active handle; | ||
506 | * otherwise, we will deadlock. | ||
507 | * | ||
508 | * Returns true if a transaction was started. | ||
509 | */ | ||
510 | int journal_force_commit_nested(journal_t *journal) | ||
511 | { | ||
512 | transaction_t *transaction = NULL; | ||
513 | tid_t tid; | ||
514 | |||
515 | spin_lock(&journal->j_state_lock); | ||
516 | if (journal->j_running_transaction && !current->journal_info) { | ||
517 | transaction = journal->j_running_transaction; | ||
518 | __log_start_commit(journal, transaction->t_tid); | ||
519 | } else if (journal->j_committing_transaction) | ||
520 | transaction = journal->j_committing_transaction; | ||
521 | |||
522 | if (!transaction) { | ||
523 | spin_unlock(&journal->j_state_lock); | ||
524 | return 0; /* Nothing to retry */ | ||
525 | } | ||
526 | |||
527 | tid = transaction->t_tid; | ||
528 | spin_unlock(&journal->j_state_lock); | ||
529 | log_wait_commit(journal, tid); | ||
530 | return 1; | ||
531 | } | ||
532 | |||
533 | /* | ||
534 | * Start a commit of the current running transaction (if any). Returns true | ||
535 | * if a transaction is going to be committed (or is currently already | ||
536 | * committing), and fills its tid in at *ptid | ||
537 | */ | ||
538 | int journal_start_commit(journal_t *journal, tid_t *ptid) | ||
539 | { | ||
540 | int ret = 0; | ||
541 | |||
542 | spin_lock(&journal->j_state_lock); | ||
543 | if (journal->j_running_transaction) { | ||
544 | tid_t tid = journal->j_running_transaction->t_tid; | ||
545 | |||
546 | __log_start_commit(journal, tid); | ||
547 | /* There's a running transaction and we've just made sure | ||
548 | * it's commit has been scheduled. */ | ||
549 | if (ptid) | ||
550 | *ptid = tid; | ||
551 | ret = 1; | ||
552 | } else if (journal->j_committing_transaction) { | ||
553 | /* | ||
554 | * If commit has been started, then we have to wait for | ||
555 | * completion of that transaction. | ||
556 | */ | ||
557 | if (ptid) | ||
558 | *ptid = journal->j_committing_transaction->t_tid; | ||
559 | ret = 1; | ||
560 | } | ||
561 | spin_unlock(&journal->j_state_lock); | ||
562 | return ret; | ||
563 | } | ||
564 | |||
565 | /* | ||
566 | * Wait for a specified commit to complete. | ||
567 | * The caller may not hold the journal lock. | ||
568 | */ | ||
569 | int log_wait_commit(journal_t *journal, tid_t tid) | ||
570 | { | ||
571 | int err = 0; | ||
572 | |||
573 | #ifdef CONFIG_JBD_DEBUG | ||
574 | spin_lock(&journal->j_state_lock); | ||
575 | if (!tid_geq(journal->j_commit_request, tid)) { | ||
576 | printk(KERN_ERR | ||
577 | "%s: error: j_commit_request=%d, tid=%d\n", | ||
578 | __func__, journal->j_commit_request, tid); | ||
579 | } | ||
580 | spin_unlock(&journal->j_state_lock); | ||
581 | #endif | ||
582 | spin_lock(&journal->j_state_lock); | ||
583 | /* | ||
584 | * Not running or committing trans? Must be already committed. This | ||
585 | * saves us from waiting for a *long* time when tid overflows. | ||
586 | */ | ||
587 | if (!((journal->j_running_transaction && | ||
588 | journal->j_running_transaction->t_tid == tid) || | ||
589 | (journal->j_committing_transaction && | ||
590 | journal->j_committing_transaction->t_tid == tid))) | ||
591 | goto out_unlock; | ||
592 | |||
593 | if (!tid_geq(journal->j_commit_waited, tid)) | ||
594 | journal->j_commit_waited = tid; | ||
595 | while (tid_gt(tid, journal->j_commit_sequence)) { | ||
596 | jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", | ||
597 | tid, journal->j_commit_sequence); | ||
598 | wake_up(&journal->j_wait_commit); | ||
599 | spin_unlock(&journal->j_state_lock); | ||
600 | wait_event(journal->j_wait_done_commit, | ||
601 | !tid_gt(tid, journal->j_commit_sequence)); | ||
602 | spin_lock(&journal->j_state_lock); | ||
603 | } | ||
604 | out_unlock: | ||
605 | spin_unlock(&journal->j_state_lock); | ||
606 | |||
607 | if (unlikely(is_journal_aborted(journal))) | ||
608 | err = -EIO; | ||
609 | return err; | ||
610 | } | ||
611 | |||
612 | /* | ||
613 | * Return 1 if a given transaction has not yet sent barrier request | ||
614 | * connected with a transaction commit. If 0 is returned, transaction | ||
615 | * may or may not have sent the barrier. Used to avoid sending barrier | ||
616 | * twice in common cases. | ||
617 | */ | ||
618 | int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid) | ||
619 | { | ||
620 | int ret = 0; | ||
621 | transaction_t *commit_trans; | ||
622 | |||
623 | if (!(journal->j_flags & JFS_BARRIER)) | ||
624 | return 0; | ||
625 | spin_lock(&journal->j_state_lock); | ||
626 | /* Transaction already committed? */ | ||
627 | if (tid_geq(journal->j_commit_sequence, tid)) | ||
628 | goto out; | ||
629 | /* | ||
630 | * Transaction is being committed and we already proceeded to | ||
631 | * writing commit record? | ||
632 | */ | ||
633 | commit_trans = journal->j_committing_transaction; | ||
634 | if (commit_trans && commit_trans->t_tid == tid && | ||
635 | commit_trans->t_state >= T_COMMIT_RECORD) | ||
636 | goto out; | ||
637 | ret = 1; | ||
638 | out: | ||
639 | spin_unlock(&journal->j_state_lock); | ||
640 | return ret; | ||
641 | } | ||
642 | EXPORT_SYMBOL(journal_trans_will_send_data_barrier); | ||
643 | |||
644 | /* | ||
645 | * Log buffer allocation routines: | ||
646 | */ | ||
647 | |||
648 | int journal_next_log_block(journal_t *journal, unsigned int *retp) | ||
649 | { | ||
650 | unsigned int blocknr; | ||
651 | |||
652 | spin_lock(&journal->j_state_lock); | ||
653 | J_ASSERT(journal->j_free > 1); | ||
654 | |||
655 | blocknr = journal->j_head; | ||
656 | journal->j_head++; | ||
657 | journal->j_free--; | ||
658 | if (journal->j_head == journal->j_last) | ||
659 | journal->j_head = journal->j_first; | ||
660 | spin_unlock(&journal->j_state_lock); | ||
661 | return journal_bmap(journal, blocknr, retp); | ||
662 | } | ||
663 | |||
664 | /* | ||
665 | * Conversion of logical to physical block numbers for the journal | ||
666 | * | ||
667 | * On external journals the journal blocks are identity-mapped, so | ||
668 | * this is a no-op. If needed, we can use j_blk_offset - everything is | ||
669 | * ready. | ||
670 | */ | ||
671 | int journal_bmap(journal_t *journal, unsigned int blocknr, | ||
672 | unsigned int *retp) | ||
673 | { | ||
674 | int err = 0; | ||
675 | unsigned int ret; | ||
676 | |||
677 | if (journal->j_inode) { | ||
678 | ret = bmap(journal->j_inode, blocknr); | ||
679 | if (ret) | ||
680 | *retp = ret; | ||
681 | else { | ||
682 | char b[BDEVNAME_SIZE]; | ||
683 | |||
684 | printk(KERN_ALERT "%s: journal block not found " | ||
685 | "at offset %u on %s\n", | ||
686 | __func__, | ||
687 | blocknr, | ||
688 | bdevname(journal->j_dev, b)); | ||
689 | err = -EIO; | ||
690 | __journal_abort_soft(journal, err); | ||
691 | } | ||
692 | } else { | ||
693 | *retp = blocknr; /* +journal->j_blk_offset */ | ||
694 | } | ||
695 | return err; | ||
696 | } | ||
697 | |||
698 | /* | ||
699 | * We play buffer_head aliasing tricks to write data/metadata blocks to | ||
700 | * the journal without copying their contents, but for journal | ||
701 | * descriptor blocks we do need to generate bona fide buffers. | ||
702 | * | ||
703 | * After the caller of journal_get_descriptor_buffer() has finished modifying | ||
704 | * the buffer's contents they really should run flush_dcache_page(bh->b_page). | ||
705 | * But we don't bother doing that, so there will be coherency problems with | ||
706 | * mmaps of blockdevs which hold live JBD-controlled filesystems. | ||
707 | */ | ||
708 | struct journal_head *journal_get_descriptor_buffer(journal_t *journal) | ||
709 | { | ||
710 | struct buffer_head *bh; | ||
711 | unsigned int blocknr; | ||
712 | int err; | ||
713 | |||
714 | err = journal_next_log_block(journal, &blocknr); | ||
715 | |||
716 | if (err) | ||
717 | return NULL; | ||
718 | |||
719 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
720 | if (!bh) | ||
721 | return NULL; | ||
722 | lock_buffer(bh); | ||
723 | memset(bh->b_data, 0, journal->j_blocksize); | ||
724 | set_buffer_uptodate(bh); | ||
725 | unlock_buffer(bh); | ||
726 | BUFFER_TRACE(bh, "return this buffer"); | ||
727 | return journal_add_journal_head(bh); | ||
728 | } | ||
729 | |||
730 | /* | ||
731 | * Management for journal control blocks: functions to create and | ||
732 | * destroy journal_t structures, and to initialise and read existing | ||
733 | * journal blocks from disk. */ | ||
734 | |||
735 | /* First: create and setup a journal_t object in memory. We initialise | ||
736 | * very few fields yet: that has to wait until we have created the | ||
737 | * journal structures from from scratch, or loaded them from disk. */ | ||
738 | |||
739 | static journal_t * journal_init_common (void) | ||
740 | { | ||
741 | journal_t *journal; | ||
742 | int err; | ||
743 | |||
744 | journal = kzalloc(sizeof(*journal), GFP_KERNEL); | ||
745 | if (!journal) | ||
746 | goto fail; | ||
747 | |||
748 | init_waitqueue_head(&journal->j_wait_transaction_locked); | ||
749 | init_waitqueue_head(&journal->j_wait_logspace); | ||
750 | init_waitqueue_head(&journal->j_wait_done_commit); | ||
751 | init_waitqueue_head(&journal->j_wait_checkpoint); | ||
752 | init_waitqueue_head(&journal->j_wait_commit); | ||
753 | init_waitqueue_head(&journal->j_wait_updates); | ||
754 | mutex_init(&journal->j_checkpoint_mutex); | ||
755 | spin_lock_init(&journal->j_revoke_lock); | ||
756 | spin_lock_init(&journal->j_list_lock); | ||
757 | spin_lock_init(&journal->j_state_lock); | ||
758 | |||
759 | journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); | ||
760 | |||
761 | /* The journal is marked for error until we succeed with recovery! */ | ||
762 | journal->j_flags = JFS_ABORT; | ||
763 | |||
764 | /* Set up a default-sized revoke table for the new mount. */ | ||
765 | err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); | ||
766 | if (err) { | ||
767 | kfree(journal); | ||
768 | goto fail; | ||
769 | } | ||
770 | return journal; | ||
771 | fail: | ||
772 | return NULL; | ||
773 | } | ||
774 | |||
775 | /* journal_init_dev and journal_init_inode: | ||
776 | * | ||
777 | * Create a journal structure assigned some fixed set of disk blocks to | ||
778 | * the journal. We don't actually touch those disk blocks yet, but we | ||
779 | * need to set up all of the mapping information to tell the journaling | ||
780 | * system where the journal blocks are. | ||
781 | * | ||
782 | */ | ||
783 | |||
784 | /** | ||
785 | * journal_t * journal_init_dev() - creates and initialises a journal structure | ||
786 | * @bdev: Block device on which to create the journal | ||
787 | * @fs_dev: Device which hold journalled filesystem for this journal. | ||
788 | * @start: Block nr Start of journal. | ||
789 | * @len: Length of the journal in blocks. | ||
790 | * @blocksize: blocksize of journalling device | ||
791 | * | ||
792 | * Returns: a newly created journal_t * | ||
793 | * | ||
794 | * journal_init_dev creates a journal which maps a fixed contiguous | ||
795 | * range of blocks on an arbitrary block device. | ||
796 | * | ||
797 | */ | ||
798 | journal_t * journal_init_dev(struct block_device *bdev, | ||
799 | struct block_device *fs_dev, | ||
800 | int start, int len, int blocksize) | ||
801 | { | ||
802 | journal_t *journal = journal_init_common(); | ||
803 | struct buffer_head *bh; | ||
804 | int n; | ||
805 | |||
806 | if (!journal) | ||
807 | return NULL; | ||
808 | |||
809 | /* journal descriptor can store up to n blocks -bzzz */ | ||
810 | journal->j_blocksize = blocksize; | ||
811 | n = journal->j_blocksize / sizeof(journal_block_tag_t); | ||
812 | journal->j_wbufsize = n; | ||
813 | journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); | ||
814 | if (!journal->j_wbuf) { | ||
815 | printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", | ||
816 | __func__); | ||
817 | goto out_err; | ||
818 | } | ||
819 | journal->j_dev = bdev; | ||
820 | journal->j_fs_dev = fs_dev; | ||
821 | journal->j_blk_offset = start; | ||
822 | journal->j_maxlen = len; | ||
823 | |||
824 | bh = __getblk(journal->j_dev, start, journal->j_blocksize); | ||
825 | if (!bh) { | ||
826 | printk(KERN_ERR | ||
827 | "%s: Cannot get buffer for journal superblock\n", | ||
828 | __func__); | ||
829 | goto out_err; | ||
830 | } | ||
831 | journal->j_sb_buffer = bh; | ||
832 | journal->j_superblock = (journal_superblock_t *)bh->b_data; | ||
833 | |||
834 | return journal; | ||
835 | out_err: | ||
836 | kfree(journal->j_wbuf); | ||
837 | kfree(journal); | ||
838 | return NULL; | ||
839 | } | ||
840 | |||
841 | /** | ||
842 | * journal_t * journal_init_inode () - creates a journal which maps to a inode. | ||
843 | * @inode: An inode to create the journal in | ||
844 | * | ||
845 | * journal_init_inode creates a journal which maps an on-disk inode as | ||
846 | * the journal. The inode must exist already, must support bmap() and | ||
847 | * must have all data blocks preallocated. | ||
848 | */ | ||
849 | journal_t * journal_init_inode (struct inode *inode) | ||
850 | { | ||
851 | struct buffer_head *bh; | ||
852 | journal_t *journal = journal_init_common(); | ||
853 | int err; | ||
854 | int n; | ||
855 | unsigned int blocknr; | ||
856 | |||
857 | if (!journal) | ||
858 | return NULL; | ||
859 | |||
860 | journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; | ||
861 | journal->j_inode = inode; | ||
862 | jbd_debug(1, | ||
863 | "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", | ||
864 | journal, inode->i_sb->s_id, inode->i_ino, | ||
865 | (long long) inode->i_size, | ||
866 | inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); | ||
867 | |||
868 | journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; | ||
869 | journal->j_blocksize = inode->i_sb->s_blocksize; | ||
870 | |||
871 | /* journal descriptor can store up to n blocks -bzzz */ | ||
872 | n = journal->j_blocksize / sizeof(journal_block_tag_t); | ||
873 | journal->j_wbufsize = n; | ||
874 | journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); | ||
875 | if (!journal->j_wbuf) { | ||
876 | printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", | ||
877 | __func__); | ||
878 | goto out_err; | ||
879 | } | ||
880 | |||
881 | err = journal_bmap(journal, 0, &blocknr); | ||
882 | /* If that failed, give up */ | ||
883 | if (err) { | ||
884 | printk(KERN_ERR "%s: Cannot locate journal superblock\n", | ||
885 | __func__); | ||
886 | goto out_err; | ||
887 | } | ||
888 | |||
889 | bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); | ||
890 | if (!bh) { | ||
891 | printk(KERN_ERR | ||
892 | "%s: Cannot get buffer for journal superblock\n", | ||
893 | __func__); | ||
894 | goto out_err; | ||
895 | } | ||
896 | journal->j_sb_buffer = bh; | ||
897 | journal->j_superblock = (journal_superblock_t *)bh->b_data; | ||
898 | |||
899 | return journal; | ||
900 | out_err: | ||
901 | kfree(journal->j_wbuf); | ||
902 | kfree(journal); | ||
903 | return NULL; | ||
904 | } | ||
905 | |||
906 | /* | ||
907 | * If the journal init or create aborts, we need to mark the journal | ||
908 | * superblock as being NULL to prevent the journal destroy from writing | ||
909 | * back a bogus superblock. | ||
910 | */ | ||
911 | static void journal_fail_superblock (journal_t *journal) | ||
912 | { | ||
913 | struct buffer_head *bh = journal->j_sb_buffer; | ||
914 | brelse(bh); | ||
915 | journal->j_sb_buffer = NULL; | ||
916 | } | ||
917 | |||
918 | /* | ||
919 | * Given a journal_t structure, initialise the various fields for | ||
920 | * startup of a new journaling session. We use this both when creating | ||
921 | * a journal, and after recovering an old journal to reset it for | ||
922 | * subsequent use. | ||
923 | */ | ||
924 | |||
925 | static int journal_reset(journal_t *journal) | ||
926 | { | ||
927 | journal_superblock_t *sb = journal->j_superblock; | ||
928 | unsigned int first, last; | ||
929 | |||
930 | first = be32_to_cpu(sb->s_first); | ||
931 | last = be32_to_cpu(sb->s_maxlen); | ||
932 | if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { | ||
933 | printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", | ||
934 | first, last); | ||
935 | journal_fail_superblock(journal); | ||
936 | return -EINVAL; | ||
937 | } | ||
938 | |||
939 | journal->j_first = first; | ||
940 | journal->j_last = last; | ||
941 | |||
942 | journal->j_head = first; | ||
943 | journal->j_tail = first; | ||
944 | journal->j_free = last - first; | ||
945 | |||
946 | journal->j_tail_sequence = journal->j_transaction_sequence; | ||
947 | journal->j_commit_sequence = journal->j_transaction_sequence - 1; | ||
948 | journal->j_commit_request = journal->j_commit_sequence; | ||
949 | |||
950 | journal->j_max_transaction_buffers = journal->j_maxlen / 4; | ||
951 | |||
952 | /* | ||
953 | * As a special case, if the on-disk copy is already marked as needing | ||
954 | * no recovery (s_start == 0), then we can safely defer the superblock | ||
955 | * update until the next commit by setting JFS_FLUSHED. This avoids | ||
956 | * attempting a write to a potential-readonly device. | ||
957 | */ | ||
958 | if (sb->s_start == 0) { | ||
959 | jbd_debug(1,"JBD: Skipping superblock update on recovered sb " | ||
960 | "(start %u, seq %d, errno %d)\n", | ||
961 | journal->j_tail, journal->j_tail_sequence, | ||
962 | journal->j_errno); | ||
963 | journal->j_flags |= JFS_FLUSHED; | ||
964 | } else { | ||
965 | /* Lock here to make assertions happy... */ | ||
966 | mutex_lock(&journal->j_checkpoint_mutex); | ||
967 | /* | ||
968 | * Update log tail information. We use WRITE_FUA since new | ||
969 | * transaction will start reusing journal space and so we | ||
970 | * must make sure information about current log tail is on | ||
971 | * disk before that. | ||
972 | */ | ||
973 | journal_update_sb_log_tail(journal, | ||
974 | journal->j_tail_sequence, | ||
975 | journal->j_tail, | ||
976 | WRITE_FUA); | ||
977 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
978 | } | ||
979 | return journal_start_thread(journal); | ||
980 | } | ||
981 | |||
982 | /** | ||
983 | * int journal_create() - Initialise the new journal file | ||
984 | * @journal: Journal to create. This structure must have been initialised | ||
985 | * | ||
986 | * Given a journal_t structure which tells us which disk blocks we can | ||
987 | * use, create a new journal superblock and initialise all of the | ||
988 | * journal fields from scratch. | ||
989 | **/ | ||
990 | int journal_create(journal_t *journal) | ||
991 | { | ||
992 | unsigned int blocknr; | ||
993 | struct buffer_head *bh; | ||
994 | journal_superblock_t *sb; | ||
995 | int i, err; | ||
996 | |||
997 | if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { | ||
998 | printk (KERN_ERR "Journal length (%d blocks) too short.\n", | ||
999 | journal->j_maxlen); | ||
1000 | journal_fail_superblock(journal); | ||
1001 | return -EINVAL; | ||
1002 | } | ||
1003 | |||
1004 | if (journal->j_inode == NULL) { | ||
1005 | /* | ||
1006 | * We don't know what block to start at! | ||
1007 | */ | ||
1008 | printk(KERN_EMERG | ||
1009 | "%s: creation of journal on external device!\n", | ||
1010 | __func__); | ||
1011 | BUG(); | ||
1012 | } | ||
1013 | |||
1014 | /* Zero out the entire journal on disk. We cannot afford to | ||
1015 | have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ | ||
1016 | jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); | ||
1017 | for (i = 0; i < journal->j_maxlen; i++) { | ||
1018 | err = journal_bmap(journal, i, &blocknr); | ||
1019 | if (err) | ||
1020 | return err; | ||
1021 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
1022 | if (unlikely(!bh)) | ||
1023 | return -ENOMEM; | ||
1024 | lock_buffer(bh); | ||
1025 | memset (bh->b_data, 0, journal->j_blocksize); | ||
1026 | BUFFER_TRACE(bh, "marking dirty"); | ||
1027 | mark_buffer_dirty(bh); | ||
1028 | BUFFER_TRACE(bh, "marking uptodate"); | ||
1029 | set_buffer_uptodate(bh); | ||
1030 | unlock_buffer(bh); | ||
1031 | __brelse(bh); | ||
1032 | } | ||
1033 | |||
1034 | sync_blockdev(journal->j_dev); | ||
1035 | jbd_debug(1, "JBD: journal cleared.\n"); | ||
1036 | |||
1037 | /* OK, fill in the initial static fields in the new superblock */ | ||
1038 | sb = journal->j_superblock; | ||
1039 | |||
1040 | sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
1041 | sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); | ||
1042 | |||
1043 | sb->s_blocksize = cpu_to_be32(journal->j_blocksize); | ||
1044 | sb->s_maxlen = cpu_to_be32(journal->j_maxlen); | ||
1045 | sb->s_first = cpu_to_be32(1); | ||
1046 | |||
1047 | journal->j_transaction_sequence = 1; | ||
1048 | |||
1049 | journal->j_flags &= ~JFS_ABORT; | ||
1050 | journal->j_format_version = 2; | ||
1051 | |||
1052 | return journal_reset(journal); | ||
1053 | } | ||
1054 | |||
1055 | static void journal_write_superblock(journal_t *journal, int write_op) | ||
1056 | { | ||
1057 | struct buffer_head *bh = journal->j_sb_buffer; | ||
1058 | int ret; | ||
1059 | |||
1060 | trace_journal_write_superblock(journal, write_op); | ||
1061 | if (!(journal->j_flags & JFS_BARRIER)) | ||
1062 | write_op &= ~(REQ_FUA | REQ_FLUSH); | ||
1063 | lock_buffer(bh); | ||
1064 | if (buffer_write_io_error(bh)) { | ||
1065 | char b[BDEVNAME_SIZE]; | ||
1066 | /* | ||
1067 | * Oh, dear. A previous attempt to write the journal | ||
1068 | * superblock failed. This could happen because the | ||
1069 | * USB device was yanked out. Or it could happen to | ||
1070 | * be a transient write error and maybe the block will | ||
1071 | * be remapped. Nothing we can do but to retry the | ||
1072 | * write and hope for the best. | ||
1073 | */ | ||
1074 | printk(KERN_ERR "JBD: previous I/O error detected " | ||
1075 | "for journal superblock update for %s.\n", | ||
1076 | journal_dev_name(journal, b)); | ||
1077 | clear_buffer_write_io_error(bh); | ||
1078 | set_buffer_uptodate(bh); | ||
1079 | } | ||
1080 | |||
1081 | get_bh(bh); | ||
1082 | bh->b_end_io = end_buffer_write_sync; | ||
1083 | ret = submit_bh(write_op, bh); | ||
1084 | wait_on_buffer(bh); | ||
1085 | if (buffer_write_io_error(bh)) { | ||
1086 | clear_buffer_write_io_error(bh); | ||
1087 | set_buffer_uptodate(bh); | ||
1088 | ret = -EIO; | ||
1089 | } | ||
1090 | if (ret) { | ||
1091 | char b[BDEVNAME_SIZE]; | ||
1092 | printk(KERN_ERR "JBD: Error %d detected " | ||
1093 | "when updating journal superblock for %s.\n", | ||
1094 | ret, journal_dev_name(journal, b)); | ||
1095 | } | ||
1096 | } | ||
1097 | |||
1098 | /** | ||
1099 | * journal_update_sb_log_tail() - Update log tail in journal sb on disk. | ||
1100 | * @journal: The journal to update. | ||
1101 | * @tail_tid: TID of the new transaction at the tail of the log | ||
1102 | * @tail_block: The first block of the transaction at the tail of the log | ||
1103 | * @write_op: With which operation should we write the journal sb | ||
1104 | * | ||
1105 | * Update a journal's superblock information about log tail and write it to | ||
1106 | * disk, waiting for the IO to complete. | ||
1107 | */ | ||
1108 | void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, | ||
1109 | unsigned int tail_block, int write_op) | ||
1110 | { | ||
1111 | journal_superblock_t *sb = journal->j_superblock; | ||
1112 | |||
1113 | BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); | ||
1114 | jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", | ||
1115 | tail_block, tail_tid); | ||
1116 | |||
1117 | sb->s_sequence = cpu_to_be32(tail_tid); | ||
1118 | sb->s_start = cpu_to_be32(tail_block); | ||
1119 | |||
1120 | journal_write_superblock(journal, write_op); | ||
1121 | |||
1122 | /* Log is no longer empty */ | ||
1123 | spin_lock(&journal->j_state_lock); | ||
1124 | WARN_ON(!sb->s_sequence); | ||
1125 | journal->j_flags &= ~JFS_FLUSHED; | ||
1126 | spin_unlock(&journal->j_state_lock); | ||
1127 | } | ||
1128 | |||
1129 | /** | ||
1130 | * mark_journal_empty() - Mark on disk journal as empty. | ||
1131 | * @journal: The journal to update. | ||
1132 | * | ||
1133 | * Update a journal's dynamic superblock fields to show that journal is empty. | ||
1134 | * Write updated superblock to disk waiting for IO to complete. | ||
1135 | */ | ||
1136 | static void mark_journal_empty(journal_t *journal) | ||
1137 | { | ||
1138 | journal_superblock_t *sb = journal->j_superblock; | ||
1139 | |||
1140 | BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); | ||
1141 | spin_lock(&journal->j_state_lock); | ||
1142 | /* Is it already empty? */ | ||
1143 | if (sb->s_start == 0) { | ||
1144 | spin_unlock(&journal->j_state_lock); | ||
1145 | return; | ||
1146 | } | ||
1147 | jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", | ||
1148 | journal->j_tail_sequence); | ||
1149 | |||
1150 | sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); | ||
1151 | sb->s_start = cpu_to_be32(0); | ||
1152 | spin_unlock(&journal->j_state_lock); | ||
1153 | |||
1154 | journal_write_superblock(journal, WRITE_FUA); | ||
1155 | |||
1156 | spin_lock(&journal->j_state_lock); | ||
1157 | /* Log is empty */ | ||
1158 | journal->j_flags |= JFS_FLUSHED; | ||
1159 | spin_unlock(&journal->j_state_lock); | ||
1160 | } | ||
1161 | |||
1162 | /** | ||
1163 | * journal_update_sb_errno() - Update error in the journal. | ||
1164 | * @journal: The journal to update. | ||
1165 | * | ||
1166 | * Update a journal's errno. Write updated superblock to disk waiting for IO | ||
1167 | * to complete. | ||
1168 | */ | ||
1169 | static void journal_update_sb_errno(journal_t *journal) | ||
1170 | { | ||
1171 | journal_superblock_t *sb = journal->j_superblock; | ||
1172 | |||
1173 | spin_lock(&journal->j_state_lock); | ||
1174 | jbd_debug(1, "JBD: updating superblock error (errno %d)\n", | ||
1175 | journal->j_errno); | ||
1176 | sb->s_errno = cpu_to_be32(journal->j_errno); | ||
1177 | spin_unlock(&journal->j_state_lock); | ||
1178 | |||
1179 | journal_write_superblock(journal, WRITE_SYNC); | ||
1180 | } | ||
1181 | |||
1182 | /* | ||
1183 | * Read the superblock for a given journal, performing initial | ||
1184 | * validation of the format. | ||
1185 | */ | ||
1186 | |||
1187 | static int journal_get_superblock(journal_t *journal) | ||
1188 | { | ||
1189 | struct buffer_head *bh; | ||
1190 | journal_superblock_t *sb; | ||
1191 | int err = -EIO; | ||
1192 | |||
1193 | bh = journal->j_sb_buffer; | ||
1194 | |||
1195 | J_ASSERT(bh != NULL); | ||
1196 | if (!buffer_uptodate(bh)) { | ||
1197 | ll_rw_block(READ, 1, &bh); | ||
1198 | wait_on_buffer(bh); | ||
1199 | if (!buffer_uptodate(bh)) { | ||
1200 | printk (KERN_ERR | ||
1201 | "JBD: IO error reading journal superblock\n"); | ||
1202 | goto out; | ||
1203 | } | ||
1204 | } | ||
1205 | |||
1206 | sb = journal->j_superblock; | ||
1207 | |||
1208 | err = -EINVAL; | ||
1209 | |||
1210 | if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || | ||
1211 | sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { | ||
1212 | printk(KERN_WARNING "JBD: no valid journal superblock found\n"); | ||
1213 | goto out; | ||
1214 | } | ||
1215 | |||
1216 | switch(be32_to_cpu(sb->s_header.h_blocktype)) { | ||
1217 | case JFS_SUPERBLOCK_V1: | ||
1218 | journal->j_format_version = 1; | ||
1219 | break; | ||
1220 | case JFS_SUPERBLOCK_V2: | ||
1221 | journal->j_format_version = 2; | ||
1222 | break; | ||
1223 | default: | ||
1224 | printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); | ||
1225 | goto out; | ||
1226 | } | ||
1227 | |||
1228 | if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) | ||
1229 | journal->j_maxlen = be32_to_cpu(sb->s_maxlen); | ||
1230 | else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { | ||
1231 | printk (KERN_WARNING "JBD: journal file too short\n"); | ||
1232 | goto out; | ||
1233 | } | ||
1234 | |||
1235 | if (be32_to_cpu(sb->s_first) == 0 || | ||
1236 | be32_to_cpu(sb->s_first) >= journal->j_maxlen) { | ||
1237 | printk(KERN_WARNING | ||
1238 | "JBD: Invalid start block of journal: %u\n", | ||
1239 | be32_to_cpu(sb->s_first)); | ||
1240 | goto out; | ||
1241 | } | ||
1242 | |||
1243 | return 0; | ||
1244 | |||
1245 | out: | ||
1246 | journal_fail_superblock(journal); | ||
1247 | return err; | ||
1248 | } | ||
1249 | |||
1250 | /* | ||
1251 | * Load the on-disk journal superblock and read the key fields into the | ||
1252 | * journal_t. | ||
1253 | */ | ||
1254 | |||
1255 | static int load_superblock(journal_t *journal) | ||
1256 | { | ||
1257 | int err; | ||
1258 | journal_superblock_t *sb; | ||
1259 | |||
1260 | err = journal_get_superblock(journal); | ||
1261 | if (err) | ||
1262 | return err; | ||
1263 | |||
1264 | sb = journal->j_superblock; | ||
1265 | |||
1266 | journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); | ||
1267 | journal->j_tail = be32_to_cpu(sb->s_start); | ||
1268 | journal->j_first = be32_to_cpu(sb->s_first); | ||
1269 | journal->j_last = be32_to_cpu(sb->s_maxlen); | ||
1270 | journal->j_errno = be32_to_cpu(sb->s_errno); | ||
1271 | |||
1272 | return 0; | ||
1273 | } | ||
1274 | |||
1275 | |||
1276 | /** | ||
1277 | * int journal_load() - Read journal from disk. | ||
1278 | * @journal: Journal to act on. | ||
1279 | * | ||
1280 | * Given a journal_t structure which tells us which disk blocks contain | ||
1281 | * a journal, read the journal from disk to initialise the in-memory | ||
1282 | * structures. | ||
1283 | */ | ||
1284 | int journal_load(journal_t *journal) | ||
1285 | { | ||
1286 | int err; | ||
1287 | journal_superblock_t *sb; | ||
1288 | |||
1289 | err = load_superblock(journal); | ||
1290 | if (err) | ||
1291 | return err; | ||
1292 | |||
1293 | sb = journal->j_superblock; | ||
1294 | /* If this is a V2 superblock, then we have to check the | ||
1295 | * features flags on it. */ | ||
1296 | |||
1297 | if (journal->j_format_version >= 2) { | ||
1298 | if ((sb->s_feature_ro_compat & | ||
1299 | ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || | ||
1300 | (sb->s_feature_incompat & | ||
1301 | ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { | ||
1302 | printk (KERN_WARNING | ||
1303 | "JBD: Unrecognised features on journal\n"); | ||
1304 | return -EINVAL; | ||
1305 | } | ||
1306 | } | ||
1307 | |||
1308 | /* Let the recovery code check whether it needs to recover any | ||
1309 | * data from the journal. */ | ||
1310 | if (journal_recover(journal)) | ||
1311 | goto recovery_error; | ||
1312 | |||
1313 | /* OK, we've finished with the dynamic journal bits: | ||
1314 | * reinitialise the dynamic contents of the superblock in memory | ||
1315 | * and reset them on disk. */ | ||
1316 | if (journal_reset(journal)) | ||
1317 | goto recovery_error; | ||
1318 | |||
1319 | journal->j_flags &= ~JFS_ABORT; | ||
1320 | journal->j_flags |= JFS_LOADED; | ||
1321 | return 0; | ||
1322 | |||
1323 | recovery_error: | ||
1324 | printk (KERN_WARNING "JBD: recovery failed\n"); | ||
1325 | return -EIO; | ||
1326 | } | ||
1327 | |||
1328 | /** | ||
1329 | * void journal_destroy() - Release a journal_t structure. | ||
1330 | * @journal: Journal to act on. | ||
1331 | * | ||
1332 | * Release a journal_t structure once it is no longer in use by the | ||
1333 | * journaled object. | ||
1334 | * Return <0 if we couldn't clean up the journal. | ||
1335 | */ | ||
1336 | int journal_destroy(journal_t *journal) | ||
1337 | { | ||
1338 | int err = 0; | ||
1339 | |||
1340 | |||
1341 | /* Wait for the commit thread to wake up and die. */ | ||
1342 | journal_kill_thread(journal); | ||
1343 | |||
1344 | /* Force a final log commit */ | ||
1345 | if (journal->j_running_transaction) | ||
1346 | journal_commit_transaction(journal); | ||
1347 | |||
1348 | /* Force any old transactions to disk */ | ||
1349 | |||
1350 | /* We cannot race with anybody but must keep assertions happy */ | ||
1351 | mutex_lock(&journal->j_checkpoint_mutex); | ||
1352 | /* Totally anal locking here... */ | ||
1353 | spin_lock(&journal->j_list_lock); | ||
1354 | while (journal->j_checkpoint_transactions != NULL) { | ||
1355 | spin_unlock(&journal->j_list_lock); | ||
1356 | log_do_checkpoint(journal); | ||
1357 | spin_lock(&journal->j_list_lock); | ||
1358 | } | ||
1359 | |||
1360 | J_ASSERT(journal->j_running_transaction == NULL); | ||
1361 | J_ASSERT(journal->j_committing_transaction == NULL); | ||
1362 | J_ASSERT(journal->j_checkpoint_transactions == NULL); | ||
1363 | spin_unlock(&journal->j_list_lock); | ||
1364 | |||
1365 | if (journal->j_sb_buffer) { | ||
1366 | if (!is_journal_aborted(journal)) { | ||
1367 | journal->j_tail_sequence = | ||
1368 | ++journal->j_transaction_sequence; | ||
1369 | mark_journal_empty(journal); | ||
1370 | } else | ||
1371 | err = -EIO; | ||
1372 | brelse(journal->j_sb_buffer); | ||
1373 | } | ||
1374 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
1375 | |||
1376 | iput(journal->j_inode); | ||
1377 | if (journal->j_revoke) | ||
1378 | journal_destroy_revoke(journal); | ||
1379 | kfree(journal->j_wbuf); | ||
1380 | kfree(journal); | ||
1381 | |||
1382 | return err; | ||
1383 | } | ||
1384 | |||
1385 | |||
1386 | /** | ||
1387 | *int journal_check_used_features () - Check if features specified are used. | ||
1388 | * @journal: Journal to check. | ||
1389 | * @compat: bitmask of compatible features | ||
1390 | * @ro: bitmask of features that force read-only mount | ||
1391 | * @incompat: bitmask of incompatible features | ||
1392 | * | ||
1393 | * Check whether the journal uses all of a given set of | ||
1394 | * features. Return true (non-zero) if it does. | ||
1395 | **/ | ||
1396 | |||
1397 | int journal_check_used_features (journal_t *journal, unsigned long compat, | ||
1398 | unsigned long ro, unsigned long incompat) | ||
1399 | { | ||
1400 | journal_superblock_t *sb; | ||
1401 | |||
1402 | if (!compat && !ro && !incompat) | ||
1403 | return 1; | ||
1404 | if (journal->j_format_version == 1) | ||
1405 | return 0; | ||
1406 | |||
1407 | sb = journal->j_superblock; | ||
1408 | |||
1409 | if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && | ||
1410 | ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && | ||
1411 | ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) | ||
1412 | return 1; | ||
1413 | |||
1414 | return 0; | ||
1415 | } | ||
1416 | |||
1417 | /** | ||
1418 | * int journal_check_available_features() - Check feature set in journalling layer | ||
1419 | * @journal: Journal to check. | ||
1420 | * @compat: bitmask of compatible features | ||
1421 | * @ro: bitmask of features that force read-only mount | ||
1422 | * @incompat: bitmask of incompatible features | ||
1423 | * | ||
1424 | * Check whether the journaling code supports the use of | ||
1425 | * all of a given set of features on this journal. Return true | ||
1426 | * (non-zero) if it can. */ | ||
1427 | |||
1428 | int journal_check_available_features (journal_t *journal, unsigned long compat, | ||
1429 | unsigned long ro, unsigned long incompat) | ||
1430 | { | ||
1431 | if (!compat && !ro && !incompat) | ||
1432 | return 1; | ||
1433 | |||
1434 | /* We can support any known requested features iff the | ||
1435 | * superblock is in version 2. Otherwise we fail to support any | ||
1436 | * extended sb features. */ | ||
1437 | |||
1438 | if (journal->j_format_version != 2) | ||
1439 | return 0; | ||
1440 | |||
1441 | if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && | ||
1442 | (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && | ||
1443 | (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) | ||
1444 | return 1; | ||
1445 | |||
1446 | return 0; | ||
1447 | } | ||
1448 | |||
1449 | /** | ||
1450 | * int journal_set_features () - Mark a given journal feature in the superblock | ||
1451 | * @journal: Journal to act on. | ||
1452 | * @compat: bitmask of compatible features | ||
1453 | * @ro: bitmask of features that force read-only mount | ||
1454 | * @incompat: bitmask of incompatible features | ||
1455 | * | ||
1456 | * Mark a given journal feature as present on the | ||
1457 | * superblock. Returns true if the requested features could be set. | ||
1458 | * | ||
1459 | */ | ||
1460 | |||
1461 | int journal_set_features (journal_t *journal, unsigned long compat, | ||
1462 | unsigned long ro, unsigned long incompat) | ||
1463 | { | ||
1464 | journal_superblock_t *sb; | ||
1465 | |||
1466 | if (journal_check_used_features(journal, compat, ro, incompat)) | ||
1467 | return 1; | ||
1468 | |||
1469 | if (!journal_check_available_features(journal, compat, ro, incompat)) | ||
1470 | return 0; | ||
1471 | |||
1472 | jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", | ||
1473 | compat, ro, incompat); | ||
1474 | |||
1475 | sb = journal->j_superblock; | ||
1476 | |||
1477 | sb->s_feature_compat |= cpu_to_be32(compat); | ||
1478 | sb->s_feature_ro_compat |= cpu_to_be32(ro); | ||
1479 | sb->s_feature_incompat |= cpu_to_be32(incompat); | ||
1480 | |||
1481 | return 1; | ||
1482 | } | ||
1483 | |||
1484 | |||
1485 | /** | ||
1486 | * int journal_update_format () - Update on-disk journal structure. | ||
1487 | * @journal: Journal to act on. | ||
1488 | * | ||
1489 | * Given an initialised but unloaded journal struct, poke about in the | ||
1490 | * on-disk structure to update it to the most recent supported version. | ||
1491 | */ | ||
1492 | int journal_update_format (journal_t *journal) | ||
1493 | { | ||
1494 | journal_superblock_t *sb; | ||
1495 | int err; | ||
1496 | |||
1497 | err = journal_get_superblock(journal); | ||
1498 | if (err) | ||
1499 | return err; | ||
1500 | |||
1501 | sb = journal->j_superblock; | ||
1502 | |||
1503 | switch (be32_to_cpu(sb->s_header.h_blocktype)) { | ||
1504 | case JFS_SUPERBLOCK_V2: | ||
1505 | return 0; | ||
1506 | case JFS_SUPERBLOCK_V1: | ||
1507 | return journal_convert_superblock_v1(journal, sb); | ||
1508 | default: | ||
1509 | break; | ||
1510 | } | ||
1511 | return -EINVAL; | ||
1512 | } | ||
1513 | |||
1514 | static int journal_convert_superblock_v1(journal_t *journal, | ||
1515 | journal_superblock_t *sb) | ||
1516 | { | ||
1517 | int offset, blocksize; | ||
1518 | struct buffer_head *bh; | ||
1519 | |||
1520 | printk(KERN_WARNING | ||
1521 | "JBD: Converting superblock from version 1 to 2.\n"); | ||
1522 | |||
1523 | /* Pre-initialise new fields to zero */ | ||
1524 | offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); | ||
1525 | blocksize = be32_to_cpu(sb->s_blocksize); | ||
1526 | memset(&sb->s_feature_compat, 0, blocksize-offset); | ||
1527 | |||
1528 | sb->s_nr_users = cpu_to_be32(1); | ||
1529 | sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); | ||
1530 | journal->j_format_version = 2; | ||
1531 | |||
1532 | bh = journal->j_sb_buffer; | ||
1533 | BUFFER_TRACE(bh, "marking dirty"); | ||
1534 | mark_buffer_dirty(bh); | ||
1535 | sync_dirty_buffer(bh); | ||
1536 | return 0; | ||
1537 | } | ||
1538 | |||
1539 | |||
1540 | /** | ||
1541 | * int journal_flush () - Flush journal | ||
1542 | * @journal: Journal to act on. | ||
1543 | * | ||
1544 | * Flush all data for a given journal to disk and empty the journal. | ||
1545 | * Filesystems can use this when remounting readonly to ensure that | ||
1546 | * recovery does not need to happen on remount. | ||
1547 | */ | ||
1548 | |||
1549 | int journal_flush(journal_t *journal) | ||
1550 | { | ||
1551 | int err = 0; | ||
1552 | transaction_t *transaction = NULL; | ||
1553 | |||
1554 | spin_lock(&journal->j_state_lock); | ||
1555 | |||
1556 | /* Force everything buffered to the log... */ | ||
1557 | if (journal->j_running_transaction) { | ||
1558 | transaction = journal->j_running_transaction; | ||
1559 | __log_start_commit(journal, transaction->t_tid); | ||
1560 | } else if (journal->j_committing_transaction) | ||
1561 | transaction = journal->j_committing_transaction; | ||
1562 | |||
1563 | /* Wait for the log commit to complete... */ | ||
1564 | if (transaction) { | ||
1565 | tid_t tid = transaction->t_tid; | ||
1566 | |||
1567 | spin_unlock(&journal->j_state_lock); | ||
1568 | log_wait_commit(journal, tid); | ||
1569 | } else { | ||
1570 | spin_unlock(&journal->j_state_lock); | ||
1571 | } | ||
1572 | |||
1573 | /* ...and flush everything in the log out to disk. */ | ||
1574 | spin_lock(&journal->j_list_lock); | ||
1575 | while (!err && journal->j_checkpoint_transactions != NULL) { | ||
1576 | spin_unlock(&journal->j_list_lock); | ||
1577 | mutex_lock(&journal->j_checkpoint_mutex); | ||
1578 | err = log_do_checkpoint(journal); | ||
1579 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
1580 | spin_lock(&journal->j_list_lock); | ||
1581 | } | ||
1582 | spin_unlock(&journal->j_list_lock); | ||
1583 | |||
1584 | if (is_journal_aborted(journal)) | ||
1585 | return -EIO; | ||
1586 | |||
1587 | mutex_lock(&journal->j_checkpoint_mutex); | ||
1588 | cleanup_journal_tail(journal); | ||
1589 | |||
1590 | /* Finally, mark the journal as really needing no recovery. | ||
1591 | * This sets s_start==0 in the underlying superblock, which is | ||
1592 | * the magic code for a fully-recovered superblock. Any future | ||
1593 | * commits of data to the journal will restore the current | ||
1594 | * s_start value. */ | ||
1595 | mark_journal_empty(journal); | ||
1596 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
1597 | spin_lock(&journal->j_state_lock); | ||
1598 | J_ASSERT(!journal->j_running_transaction); | ||
1599 | J_ASSERT(!journal->j_committing_transaction); | ||
1600 | J_ASSERT(!journal->j_checkpoint_transactions); | ||
1601 | J_ASSERT(journal->j_head == journal->j_tail); | ||
1602 | J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); | ||
1603 | spin_unlock(&journal->j_state_lock); | ||
1604 | return 0; | ||
1605 | } | ||
1606 | |||
1607 | /** | ||
1608 | * int journal_wipe() - Wipe journal contents | ||
1609 | * @journal: Journal to act on. | ||
1610 | * @write: flag (see below) | ||
1611 | * | ||
1612 | * Wipe out all of the contents of a journal, safely. This will produce | ||
1613 | * a warning if the journal contains any valid recovery information. | ||
1614 | * Must be called between journal_init_*() and journal_load(). | ||
1615 | * | ||
1616 | * If 'write' is non-zero, then we wipe out the journal on disk; otherwise | ||
1617 | * we merely suppress recovery. | ||
1618 | */ | ||
1619 | |||
1620 | int journal_wipe(journal_t *journal, int write) | ||
1621 | { | ||
1622 | int err = 0; | ||
1623 | |||
1624 | J_ASSERT (!(journal->j_flags & JFS_LOADED)); | ||
1625 | |||
1626 | err = load_superblock(journal); | ||
1627 | if (err) | ||
1628 | return err; | ||
1629 | |||
1630 | if (!journal->j_tail) | ||
1631 | goto no_recovery; | ||
1632 | |||
1633 | printk (KERN_WARNING "JBD: %s recovery information on journal\n", | ||
1634 | write ? "Clearing" : "Ignoring"); | ||
1635 | |||
1636 | err = journal_skip_recovery(journal); | ||
1637 | if (write) { | ||
1638 | /* Lock to make assertions happy... */ | ||
1639 | mutex_lock(&journal->j_checkpoint_mutex); | ||
1640 | mark_journal_empty(journal); | ||
1641 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
1642 | } | ||
1643 | |||
1644 | no_recovery: | ||
1645 | return err; | ||
1646 | } | ||
1647 | |||
1648 | /* | ||
1649 | * journal_dev_name: format a character string to describe on what | ||
1650 | * device this journal is present. | ||
1651 | */ | ||
1652 | |||
1653 | static const char *journal_dev_name(journal_t *journal, char *buffer) | ||
1654 | { | ||
1655 | struct block_device *bdev; | ||
1656 | |||
1657 | if (journal->j_inode) | ||
1658 | bdev = journal->j_inode->i_sb->s_bdev; | ||
1659 | else | ||
1660 | bdev = journal->j_dev; | ||
1661 | |||
1662 | return bdevname(bdev, buffer); | ||
1663 | } | ||
1664 | |||
1665 | /* | ||
1666 | * Journal abort has very specific semantics, which we describe | ||
1667 | * for journal abort. | ||
1668 | * | ||
1669 | * Two internal function, which provide abort to te jbd layer | ||
1670 | * itself are here. | ||
1671 | */ | ||
1672 | |||
1673 | /* | ||
1674 | * Quick version for internal journal use (doesn't lock the journal). | ||
1675 | * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, | ||
1676 | * and don't attempt to make any other journal updates. | ||
1677 | */ | ||
1678 | static void __journal_abort_hard(journal_t *journal) | ||
1679 | { | ||
1680 | transaction_t *transaction; | ||
1681 | char b[BDEVNAME_SIZE]; | ||
1682 | |||
1683 | if (journal->j_flags & JFS_ABORT) | ||
1684 | return; | ||
1685 | |||
1686 | printk(KERN_ERR "Aborting journal on device %s.\n", | ||
1687 | journal_dev_name(journal, b)); | ||
1688 | |||
1689 | spin_lock(&journal->j_state_lock); | ||
1690 | journal->j_flags |= JFS_ABORT; | ||
1691 | transaction = journal->j_running_transaction; | ||
1692 | if (transaction) | ||
1693 | __log_start_commit(journal, transaction->t_tid); | ||
1694 | spin_unlock(&journal->j_state_lock); | ||
1695 | } | ||
1696 | |||
1697 | /* Soft abort: record the abort error status in the journal superblock, | ||
1698 | * but don't do any other IO. */ | ||
1699 | static void __journal_abort_soft (journal_t *journal, int errno) | ||
1700 | { | ||
1701 | if (journal->j_flags & JFS_ABORT) | ||
1702 | return; | ||
1703 | |||
1704 | if (!journal->j_errno) | ||
1705 | journal->j_errno = errno; | ||
1706 | |||
1707 | __journal_abort_hard(journal); | ||
1708 | |||
1709 | if (errno) | ||
1710 | journal_update_sb_errno(journal); | ||
1711 | } | ||
1712 | |||
1713 | /** | ||
1714 | * void journal_abort () - Shutdown the journal immediately. | ||
1715 | * @journal: the journal to shutdown. | ||
1716 | * @errno: an error number to record in the journal indicating | ||
1717 | * the reason for the shutdown. | ||
1718 | * | ||
1719 | * Perform a complete, immediate shutdown of the ENTIRE | ||
1720 | * journal (not of a single transaction). This operation cannot be | ||
1721 | * undone without closing and reopening the journal. | ||
1722 | * | ||
1723 | * The journal_abort function is intended to support higher level error | ||
1724 | * recovery mechanisms such as the ext2/ext3 remount-readonly error | ||
1725 | * mode. | ||
1726 | * | ||
1727 | * Journal abort has very specific semantics. Any existing dirty, | ||
1728 | * unjournaled buffers in the main filesystem will still be written to | ||
1729 | * disk by bdflush, but the journaling mechanism will be suspended | ||
1730 | * immediately and no further transaction commits will be honoured. | ||
1731 | * | ||
1732 | * Any dirty, journaled buffers will be written back to disk without | ||
1733 | * hitting the journal. Atomicity cannot be guaranteed on an aborted | ||
1734 | * filesystem, but we _do_ attempt to leave as much data as possible | ||
1735 | * behind for fsck to use for cleanup. | ||
1736 | * | ||
1737 | * Any attempt to get a new transaction handle on a journal which is in | ||
1738 | * ABORT state will just result in an -EROFS error return. A | ||
1739 | * journal_stop on an existing handle will return -EIO if we have | ||
1740 | * entered abort state during the update. | ||
1741 | * | ||
1742 | * Recursive transactions are not disturbed by journal abort until the | ||
1743 | * final journal_stop, which will receive the -EIO error. | ||
1744 | * | ||
1745 | * Finally, the journal_abort call allows the caller to supply an errno | ||
1746 | * which will be recorded (if possible) in the journal superblock. This | ||
1747 | * allows a client to record failure conditions in the middle of a | ||
1748 | * transaction without having to complete the transaction to record the | ||
1749 | * failure to disk. ext3_error, for example, now uses this | ||
1750 | * functionality. | ||
1751 | * | ||
1752 | * Errors which originate from within the journaling layer will NOT | ||
1753 | * supply an errno; a null errno implies that absolutely no further | ||
1754 | * writes are done to the journal (unless there are any already in | ||
1755 | * progress). | ||
1756 | * | ||
1757 | */ | ||
1758 | |||
1759 | void journal_abort(journal_t *journal, int errno) | ||
1760 | { | ||
1761 | __journal_abort_soft(journal, errno); | ||
1762 | } | ||
1763 | |||
1764 | /** | ||
1765 | * int journal_errno () - returns the journal's error state. | ||
1766 | * @journal: journal to examine. | ||
1767 | * | ||
1768 | * This is the errno numbet set with journal_abort(), the last | ||
1769 | * time the journal was mounted - if the journal was stopped | ||
1770 | * without calling abort this will be 0. | ||
1771 | * | ||
1772 | * If the journal has been aborted on this mount time -EROFS will | ||
1773 | * be returned. | ||
1774 | */ | ||
1775 | int journal_errno(journal_t *journal) | ||
1776 | { | ||
1777 | int err; | ||
1778 | |||
1779 | spin_lock(&journal->j_state_lock); | ||
1780 | if (journal->j_flags & JFS_ABORT) | ||
1781 | err = -EROFS; | ||
1782 | else | ||
1783 | err = journal->j_errno; | ||
1784 | spin_unlock(&journal->j_state_lock); | ||
1785 | return err; | ||
1786 | } | ||
1787 | |||
1788 | /** | ||
1789 | * int journal_clear_err () - clears the journal's error state | ||
1790 | * @journal: journal to act on. | ||
1791 | * | ||
1792 | * An error must be cleared or Acked to take a FS out of readonly | ||
1793 | * mode. | ||
1794 | */ | ||
1795 | int journal_clear_err(journal_t *journal) | ||
1796 | { | ||
1797 | int err = 0; | ||
1798 | |||
1799 | spin_lock(&journal->j_state_lock); | ||
1800 | if (journal->j_flags & JFS_ABORT) | ||
1801 | err = -EROFS; | ||
1802 | else | ||
1803 | journal->j_errno = 0; | ||
1804 | spin_unlock(&journal->j_state_lock); | ||
1805 | return err; | ||
1806 | } | ||
1807 | |||
1808 | /** | ||
1809 | * void journal_ack_err() - Ack journal err. | ||
1810 | * @journal: journal to act on. | ||
1811 | * | ||
1812 | * An error must be cleared or Acked to take a FS out of readonly | ||
1813 | * mode. | ||
1814 | */ | ||
1815 | void journal_ack_err(journal_t *journal) | ||
1816 | { | ||
1817 | spin_lock(&journal->j_state_lock); | ||
1818 | if (journal->j_errno) | ||
1819 | journal->j_flags |= JFS_ACK_ERR; | ||
1820 | spin_unlock(&journal->j_state_lock); | ||
1821 | } | ||
1822 | |||
1823 | int journal_blocks_per_page(struct inode *inode) | ||
1824 | { | ||
1825 | return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); | ||
1826 | } | ||
1827 | |||
1828 | /* | ||
1829 | * Journal_head storage management | ||
1830 | */ | ||
1831 | static struct kmem_cache *journal_head_cache; | ||
1832 | #ifdef CONFIG_JBD_DEBUG | ||
1833 | static atomic_t nr_journal_heads = ATOMIC_INIT(0); | ||
1834 | #endif | ||
1835 | |||
1836 | static int journal_init_journal_head_cache(void) | ||
1837 | { | ||
1838 | int retval; | ||
1839 | |||
1840 | J_ASSERT(journal_head_cache == NULL); | ||
1841 | journal_head_cache = kmem_cache_create("journal_head", | ||
1842 | sizeof(struct journal_head), | ||
1843 | 0, /* offset */ | ||
1844 | SLAB_TEMPORARY, /* flags */ | ||
1845 | NULL); /* ctor */ | ||
1846 | retval = 0; | ||
1847 | if (!journal_head_cache) { | ||
1848 | retval = -ENOMEM; | ||
1849 | printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); | ||
1850 | } | ||
1851 | return retval; | ||
1852 | } | ||
1853 | |||
1854 | static void journal_destroy_journal_head_cache(void) | ||
1855 | { | ||
1856 | if (journal_head_cache) { | ||
1857 | kmem_cache_destroy(journal_head_cache); | ||
1858 | journal_head_cache = NULL; | ||
1859 | } | ||
1860 | } | ||
1861 | |||
1862 | /* | ||
1863 | * journal_head splicing and dicing | ||
1864 | */ | ||
1865 | static struct journal_head *journal_alloc_journal_head(void) | ||
1866 | { | ||
1867 | struct journal_head *ret; | ||
1868 | |||
1869 | #ifdef CONFIG_JBD_DEBUG | ||
1870 | atomic_inc(&nr_journal_heads); | ||
1871 | #endif | ||
1872 | ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); | ||
1873 | if (ret == NULL) { | ||
1874 | jbd_debug(1, "out of memory for journal_head\n"); | ||
1875 | printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", | ||
1876 | __func__); | ||
1877 | |||
1878 | while (ret == NULL) { | ||
1879 | yield(); | ||
1880 | ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); | ||
1881 | } | ||
1882 | } | ||
1883 | return ret; | ||
1884 | } | ||
1885 | |||
1886 | static void journal_free_journal_head(struct journal_head *jh) | ||
1887 | { | ||
1888 | #ifdef CONFIG_JBD_DEBUG | ||
1889 | atomic_dec(&nr_journal_heads); | ||
1890 | memset(jh, JBD_POISON_FREE, sizeof(*jh)); | ||
1891 | #endif | ||
1892 | kmem_cache_free(journal_head_cache, jh); | ||
1893 | } | ||
1894 | |||
1895 | /* | ||
1896 | * A journal_head is attached to a buffer_head whenever JBD has an | ||
1897 | * interest in the buffer. | ||
1898 | * | ||
1899 | * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit | ||
1900 | * is set. This bit is tested in core kernel code where we need to take | ||
1901 | * JBD-specific actions. Testing the zeroness of ->b_private is not reliable | ||
1902 | * there. | ||
1903 | * | ||
1904 | * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. | ||
1905 | * | ||
1906 | * When a buffer has its BH_JBD bit set it is immune from being released by | ||
1907 | * core kernel code, mainly via ->b_count. | ||
1908 | * | ||
1909 | * A journal_head is detached from its buffer_head when the journal_head's | ||
1910 | * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint | ||
1911 | * transaction (b_cp_transaction) hold their references to b_jcount. | ||
1912 | * | ||
1913 | * Various places in the kernel want to attach a journal_head to a buffer_head | ||
1914 | * _before_ attaching the journal_head to a transaction. To protect the | ||
1915 | * journal_head in this situation, journal_add_journal_head elevates the | ||
1916 | * journal_head's b_jcount refcount by one. The caller must call | ||
1917 | * journal_put_journal_head() to undo this. | ||
1918 | * | ||
1919 | * So the typical usage would be: | ||
1920 | * | ||
1921 | * (Attach a journal_head if needed. Increments b_jcount) | ||
1922 | * struct journal_head *jh = journal_add_journal_head(bh); | ||
1923 | * ... | ||
1924 | * (Get another reference for transaction) | ||
1925 | * journal_grab_journal_head(bh); | ||
1926 | * jh->b_transaction = xxx; | ||
1927 | * (Put original reference) | ||
1928 | * journal_put_journal_head(jh); | ||
1929 | */ | ||
1930 | |||
1931 | /* | ||
1932 | * Give a buffer_head a journal_head. | ||
1933 | * | ||
1934 | * May sleep. | ||
1935 | */ | ||
1936 | struct journal_head *journal_add_journal_head(struct buffer_head *bh) | ||
1937 | { | ||
1938 | struct journal_head *jh; | ||
1939 | struct journal_head *new_jh = NULL; | ||
1940 | |||
1941 | repeat: | ||
1942 | if (!buffer_jbd(bh)) | ||
1943 | new_jh = journal_alloc_journal_head(); | ||
1944 | |||
1945 | jbd_lock_bh_journal_head(bh); | ||
1946 | if (buffer_jbd(bh)) { | ||
1947 | jh = bh2jh(bh); | ||
1948 | } else { | ||
1949 | J_ASSERT_BH(bh, | ||
1950 | (atomic_read(&bh->b_count) > 0) || | ||
1951 | (bh->b_page && bh->b_page->mapping)); | ||
1952 | |||
1953 | if (!new_jh) { | ||
1954 | jbd_unlock_bh_journal_head(bh); | ||
1955 | goto repeat; | ||
1956 | } | ||
1957 | |||
1958 | jh = new_jh; | ||
1959 | new_jh = NULL; /* We consumed it */ | ||
1960 | set_buffer_jbd(bh); | ||
1961 | bh->b_private = jh; | ||
1962 | jh->b_bh = bh; | ||
1963 | get_bh(bh); | ||
1964 | BUFFER_TRACE(bh, "added journal_head"); | ||
1965 | } | ||
1966 | jh->b_jcount++; | ||
1967 | jbd_unlock_bh_journal_head(bh); | ||
1968 | if (new_jh) | ||
1969 | journal_free_journal_head(new_jh); | ||
1970 | return bh->b_private; | ||
1971 | } | ||
1972 | |||
1973 | /* | ||
1974 | * Grab a ref against this buffer_head's journal_head. If it ended up not | ||
1975 | * having a journal_head, return NULL | ||
1976 | */ | ||
1977 | struct journal_head *journal_grab_journal_head(struct buffer_head *bh) | ||
1978 | { | ||
1979 | struct journal_head *jh = NULL; | ||
1980 | |||
1981 | jbd_lock_bh_journal_head(bh); | ||
1982 | if (buffer_jbd(bh)) { | ||
1983 | jh = bh2jh(bh); | ||
1984 | jh->b_jcount++; | ||
1985 | } | ||
1986 | jbd_unlock_bh_journal_head(bh); | ||
1987 | return jh; | ||
1988 | } | ||
1989 | |||
1990 | static void __journal_remove_journal_head(struct buffer_head *bh) | ||
1991 | { | ||
1992 | struct journal_head *jh = bh2jh(bh); | ||
1993 | |||
1994 | J_ASSERT_JH(jh, jh->b_jcount >= 0); | ||
1995 | J_ASSERT_JH(jh, jh->b_transaction == NULL); | ||
1996 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
1997 | J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); | ||
1998 | J_ASSERT_JH(jh, jh->b_jlist == BJ_None); | ||
1999 | J_ASSERT_BH(bh, buffer_jbd(bh)); | ||
2000 | J_ASSERT_BH(bh, jh2bh(jh) == bh); | ||
2001 | BUFFER_TRACE(bh, "remove journal_head"); | ||
2002 | if (jh->b_frozen_data) { | ||
2003 | printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); | ||
2004 | jbd_free(jh->b_frozen_data, bh->b_size); | ||
2005 | } | ||
2006 | if (jh->b_committed_data) { | ||
2007 | printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); | ||
2008 | jbd_free(jh->b_committed_data, bh->b_size); | ||
2009 | } | ||
2010 | bh->b_private = NULL; | ||
2011 | jh->b_bh = NULL; /* debug, really */ | ||
2012 | clear_buffer_jbd(bh); | ||
2013 | journal_free_journal_head(jh); | ||
2014 | } | ||
2015 | |||
2016 | /* | ||
2017 | * Drop a reference on the passed journal_head. If it fell to zero then | ||
2018 | * release the journal_head from the buffer_head. | ||
2019 | */ | ||
2020 | void journal_put_journal_head(struct journal_head *jh) | ||
2021 | { | ||
2022 | struct buffer_head *bh = jh2bh(jh); | ||
2023 | |||
2024 | jbd_lock_bh_journal_head(bh); | ||
2025 | J_ASSERT_JH(jh, jh->b_jcount > 0); | ||
2026 | --jh->b_jcount; | ||
2027 | if (!jh->b_jcount) { | ||
2028 | __journal_remove_journal_head(bh); | ||
2029 | jbd_unlock_bh_journal_head(bh); | ||
2030 | __brelse(bh); | ||
2031 | } else | ||
2032 | jbd_unlock_bh_journal_head(bh); | ||
2033 | } | ||
2034 | |||
2035 | /* | ||
2036 | * debugfs tunables | ||
2037 | */ | ||
2038 | #ifdef CONFIG_JBD_DEBUG | ||
2039 | |||
2040 | u8 journal_enable_debug __read_mostly; | ||
2041 | EXPORT_SYMBOL(journal_enable_debug); | ||
2042 | |||
2043 | static struct dentry *jbd_debugfs_dir; | ||
2044 | static struct dentry *jbd_debug; | ||
2045 | |||
2046 | static void __init jbd_create_debugfs_entry(void) | ||
2047 | { | ||
2048 | jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); | ||
2049 | if (jbd_debugfs_dir) | ||
2050 | jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, | ||
2051 | jbd_debugfs_dir, | ||
2052 | &journal_enable_debug); | ||
2053 | } | ||
2054 | |||
2055 | static void __exit jbd_remove_debugfs_entry(void) | ||
2056 | { | ||
2057 | debugfs_remove(jbd_debug); | ||
2058 | debugfs_remove(jbd_debugfs_dir); | ||
2059 | } | ||
2060 | |||
2061 | #else | ||
2062 | |||
2063 | static inline void jbd_create_debugfs_entry(void) | ||
2064 | { | ||
2065 | } | ||
2066 | |||
2067 | static inline void jbd_remove_debugfs_entry(void) | ||
2068 | { | ||
2069 | } | ||
2070 | |||
2071 | #endif | ||
2072 | |||
2073 | struct kmem_cache *jbd_handle_cache; | ||
2074 | |||
2075 | static int __init journal_init_handle_cache(void) | ||
2076 | { | ||
2077 | jbd_handle_cache = kmem_cache_create("journal_handle", | ||
2078 | sizeof(handle_t), | ||
2079 | 0, /* offset */ | ||
2080 | SLAB_TEMPORARY, /* flags */ | ||
2081 | NULL); /* ctor */ | ||
2082 | if (jbd_handle_cache == NULL) { | ||
2083 | printk(KERN_EMERG "JBD: failed to create handle cache\n"); | ||
2084 | return -ENOMEM; | ||
2085 | } | ||
2086 | return 0; | ||
2087 | } | ||
2088 | |||
2089 | static void journal_destroy_handle_cache(void) | ||
2090 | { | ||
2091 | if (jbd_handle_cache) | ||
2092 | kmem_cache_destroy(jbd_handle_cache); | ||
2093 | } | ||
2094 | |||
2095 | /* | ||
2096 | * Module startup and shutdown | ||
2097 | */ | ||
2098 | |||
2099 | static int __init journal_init_caches(void) | ||
2100 | { | ||
2101 | int ret; | ||
2102 | |||
2103 | ret = journal_init_revoke_caches(); | ||
2104 | if (ret == 0) | ||
2105 | ret = journal_init_journal_head_cache(); | ||
2106 | if (ret == 0) | ||
2107 | ret = journal_init_handle_cache(); | ||
2108 | return ret; | ||
2109 | } | ||
2110 | |||
2111 | static void journal_destroy_caches(void) | ||
2112 | { | ||
2113 | journal_destroy_revoke_caches(); | ||
2114 | journal_destroy_journal_head_cache(); | ||
2115 | journal_destroy_handle_cache(); | ||
2116 | } | ||
2117 | |||
2118 | static int __init journal_init(void) | ||
2119 | { | ||
2120 | int ret; | ||
2121 | |||
2122 | BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); | ||
2123 | |||
2124 | ret = journal_init_caches(); | ||
2125 | if (ret != 0) | ||
2126 | journal_destroy_caches(); | ||
2127 | jbd_create_debugfs_entry(); | ||
2128 | return ret; | ||
2129 | } | ||
2130 | |||
2131 | static void __exit journal_exit(void) | ||
2132 | { | ||
2133 | #ifdef CONFIG_JBD_DEBUG | ||
2134 | int n = atomic_read(&nr_journal_heads); | ||
2135 | if (n) | ||
2136 | printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); | ||
2137 | #endif | ||
2138 | jbd_remove_debugfs_entry(); | ||
2139 | journal_destroy_caches(); | ||
2140 | } | ||
2141 | |||
2142 | MODULE_LICENSE("GPL"); | ||
2143 | module_init(journal_init); | ||
2144 | module_exit(journal_exit); | ||
2145 | |||
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c deleted file mode 100644 index a748fe21465a..000000000000 --- a/fs/jbd/recovery.c +++ /dev/null | |||
@@ -1,594 +0,0 @@ | |||
1 | /* | ||
2 | * linux/fs/jbd/recovery.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 | ||
5 | * | ||
6 | * Copyright 1999-2000 Red Hat Software --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Journal recovery routines for the generic filesystem journaling code; | ||
13 | * part of the ext2fs journaling system. | ||
14 | */ | ||
15 | |||
16 | #ifndef __KERNEL__ | ||
17 | #include "jfs_user.h" | ||
18 | #else | ||
19 | #include <linux/time.h> | ||
20 | #include <linux/fs.h> | ||
21 | #include <linux/jbd.h> | ||
22 | #include <linux/errno.h> | ||
23 | #include <linux/blkdev.h> | ||
24 | #endif | ||
25 | |||
26 | /* | ||
27 | * Maintain information about the progress of the recovery job, so that | ||
28 | * the different passes can carry information between them. | ||
29 | */ | ||
30 | struct recovery_info | ||
31 | { | ||
32 | tid_t start_transaction; | ||
33 | tid_t end_transaction; | ||
34 | |||
35 | int nr_replays; | ||
36 | int nr_revokes; | ||
37 | int nr_revoke_hits; | ||
38 | }; | ||
39 | |||
40 | enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; | ||
41 | static int do_one_pass(journal_t *journal, | ||
42 | struct recovery_info *info, enum passtype pass); | ||
43 | static int scan_revoke_records(journal_t *, struct buffer_head *, | ||
44 | tid_t, struct recovery_info *); | ||
45 | |||
46 | #ifdef __KERNEL__ | ||
47 | |||
48 | /* Release readahead buffers after use */ | ||
49 | static void journal_brelse_array(struct buffer_head *b[], int n) | ||
50 | { | ||
51 | while (--n >= 0) | ||
52 | brelse (b[n]); | ||
53 | } | ||
54 | |||
55 | |||
56 | /* | ||
57 | * When reading from the journal, we are going through the block device | ||
58 | * layer directly and so there is no readahead being done for us. We | ||
59 | * need to implement any readahead ourselves if we want it to happen at | ||
60 | * all. Recovery is basically one long sequential read, so make sure we | ||
61 | * do the IO in reasonably large chunks. | ||
62 | * | ||
63 | * This is not so critical that we need to be enormously clever about | ||
64 | * the readahead size, though. 128K is a purely arbitrary, good-enough | ||
65 | * fixed value. | ||
66 | */ | ||
67 | |||
68 | #define MAXBUF 8 | ||
69 | static int do_readahead(journal_t *journal, unsigned int start) | ||
70 | { | ||
71 | int err; | ||
72 | unsigned int max, nbufs, next; | ||
73 | unsigned int blocknr; | ||
74 | struct buffer_head *bh; | ||
75 | |||
76 | struct buffer_head * bufs[MAXBUF]; | ||
77 | |||
78 | /* Do up to 128K of readahead */ | ||
79 | max = start + (128 * 1024 / journal->j_blocksize); | ||
80 | if (max > journal->j_maxlen) | ||
81 | max = journal->j_maxlen; | ||
82 | |||
83 | /* Do the readahead itself. We'll submit MAXBUF buffer_heads at | ||
84 | * a time to the block device IO layer. */ | ||
85 | |||
86 | nbufs = 0; | ||
87 | |||
88 | for (next = start; next < max; next++) { | ||
89 | err = journal_bmap(journal, next, &blocknr); | ||
90 | |||
91 | if (err) { | ||
92 | printk (KERN_ERR "JBD: bad block at offset %u\n", | ||
93 | next); | ||
94 | goto failed; | ||
95 | } | ||
96 | |||
97 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
98 | if (!bh) { | ||
99 | err = -ENOMEM; | ||
100 | goto failed; | ||
101 | } | ||
102 | |||
103 | if (!buffer_uptodate(bh) && !buffer_locked(bh)) { | ||
104 | bufs[nbufs++] = bh; | ||
105 | if (nbufs == MAXBUF) { | ||
106 | ll_rw_block(READ, nbufs, bufs); | ||
107 | journal_brelse_array(bufs, nbufs); | ||
108 | nbufs = 0; | ||
109 | } | ||
110 | } else | ||
111 | brelse(bh); | ||
112 | } | ||
113 | |||
114 | if (nbufs) | ||
115 | ll_rw_block(READ, nbufs, bufs); | ||
116 | err = 0; | ||
117 | |||
118 | failed: | ||
119 | if (nbufs) | ||
120 | journal_brelse_array(bufs, nbufs); | ||
121 | return err; | ||
122 | } | ||
123 | |||
124 | #endif /* __KERNEL__ */ | ||
125 | |||
126 | |||
127 | /* | ||
128 | * Read a block from the journal | ||
129 | */ | ||
130 | |||
131 | static int jread(struct buffer_head **bhp, journal_t *journal, | ||
132 | unsigned int offset) | ||
133 | { | ||
134 | int err; | ||
135 | unsigned int blocknr; | ||
136 | struct buffer_head *bh; | ||
137 | |||
138 | *bhp = NULL; | ||
139 | |||
140 | if (offset >= journal->j_maxlen) { | ||
141 | printk(KERN_ERR "JBD: corrupted journal superblock\n"); | ||
142 | return -EIO; | ||
143 | } | ||
144 | |||
145 | err = journal_bmap(journal, offset, &blocknr); | ||
146 | |||
147 | if (err) { | ||
148 | printk (KERN_ERR "JBD: bad block at offset %u\n", | ||
149 | offset); | ||
150 | return err; | ||
151 | } | ||
152 | |||
153 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
154 | if (!bh) | ||
155 | return -ENOMEM; | ||
156 | |||
157 | if (!buffer_uptodate(bh)) { | ||
158 | /* If this is a brand new buffer, start readahead. | ||
159 | Otherwise, we assume we are already reading it. */ | ||
160 | if (!buffer_req(bh)) | ||
161 | do_readahead(journal, offset); | ||
162 | wait_on_buffer(bh); | ||
163 | } | ||
164 | |||
165 | if (!buffer_uptodate(bh)) { | ||
166 | printk (KERN_ERR "JBD: Failed to read block at offset %u\n", | ||
167 | offset); | ||
168 | brelse(bh); | ||
169 | return -EIO; | ||
170 | } | ||
171 | |||
172 | *bhp = bh; | ||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | |||
177 | /* | ||
178 | * Count the number of in-use tags in a journal descriptor block. | ||
179 | */ | ||
180 | |||
181 | static int count_tags(struct buffer_head *bh, int size) | ||
182 | { | ||
183 | char * tagp; | ||
184 | journal_block_tag_t * tag; | ||
185 | int nr = 0; | ||
186 | |||
187 | tagp = &bh->b_data[sizeof(journal_header_t)]; | ||
188 | |||
189 | while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { | ||
190 | tag = (journal_block_tag_t *) tagp; | ||
191 | |||
192 | nr++; | ||
193 | tagp += sizeof(journal_block_tag_t); | ||
194 | if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) | ||
195 | tagp += 16; | ||
196 | |||
197 | if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG)) | ||
198 | break; | ||
199 | } | ||
200 | |||
201 | return nr; | ||
202 | } | ||
203 | |||
204 | |||
205 | /* Make sure we wrap around the log correctly! */ | ||
206 | #define wrap(journal, var) \ | ||
207 | do { \ | ||
208 | if (var >= (journal)->j_last) \ | ||
209 | var -= ((journal)->j_last - (journal)->j_first); \ | ||
210 | } while (0) | ||
211 | |||
212 | /** | ||
213 | * journal_recover - recovers a on-disk journal | ||
214 | * @journal: the journal to recover | ||
215 | * | ||
216 | * The primary function for recovering the log contents when mounting a | ||
217 | * journaled device. | ||
218 | * | ||
219 | * Recovery is done in three passes. In the first pass, we look for the | ||
220 | * end of the log. In the second, we assemble the list of revoke | ||
221 | * blocks. In the third and final pass, we replay any un-revoked blocks | ||
222 | * in the log. | ||
223 | */ | ||
224 | int journal_recover(journal_t *journal) | ||
225 | { | ||
226 | int err, err2; | ||
227 | journal_superblock_t * sb; | ||
228 | |||
229 | struct recovery_info info; | ||
230 | |||
231 | memset(&info, 0, sizeof(info)); | ||
232 | sb = journal->j_superblock; | ||
233 | |||
234 | /* | ||
235 | * The journal superblock's s_start field (the current log head) | ||
236 | * is always zero if, and only if, the journal was cleanly | ||
237 | * unmounted. | ||
238 | */ | ||
239 | |||
240 | if (!sb->s_start) { | ||
241 | jbd_debug(1, "No recovery required, last transaction %d\n", | ||
242 | be32_to_cpu(sb->s_sequence)); | ||
243 | journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; | ||
244 | return 0; | ||
245 | } | ||
246 | |||
247 | err = do_one_pass(journal, &info, PASS_SCAN); | ||
248 | if (!err) | ||
249 | err = do_one_pass(journal, &info, PASS_REVOKE); | ||
250 | if (!err) | ||
251 | err = do_one_pass(journal, &info, PASS_REPLAY); | ||
252 | |||
253 | jbd_debug(1, "JBD: recovery, exit status %d, " | ||
254 | "recovered transactions %u to %u\n", | ||
255 | err, info.start_transaction, info.end_transaction); | ||
256 | jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", | ||
257 | info.nr_replays, info.nr_revoke_hits, info.nr_revokes); | ||
258 | |||
259 | /* Restart the log at the next transaction ID, thus invalidating | ||
260 | * any existing commit records in the log. */ | ||
261 | journal->j_transaction_sequence = ++info.end_transaction; | ||
262 | |||
263 | journal_clear_revoke(journal); | ||
264 | err2 = sync_blockdev(journal->j_fs_dev); | ||
265 | if (!err) | ||
266 | err = err2; | ||
267 | /* Flush disk caches to get replayed data on the permanent storage */ | ||
268 | if (journal->j_flags & JFS_BARRIER) { | ||
269 | err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); | ||
270 | if (!err) | ||
271 | err = err2; | ||
272 | } | ||
273 | |||
274 | return err; | ||
275 | } | ||
276 | |||
277 | /** | ||
278 | * journal_skip_recovery - Start journal and wipe exiting records | ||
279 | * @journal: journal to startup | ||
280 | * | ||
281 | * Locate any valid recovery information from the journal and set up the | ||
282 | * journal structures in memory to ignore it (presumably because the | ||
283 | * caller has evidence that it is out of date). | ||
284 | * This function does'nt appear to be exorted.. | ||
285 | * | ||
286 | * We perform one pass over the journal to allow us to tell the user how | ||
287 | * much recovery information is being erased, and to let us initialise | ||
288 | * the journal transaction sequence numbers to the next unused ID. | ||
289 | */ | ||
290 | int journal_skip_recovery(journal_t *journal) | ||
291 | { | ||
292 | int err; | ||
293 | struct recovery_info info; | ||
294 | |||
295 | memset (&info, 0, sizeof(info)); | ||
296 | |||
297 | err = do_one_pass(journal, &info, PASS_SCAN); | ||
298 | |||
299 | if (err) { | ||
300 | printk(KERN_ERR "JBD: error %d scanning journal\n", err); | ||
301 | ++journal->j_transaction_sequence; | ||
302 | } else { | ||
303 | #ifdef CONFIG_JBD_DEBUG | ||
304 | int dropped = info.end_transaction - | ||
305 | be32_to_cpu(journal->j_superblock->s_sequence); | ||
306 | jbd_debug(1, | ||
307 | "JBD: ignoring %d transaction%s from the journal.\n", | ||
308 | dropped, (dropped == 1) ? "" : "s"); | ||
309 | #endif | ||
310 | journal->j_transaction_sequence = ++info.end_transaction; | ||
311 | } | ||
312 | |||
313 | journal->j_tail = 0; | ||
314 | return err; | ||
315 | } | ||
316 | |||
317 | static int do_one_pass(journal_t *journal, | ||
318 | struct recovery_info *info, enum passtype pass) | ||
319 | { | ||
320 | unsigned int first_commit_ID, next_commit_ID; | ||
321 | unsigned int next_log_block; | ||
322 | int err, success = 0; | ||
323 | journal_superblock_t * sb; | ||
324 | journal_header_t * tmp; | ||
325 | struct buffer_head * bh; | ||
326 | unsigned int sequence; | ||
327 | int blocktype; | ||
328 | |||
329 | /* | ||
330 | * First thing is to establish what we expect to find in the log | ||
331 | * (in terms of transaction IDs), and where (in terms of log | ||
332 | * block offsets): query the superblock. | ||
333 | */ | ||
334 | |||
335 | sb = journal->j_superblock; | ||
336 | next_commit_ID = be32_to_cpu(sb->s_sequence); | ||
337 | next_log_block = be32_to_cpu(sb->s_start); | ||
338 | |||
339 | first_commit_ID = next_commit_ID; | ||
340 | if (pass == PASS_SCAN) | ||
341 | info->start_transaction = first_commit_ID; | ||
342 | |||
343 | jbd_debug(1, "Starting recovery pass %d\n", pass); | ||
344 | |||
345 | /* | ||
346 | * Now we walk through the log, transaction by transaction, | ||
347 | * making sure that each transaction has a commit block in the | ||
348 | * expected place. Each complete transaction gets replayed back | ||
349 | * into the main filesystem. | ||
350 | */ | ||
351 | |||
352 | while (1) { | ||
353 | int flags; | ||
354 | char * tagp; | ||
355 | journal_block_tag_t * tag; | ||
356 | struct buffer_head * obh; | ||
357 | struct buffer_head * nbh; | ||
358 | |||
359 | cond_resched(); | ||
360 | |||
361 | /* If we already know where to stop the log traversal, | ||
362 | * check right now that we haven't gone past the end of | ||
363 | * the log. */ | ||
364 | |||
365 | if (pass != PASS_SCAN) | ||
366 | if (tid_geq(next_commit_ID, info->end_transaction)) | ||
367 | break; | ||
368 | |||
369 | jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n", | ||
370 | next_commit_ID, next_log_block, journal->j_last); | ||
371 | |||
372 | /* Skip over each chunk of the transaction looking | ||
373 | * either the next descriptor block or the final commit | ||
374 | * record. */ | ||
375 | |||
376 | jbd_debug(3, "JBD: checking block %u\n", next_log_block); | ||
377 | err = jread(&bh, journal, next_log_block); | ||
378 | if (err) | ||
379 | goto failed; | ||
380 | |||
381 | next_log_block++; | ||
382 | wrap(journal, next_log_block); | ||
383 | |||
384 | /* What kind of buffer is it? | ||
385 | * | ||
386 | * If it is a descriptor block, check that it has the | ||
387 | * expected sequence number. Otherwise, we're all done | ||
388 | * here. */ | ||
389 | |||
390 | tmp = (journal_header_t *)bh->b_data; | ||
391 | |||
392 | if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { | ||
393 | brelse(bh); | ||
394 | break; | ||
395 | } | ||
396 | |||
397 | blocktype = be32_to_cpu(tmp->h_blocktype); | ||
398 | sequence = be32_to_cpu(tmp->h_sequence); | ||
399 | jbd_debug(3, "Found magic %d, sequence %d\n", | ||
400 | blocktype, sequence); | ||
401 | |||
402 | if (sequence != next_commit_ID) { | ||
403 | brelse(bh); | ||
404 | break; | ||
405 | } | ||
406 | |||
407 | /* OK, we have a valid descriptor block which matches | ||
408 | * all of the sequence number checks. What are we going | ||
409 | * to do with it? That depends on the pass... */ | ||
410 | |||
411 | switch(blocktype) { | ||
412 | case JFS_DESCRIPTOR_BLOCK: | ||
413 | /* If it is a valid descriptor block, replay it | ||
414 | * in pass REPLAY; otherwise, just skip over the | ||
415 | * blocks it describes. */ | ||
416 | if (pass != PASS_REPLAY) { | ||
417 | next_log_block += | ||
418 | count_tags(bh, journal->j_blocksize); | ||
419 | wrap(journal, next_log_block); | ||
420 | brelse(bh); | ||
421 | continue; | ||
422 | } | ||
423 | |||
424 | /* A descriptor block: we can now write all of | ||
425 | * the data blocks. Yay, useful work is finally | ||
426 | * getting done here! */ | ||
427 | |||
428 | tagp = &bh->b_data[sizeof(journal_header_t)]; | ||
429 | while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) | ||
430 | <= journal->j_blocksize) { | ||
431 | unsigned int io_block; | ||
432 | |||
433 | tag = (journal_block_tag_t *) tagp; | ||
434 | flags = be32_to_cpu(tag->t_flags); | ||
435 | |||
436 | io_block = next_log_block++; | ||
437 | wrap(journal, next_log_block); | ||
438 | err = jread(&obh, journal, io_block); | ||
439 | if (err) { | ||
440 | /* Recover what we can, but | ||
441 | * report failure at the end. */ | ||
442 | success = err; | ||
443 | printk (KERN_ERR | ||
444 | "JBD: IO error %d recovering " | ||
445 | "block %u in log\n", | ||
446 | err, io_block); | ||
447 | } else { | ||
448 | unsigned int blocknr; | ||
449 | |||
450 | J_ASSERT(obh != NULL); | ||
451 | blocknr = be32_to_cpu(tag->t_blocknr); | ||
452 | |||
453 | /* If the block has been | ||
454 | * revoked, then we're all done | ||
455 | * here. */ | ||
456 | if (journal_test_revoke | ||
457 | (journal, blocknr, | ||
458 | next_commit_ID)) { | ||
459 | brelse(obh); | ||
460 | ++info->nr_revoke_hits; | ||
461 | goto skip_write; | ||
462 | } | ||
463 | |||
464 | /* Find a buffer for the new | ||
465 | * data being restored */ | ||
466 | nbh = __getblk(journal->j_fs_dev, | ||
467 | blocknr, | ||
468 | journal->j_blocksize); | ||
469 | if (nbh == NULL) { | ||
470 | printk(KERN_ERR | ||
471 | "JBD: Out of memory " | ||
472 | "during recovery.\n"); | ||
473 | err = -ENOMEM; | ||
474 | brelse(bh); | ||
475 | brelse(obh); | ||
476 | goto failed; | ||
477 | } | ||
478 | |||
479 | lock_buffer(nbh); | ||
480 | memcpy(nbh->b_data, obh->b_data, | ||
481 | journal->j_blocksize); | ||
482 | if (flags & JFS_FLAG_ESCAPE) { | ||
483 | *((__be32 *)nbh->b_data) = | ||
484 | cpu_to_be32(JFS_MAGIC_NUMBER); | ||
485 | } | ||
486 | |||
487 | BUFFER_TRACE(nbh, "marking dirty"); | ||
488 | set_buffer_uptodate(nbh); | ||
489 | mark_buffer_dirty(nbh); | ||
490 | BUFFER_TRACE(nbh, "marking uptodate"); | ||
491 | ++info->nr_replays; | ||
492 | /* ll_rw_block(WRITE, 1, &nbh); */ | ||
493 | unlock_buffer(nbh); | ||
494 | brelse(obh); | ||
495 | brelse(nbh); | ||
496 | } | ||
497 | |||
498 | skip_write: | ||
499 | tagp += sizeof(journal_block_tag_t); | ||
500 | if (!(flags & JFS_FLAG_SAME_UUID)) | ||
501 | tagp += 16; | ||
502 | |||
503 | if (flags & JFS_FLAG_LAST_TAG) | ||
504 | break; | ||
505 | } | ||
506 | |||
507 | brelse(bh); | ||
508 | continue; | ||
509 | |||
510 | case JFS_COMMIT_BLOCK: | ||
511 | /* Found an expected commit block: not much to | ||
512 | * do other than move on to the next sequence | ||
513 | * number. */ | ||
514 | brelse(bh); | ||
515 | next_commit_ID++; | ||
516 | continue; | ||
517 | |||
518 | case JFS_REVOKE_BLOCK: | ||
519 | /* If we aren't in the REVOKE pass, then we can | ||
520 | * just skip over this block. */ | ||
521 | if (pass != PASS_REVOKE) { | ||
522 | brelse(bh); | ||
523 | continue; | ||
524 | } | ||
525 | |||
526 | err = scan_revoke_records(journal, bh, | ||
527 | next_commit_ID, info); | ||
528 | brelse(bh); | ||
529 | if (err) | ||
530 | goto failed; | ||
531 | continue; | ||
532 | |||
533 | default: | ||
534 | jbd_debug(3, "Unrecognised magic %d, end of scan.\n", | ||
535 | blocktype); | ||
536 | brelse(bh); | ||
537 | goto done; | ||
538 | } | ||
539 | } | ||
540 | |||
541 | done: | ||
542 | /* | ||
543 | * We broke out of the log scan loop: either we came to the | ||
544 | * known end of the log or we found an unexpected block in the | ||
545 | * log. If the latter happened, then we know that the "current" | ||
546 | * transaction marks the end of the valid log. | ||
547 | */ | ||
548 | |||
549 | if (pass == PASS_SCAN) | ||
550 | info->end_transaction = next_commit_ID; | ||
551 | else { | ||
552 | /* It's really bad news if different passes end up at | ||
553 | * different places (but possible due to IO errors). */ | ||
554 | if (info->end_transaction != next_commit_ID) { | ||
555 | printk (KERN_ERR "JBD: recovery pass %d ended at " | ||
556 | "transaction %u, expected %u\n", | ||
557 | pass, next_commit_ID, info->end_transaction); | ||
558 | if (!success) | ||
559 | success = -EIO; | ||
560 | } | ||
561 | } | ||
562 | |||
563 | return success; | ||
564 | |||
565 | failed: | ||
566 | return err; | ||
567 | } | ||
568 | |||
569 | |||
570 | /* Scan a revoke record, marking all blocks mentioned as revoked. */ | ||
571 | |||
572 | static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, | ||
573 | tid_t sequence, struct recovery_info *info) | ||
574 | { | ||
575 | journal_revoke_header_t *header; | ||
576 | int offset, max; | ||
577 | |||
578 | header = (journal_revoke_header_t *) bh->b_data; | ||
579 | offset = sizeof(journal_revoke_header_t); | ||
580 | max = be32_to_cpu(header->r_count); | ||
581 | |||
582 | while (offset < max) { | ||
583 | unsigned int blocknr; | ||
584 | int err; | ||
585 | |||
586 | blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); | ||
587 | offset += 4; | ||
588 | err = journal_set_revoke(journal, blocknr, sequence); | ||
589 | if (err) | ||
590 | return err; | ||
591 | ++info->nr_revokes; | ||
592 | } | ||
593 | return 0; | ||
594 | } | ||
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c deleted file mode 100644 index dcead636c33b..000000000000 --- a/fs/jbd/revoke.c +++ /dev/null | |||
@@ -1,733 +0,0 @@ | |||
1 | /* | ||
2 | * linux/fs/jbd/revoke.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 | ||
5 | * | ||
6 | * Copyright 2000 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Journal revoke routines for the generic filesystem journaling code; | ||
13 | * part of the ext2fs journaling system. | ||
14 | * | ||
15 | * Revoke is the mechanism used to prevent old log records for deleted | ||
16 | * metadata from being replayed on top of newer data using the same | ||
17 | * blocks. The revoke mechanism is used in two separate places: | ||
18 | * | ||
19 | * + Commit: during commit we write the entire list of the current | ||
20 | * transaction's revoked blocks to the journal | ||
21 | * | ||
22 | * + Recovery: during recovery we record the transaction ID of all | ||
23 | * revoked blocks. If there are multiple revoke records in the log | ||
24 | * for a single block, only the last one counts, and if there is a log | ||
25 | * entry for a block beyond the last revoke, then that log entry still | ||
26 | * gets replayed. | ||
27 | * | ||
28 | * We can get interactions between revokes and new log data within a | ||
29 | * single transaction: | ||
30 | * | ||
31 | * Block is revoked and then journaled: | ||
32 | * The desired end result is the journaling of the new block, so we | ||
33 | * cancel the revoke before the transaction commits. | ||
34 | * | ||
35 | * Block is journaled and then revoked: | ||
36 | * The revoke must take precedence over the write of the block, so we | ||
37 | * need either to cancel the journal entry or to write the revoke | ||
38 | * later in the log than the log block. In this case, we choose the | ||
39 | * latter: journaling a block cancels any revoke record for that block | ||
40 | * in the current transaction, so any revoke for that block in the | ||
41 | * transaction must have happened after the block was journaled and so | ||
42 | * the revoke must take precedence. | ||
43 | * | ||
44 | * Block is revoked and then written as data: | ||
45 | * The data write is allowed to succeed, but the revoke is _not_ | ||
46 | * cancelled. We still need to prevent old log records from | ||
47 | * overwriting the new data. We don't even need to clear the revoke | ||
48 | * bit here. | ||
49 | * | ||
50 | * We cache revoke status of a buffer in the current transaction in b_states | ||
51 | * bits. As the name says, revokevalid flag indicates that the cached revoke | ||
52 | * status of a buffer is valid and we can rely on the cached status. | ||
53 | * | ||
54 | * Revoke information on buffers is a tri-state value: | ||
55 | * | ||
56 | * RevokeValid clear: no cached revoke status, need to look it up | ||
57 | * RevokeValid set, Revoked clear: | ||
58 | * buffer has not been revoked, and cancel_revoke | ||
59 | * need do nothing. | ||
60 | * RevokeValid set, Revoked set: | ||
61 | * buffer has been revoked. | ||
62 | * | ||
63 | * Locking rules: | ||
64 | * We keep two hash tables of revoke records. One hashtable belongs to the | ||
65 | * running transaction (is pointed to by journal->j_revoke), the other one | ||
66 | * belongs to the committing transaction. Accesses to the second hash table | ||
67 | * happen only from the kjournald and no other thread touches this table. Also | ||
68 | * journal_switch_revoke_table() which switches which hashtable belongs to the | ||
69 | * running and which to the committing transaction is called only from | ||
70 | * kjournald. Therefore we need no locks when accessing the hashtable belonging | ||
71 | * to the committing transaction. | ||
72 | * | ||
73 | * All users operating on the hash table belonging to the running transaction | ||
74 | * have a handle to the transaction. Therefore they are safe from kjournald | ||
75 | * switching hash tables under them. For operations on the lists of entries in | ||
76 | * the hash table j_revoke_lock is used. | ||
77 | * | ||
78 | * Finally, also replay code uses the hash tables but at this moment no one else | ||
79 | * can touch them (filesystem isn't mounted yet) and hence no locking is | ||
80 | * needed. | ||
81 | */ | ||
82 | |||
83 | #ifndef __KERNEL__ | ||
84 | #include "jfs_user.h" | ||
85 | #else | ||
86 | #include <linux/time.h> | ||
87 | #include <linux/fs.h> | ||
88 | #include <linux/jbd.h> | ||
89 | #include <linux/errno.h> | ||
90 | #include <linux/slab.h> | ||
91 | #include <linux/list.h> | ||
92 | #include <linux/init.h> | ||
93 | #include <linux/bio.h> | ||
94 | #endif | ||
95 | #include <linux/log2.h> | ||
96 | #include <linux/hash.h> | ||
97 | |||
98 | static struct kmem_cache *revoke_record_cache; | ||
99 | static struct kmem_cache *revoke_table_cache; | ||
100 | |||
101 | /* Each revoke record represents one single revoked block. During | ||
102 | journal replay, this involves recording the transaction ID of the | ||
103 | last transaction to revoke this block. */ | ||
104 | |||
105 | struct jbd_revoke_record_s | ||
106 | { | ||
107 | struct list_head hash; | ||
108 | tid_t sequence; /* Used for recovery only */ | ||
109 | unsigned int blocknr; | ||
110 | }; | ||
111 | |||
112 | |||
113 | /* The revoke table is just a simple hash table of revoke records. */ | ||
114 | struct jbd_revoke_table_s | ||
115 | { | ||
116 | /* It is conceivable that we might want a larger hash table | ||
117 | * for recovery. Must be a power of two. */ | ||
118 | int hash_size; | ||
119 | int hash_shift; | ||
120 | struct list_head *hash_table; | ||
121 | }; | ||
122 | |||
123 | |||
124 | #ifdef __KERNEL__ | ||
125 | static void write_one_revoke_record(journal_t *, transaction_t *, | ||
126 | struct journal_head **, int *, | ||
127 | struct jbd_revoke_record_s *, int); | ||
128 | static void flush_descriptor(journal_t *, struct journal_head *, int, int); | ||
129 | #endif | ||
130 | |||
131 | /* Utility functions to maintain the revoke table */ | ||
132 | |||
133 | static inline int hash(journal_t *journal, unsigned int block) | ||
134 | { | ||
135 | struct jbd_revoke_table_s *table = journal->j_revoke; | ||
136 | |||
137 | return hash_32(block, table->hash_shift); | ||
138 | } | ||
139 | |||
140 | static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, | ||
141 | tid_t seq) | ||
142 | { | ||
143 | struct list_head *hash_list; | ||
144 | struct jbd_revoke_record_s *record; | ||
145 | |||
146 | repeat: | ||
147 | record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); | ||
148 | if (!record) | ||
149 | goto oom; | ||
150 | |||
151 | record->sequence = seq; | ||
152 | record->blocknr = blocknr; | ||
153 | hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; | ||
154 | spin_lock(&journal->j_revoke_lock); | ||
155 | list_add(&record->hash, hash_list); | ||
156 | spin_unlock(&journal->j_revoke_lock); | ||
157 | return 0; | ||
158 | |||
159 | oom: | ||
160 | if (!journal_oom_retry) | ||
161 | return -ENOMEM; | ||
162 | jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); | ||
163 | yield(); | ||
164 | goto repeat; | ||
165 | } | ||
166 | |||
167 | /* Find a revoke record in the journal's hash table. */ | ||
168 | |||
169 | static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, | ||
170 | unsigned int blocknr) | ||
171 | { | ||
172 | struct list_head *hash_list; | ||
173 | struct jbd_revoke_record_s *record; | ||
174 | |||
175 | hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; | ||
176 | |||
177 | spin_lock(&journal->j_revoke_lock); | ||
178 | record = (struct jbd_revoke_record_s *) hash_list->next; | ||
179 | while (&(record->hash) != hash_list) { | ||
180 | if (record->blocknr == blocknr) { | ||
181 | spin_unlock(&journal->j_revoke_lock); | ||
182 | return record; | ||
183 | } | ||
184 | record = (struct jbd_revoke_record_s *) record->hash.next; | ||
185 | } | ||
186 | spin_unlock(&journal->j_revoke_lock); | ||
187 | return NULL; | ||
188 | } | ||
189 | |||
190 | void journal_destroy_revoke_caches(void) | ||
191 | { | ||
192 | if (revoke_record_cache) { | ||
193 | kmem_cache_destroy(revoke_record_cache); | ||
194 | revoke_record_cache = NULL; | ||
195 | } | ||
196 | if (revoke_table_cache) { | ||
197 | kmem_cache_destroy(revoke_table_cache); | ||
198 | revoke_table_cache = NULL; | ||
199 | } | ||
200 | } | ||
201 | |||
202 | int __init journal_init_revoke_caches(void) | ||
203 | { | ||
204 | J_ASSERT(!revoke_record_cache); | ||
205 | J_ASSERT(!revoke_table_cache); | ||
206 | |||
207 | revoke_record_cache = kmem_cache_create("revoke_record", | ||
208 | sizeof(struct jbd_revoke_record_s), | ||
209 | 0, | ||
210 | SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, | ||
211 | NULL); | ||
212 | if (!revoke_record_cache) | ||
213 | goto record_cache_failure; | ||
214 | |||
215 | revoke_table_cache = kmem_cache_create("revoke_table", | ||
216 | sizeof(struct jbd_revoke_table_s), | ||
217 | 0, SLAB_TEMPORARY, NULL); | ||
218 | if (!revoke_table_cache) | ||
219 | goto table_cache_failure; | ||
220 | |||
221 | return 0; | ||
222 | |||
223 | table_cache_failure: | ||
224 | journal_destroy_revoke_caches(); | ||
225 | record_cache_failure: | ||
226 | return -ENOMEM; | ||
227 | } | ||
228 | |||
229 | static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) | ||
230 | { | ||
231 | int i; | ||
232 | struct jbd_revoke_table_s *table; | ||
233 | |||
234 | table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); | ||
235 | if (!table) | ||
236 | goto out; | ||
237 | |||
238 | table->hash_size = hash_size; | ||
239 | table->hash_shift = ilog2(hash_size); | ||
240 | table->hash_table = | ||
241 | kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); | ||
242 | if (!table->hash_table) { | ||
243 | kmem_cache_free(revoke_table_cache, table); | ||
244 | table = NULL; | ||
245 | goto out; | ||
246 | } | ||
247 | |||
248 | for (i = 0; i < hash_size; i++) | ||
249 | INIT_LIST_HEAD(&table->hash_table[i]); | ||
250 | |||
251 | out: | ||
252 | return table; | ||
253 | } | ||
254 | |||
255 | static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table) | ||
256 | { | ||
257 | int i; | ||
258 | struct list_head *hash_list; | ||
259 | |||
260 | for (i = 0; i < table->hash_size; i++) { | ||
261 | hash_list = &table->hash_table[i]; | ||
262 | J_ASSERT(list_empty(hash_list)); | ||
263 | } | ||
264 | |||
265 | kfree(table->hash_table); | ||
266 | kmem_cache_free(revoke_table_cache, table); | ||
267 | } | ||
268 | |||
269 | /* Initialise the revoke table for a given journal to a given size. */ | ||
270 | int journal_init_revoke(journal_t *journal, int hash_size) | ||
271 | { | ||
272 | J_ASSERT(journal->j_revoke_table[0] == NULL); | ||
273 | J_ASSERT(is_power_of_2(hash_size)); | ||
274 | |||
275 | journal->j_revoke_table[0] = journal_init_revoke_table(hash_size); | ||
276 | if (!journal->j_revoke_table[0]) | ||
277 | goto fail0; | ||
278 | |||
279 | journal->j_revoke_table[1] = journal_init_revoke_table(hash_size); | ||
280 | if (!journal->j_revoke_table[1]) | ||
281 | goto fail1; | ||
282 | |||
283 | journal->j_revoke = journal->j_revoke_table[1]; | ||
284 | |||
285 | spin_lock_init(&journal->j_revoke_lock); | ||
286 | |||
287 | return 0; | ||
288 | |||
289 | fail1: | ||
290 | journal_destroy_revoke_table(journal->j_revoke_table[0]); | ||
291 | fail0: | ||
292 | return -ENOMEM; | ||
293 | } | ||
294 | |||
295 | /* Destroy a journal's revoke table. The table must already be empty! */ | ||
296 | void journal_destroy_revoke(journal_t *journal) | ||
297 | { | ||
298 | journal->j_revoke = NULL; | ||
299 | if (journal->j_revoke_table[0]) | ||
300 | journal_destroy_revoke_table(journal->j_revoke_table[0]); | ||
301 | if (journal->j_revoke_table[1]) | ||
302 | journal_destroy_revoke_table(journal->j_revoke_table[1]); | ||
303 | } | ||
304 | |||
305 | |||
306 | #ifdef __KERNEL__ | ||
307 | |||
308 | /* | ||
309 | * journal_revoke: revoke a given buffer_head from the journal. This | ||
310 | * prevents the block from being replayed during recovery if we take a | ||
311 | * crash after this current transaction commits. Any subsequent | ||
312 | * metadata writes of the buffer in this transaction cancel the | ||
313 | * revoke. | ||
314 | * | ||
315 | * Note that this call may block --- it is up to the caller to make | ||
316 | * sure that there are no further calls to journal_write_metadata | ||
317 | * before the revoke is complete. In ext3, this implies calling the | ||
318 | * revoke before clearing the block bitmap when we are deleting | ||
319 | * metadata. | ||
320 | * | ||
321 | * Revoke performs a journal_forget on any buffer_head passed in as a | ||
322 | * parameter, but does _not_ forget the buffer_head if the bh was only | ||
323 | * found implicitly. | ||
324 | * | ||
325 | * bh_in may not be a journalled buffer - it may have come off | ||
326 | * the hash tables without an attached journal_head. | ||
327 | * | ||
328 | * If bh_in is non-zero, journal_revoke() will decrement its b_count | ||
329 | * by one. | ||
330 | */ | ||
331 | |||
332 | int journal_revoke(handle_t *handle, unsigned int blocknr, | ||
333 | struct buffer_head *bh_in) | ||
334 | { | ||
335 | struct buffer_head *bh = NULL; | ||
336 | journal_t *journal; | ||
337 | struct block_device *bdev; | ||
338 | int err; | ||
339 | |||
340 | might_sleep(); | ||
341 | if (bh_in) | ||
342 | BUFFER_TRACE(bh_in, "enter"); | ||
343 | |||
344 | journal = handle->h_transaction->t_journal; | ||
345 | if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ | ||
346 | J_ASSERT (!"Cannot set revoke feature!"); | ||
347 | return -EINVAL; | ||
348 | } | ||
349 | |||
350 | bdev = journal->j_fs_dev; | ||
351 | bh = bh_in; | ||
352 | |||
353 | if (!bh) { | ||
354 | bh = __find_get_block(bdev, blocknr, journal->j_blocksize); | ||
355 | if (bh) | ||
356 | BUFFER_TRACE(bh, "found on hash"); | ||
357 | } | ||
358 | #ifdef JBD_EXPENSIVE_CHECKING | ||
359 | else { | ||
360 | struct buffer_head *bh2; | ||
361 | |||
362 | /* If there is a different buffer_head lying around in | ||
363 | * memory anywhere... */ | ||
364 | bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); | ||
365 | if (bh2) { | ||
366 | /* ... and it has RevokeValid status... */ | ||
367 | if (bh2 != bh && buffer_revokevalid(bh2)) | ||
368 | /* ...then it better be revoked too, | ||
369 | * since it's illegal to create a revoke | ||
370 | * record against a buffer_head which is | ||
371 | * not marked revoked --- that would | ||
372 | * risk missing a subsequent revoke | ||
373 | * cancel. */ | ||
374 | J_ASSERT_BH(bh2, buffer_revoked(bh2)); | ||
375 | put_bh(bh2); | ||
376 | } | ||
377 | } | ||
378 | #endif | ||
379 | |||
380 | /* We really ought not ever to revoke twice in a row without | ||
381 | first having the revoke cancelled: it's illegal to free a | ||
382 | block twice without allocating it in between! */ | ||
383 | if (bh) { | ||
384 | if (!J_EXPECT_BH(bh, !buffer_revoked(bh), | ||
385 | "inconsistent data on disk")) { | ||
386 | if (!bh_in) | ||
387 | brelse(bh); | ||
388 | return -EIO; | ||
389 | } | ||
390 | set_buffer_revoked(bh); | ||
391 | set_buffer_revokevalid(bh); | ||
392 | if (bh_in) { | ||
393 | BUFFER_TRACE(bh_in, "call journal_forget"); | ||
394 | journal_forget(handle, bh_in); | ||
395 | } else { | ||
396 | BUFFER_TRACE(bh, "call brelse"); | ||
397 | __brelse(bh); | ||
398 | } | ||
399 | } | ||
400 | |||
401 | jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in); | ||
402 | err = insert_revoke_hash(journal, blocknr, | ||
403 | handle->h_transaction->t_tid); | ||
404 | BUFFER_TRACE(bh_in, "exit"); | ||
405 | return err; | ||
406 | } | ||
407 | |||
408 | /* | ||
409 | * Cancel an outstanding revoke. For use only internally by the | ||
410 | * journaling code (called from journal_get_write_access). | ||
411 | * | ||
412 | * We trust buffer_revoked() on the buffer if the buffer is already | ||
413 | * being journaled: if there is no revoke pending on the buffer, then we | ||
414 | * don't do anything here. | ||
415 | * | ||
416 | * This would break if it were possible for a buffer to be revoked and | ||
417 | * discarded, and then reallocated within the same transaction. In such | ||
418 | * a case we would have lost the revoked bit, but when we arrived here | ||
419 | * the second time we would still have a pending revoke to cancel. So, | ||
420 | * do not trust the Revoked bit on buffers unless RevokeValid is also | ||
421 | * set. | ||
422 | */ | ||
423 | int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) | ||
424 | { | ||
425 | struct jbd_revoke_record_s *record; | ||
426 | journal_t *journal = handle->h_transaction->t_journal; | ||
427 | int need_cancel; | ||
428 | int did_revoke = 0; /* akpm: debug */ | ||
429 | struct buffer_head *bh = jh2bh(jh); | ||
430 | |||
431 | jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); | ||
432 | |||
433 | /* Is the existing Revoke bit valid? If so, we trust it, and | ||
434 | * only perform the full cancel if the revoke bit is set. If | ||
435 | * not, we can't trust the revoke bit, and we need to do the | ||
436 | * full search for a revoke record. */ | ||
437 | if (test_set_buffer_revokevalid(bh)) { | ||
438 | need_cancel = test_clear_buffer_revoked(bh); | ||
439 | } else { | ||
440 | need_cancel = 1; | ||
441 | clear_buffer_revoked(bh); | ||
442 | } | ||
443 | |||
444 | if (need_cancel) { | ||
445 | record = find_revoke_record(journal, bh->b_blocknr); | ||
446 | if (record) { | ||
447 | jbd_debug(4, "cancelled existing revoke on " | ||
448 | "blocknr %llu\n", (unsigned long long)bh->b_blocknr); | ||
449 | spin_lock(&journal->j_revoke_lock); | ||
450 | list_del(&record->hash); | ||
451 | spin_unlock(&journal->j_revoke_lock); | ||
452 | kmem_cache_free(revoke_record_cache, record); | ||
453 | did_revoke = 1; | ||
454 | } | ||
455 | } | ||
456 | |||
457 | #ifdef JBD_EXPENSIVE_CHECKING | ||
458 | /* There better not be one left behind by now! */ | ||
459 | record = find_revoke_record(journal, bh->b_blocknr); | ||
460 | J_ASSERT_JH(jh, record == NULL); | ||
461 | #endif | ||
462 | |||
463 | /* Finally, have we just cleared revoke on an unhashed | ||
464 | * buffer_head? If so, we'd better make sure we clear the | ||
465 | * revoked status on any hashed alias too, otherwise the revoke | ||
466 | * state machine will get very upset later on. */ | ||
467 | if (need_cancel) { | ||
468 | struct buffer_head *bh2; | ||
469 | bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); | ||
470 | if (bh2) { | ||
471 | if (bh2 != bh) | ||
472 | clear_buffer_revoked(bh2); | ||
473 | __brelse(bh2); | ||
474 | } | ||
475 | } | ||
476 | return did_revoke; | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * journal_clear_revoked_flags clears revoked flag of buffers in | ||
481 | * revoke table to reflect there is no revoked buffer in the next | ||
482 | * transaction which is going to be started. | ||
483 | */ | ||
484 | void journal_clear_buffer_revoked_flags(journal_t *journal) | ||
485 | { | ||
486 | struct jbd_revoke_table_s *revoke = journal->j_revoke; | ||
487 | int i = 0; | ||
488 | |||
489 | for (i = 0; i < revoke->hash_size; i++) { | ||
490 | struct list_head *hash_list; | ||
491 | struct list_head *list_entry; | ||
492 | hash_list = &revoke->hash_table[i]; | ||
493 | |||
494 | list_for_each(list_entry, hash_list) { | ||
495 | struct jbd_revoke_record_s *record; | ||
496 | struct buffer_head *bh; | ||
497 | record = (struct jbd_revoke_record_s *)list_entry; | ||
498 | bh = __find_get_block(journal->j_fs_dev, | ||
499 | record->blocknr, | ||
500 | journal->j_blocksize); | ||
501 | if (bh) { | ||
502 | clear_buffer_revoked(bh); | ||
503 | __brelse(bh); | ||
504 | } | ||
505 | } | ||
506 | } | ||
507 | } | ||
508 | |||
509 | /* journal_switch_revoke table select j_revoke for next transaction | ||
510 | * we do not want to suspend any processing until all revokes are | ||
511 | * written -bzzz | ||
512 | */ | ||
513 | void journal_switch_revoke_table(journal_t *journal) | ||
514 | { | ||
515 | int i; | ||
516 | |||
517 | if (journal->j_revoke == journal->j_revoke_table[0]) | ||
518 | journal->j_revoke = journal->j_revoke_table[1]; | ||
519 | else | ||
520 | journal->j_revoke = journal->j_revoke_table[0]; | ||
521 | |||
522 | for (i = 0; i < journal->j_revoke->hash_size; i++) | ||
523 | INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); | ||
524 | } | ||
525 | |||
526 | /* | ||
527 | * Write revoke records to the journal for all entries in the current | ||
528 | * revoke hash, deleting the entries as we go. | ||
529 | */ | ||
530 | void journal_write_revoke_records(journal_t *journal, | ||
531 | transaction_t *transaction, int write_op) | ||
532 | { | ||
533 | struct journal_head *descriptor; | ||
534 | struct jbd_revoke_record_s *record; | ||
535 | struct jbd_revoke_table_s *revoke; | ||
536 | struct list_head *hash_list; | ||
537 | int i, offset, count; | ||
538 | |||
539 | descriptor = NULL; | ||
540 | offset = 0; | ||
541 | count = 0; | ||
542 | |||
543 | /* select revoke table for committing transaction */ | ||
544 | revoke = journal->j_revoke == journal->j_revoke_table[0] ? | ||
545 | journal->j_revoke_table[1] : journal->j_revoke_table[0]; | ||
546 | |||
547 | for (i = 0; i < revoke->hash_size; i++) { | ||
548 | hash_list = &revoke->hash_table[i]; | ||
549 | |||
550 | while (!list_empty(hash_list)) { | ||
551 | record = (struct jbd_revoke_record_s *) | ||
552 | hash_list->next; | ||
553 | write_one_revoke_record(journal, transaction, | ||
554 | &descriptor, &offset, | ||
555 | record, write_op); | ||
556 | count++; | ||
557 | list_del(&record->hash); | ||
558 | kmem_cache_free(revoke_record_cache, record); | ||
559 | } | ||
560 | } | ||
561 | if (descriptor) | ||
562 | flush_descriptor(journal, descriptor, offset, write_op); | ||
563 | jbd_debug(1, "Wrote %d revoke records\n", count); | ||
564 | } | ||
565 | |||
566 | /* | ||
567 | * Write out one revoke record. We need to create a new descriptor | ||
568 | * block if the old one is full or if we have not already created one. | ||
569 | */ | ||
570 | |||
571 | static void write_one_revoke_record(journal_t *journal, | ||
572 | transaction_t *transaction, | ||
573 | struct journal_head **descriptorp, | ||
574 | int *offsetp, | ||
575 | struct jbd_revoke_record_s *record, | ||
576 | int write_op) | ||
577 | { | ||
578 | struct journal_head *descriptor; | ||
579 | int offset; | ||
580 | journal_header_t *header; | ||
581 | |||
582 | /* If we are already aborting, this all becomes a noop. We | ||
583 | still need to go round the loop in | ||
584 | journal_write_revoke_records in order to free all of the | ||
585 | revoke records: only the IO to the journal is omitted. */ | ||
586 | if (is_journal_aborted(journal)) | ||
587 | return; | ||
588 | |||
589 | descriptor = *descriptorp; | ||
590 | offset = *offsetp; | ||
591 | |||
592 | /* Make sure we have a descriptor with space left for the record */ | ||
593 | if (descriptor) { | ||
594 | if (offset == journal->j_blocksize) { | ||
595 | flush_descriptor(journal, descriptor, offset, write_op); | ||
596 | descriptor = NULL; | ||
597 | } | ||
598 | } | ||
599 | |||
600 | if (!descriptor) { | ||
601 | descriptor = journal_get_descriptor_buffer(journal); | ||
602 | if (!descriptor) | ||
603 | return; | ||
604 | header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; | ||
605 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | ||
606 | header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK); | ||
607 | header->h_sequence = cpu_to_be32(transaction->t_tid); | ||
608 | |||
609 | /* Record it so that we can wait for IO completion later */ | ||
610 | JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); | ||
611 | journal_file_buffer(descriptor, transaction, BJ_LogCtl); | ||
612 | |||
613 | offset = sizeof(journal_revoke_header_t); | ||
614 | *descriptorp = descriptor; | ||
615 | } | ||
616 | |||
617 | * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = | ||
618 | cpu_to_be32(record->blocknr); | ||
619 | offset += 4; | ||
620 | *offsetp = offset; | ||
621 | } | ||
622 | |||
623 | /* | ||
624 | * Flush a revoke descriptor out to the journal. If we are aborting, | ||
625 | * this is a noop; otherwise we are generating a buffer which needs to | ||
626 | * be waited for during commit, so it has to go onto the appropriate | ||
627 | * journal buffer list. | ||
628 | */ | ||
629 | |||
630 | static void flush_descriptor(journal_t *journal, | ||
631 | struct journal_head *descriptor, | ||
632 | int offset, int write_op) | ||
633 | { | ||
634 | journal_revoke_header_t *header; | ||
635 | struct buffer_head *bh = jh2bh(descriptor); | ||
636 | |||
637 | if (is_journal_aborted(journal)) { | ||
638 | put_bh(bh); | ||
639 | return; | ||
640 | } | ||
641 | |||
642 | header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; | ||
643 | header->r_count = cpu_to_be32(offset); | ||
644 | set_buffer_jwrite(bh); | ||
645 | BUFFER_TRACE(bh, "write"); | ||
646 | set_buffer_dirty(bh); | ||
647 | write_dirty_buffer(bh, write_op); | ||
648 | } | ||
649 | #endif | ||
650 | |||
651 | /* | ||
652 | * Revoke support for recovery. | ||
653 | * | ||
654 | * Recovery needs to be able to: | ||
655 | * | ||
656 | * record all revoke records, including the tid of the latest instance | ||
657 | * of each revoke in the journal | ||
658 | * | ||
659 | * check whether a given block in a given transaction should be replayed | ||
660 | * (ie. has not been revoked by a revoke record in that or a subsequent | ||
661 | * transaction) | ||
662 | * | ||
663 | * empty the revoke table after recovery. | ||
664 | */ | ||
665 | |||
666 | /* | ||
667 | * First, setting revoke records. We create a new revoke record for | ||
668 | * every block ever revoked in the log as we scan it for recovery, and | ||
669 | * we update the existing records if we find multiple revokes for a | ||
670 | * single block. | ||
671 | */ | ||
672 | |||
673 | int journal_set_revoke(journal_t *journal, | ||
674 | unsigned int blocknr, | ||
675 | tid_t sequence) | ||
676 | { | ||
677 | struct jbd_revoke_record_s *record; | ||
678 | |||
679 | record = find_revoke_record(journal, blocknr); | ||
680 | if (record) { | ||
681 | /* If we have multiple occurrences, only record the | ||
682 | * latest sequence number in the hashed record */ | ||
683 | if (tid_gt(sequence, record->sequence)) | ||
684 | record->sequence = sequence; | ||
685 | return 0; | ||
686 | } | ||
687 | return insert_revoke_hash(journal, blocknr, sequence); | ||
688 | } | ||
689 | |||
690 | /* | ||
691 | * Test revoke records. For a given block referenced in the log, has | ||
692 | * that block been revoked? A revoke record with a given transaction | ||
693 | * sequence number revokes all blocks in that transaction and earlier | ||
694 | * ones, but later transactions still need replayed. | ||
695 | */ | ||
696 | |||
697 | int journal_test_revoke(journal_t *journal, | ||
698 | unsigned int blocknr, | ||
699 | tid_t sequence) | ||
700 | { | ||
701 | struct jbd_revoke_record_s *record; | ||
702 | |||
703 | record = find_revoke_record(journal, blocknr); | ||
704 | if (!record) | ||
705 | return 0; | ||
706 | if (tid_gt(sequence, record->sequence)) | ||
707 | return 0; | ||
708 | return 1; | ||
709 | } | ||
710 | |||
711 | /* | ||
712 | * Finally, once recovery is over, we need to clear the revoke table so | ||
713 | * that it can be reused by the running filesystem. | ||
714 | */ | ||
715 | |||
716 | void journal_clear_revoke(journal_t *journal) | ||
717 | { | ||
718 | int i; | ||
719 | struct list_head *hash_list; | ||
720 | struct jbd_revoke_record_s *record; | ||
721 | struct jbd_revoke_table_s *revoke; | ||
722 | |||
723 | revoke = journal->j_revoke; | ||
724 | |||
725 | for (i = 0; i < revoke->hash_size; i++) { | ||
726 | hash_list = &revoke->hash_table[i]; | ||
727 | while (!list_empty(hash_list)) { | ||
728 | record = (struct jbd_revoke_record_s*) hash_list->next; | ||
729 | list_del(&record->hash); | ||
730 | kmem_cache_free(revoke_record_cache, record); | ||
731 | } | ||
732 | } | ||
733 | } | ||
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c deleted file mode 100644 index 1695ba8334a2..000000000000 --- a/fs/jbd/transaction.c +++ /dev/null | |||
@@ -1,2237 +0,0 @@ | |||
1 | /* | ||
2 | * linux/fs/jbd/transaction.c | ||
3 | * | ||
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | ||
5 | * | ||
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | ||
7 | * | ||
8 | * This file is part of the Linux kernel and is made available under | ||
9 | * the terms of the GNU General Public License, version 2, or at your | ||
10 | * option, any later version, incorporated herein by reference. | ||
11 | * | ||
12 | * Generic filesystem transaction handling code; part of the ext2fs | ||
13 | * journaling system. | ||
14 | * | ||
15 | * This file manages transactions (compound commits managed by the | ||
16 | * journaling code) and handles (individual atomic operations by the | ||
17 | * filesystem). | ||
18 | */ | ||
19 | |||
20 | #include <linux/time.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/jbd.h> | ||
23 | #include <linux/errno.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/timer.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/highmem.h> | ||
28 | #include <linux/hrtimer.h> | ||
29 | |||
30 | static void __journal_temp_unlink_buffer(struct journal_head *jh); | ||
31 | |||
32 | /* | ||
33 | * get_transaction: obtain a new transaction_t object. | ||
34 | * | ||
35 | * Simply allocate and initialise a new transaction. Create it in | ||
36 | * RUNNING state and add it to the current journal (which should not | ||
37 | * have an existing running transaction: we only make a new transaction | ||
38 | * once we have started to commit the old one). | ||
39 | * | ||
40 | * Preconditions: | ||
41 | * The journal MUST be locked. We don't perform atomic mallocs on the | ||
42 | * new transaction and we can't block without protecting against other | ||
43 | * processes trying to touch the journal while it is in transition. | ||
44 | * | ||
45 | * Called under j_state_lock | ||
46 | */ | ||
47 | |||
48 | static transaction_t * | ||
49 | get_transaction(journal_t *journal, transaction_t *transaction) | ||
50 | { | ||
51 | transaction->t_journal = journal; | ||
52 | transaction->t_state = T_RUNNING; | ||
53 | transaction->t_start_time = ktime_get(); | ||
54 | transaction->t_tid = journal->j_transaction_sequence++; | ||
55 | transaction->t_expires = jiffies + journal->j_commit_interval; | ||
56 | spin_lock_init(&transaction->t_handle_lock); | ||
57 | |||
58 | /* Set up the commit timer for the new transaction. */ | ||
59 | journal->j_commit_timer.expires = | ||
60 | round_jiffies_up(transaction->t_expires); | ||
61 | add_timer(&journal->j_commit_timer); | ||
62 | |||
63 | J_ASSERT(journal->j_running_transaction == NULL); | ||
64 | journal->j_running_transaction = transaction; | ||
65 | |||
66 | return transaction; | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * Handle management. | ||
71 | * | ||
72 | * A handle_t is an object which represents a single atomic update to a | ||
73 | * filesystem, and which tracks all of the modifications which form part | ||
74 | * of that one update. | ||
75 | */ | ||
76 | |||
77 | /* | ||
78 | * start_this_handle: Given a handle, deal with any locking or stalling | ||
79 | * needed to make sure that there is enough journal space for the handle | ||
80 | * to begin. Attach the handle to a transaction and set up the | ||
81 | * transaction's buffer credits. | ||
82 | */ | ||
83 | |||
84 | static int start_this_handle(journal_t *journal, handle_t *handle) | ||
85 | { | ||
86 | transaction_t *transaction; | ||
87 | int needed; | ||
88 | int nblocks = handle->h_buffer_credits; | ||
89 | transaction_t *new_transaction = NULL; | ||
90 | int ret = 0; | ||
91 | |||
92 | if (nblocks > journal->j_max_transaction_buffers) { | ||
93 | printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", | ||
94 | current->comm, nblocks, | ||
95 | journal->j_max_transaction_buffers); | ||
96 | ret = -ENOSPC; | ||
97 | goto out; | ||
98 | } | ||
99 | |||
100 | alloc_transaction: | ||
101 | if (!journal->j_running_transaction) { | ||
102 | new_transaction = kzalloc(sizeof(*new_transaction), | ||
103 | GFP_NOFS|__GFP_NOFAIL); | ||
104 | if (!new_transaction) { | ||
105 | ret = -ENOMEM; | ||
106 | goto out; | ||
107 | } | ||
108 | } | ||
109 | |||
110 | jbd_debug(3, "New handle %p going live.\n", handle); | ||
111 | |||
112 | repeat: | ||
113 | |||
114 | /* | ||
115 | * We need to hold j_state_lock until t_updates has been incremented, | ||
116 | * for proper journal barrier handling | ||
117 | */ | ||
118 | spin_lock(&journal->j_state_lock); | ||
119 | repeat_locked: | ||
120 | if (is_journal_aborted(journal) || | ||
121 | (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { | ||
122 | spin_unlock(&journal->j_state_lock); | ||
123 | ret = -EROFS; | ||
124 | goto out; | ||
125 | } | ||
126 | |||
127 | /* Wait on the journal's transaction barrier if necessary */ | ||
128 | if (journal->j_barrier_count) { | ||
129 | spin_unlock(&journal->j_state_lock); | ||
130 | wait_event(journal->j_wait_transaction_locked, | ||
131 | journal->j_barrier_count == 0); | ||
132 | goto repeat; | ||
133 | } | ||
134 | |||
135 | if (!journal->j_running_transaction) { | ||
136 | if (!new_transaction) { | ||
137 | spin_unlock(&journal->j_state_lock); | ||
138 | goto alloc_transaction; | ||
139 | } | ||
140 | get_transaction(journal, new_transaction); | ||
141 | new_transaction = NULL; | ||
142 | } | ||
143 | |||
144 | transaction = journal->j_running_transaction; | ||
145 | |||
146 | /* | ||
147 | * If the current transaction is locked down for commit, wait for the | ||
148 | * lock to be released. | ||
149 | */ | ||
150 | if (transaction->t_state == T_LOCKED) { | ||
151 | DEFINE_WAIT(wait); | ||
152 | |||
153 | prepare_to_wait(&journal->j_wait_transaction_locked, | ||
154 | &wait, TASK_UNINTERRUPTIBLE); | ||
155 | spin_unlock(&journal->j_state_lock); | ||
156 | schedule(); | ||
157 | finish_wait(&journal->j_wait_transaction_locked, &wait); | ||
158 | goto repeat; | ||
159 | } | ||
160 | |||
161 | /* | ||
162 | * If there is not enough space left in the log to write all potential | ||
163 | * buffers requested by this operation, we need to stall pending a log | ||
164 | * checkpoint to free some more log space. | ||
165 | */ | ||
166 | spin_lock(&transaction->t_handle_lock); | ||
167 | needed = transaction->t_outstanding_credits + nblocks; | ||
168 | |||
169 | if (needed > journal->j_max_transaction_buffers) { | ||
170 | /* | ||
171 | * If the current transaction is already too large, then start | ||
172 | * to commit it: we can then go back and attach this handle to | ||
173 | * a new transaction. | ||
174 | */ | ||
175 | DEFINE_WAIT(wait); | ||
176 | |||
177 | jbd_debug(2, "Handle %p starting new commit...\n", handle); | ||
178 | spin_unlock(&transaction->t_handle_lock); | ||
179 | prepare_to_wait(&journal->j_wait_transaction_locked, &wait, | ||
180 | TASK_UNINTERRUPTIBLE); | ||
181 | __log_start_commit(journal, transaction->t_tid); | ||
182 | spin_unlock(&journal->j_state_lock); | ||
183 | schedule(); | ||
184 | finish_wait(&journal->j_wait_transaction_locked, &wait); | ||
185 | goto repeat; | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * The commit code assumes that it can get enough log space | ||
190 | * without forcing a checkpoint. This is *critical* for | ||
191 | * correctness: a checkpoint of a buffer which is also | ||
192 | * associated with a committing transaction creates a deadlock, | ||
193 | * so commit simply cannot force through checkpoints. | ||
194 | * | ||
195 | * We must therefore ensure the necessary space in the journal | ||
196 | * *before* starting to dirty potentially checkpointed buffers | ||
197 | * in the new transaction. | ||
198 | * | ||
199 | * The worst part is, any transaction currently committing can | ||
200 | * reduce the free space arbitrarily. Be careful to account for | ||
201 | * those buffers when checkpointing. | ||
202 | */ | ||
203 | |||
204 | /* | ||
205 | * @@@ AKPM: This seems rather over-defensive. We're giving commit | ||
206 | * a _lot_ of headroom: 1/4 of the journal plus the size of | ||
207 | * the committing transaction. Really, we only need to give it | ||
208 | * committing_transaction->t_outstanding_credits plus "enough" for | ||
209 | * the log control blocks. | ||
210 | * Also, this test is inconsistent with the matching one in | ||
211 | * journal_extend(). | ||
212 | */ | ||
213 | if (__log_space_left(journal) < jbd_space_needed(journal)) { | ||
214 | jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); | ||
215 | spin_unlock(&transaction->t_handle_lock); | ||
216 | __log_wait_for_space(journal); | ||
217 | goto repeat_locked; | ||
218 | } | ||
219 | |||
220 | /* OK, account for the buffers that this operation expects to | ||
221 | * use and add the handle to the running transaction. */ | ||
222 | |||
223 | handle->h_transaction = transaction; | ||
224 | transaction->t_outstanding_credits += nblocks; | ||
225 | transaction->t_updates++; | ||
226 | transaction->t_handle_count++; | ||
227 | jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", | ||
228 | handle, nblocks, transaction->t_outstanding_credits, | ||
229 | __log_space_left(journal)); | ||
230 | spin_unlock(&transaction->t_handle_lock); | ||
231 | spin_unlock(&journal->j_state_lock); | ||
232 | |||
233 | lock_map_acquire(&handle->h_lockdep_map); | ||
234 | out: | ||
235 | if (unlikely(new_transaction)) /* It's usually NULL */ | ||
236 | kfree(new_transaction); | ||
237 | return ret; | ||
238 | } | ||
239 | |||
240 | static struct lock_class_key jbd_handle_key; | ||
241 | |||
242 | /* Allocate a new handle. This should probably be in a slab... */ | ||
243 | static handle_t *new_handle(int nblocks) | ||
244 | { | ||
245 | handle_t *handle = jbd_alloc_handle(GFP_NOFS); | ||
246 | if (!handle) | ||
247 | return NULL; | ||
248 | handle->h_buffer_credits = nblocks; | ||
249 | handle->h_ref = 1; | ||
250 | |||
251 | lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0); | ||
252 | |||
253 | return handle; | ||
254 | } | ||
255 | |||
256 | /** | ||
257 | * handle_t *journal_start() - Obtain a new handle. | ||
258 | * @journal: Journal to start transaction on. | ||
259 | * @nblocks: number of block buffer we might modify | ||
260 | * | ||
261 | * We make sure that the transaction can guarantee at least nblocks of | ||
262 | * modified buffers in the log. We block until the log can guarantee | ||
263 | * that much space. | ||
264 | * | ||
265 | * This function is visible to journal users (like ext3fs), so is not | ||
266 | * called with the journal already locked. | ||
267 | * | ||
268 | * Return a pointer to a newly allocated handle, or an ERR_PTR() value | ||
269 | * on failure. | ||
270 | */ | ||
271 | handle_t *journal_start(journal_t *journal, int nblocks) | ||
272 | { | ||
273 | handle_t *handle = journal_current_handle(); | ||
274 | int err; | ||
275 | |||
276 | if (!journal) | ||
277 | return ERR_PTR(-EROFS); | ||
278 | |||
279 | if (handle) { | ||
280 | J_ASSERT(handle->h_transaction->t_journal == journal); | ||
281 | handle->h_ref++; | ||
282 | return handle; | ||
283 | } | ||
284 | |||
285 | handle = new_handle(nblocks); | ||
286 | if (!handle) | ||
287 | return ERR_PTR(-ENOMEM); | ||
288 | |||
289 | current->journal_info = handle; | ||
290 | |||
291 | err = start_this_handle(journal, handle); | ||
292 | if (err < 0) { | ||
293 | jbd_free_handle(handle); | ||
294 | current->journal_info = NULL; | ||
295 | handle = ERR_PTR(err); | ||
296 | } | ||
297 | return handle; | ||
298 | } | ||
299 | |||
300 | /** | ||
301 | * int journal_extend() - extend buffer credits. | ||
302 | * @handle: handle to 'extend' | ||
303 | * @nblocks: nr blocks to try to extend by. | ||
304 | * | ||
305 | * Some transactions, such as large extends and truncates, can be done | ||
306 | * atomically all at once or in several stages. The operation requests | ||
307 | * a credit for a number of buffer modications in advance, but can | ||
308 | * extend its credit if it needs more. | ||
309 | * | ||
310 | * journal_extend tries to give the running handle more buffer credits. | ||
311 | * It does not guarantee that allocation - this is a best-effort only. | ||
312 | * The calling process MUST be able to deal cleanly with a failure to | ||
313 | * extend here. | ||
314 | * | ||
315 | * Return 0 on success, non-zero on failure. | ||
316 | * | ||
317 | * return code < 0 implies an error | ||
318 | * return code > 0 implies normal transaction-full status. | ||
319 | */ | ||
320 | int journal_extend(handle_t *handle, int nblocks) | ||
321 | { | ||
322 | transaction_t *transaction = handle->h_transaction; | ||
323 | journal_t *journal = transaction->t_journal; | ||
324 | int result; | ||
325 | int wanted; | ||
326 | |||
327 | result = -EIO; | ||
328 | if (is_handle_aborted(handle)) | ||
329 | goto out; | ||
330 | |||
331 | result = 1; | ||
332 | |||
333 | spin_lock(&journal->j_state_lock); | ||
334 | |||
335 | /* Don't extend a locked-down transaction! */ | ||
336 | if (handle->h_transaction->t_state != T_RUNNING) { | ||
337 | jbd_debug(3, "denied handle %p %d blocks: " | ||
338 | "transaction not running\n", handle, nblocks); | ||
339 | goto error_out; | ||
340 | } | ||
341 | |||
342 | spin_lock(&transaction->t_handle_lock); | ||
343 | wanted = transaction->t_outstanding_credits + nblocks; | ||
344 | |||
345 | if (wanted > journal->j_max_transaction_buffers) { | ||
346 | jbd_debug(3, "denied handle %p %d blocks: " | ||
347 | "transaction too large\n", handle, nblocks); | ||
348 | goto unlock; | ||
349 | } | ||
350 | |||
351 | if (wanted > __log_space_left(journal)) { | ||
352 | jbd_debug(3, "denied handle %p %d blocks: " | ||
353 | "insufficient log space\n", handle, nblocks); | ||
354 | goto unlock; | ||
355 | } | ||
356 | |||
357 | handle->h_buffer_credits += nblocks; | ||
358 | transaction->t_outstanding_credits += nblocks; | ||
359 | result = 0; | ||
360 | |||
361 | jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); | ||
362 | unlock: | ||
363 | spin_unlock(&transaction->t_handle_lock); | ||
364 | error_out: | ||
365 | spin_unlock(&journal->j_state_lock); | ||
366 | out: | ||
367 | return result; | ||
368 | } | ||
369 | |||
370 | |||
371 | /** | ||
372 | * int journal_restart() - restart a handle. | ||
373 | * @handle: handle to restart | ||
374 | * @nblocks: nr credits requested | ||
375 | * | ||
376 | * Restart a handle for a multi-transaction filesystem | ||
377 | * operation. | ||
378 | * | ||
379 | * If the journal_extend() call above fails to grant new buffer credits | ||
380 | * to a running handle, a call to journal_restart will commit the | ||
381 | * handle's transaction so far and reattach the handle to a new | ||
382 | * transaction capabable of guaranteeing the requested number of | ||
383 | * credits. | ||
384 | */ | ||
385 | |||
386 | int journal_restart(handle_t *handle, int nblocks) | ||
387 | { | ||
388 | transaction_t *transaction = handle->h_transaction; | ||
389 | journal_t *journal = transaction->t_journal; | ||
390 | int ret; | ||
391 | |||
392 | /* If we've had an abort of any type, don't even think about | ||
393 | * actually doing the restart! */ | ||
394 | if (is_handle_aborted(handle)) | ||
395 | return 0; | ||
396 | |||
397 | /* | ||
398 | * First unlink the handle from its current transaction, and start the | ||
399 | * commit on that. | ||
400 | */ | ||
401 | J_ASSERT(transaction->t_updates > 0); | ||
402 | J_ASSERT(journal_current_handle() == handle); | ||
403 | |||
404 | spin_lock(&journal->j_state_lock); | ||
405 | spin_lock(&transaction->t_handle_lock); | ||
406 | transaction->t_outstanding_credits -= handle->h_buffer_credits; | ||
407 | transaction->t_updates--; | ||
408 | |||
409 | if (!transaction->t_updates) | ||
410 | wake_up(&journal->j_wait_updates); | ||
411 | spin_unlock(&transaction->t_handle_lock); | ||
412 | |||
413 | jbd_debug(2, "restarting handle %p\n", handle); | ||
414 | __log_start_commit(journal, transaction->t_tid); | ||
415 | spin_unlock(&journal->j_state_lock); | ||
416 | |||
417 | lock_map_release(&handle->h_lockdep_map); | ||
418 | handle->h_buffer_credits = nblocks; | ||
419 | ret = start_this_handle(journal, handle); | ||
420 | return ret; | ||
421 | } | ||
422 | |||
423 | |||
424 | /** | ||
425 | * void journal_lock_updates () - establish a transaction barrier. | ||
426 | * @journal: Journal to establish a barrier on. | ||
427 | * | ||
428 | * This locks out any further updates from being started, and blocks until all | ||
429 | * existing updates have completed, returning only once the journal is in a | ||
430 | * quiescent state with no updates running. | ||
431 | * | ||
432 | * We do not use simple mutex for synchronization as there are syscalls which | ||
433 | * want to return with filesystem locked and that trips up lockdep. Also | ||
434 | * hibernate needs to lock filesystem but locked mutex then blocks hibernation. | ||
435 | * Since locking filesystem is rare operation, we use simple counter and | ||
436 | * waitqueue for locking. | ||
437 | */ | ||
438 | void journal_lock_updates(journal_t *journal) | ||
439 | { | ||
440 | DEFINE_WAIT(wait); | ||
441 | |||
442 | wait: | ||
443 | /* Wait for previous locked operation to finish */ | ||
444 | wait_event(journal->j_wait_transaction_locked, | ||
445 | journal->j_barrier_count == 0); | ||
446 | |||
447 | spin_lock(&journal->j_state_lock); | ||
448 | /* | ||
449 | * Check reliably under the lock whether we are the ones winning the race | ||
450 | * and locking the journal | ||
451 | */ | ||
452 | if (journal->j_barrier_count > 0) { | ||
453 | spin_unlock(&journal->j_state_lock); | ||
454 | goto wait; | ||
455 | } | ||
456 | ++journal->j_barrier_count; | ||
457 | |||
458 | /* Wait until there are no running updates */ | ||
459 | while (1) { | ||
460 | transaction_t *transaction = journal->j_running_transaction; | ||
461 | |||
462 | if (!transaction) | ||
463 | break; | ||
464 | |||
465 | spin_lock(&transaction->t_handle_lock); | ||
466 | if (!transaction->t_updates) { | ||
467 | spin_unlock(&transaction->t_handle_lock); | ||
468 | break; | ||
469 | } | ||
470 | prepare_to_wait(&journal->j_wait_updates, &wait, | ||
471 | TASK_UNINTERRUPTIBLE); | ||
472 | spin_unlock(&transaction->t_handle_lock); | ||
473 | spin_unlock(&journal->j_state_lock); | ||
474 | schedule(); | ||
475 | finish_wait(&journal->j_wait_updates, &wait); | ||
476 | spin_lock(&journal->j_state_lock); | ||
477 | } | ||
478 | spin_unlock(&journal->j_state_lock); | ||
479 | } | ||
480 | |||
481 | /** | ||
482 | * void journal_unlock_updates (journal_t* journal) - release barrier | ||
483 | * @journal: Journal to release the barrier on. | ||
484 | * | ||
485 | * Release a transaction barrier obtained with journal_lock_updates(). | ||
486 | */ | ||
487 | void journal_unlock_updates (journal_t *journal) | ||
488 | { | ||
489 | J_ASSERT(journal->j_barrier_count != 0); | ||
490 | |||
491 | spin_lock(&journal->j_state_lock); | ||
492 | --journal->j_barrier_count; | ||
493 | spin_unlock(&journal->j_state_lock); | ||
494 | wake_up(&journal->j_wait_transaction_locked); | ||
495 | } | ||
496 | |||
497 | static void warn_dirty_buffer(struct buffer_head *bh) | ||
498 | { | ||
499 | char b[BDEVNAME_SIZE]; | ||
500 | |||
501 | printk(KERN_WARNING | ||
502 | "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " | ||
503 | "There's a risk of filesystem corruption in case of system " | ||
504 | "crash.\n", | ||
505 | bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); | ||
506 | } | ||
507 | |||
508 | /* | ||
509 | * If the buffer is already part of the current transaction, then there | ||
510 | * is nothing we need to do. If it is already part of a prior | ||
511 | * transaction which we are still committing to disk, then we need to | ||
512 | * make sure that we do not overwrite the old copy: we do copy-out to | ||
513 | * preserve the copy going to disk. We also account the buffer against | ||
514 | * the handle's metadata buffer credits (unless the buffer is already | ||
515 | * part of the transaction, that is). | ||
516 | * | ||
517 | */ | ||
518 | static int | ||
519 | do_get_write_access(handle_t *handle, struct journal_head *jh, | ||
520 | int force_copy) | ||
521 | { | ||
522 | struct buffer_head *bh; | ||
523 | transaction_t *transaction; | ||
524 | journal_t *journal; | ||
525 | int error; | ||
526 | char *frozen_buffer = NULL; | ||
527 | int need_copy = 0; | ||
528 | |||
529 | if (is_handle_aborted(handle)) | ||
530 | return -EROFS; | ||
531 | |||
532 | transaction = handle->h_transaction; | ||
533 | journal = transaction->t_journal; | ||
534 | |||
535 | jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); | ||
536 | |||
537 | JBUFFER_TRACE(jh, "entry"); | ||
538 | repeat: | ||
539 | bh = jh2bh(jh); | ||
540 | |||
541 | /* @@@ Need to check for errors here at some point. */ | ||
542 | |||
543 | lock_buffer(bh); | ||
544 | jbd_lock_bh_state(bh); | ||
545 | |||
546 | /* We now hold the buffer lock so it is safe to query the buffer | ||
547 | * state. Is the buffer dirty? | ||
548 | * | ||
549 | * If so, there are two possibilities. The buffer may be | ||
550 | * non-journaled, and undergoing a quite legitimate writeback. | ||
551 | * Otherwise, it is journaled, and we don't expect dirty buffers | ||
552 | * in that state (the buffers should be marked JBD_Dirty | ||
553 | * instead.) So either the IO is being done under our own | ||
554 | * control and this is a bug, or it's a third party IO such as | ||
555 | * dump(8) (which may leave the buffer scheduled for read --- | ||
556 | * ie. locked but not dirty) or tune2fs (which may actually have | ||
557 | * the buffer dirtied, ugh.) */ | ||
558 | |||
559 | if (buffer_dirty(bh)) { | ||
560 | /* | ||
561 | * First question: is this buffer already part of the current | ||
562 | * transaction or the existing committing transaction? | ||
563 | */ | ||
564 | if (jh->b_transaction) { | ||
565 | J_ASSERT_JH(jh, | ||
566 | jh->b_transaction == transaction || | ||
567 | jh->b_transaction == | ||
568 | journal->j_committing_transaction); | ||
569 | if (jh->b_next_transaction) | ||
570 | J_ASSERT_JH(jh, jh->b_next_transaction == | ||
571 | transaction); | ||
572 | warn_dirty_buffer(bh); | ||
573 | } | ||
574 | /* | ||
575 | * In any case we need to clean the dirty flag and we must | ||
576 | * do it under the buffer lock to be sure we don't race | ||
577 | * with running write-out. | ||
578 | */ | ||
579 | JBUFFER_TRACE(jh, "Journalling dirty buffer"); | ||
580 | clear_buffer_dirty(bh); | ||
581 | set_buffer_jbddirty(bh); | ||
582 | } | ||
583 | |||
584 | unlock_buffer(bh); | ||
585 | |||
586 | error = -EROFS; | ||
587 | if (is_handle_aborted(handle)) { | ||
588 | jbd_unlock_bh_state(bh); | ||
589 | goto out; | ||
590 | } | ||
591 | error = 0; | ||
592 | |||
593 | /* | ||
594 | * The buffer is already part of this transaction if b_transaction or | ||
595 | * b_next_transaction points to it | ||
596 | */ | ||
597 | if (jh->b_transaction == transaction || | ||
598 | jh->b_next_transaction == transaction) | ||
599 | goto done; | ||
600 | |||
601 | /* | ||
602 | * this is the first time this transaction is touching this buffer, | ||
603 | * reset the modified flag | ||
604 | */ | ||
605 | jh->b_modified = 0; | ||
606 | |||
607 | /* | ||
608 | * If there is already a copy-out version of this buffer, then we don't | ||
609 | * need to make another one | ||
610 | */ | ||
611 | if (jh->b_frozen_data) { | ||
612 | JBUFFER_TRACE(jh, "has frozen data"); | ||
613 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
614 | jh->b_next_transaction = transaction; | ||
615 | goto done; | ||
616 | } | ||
617 | |||
618 | /* Is there data here we need to preserve? */ | ||
619 | |||
620 | if (jh->b_transaction && jh->b_transaction != transaction) { | ||
621 | JBUFFER_TRACE(jh, "owned by older transaction"); | ||
622 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
623 | J_ASSERT_JH(jh, jh->b_transaction == | ||
624 | journal->j_committing_transaction); | ||
625 | |||
626 | /* There is one case we have to be very careful about. | ||
627 | * If the committing transaction is currently writing | ||
628 | * this buffer out to disk and has NOT made a copy-out, | ||
629 | * then we cannot modify the buffer contents at all | ||
630 | * right now. The essence of copy-out is that it is the | ||
631 | * extra copy, not the primary copy, which gets | ||
632 | * journaled. If the primary copy is already going to | ||
633 | * disk then we cannot do copy-out here. */ | ||
634 | |||
635 | if (jh->b_jlist == BJ_Shadow) { | ||
636 | DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); | ||
637 | wait_queue_head_t *wqh; | ||
638 | |||
639 | wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); | ||
640 | |||
641 | JBUFFER_TRACE(jh, "on shadow: sleep"); | ||
642 | jbd_unlock_bh_state(bh); | ||
643 | /* commit wakes up all shadow buffers after IO */ | ||
644 | for ( ; ; ) { | ||
645 | prepare_to_wait(wqh, &wait.wait, | ||
646 | TASK_UNINTERRUPTIBLE); | ||
647 | if (jh->b_jlist != BJ_Shadow) | ||
648 | break; | ||
649 | schedule(); | ||
650 | } | ||
651 | finish_wait(wqh, &wait.wait); | ||
652 | goto repeat; | ||
653 | } | ||
654 | |||
655 | /* Only do the copy if the currently-owning transaction | ||
656 | * still needs it. If it is on the Forget list, the | ||
657 | * committing transaction is past that stage. The | ||
658 | * buffer had better remain locked during the kmalloc, | ||
659 | * but that should be true --- we hold the journal lock | ||
660 | * still and the buffer is already on the BUF_JOURNAL | ||
661 | * list so won't be flushed. | ||
662 | * | ||
663 | * Subtle point, though: if this is a get_undo_access, | ||
664 | * then we will be relying on the frozen_data to contain | ||
665 | * the new value of the committed_data record after the | ||
666 | * transaction, so we HAVE to force the frozen_data copy | ||
667 | * in that case. */ | ||
668 | |||
669 | if (jh->b_jlist != BJ_Forget || force_copy) { | ||
670 | JBUFFER_TRACE(jh, "generate frozen data"); | ||
671 | if (!frozen_buffer) { | ||
672 | JBUFFER_TRACE(jh, "allocate memory for buffer"); | ||
673 | jbd_unlock_bh_state(bh); | ||
674 | frozen_buffer = | ||
675 | jbd_alloc(jh2bh(jh)->b_size, | ||
676 | GFP_NOFS); | ||
677 | if (!frozen_buffer) { | ||
678 | printk(KERN_ERR | ||
679 | "%s: OOM for frozen_buffer\n", | ||
680 | __func__); | ||
681 | JBUFFER_TRACE(jh, "oom!"); | ||
682 | error = -ENOMEM; | ||
683 | jbd_lock_bh_state(bh); | ||
684 | goto done; | ||
685 | } | ||
686 | goto repeat; | ||
687 | } | ||
688 | jh->b_frozen_data = frozen_buffer; | ||
689 | frozen_buffer = NULL; | ||
690 | need_copy = 1; | ||
691 | } | ||
692 | jh->b_next_transaction = transaction; | ||
693 | } | ||
694 | |||
695 | |||
696 | /* | ||
697 | * Finally, if the buffer is not journaled right now, we need to make | ||
698 | * sure it doesn't get written to disk before the caller actually | ||
699 | * commits the new data | ||
700 | */ | ||
701 | if (!jh->b_transaction) { | ||
702 | JBUFFER_TRACE(jh, "no transaction"); | ||
703 | J_ASSERT_JH(jh, !jh->b_next_transaction); | ||
704 | JBUFFER_TRACE(jh, "file as BJ_Reserved"); | ||
705 | spin_lock(&journal->j_list_lock); | ||
706 | __journal_file_buffer(jh, transaction, BJ_Reserved); | ||
707 | spin_unlock(&journal->j_list_lock); | ||
708 | } | ||
709 | |||
710 | done: | ||
711 | if (need_copy) { | ||
712 | struct page *page; | ||
713 | int offset; | ||
714 | char *source; | ||
715 | |||
716 | J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), | ||
717 | "Possible IO failure.\n"); | ||
718 | page = jh2bh(jh)->b_page; | ||
719 | offset = offset_in_page(jh2bh(jh)->b_data); | ||
720 | source = kmap_atomic(page); | ||
721 | memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); | ||
722 | kunmap_atomic(source); | ||
723 | } | ||
724 | jbd_unlock_bh_state(bh); | ||
725 | |||
726 | /* | ||
727 | * If we are about to journal a buffer, then any revoke pending on it is | ||
728 | * no longer valid | ||
729 | */ | ||
730 | journal_cancel_revoke(handle, jh); | ||
731 | |||
732 | out: | ||
733 | if (unlikely(frozen_buffer)) /* It's usually NULL */ | ||
734 | jbd_free(frozen_buffer, bh->b_size); | ||
735 | |||
736 | JBUFFER_TRACE(jh, "exit"); | ||
737 | return error; | ||
738 | } | ||
739 | |||
740 | /** | ||
741 | * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. | ||
742 | * @handle: transaction to add buffer modifications to | ||
743 | * @bh: bh to be used for metadata writes | ||
744 | * | ||
745 | * Returns an error code or 0 on success. | ||
746 | * | ||
747 | * In full data journalling mode the buffer may be of type BJ_AsyncData, | ||
748 | * because we're write()ing a buffer which is also part of a shared mapping. | ||
749 | */ | ||
750 | |||
751 | int journal_get_write_access(handle_t *handle, struct buffer_head *bh) | ||
752 | { | ||
753 | struct journal_head *jh = journal_add_journal_head(bh); | ||
754 | int rc; | ||
755 | |||
756 | /* We do not want to get caught playing with fields which the | ||
757 | * log thread also manipulates. Make sure that the buffer | ||
758 | * completes any outstanding IO before proceeding. */ | ||
759 | rc = do_get_write_access(handle, jh, 0); | ||
760 | journal_put_journal_head(jh); | ||
761 | return rc; | ||
762 | } | ||
763 | |||
764 | |||
765 | /* | ||
766 | * When the user wants to journal a newly created buffer_head | ||
767 | * (ie. getblk() returned a new buffer and we are going to populate it | ||
768 | * manually rather than reading off disk), then we need to keep the | ||
769 | * buffer_head locked until it has been completely filled with new | ||
770 | * data. In this case, we should be able to make the assertion that | ||
771 | * the bh is not already part of an existing transaction. | ||
772 | * | ||
773 | * The buffer should already be locked by the caller by this point. | ||
774 | * There is no lock ranking violation: it was a newly created, | ||
775 | * unlocked buffer beforehand. */ | ||
776 | |||
777 | /** | ||
778 | * int journal_get_create_access () - notify intent to use newly created bh | ||
779 | * @handle: transaction to new buffer to | ||
780 | * @bh: new buffer. | ||
781 | * | ||
782 | * Call this if you create a new bh. | ||
783 | */ | ||
784 | int journal_get_create_access(handle_t *handle, struct buffer_head *bh) | ||
785 | { | ||
786 | transaction_t *transaction = handle->h_transaction; | ||
787 | journal_t *journal = transaction->t_journal; | ||
788 | struct journal_head *jh = journal_add_journal_head(bh); | ||
789 | int err; | ||
790 | |||
791 | jbd_debug(5, "journal_head %p\n", jh); | ||
792 | err = -EROFS; | ||
793 | if (is_handle_aborted(handle)) | ||
794 | goto out; | ||
795 | err = 0; | ||
796 | |||
797 | JBUFFER_TRACE(jh, "entry"); | ||
798 | /* | ||
799 | * The buffer may already belong to this transaction due to pre-zeroing | ||
800 | * in the filesystem's new_block code. It may also be on the previous, | ||
801 | * committing transaction's lists, but it HAS to be in Forget state in | ||
802 | * that case: the transaction must have deleted the buffer for it to be | ||
803 | * reused here. | ||
804 | */ | ||
805 | jbd_lock_bh_state(bh); | ||
806 | spin_lock(&journal->j_list_lock); | ||
807 | J_ASSERT_JH(jh, (jh->b_transaction == transaction || | ||
808 | jh->b_transaction == NULL || | ||
809 | (jh->b_transaction == journal->j_committing_transaction && | ||
810 | jh->b_jlist == BJ_Forget))); | ||
811 | |||
812 | J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
813 | J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); | ||
814 | |||
815 | if (jh->b_transaction == NULL) { | ||
816 | /* | ||
817 | * Previous journal_forget() could have left the buffer | ||
818 | * with jbddirty bit set because it was being committed. When | ||
819 | * the commit finished, we've filed the buffer for | ||
820 | * checkpointing and marked it dirty. Now we are reallocating | ||
821 | * the buffer so the transaction freeing it must have | ||
822 | * committed and so it's safe to clear the dirty bit. | ||
823 | */ | ||
824 | clear_buffer_dirty(jh2bh(jh)); | ||
825 | |||
826 | /* first access by this transaction */ | ||
827 | jh->b_modified = 0; | ||
828 | |||
829 | JBUFFER_TRACE(jh, "file as BJ_Reserved"); | ||
830 | __journal_file_buffer(jh, transaction, BJ_Reserved); | ||
831 | } else if (jh->b_transaction == journal->j_committing_transaction) { | ||
832 | /* first access by this transaction */ | ||
833 | jh->b_modified = 0; | ||
834 | |||
835 | JBUFFER_TRACE(jh, "set next transaction"); | ||
836 | jh->b_next_transaction = transaction; | ||
837 | } | ||
838 | spin_unlock(&journal->j_list_lock); | ||
839 | jbd_unlock_bh_state(bh); | ||
840 | |||
841 | /* | ||
842 | * akpm: I added this. ext3_alloc_branch can pick up new indirect | ||
843 | * blocks which contain freed but then revoked metadata. We need | ||
844 | * to cancel the revoke in case we end up freeing it yet again | ||
845 | * and the reallocating as data - this would cause a second revoke, | ||
846 | * which hits an assertion error. | ||
847 | */ | ||
848 | JBUFFER_TRACE(jh, "cancelling revoke"); | ||
849 | journal_cancel_revoke(handle, jh); | ||
850 | out: | ||
851 | journal_put_journal_head(jh); | ||
852 | return err; | ||
853 | } | ||
854 | |||
855 | /** | ||
856 | * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences | ||
857 | * @handle: transaction | ||
858 | * @bh: buffer to undo | ||
859 | * | ||
860 | * Sometimes there is a need to distinguish between metadata which has | ||
861 | * been committed to disk and that which has not. The ext3fs code uses | ||
862 | * this for freeing and allocating space, we have to make sure that we | ||
863 | * do not reuse freed space until the deallocation has been committed, | ||
864 | * since if we overwrote that space we would make the delete | ||
865 | * un-rewindable in case of a crash. | ||
866 | * | ||
867 | * To deal with that, journal_get_undo_access requests write access to a | ||
868 | * buffer for parts of non-rewindable operations such as delete | ||
869 | * operations on the bitmaps. The journaling code must keep a copy of | ||
870 | * the buffer's contents prior to the undo_access call until such time | ||
871 | * as we know that the buffer has definitely been committed to disk. | ||
872 | * | ||
873 | * We never need to know which transaction the committed data is part | ||
874 | * of, buffers touched here are guaranteed to be dirtied later and so | ||
875 | * will be committed to a new transaction in due course, at which point | ||
876 | * we can discard the old committed data pointer. | ||
877 | * | ||
878 | * Returns error number or 0 on success. | ||
879 | */ | ||
880 | int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) | ||
881 | { | ||
882 | int err; | ||
883 | struct journal_head *jh = journal_add_journal_head(bh); | ||
884 | char *committed_data = NULL; | ||
885 | |||
886 | JBUFFER_TRACE(jh, "entry"); | ||
887 | |||
888 | /* | ||
889 | * Do this first --- it can drop the journal lock, so we want to | ||
890 | * make sure that obtaining the committed_data is done | ||
891 | * atomically wrt. completion of any outstanding commits. | ||
892 | */ | ||
893 | err = do_get_write_access(handle, jh, 1); | ||
894 | if (err) | ||
895 | goto out; | ||
896 | |||
897 | repeat: | ||
898 | if (!jh->b_committed_data) { | ||
899 | committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); | ||
900 | if (!committed_data) { | ||
901 | printk(KERN_ERR "%s: No memory for committed data\n", | ||
902 | __func__); | ||
903 | err = -ENOMEM; | ||
904 | goto out; | ||
905 | } | ||
906 | } | ||
907 | |||
908 | jbd_lock_bh_state(bh); | ||
909 | if (!jh->b_committed_data) { | ||
910 | /* Copy out the current buffer contents into the | ||
911 | * preserved, committed copy. */ | ||
912 | JBUFFER_TRACE(jh, "generate b_committed data"); | ||
913 | if (!committed_data) { | ||
914 | jbd_unlock_bh_state(bh); | ||
915 | goto repeat; | ||
916 | } | ||
917 | |||
918 | jh->b_committed_data = committed_data; | ||
919 | committed_data = NULL; | ||
920 | memcpy(jh->b_committed_data, bh->b_data, bh->b_size); | ||
921 | } | ||
922 | jbd_unlock_bh_state(bh); | ||
923 | out: | ||
924 | journal_put_journal_head(jh); | ||
925 | if (unlikely(committed_data)) | ||
926 | jbd_free(committed_data, bh->b_size); | ||
927 | return err; | ||
928 | } | ||
929 | |||
930 | /** | ||
931 | * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed | ||
932 | * @handle: transaction | ||
933 | * @bh: bufferhead to mark | ||
934 | * | ||
935 | * Description: | ||
936 | * Mark a buffer as containing dirty data which needs to be flushed before | ||
937 | * we can commit the current transaction. | ||
938 | * | ||
939 | * The buffer is placed on the transaction's data list and is marked as | ||
940 | * belonging to the transaction. | ||
941 | * | ||
942 | * Returns error number or 0 on success. | ||
943 | * | ||
944 | * journal_dirty_data() can be called via page_launder->ext3_writepage | ||
945 | * by kswapd. | ||
946 | */ | ||
947 | int journal_dirty_data(handle_t *handle, struct buffer_head *bh) | ||
948 | { | ||
949 | journal_t *journal = handle->h_transaction->t_journal; | ||
950 | int need_brelse = 0; | ||
951 | struct journal_head *jh; | ||
952 | int ret = 0; | ||
953 | |||
954 | if (is_handle_aborted(handle)) | ||
955 | return ret; | ||
956 | |||
957 | jh = journal_add_journal_head(bh); | ||
958 | JBUFFER_TRACE(jh, "entry"); | ||
959 | |||
960 | /* | ||
961 | * The buffer could *already* be dirty. Writeout can start | ||
962 | * at any time. | ||
963 | */ | ||
964 | jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); | ||
965 | |||
966 | /* | ||
967 | * What if the buffer is already part of a running transaction? | ||
968 | * | ||
969 | * There are two cases: | ||
970 | * 1) It is part of the current running transaction. Refile it, | ||
971 | * just in case we have allocated it as metadata, deallocated | ||
972 | * it, then reallocated it as data. | ||
973 | * 2) It is part of the previous, still-committing transaction. | ||
974 | * If all we want to do is to guarantee that the buffer will be | ||
975 | * written to disk before this new transaction commits, then | ||
976 | * being sure that the *previous* transaction has this same | ||
977 | * property is sufficient for us! Just leave it on its old | ||
978 | * transaction. | ||
979 | * | ||
980 | * In case (2), the buffer must not already exist as metadata | ||
981 | * --- that would violate write ordering (a transaction is free | ||
982 | * to write its data at any point, even before the previous | ||
983 | * committing transaction has committed). The caller must | ||
984 | * never, ever allow this to happen: there's nothing we can do | ||
985 | * about it in this layer. | ||
986 | */ | ||
987 | jbd_lock_bh_state(bh); | ||
988 | spin_lock(&journal->j_list_lock); | ||
989 | |||
990 | /* Now that we have bh_state locked, are we really still mapped? */ | ||
991 | if (!buffer_mapped(bh)) { | ||
992 | JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); | ||
993 | goto no_journal; | ||
994 | } | ||
995 | |||
996 | if (jh->b_transaction) { | ||
997 | JBUFFER_TRACE(jh, "has transaction"); | ||
998 | if (jh->b_transaction != handle->h_transaction) { | ||
999 | JBUFFER_TRACE(jh, "belongs to older transaction"); | ||
1000 | J_ASSERT_JH(jh, jh->b_transaction == | ||
1001 | journal->j_committing_transaction); | ||
1002 | |||
1003 | /* @@@ IS THIS TRUE ? */ | ||
1004 | /* | ||
1005 | * Not any more. Scenario: someone does a write() | ||
1006 | * in data=journal mode. The buffer's transaction has | ||
1007 | * moved into commit. Then someone does another | ||
1008 | * write() to the file. We do the frozen data copyout | ||
1009 | * and set b_next_transaction to point to j_running_t. | ||
1010 | * And while we're in that state, someone does a | ||
1011 | * writepage() in an attempt to pageout the same area | ||
1012 | * of the file via a shared mapping. At present that | ||
1013 | * calls journal_dirty_data(), and we get right here. | ||
1014 | * It may be too late to journal the data. Simply | ||
1015 | * falling through to the next test will suffice: the | ||
1016 | * data will be dirty and wil be checkpointed. The | ||
1017 | * ordering comments in the next comment block still | ||
1018 | * apply. | ||
1019 | */ | ||
1020 | //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
1021 | |||
1022 | /* | ||
1023 | * If we're journalling data, and this buffer was | ||
1024 | * subject to a write(), it could be metadata, forget | ||
1025 | * or shadow against the committing transaction. Now, | ||
1026 | * someone has dirtied the same darn page via a mapping | ||
1027 | * and it is being writepage()'d. | ||
1028 | * We *could* just steal the page from commit, with some | ||
1029 | * fancy locking there. Instead, we just skip it - | ||
1030 | * don't tie the page's buffers to the new transaction | ||
1031 | * at all. | ||
1032 | * Implication: if we crash before the writepage() data | ||
1033 | * is written into the filesystem, recovery will replay | ||
1034 | * the write() data. | ||
1035 | */ | ||
1036 | if (jh->b_jlist != BJ_None && | ||
1037 | jh->b_jlist != BJ_SyncData && | ||
1038 | jh->b_jlist != BJ_Locked) { | ||
1039 | JBUFFER_TRACE(jh, "Not stealing"); | ||
1040 | goto no_journal; | ||
1041 | } | ||
1042 | |||
1043 | /* | ||
1044 | * This buffer may be undergoing writeout in commit. We | ||
1045 | * can't return from here and let the caller dirty it | ||
1046 | * again because that can cause the write-out loop in | ||
1047 | * commit to never terminate. | ||
1048 | */ | ||
1049 | if (buffer_dirty(bh)) { | ||
1050 | get_bh(bh); | ||
1051 | spin_unlock(&journal->j_list_lock); | ||
1052 | jbd_unlock_bh_state(bh); | ||
1053 | need_brelse = 1; | ||
1054 | sync_dirty_buffer(bh); | ||
1055 | jbd_lock_bh_state(bh); | ||
1056 | spin_lock(&journal->j_list_lock); | ||
1057 | /* Since we dropped the lock... */ | ||
1058 | if (!buffer_mapped(bh)) { | ||
1059 | JBUFFER_TRACE(jh, "buffer got unmapped"); | ||
1060 | goto no_journal; | ||
1061 | } | ||
1062 | /* The buffer may become locked again at any | ||
1063 | time if it is redirtied */ | ||
1064 | } | ||
1065 | |||
1066 | /* | ||
1067 | * We cannot remove the buffer with io error from the | ||
1068 | * committing transaction, because otherwise it would | ||
1069 | * miss the error and the commit would not abort. | ||
1070 | */ | ||
1071 | if (unlikely(!buffer_uptodate(bh))) { | ||
1072 | ret = -EIO; | ||
1073 | goto no_journal; | ||
1074 | } | ||
1075 | /* We might have slept so buffer could be refiled now */ | ||
1076 | if (jh->b_transaction != NULL && | ||
1077 | jh->b_transaction != handle->h_transaction) { | ||
1078 | JBUFFER_TRACE(jh, "unfile from commit"); | ||
1079 | __journal_temp_unlink_buffer(jh); | ||
1080 | /* It still points to the committing | ||
1081 | * transaction; move it to this one so | ||
1082 | * that the refile assert checks are | ||
1083 | * happy. */ | ||
1084 | jh->b_transaction = handle->h_transaction; | ||
1085 | } | ||
1086 | /* The buffer will be refiled below */ | ||
1087 | |||
1088 | } | ||
1089 | /* | ||
1090 | * Special case --- the buffer might actually have been | ||
1091 | * allocated and then immediately deallocated in the previous, | ||
1092 | * committing transaction, so might still be left on that | ||
1093 | * transaction's metadata lists. | ||
1094 | */ | ||
1095 | if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { | ||
1096 | JBUFFER_TRACE(jh, "not on correct data list: unfile"); | ||
1097 | J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); | ||
1098 | JBUFFER_TRACE(jh, "file as data"); | ||
1099 | __journal_file_buffer(jh, handle->h_transaction, | ||
1100 | BJ_SyncData); | ||
1101 | } | ||
1102 | } else { | ||
1103 | JBUFFER_TRACE(jh, "not on a transaction"); | ||
1104 | __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); | ||
1105 | } | ||
1106 | no_journal: | ||
1107 | spin_unlock(&journal->j_list_lock); | ||
1108 | jbd_unlock_bh_state(bh); | ||
1109 | if (need_brelse) { | ||
1110 | BUFFER_TRACE(bh, "brelse"); | ||
1111 | __brelse(bh); | ||
1112 | } | ||
1113 | JBUFFER_TRACE(jh, "exit"); | ||
1114 | journal_put_journal_head(jh); | ||
1115 | return ret; | ||
1116 | } | ||
1117 | |||
1118 | /** | ||
1119 | * int journal_dirty_metadata() - mark a buffer as containing dirty metadata | ||
1120 | * @handle: transaction to add buffer to. | ||
1121 | * @bh: buffer to mark | ||
1122 | * | ||
1123 | * Mark dirty metadata which needs to be journaled as part of the current | ||
1124 | * transaction. | ||
1125 | * | ||
1126 | * The buffer is placed on the transaction's metadata list and is marked | ||
1127 | * as belonging to the transaction. | ||
1128 | * | ||
1129 | * Returns error number or 0 on success. | ||
1130 | * | ||
1131 | * Special care needs to be taken if the buffer already belongs to the | ||
1132 | * current committing transaction (in which case we should have frozen | ||
1133 | * data present for that commit). In that case, we don't relink the | ||
1134 | * buffer: that only gets done when the old transaction finally | ||
1135 | * completes its commit. | ||
1136 | */ | ||
1137 | int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) | ||
1138 | { | ||
1139 | transaction_t *transaction = handle->h_transaction; | ||
1140 | journal_t *journal = transaction->t_journal; | ||
1141 | struct journal_head *jh = bh2jh(bh); | ||
1142 | |||
1143 | jbd_debug(5, "journal_head %p\n", jh); | ||
1144 | JBUFFER_TRACE(jh, "entry"); | ||
1145 | if (is_handle_aborted(handle)) | ||
1146 | goto out; | ||
1147 | |||
1148 | jbd_lock_bh_state(bh); | ||
1149 | |||
1150 | if (jh->b_modified == 0) { | ||
1151 | /* | ||
1152 | * This buffer's got modified and becoming part | ||
1153 | * of the transaction. This needs to be done | ||
1154 | * once a transaction -bzzz | ||
1155 | */ | ||
1156 | jh->b_modified = 1; | ||
1157 | J_ASSERT_JH(jh, handle->h_buffer_credits > 0); | ||
1158 | handle->h_buffer_credits--; | ||
1159 | } | ||
1160 | |||
1161 | /* | ||
1162 | * fastpath, to avoid expensive locking. If this buffer is already | ||
1163 | * on the running transaction's metadata list there is nothing to do. | ||
1164 | * Nobody can take it off again because there is a handle open. | ||
1165 | * I _think_ we're OK here with SMP barriers - a mistaken decision will | ||
1166 | * result in this test being false, so we go in and take the locks. | ||
1167 | */ | ||
1168 | if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { | ||
1169 | JBUFFER_TRACE(jh, "fastpath"); | ||
1170 | J_ASSERT_JH(jh, jh->b_transaction == | ||
1171 | journal->j_running_transaction); | ||
1172 | goto out_unlock_bh; | ||
1173 | } | ||
1174 | |||
1175 | set_buffer_jbddirty(bh); | ||
1176 | |||
1177 | /* | ||
1178 | * Metadata already on the current transaction list doesn't | ||
1179 | * need to be filed. Metadata on another transaction's list must | ||
1180 | * be committing, and will be refiled once the commit completes: | ||
1181 | * leave it alone for now. | ||
1182 | */ | ||
1183 | if (jh->b_transaction != transaction) { | ||
1184 | JBUFFER_TRACE(jh, "already on other transaction"); | ||
1185 | J_ASSERT_JH(jh, jh->b_transaction == | ||
1186 | journal->j_committing_transaction); | ||
1187 | J_ASSERT_JH(jh, jh->b_next_transaction == transaction); | ||
1188 | /* And this case is illegal: we can't reuse another | ||
1189 | * transaction's data buffer, ever. */ | ||
1190 | goto out_unlock_bh; | ||
1191 | } | ||
1192 | |||
1193 | /* That test should have eliminated the following case: */ | ||
1194 | J_ASSERT_JH(jh, jh->b_frozen_data == NULL); | ||
1195 | |||
1196 | JBUFFER_TRACE(jh, "file as BJ_Metadata"); | ||
1197 | spin_lock(&journal->j_list_lock); | ||
1198 | __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); | ||
1199 | spin_unlock(&journal->j_list_lock); | ||
1200 | out_unlock_bh: | ||
1201 | jbd_unlock_bh_state(bh); | ||
1202 | out: | ||
1203 | JBUFFER_TRACE(jh, "exit"); | ||
1204 | return 0; | ||
1205 | } | ||
1206 | |||
1207 | /* | ||
1208 | * journal_release_buffer: undo a get_write_access without any buffer | ||
1209 | * updates, if the update decided in the end that it didn't need access. | ||
1210 | * | ||
1211 | */ | ||
1212 | void | ||
1213 | journal_release_buffer(handle_t *handle, struct buffer_head *bh) | ||
1214 | { | ||
1215 | BUFFER_TRACE(bh, "entry"); | ||
1216 | } | ||
1217 | |||
1218 | /** | ||
1219 | * void journal_forget() - bforget() for potentially-journaled buffers. | ||
1220 | * @handle: transaction handle | ||
1221 | * @bh: bh to 'forget' | ||
1222 | * | ||
1223 | * We can only do the bforget if there are no commits pending against the | ||
1224 | * buffer. If the buffer is dirty in the current running transaction we | ||
1225 | * can safely unlink it. | ||
1226 | * | ||
1227 | * bh may not be a journalled buffer at all - it may be a non-JBD | ||
1228 | * buffer which came off the hashtable. Check for this. | ||
1229 | * | ||
1230 | * Decrements bh->b_count by one. | ||
1231 | * | ||
1232 | * Allow this call even if the handle has aborted --- it may be part of | ||
1233 | * the caller's cleanup after an abort. | ||
1234 | */ | ||
1235 | int journal_forget (handle_t *handle, struct buffer_head *bh) | ||
1236 | { | ||
1237 | transaction_t *transaction = handle->h_transaction; | ||
1238 | journal_t *journal = transaction->t_journal; | ||
1239 | struct journal_head *jh; | ||
1240 | int drop_reserve = 0; | ||
1241 | int err = 0; | ||
1242 | int was_modified = 0; | ||
1243 | |||
1244 | BUFFER_TRACE(bh, "entry"); | ||
1245 | |||
1246 | jbd_lock_bh_state(bh); | ||
1247 | spin_lock(&journal->j_list_lock); | ||
1248 | |||
1249 | if (!buffer_jbd(bh)) | ||
1250 | goto not_jbd; | ||
1251 | jh = bh2jh(bh); | ||
1252 | |||
1253 | /* Critical error: attempting to delete a bitmap buffer, maybe? | ||
1254 | * Don't do any jbd operations, and return an error. */ | ||
1255 | if (!J_EXPECT_JH(jh, !jh->b_committed_data, | ||
1256 | "inconsistent data on disk")) { | ||
1257 | err = -EIO; | ||
1258 | goto not_jbd; | ||
1259 | } | ||
1260 | |||
1261 | /* keep track of whether or not this transaction modified us */ | ||
1262 | was_modified = jh->b_modified; | ||
1263 | |||
1264 | /* | ||
1265 | * The buffer's going from the transaction, we must drop | ||
1266 | * all references -bzzz | ||
1267 | */ | ||
1268 | jh->b_modified = 0; | ||
1269 | |||
1270 | if (jh->b_transaction == handle->h_transaction) { | ||
1271 | J_ASSERT_JH(jh, !jh->b_frozen_data); | ||
1272 | |||
1273 | /* If we are forgetting a buffer which is already part | ||
1274 | * of this transaction, then we can just drop it from | ||
1275 | * the transaction immediately. */ | ||
1276 | clear_buffer_dirty(bh); | ||
1277 | clear_buffer_jbddirty(bh); | ||
1278 | |||
1279 | JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); | ||
1280 | |||
1281 | /* | ||
1282 | * we only want to drop a reference if this transaction | ||
1283 | * modified the buffer | ||
1284 | */ | ||
1285 | if (was_modified) | ||
1286 | drop_reserve = 1; | ||
1287 | |||
1288 | /* | ||
1289 | * We are no longer going to journal this buffer. | ||
1290 | * However, the commit of this transaction is still | ||
1291 | * important to the buffer: the delete that we are now | ||
1292 | * processing might obsolete an old log entry, so by | ||
1293 | * committing, we can satisfy the buffer's checkpoint. | ||
1294 | * | ||
1295 | * So, if we have a checkpoint on the buffer, we should | ||
1296 | * now refile the buffer on our BJ_Forget list so that | ||
1297 | * we know to remove the checkpoint after we commit. | ||
1298 | */ | ||
1299 | |||
1300 | if (jh->b_cp_transaction) { | ||
1301 | __journal_temp_unlink_buffer(jh); | ||
1302 | __journal_file_buffer(jh, transaction, BJ_Forget); | ||
1303 | } else { | ||
1304 | __journal_unfile_buffer(jh); | ||
1305 | if (!buffer_jbd(bh)) { | ||
1306 | spin_unlock(&journal->j_list_lock); | ||
1307 | jbd_unlock_bh_state(bh); | ||
1308 | __bforget(bh); | ||
1309 | goto drop; | ||
1310 | } | ||
1311 | } | ||
1312 | } else if (jh->b_transaction) { | ||
1313 | J_ASSERT_JH(jh, (jh->b_transaction == | ||
1314 | journal->j_committing_transaction)); | ||
1315 | /* However, if the buffer is still owned by a prior | ||
1316 | * (committing) transaction, we can't drop it yet... */ | ||
1317 | JBUFFER_TRACE(jh, "belongs to older transaction"); | ||
1318 | /* ... but we CAN drop it from the new transaction if we | ||
1319 | * have also modified it since the original commit. */ | ||
1320 | |||
1321 | if (jh->b_next_transaction) { | ||
1322 | J_ASSERT(jh->b_next_transaction == transaction); | ||
1323 | jh->b_next_transaction = NULL; | ||
1324 | |||
1325 | /* | ||
1326 | * only drop a reference if this transaction modified | ||
1327 | * the buffer | ||
1328 | */ | ||
1329 | if (was_modified) | ||
1330 | drop_reserve = 1; | ||
1331 | } | ||
1332 | } | ||
1333 | |||
1334 | not_jbd: | ||
1335 | spin_unlock(&journal->j_list_lock); | ||
1336 | jbd_unlock_bh_state(bh); | ||
1337 | __brelse(bh); | ||
1338 | drop: | ||
1339 | if (drop_reserve) { | ||
1340 | /* no need to reserve log space for this block -bzzz */ | ||
1341 | handle->h_buffer_credits++; | ||
1342 | } | ||
1343 | return err; | ||
1344 | } | ||
1345 | |||
1346 | /** | ||
1347 | * int journal_stop() - complete a transaction | ||
1348 | * @handle: tranaction to complete. | ||
1349 | * | ||
1350 | * All done for a particular handle. | ||
1351 | * | ||
1352 | * There is not much action needed here. We just return any remaining | ||
1353 | * buffer credits to the transaction and remove the handle. The only | ||
1354 | * complication is that we need to start a commit operation if the | ||
1355 | * filesystem is marked for synchronous update. | ||
1356 | * | ||
1357 | * journal_stop itself will not usually return an error, but it may | ||
1358 | * do so in unusual circumstances. In particular, expect it to | ||
1359 | * return -EIO if a journal_abort has been executed since the | ||
1360 | * transaction began. | ||
1361 | */ | ||
1362 | int journal_stop(handle_t *handle) | ||
1363 | { | ||
1364 | transaction_t *transaction = handle->h_transaction; | ||
1365 | journal_t *journal = transaction->t_journal; | ||
1366 | int err; | ||
1367 | pid_t pid; | ||
1368 | |||
1369 | J_ASSERT(journal_current_handle() == handle); | ||
1370 | |||
1371 | if (is_handle_aborted(handle)) | ||
1372 | err = -EIO; | ||
1373 | else { | ||
1374 | J_ASSERT(transaction->t_updates > 0); | ||
1375 | err = 0; | ||
1376 | } | ||
1377 | |||
1378 | if (--handle->h_ref > 0) { | ||
1379 | jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, | ||
1380 | handle->h_ref); | ||
1381 | return err; | ||
1382 | } | ||
1383 | |||
1384 | jbd_debug(4, "Handle %p going down\n", handle); | ||
1385 | |||
1386 | /* | ||
1387 | * Implement synchronous transaction batching. If the handle | ||
1388 | * was synchronous, don't force a commit immediately. Let's | ||
1389 | * yield and let another thread piggyback onto this transaction. | ||
1390 | * Keep doing that while new threads continue to arrive. | ||
1391 | * It doesn't cost much - we're about to run a commit and sleep | ||
1392 | * on IO anyway. Speeds up many-threaded, many-dir operations | ||
1393 | * by 30x or more... | ||
1394 | * | ||
1395 | * We try and optimize the sleep time against what the underlying disk | ||
1396 | * can do, instead of having a static sleep time. This is useful for | ||
1397 | * the case where our storage is so fast that it is more optimal to go | ||
1398 | * ahead and force a flush and wait for the transaction to be committed | ||
1399 | * than it is to wait for an arbitrary amount of time for new writers to | ||
1400 | * join the transaction. We achieve this by measuring how long it takes | ||
1401 | * to commit a transaction, and compare it with how long this | ||
1402 | * transaction has been running, and if run time < commit time then we | ||
1403 | * sleep for the delta and commit. This greatly helps super fast disks | ||
1404 | * that would see slowdowns as more threads started doing fsyncs. | ||
1405 | * | ||
1406 | * But don't do this if this process was the most recent one to | ||
1407 | * perform a synchronous write. We do this to detect the case where a | ||
1408 | * single process is doing a stream of sync writes. No point in waiting | ||
1409 | * for joiners in that case. | ||
1410 | */ | ||
1411 | pid = current->pid; | ||
1412 | if (handle->h_sync && journal->j_last_sync_writer != pid) { | ||
1413 | u64 commit_time, trans_time; | ||
1414 | |||
1415 | journal->j_last_sync_writer = pid; | ||
1416 | |||
1417 | spin_lock(&journal->j_state_lock); | ||
1418 | commit_time = journal->j_average_commit_time; | ||
1419 | spin_unlock(&journal->j_state_lock); | ||
1420 | |||
1421 | trans_time = ktime_to_ns(ktime_sub(ktime_get(), | ||
1422 | transaction->t_start_time)); | ||
1423 | |||
1424 | commit_time = min_t(u64, commit_time, | ||
1425 | 1000*jiffies_to_usecs(1)); | ||
1426 | |||
1427 | if (trans_time < commit_time) { | ||
1428 | ktime_t expires = ktime_add_ns(ktime_get(), | ||
1429 | commit_time); | ||
1430 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1431 | schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); | ||
1432 | } | ||
1433 | } | ||
1434 | |||
1435 | current->journal_info = NULL; | ||
1436 | spin_lock(&journal->j_state_lock); | ||
1437 | spin_lock(&transaction->t_handle_lock); | ||
1438 | transaction->t_outstanding_credits -= handle->h_buffer_credits; | ||
1439 | transaction->t_updates--; | ||
1440 | if (!transaction->t_updates) { | ||
1441 | wake_up(&journal->j_wait_updates); | ||
1442 | if (journal->j_barrier_count) | ||
1443 | wake_up(&journal->j_wait_transaction_locked); | ||
1444 | } | ||
1445 | |||
1446 | /* | ||
1447 | * If the handle is marked SYNC, we need to set another commit | ||
1448 | * going! We also want to force a commit if the current | ||
1449 | * transaction is occupying too much of the log, or if the | ||
1450 | * transaction is too old now. | ||
1451 | */ | ||
1452 | if (handle->h_sync || | ||
1453 | transaction->t_outstanding_credits > | ||
1454 | journal->j_max_transaction_buffers || | ||
1455 | time_after_eq(jiffies, transaction->t_expires)) { | ||
1456 | /* Do this even for aborted journals: an abort still | ||
1457 | * completes the commit thread, it just doesn't write | ||
1458 | * anything to disk. */ | ||
1459 | tid_t tid = transaction->t_tid; | ||
1460 | |||
1461 | spin_unlock(&transaction->t_handle_lock); | ||
1462 | jbd_debug(2, "transaction too old, requesting commit for " | ||
1463 | "handle %p\n", handle); | ||
1464 | /* This is non-blocking */ | ||
1465 | __log_start_commit(journal, transaction->t_tid); | ||
1466 | spin_unlock(&journal->j_state_lock); | ||
1467 | |||
1468 | /* | ||
1469 | * Special case: JFS_SYNC synchronous updates require us | ||
1470 | * to wait for the commit to complete. | ||
1471 | */ | ||
1472 | if (handle->h_sync && !(current->flags & PF_MEMALLOC)) | ||
1473 | err = log_wait_commit(journal, tid); | ||
1474 | } else { | ||
1475 | spin_unlock(&transaction->t_handle_lock); | ||
1476 | spin_unlock(&journal->j_state_lock); | ||
1477 | } | ||
1478 | |||
1479 | lock_map_release(&handle->h_lockdep_map); | ||
1480 | |||
1481 | jbd_free_handle(handle); | ||
1482 | return err; | ||
1483 | } | ||
1484 | |||
1485 | /** | ||
1486 | * int journal_force_commit() - force any uncommitted transactions | ||
1487 | * @journal: journal to force | ||
1488 | * | ||
1489 | * For synchronous operations: force any uncommitted transactions | ||
1490 | * to disk. May seem kludgy, but it reuses all the handle batching | ||
1491 | * code in a very simple manner. | ||
1492 | */ | ||
1493 | int journal_force_commit(journal_t *journal) | ||
1494 | { | ||
1495 | handle_t *handle; | ||
1496 | int ret; | ||
1497 | |||
1498 | handle = journal_start(journal, 1); | ||
1499 | if (IS_ERR(handle)) { | ||
1500 | ret = PTR_ERR(handle); | ||
1501 | } else { | ||
1502 | handle->h_sync = 1; | ||
1503 | ret = journal_stop(handle); | ||
1504 | } | ||
1505 | return ret; | ||
1506 | } | ||
1507 | |||
1508 | /* | ||
1509 | * | ||
1510 | * List management code snippets: various functions for manipulating the | ||
1511 | * transaction buffer lists. | ||
1512 | * | ||
1513 | */ | ||
1514 | |||
1515 | /* | ||
1516 | * Append a buffer to a transaction list, given the transaction's list head | ||
1517 | * pointer. | ||
1518 | * | ||
1519 | * j_list_lock is held. | ||
1520 | * | ||
1521 | * jbd_lock_bh_state(jh2bh(jh)) is held. | ||
1522 | */ | ||
1523 | |||
1524 | static inline void | ||
1525 | __blist_add_buffer(struct journal_head **list, struct journal_head *jh) | ||
1526 | { | ||
1527 | if (!*list) { | ||
1528 | jh->b_tnext = jh->b_tprev = jh; | ||
1529 | *list = jh; | ||
1530 | } else { | ||
1531 | /* Insert at the tail of the list to preserve order */ | ||
1532 | struct journal_head *first = *list, *last = first->b_tprev; | ||
1533 | jh->b_tprev = last; | ||
1534 | jh->b_tnext = first; | ||
1535 | last->b_tnext = first->b_tprev = jh; | ||
1536 | } | ||
1537 | } | ||
1538 | |||
1539 | /* | ||
1540 | * Remove a buffer from a transaction list, given the transaction's list | ||
1541 | * head pointer. | ||
1542 | * | ||
1543 | * Called with j_list_lock held, and the journal may not be locked. | ||
1544 | * | ||
1545 | * jbd_lock_bh_state(jh2bh(jh)) is held. | ||
1546 | */ | ||
1547 | |||
1548 | static inline void | ||
1549 | __blist_del_buffer(struct journal_head **list, struct journal_head *jh) | ||
1550 | { | ||
1551 | if (*list == jh) { | ||
1552 | *list = jh->b_tnext; | ||
1553 | if (*list == jh) | ||
1554 | *list = NULL; | ||
1555 | } | ||
1556 | jh->b_tprev->b_tnext = jh->b_tnext; | ||
1557 | jh->b_tnext->b_tprev = jh->b_tprev; | ||
1558 | } | ||
1559 | |||
1560 | /* | ||
1561 | * Remove a buffer from the appropriate transaction list. | ||
1562 | * | ||
1563 | * Note that this function can *change* the value of | ||
1564 | * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, | ||
1565 | * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller | ||
1566 | * is holding onto a copy of one of thee pointers, it could go bad. | ||
1567 | * Generally the caller needs to re-read the pointer from the transaction_t. | ||
1568 | * | ||
1569 | * Called under j_list_lock. The journal may not be locked. | ||
1570 | */ | ||
1571 | static void __journal_temp_unlink_buffer(struct journal_head *jh) | ||
1572 | { | ||
1573 | struct journal_head **list = NULL; | ||
1574 | transaction_t *transaction; | ||
1575 | struct buffer_head *bh = jh2bh(jh); | ||
1576 | |||
1577 | J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); | ||
1578 | transaction = jh->b_transaction; | ||
1579 | if (transaction) | ||
1580 | assert_spin_locked(&transaction->t_journal->j_list_lock); | ||
1581 | |||
1582 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); | ||
1583 | if (jh->b_jlist != BJ_None) | ||
1584 | J_ASSERT_JH(jh, transaction != NULL); | ||
1585 | |||
1586 | switch (jh->b_jlist) { | ||
1587 | case BJ_None: | ||
1588 | return; | ||
1589 | case BJ_SyncData: | ||
1590 | list = &transaction->t_sync_datalist; | ||
1591 | break; | ||
1592 | case BJ_Metadata: | ||
1593 | transaction->t_nr_buffers--; | ||
1594 | J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); | ||
1595 | list = &transaction->t_buffers; | ||
1596 | break; | ||
1597 | case BJ_Forget: | ||
1598 | list = &transaction->t_forget; | ||
1599 | break; | ||
1600 | case BJ_IO: | ||
1601 | list = &transaction->t_iobuf_list; | ||
1602 | break; | ||
1603 | case BJ_Shadow: | ||
1604 | list = &transaction->t_shadow_list; | ||
1605 | break; | ||
1606 | case BJ_LogCtl: | ||
1607 | list = &transaction->t_log_list; | ||
1608 | break; | ||
1609 | case BJ_Reserved: | ||
1610 | list = &transaction->t_reserved_list; | ||
1611 | break; | ||
1612 | case BJ_Locked: | ||
1613 | list = &transaction->t_locked_list; | ||
1614 | break; | ||
1615 | } | ||
1616 | |||
1617 | __blist_del_buffer(list, jh); | ||
1618 | jh->b_jlist = BJ_None; | ||
1619 | if (test_clear_buffer_jbddirty(bh)) | ||
1620 | mark_buffer_dirty(bh); /* Expose it to the VM */ | ||
1621 | } | ||
1622 | |||
1623 | /* | ||
1624 | * Remove buffer from all transactions. | ||
1625 | * | ||
1626 | * Called with bh_state lock and j_list_lock | ||
1627 | * | ||
1628 | * jh and bh may be already freed when this function returns. | ||
1629 | */ | ||
1630 | void __journal_unfile_buffer(struct journal_head *jh) | ||
1631 | { | ||
1632 | __journal_temp_unlink_buffer(jh); | ||
1633 | jh->b_transaction = NULL; | ||
1634 | journal_put_journal_head(jh); | ||
1635 | } | ||
1636 | |||
1637 | void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) | ||
1638 | { | ||
1639 | struct buffer_head *bh = jh2bh(jh); | ||
1640 | |||
1641 | /* Get reference so that buffer cannot be freed before we unlock it */ | ||
1642 | get_bh(bh); | ||
1643 | jbd_lock_bh_state(bh); | ||
1644 | spin_lock(&journal->j_list_lock); | ||
1645 | __journal_unfile_buffer(jh); | ||
1646 | spin_unlock(&journal->j_list_lock); | ||
1647 | jbd_unlock_bh_state(bh); | ||
1648 | __brelse(bh); | ||
1649 | } | ||
1650 | |||
1651 | /* | ||
1652 | * Called from journal_try_to_free_buffers(). | ||
1653 | * | ||
1654 | * Called under jbd_lock_bh_state(bh) | ||
1655 | */ | ||
1656 | static void | ||
1657 | __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) | ||
1658 | { | ||
1659 | struct journal_head *jh; | ||
1660 | |||
1661 | jh = bh2jh(bh); | ||
1662 | |||
1663 | if (buffer_locked(bh) || buffer_dirty(bh)) | ||
1664 | goto out; | ||
1665 | |||
1666 | if (jh->b_next_transaction != NULL) | ||
1667 | goto out; | ||
1668 | |||
1669 | spin_lock(&journal->j_list_lock); | ||
1670 | if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { | ||
1671 | if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { | ||
1672 | /* A written-back ordered data buffer */ | ||
1673 | JBUFFER_TRACE(jh, "release data"); | ||
1674 | __journal_unfile_buffer(jh); | ||
1675 | } | ||
1676 | } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { | ||
1677 | /* written-back checkpointed metadata buffer */ | ||
1678 | if (jh->b_jlist == BJ_None) { | ||
1679 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | ||
1680 | __journal_remove_checkpoint(jh); | ||
1681 | } | ||
1682 | } | ||
1683 | spin_unlock(&journal->j_list_lock); | ||
1684 | out: | ||
1685 | return; | ||
1686 | } | ||
1687 | |||
1688 | /** | ||
1689 | * int journal_try_to_free_buffers() - try to free page buffers. | ||
1690 | * @journal: journal for operation | ||
1691 | * @page: to try and free | ||
1692 | * @gfp_mask: we use the mask to detect how hard should we try to release | ||
1693 | * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to | ||
1694 | * release the buffers. | ||
1695 | * | ||
1696 | * | ||
1697 | * For all the buffers on this page, | ||
1698 | * if they are fully written out ordered data, move them onto BUF_CLEAN | ||
1699 | * so try_to_free_buffers() can reap them. | ||
1700 | * | ||
1701 | * This function returns non-zero if we wish try_to_free_buffers() | ||
1702 | * to be called. We do this if the page is releasable by try_to_free_buffers(). | ||
1703 | * We also do it if the page has locked or dirty buffers and the caller wants | ||
1704 | * us to perform sync or async writeout. | ||
1705 | * | ||
1706 | * This complicates JBD locking somewhat. We aren't protected by the | ||
1707 | * BKL here. We wish to remove the buffer from its committing or | ||
1708 | * running transaction's ->t_datalist via __journal_unfile_buffer. | ||
1709 | * | ||
1710 | * This may *change* the value of transaction_t->t_datalist, so anyone | ||
1711 | * who looks at t_datalist needs to lock against this function. | ||
1712 | * | ||
1713 | * Even worse, someone may be doing a journal_dirty_data on this | ||
1714 | * buffer. So we need to lock against that. journal_dirty_data() | ||
1715 | * will come out of the lock with the buffer dirty, which makes it | ||
1716 | * ineligible for release here. | ||
1717 | * | ||
1718 | * Who else is affected by this? hmm... Really the only contender | ||
1719 | * is do_get_write_access() - it could be looking at the buffer while | ||
1720 | * journal_try_to_free_buffer() is changing its state. But that | ||
1721 | * cannot happen because we never reallocate freed data as metadata | ||
1722 | * while the data is part of a transaction. Yes? | ||
1723 | * | ||
1724 | * Return 0 on failure, 1 on success | ||
1725 | */ | ||
1726 | int journal_try_to_free_buffers(journal_t *journal, | ||
1727 | struct page *page, gfp_t gfp_mask) | ||
1728 | { | ||
1729 | struct buffer_head *head; | ||
1730 | struct buffer_head *bh; | ||
1731 | int ret = 0; | ||
1732 | |||
1733 | J_ASSERT(PageLocked(page)); | ||
1734 | |||
1735 | head = page_buffers(page); | ||
1736 | bh = head; | ||
1737 | do { | ||
1738 | struct journal_head *jh; | ||
1739 | |||
1740 | /* | ||
1741 | * We take our own ref against the journal_head here to avoid | ||
1742 | * having to add tons of locking around each instance of | ||
1743 | * journal_put_journal_head(). | ||
1744 | */ | ||
1745 | jh = journal_grab_journal_head(bh); | ||
1746 | if (!jh) | ||
1747 | continue; | ||
1748 | |||
1749 | jbd_lock_bh_state(bh); | ||
1750 | __journal_try_to_free_buffer(journal, bh); | ||
1751 | journal_put_journal_head(jh); | ||
1752 | jbd_unlock_bh_state(bh); | ||
1753 | if (buffer_jbd(bh)) | ||
1754 | goto busy; | ||
1755 | } while ((bh = bh->b_this_page) != head); | ||
1756 | |||
1757 | ret = try_to_free_buffers(page); | ||
1758 | |||
1759 | busy: | ||
1760 | return ret; | ||
1761 | } | ||
1762 | |||
1763 | /* | ||
1764 | * This buffer is no longer needed. If it is on an older transaction's | ||
1765 | * checkpoint list we need to record it on this transaction's forget list | ||
1766 | * to pin this buffer (and hence its checkpointing transaction) down until | ||
1767 | * this transaction commits. If the buffer isn't on a checkpoint list, we | ||
1768 | * release it. | ||
1769 | * Returns non-zero if JBD no longer has an interest in the buffer. | ||
1770 | * | ||
1771 | * Called under j_list_lock. | ||
1772 | * | ||
1773 | * Called under jbd_lock_bh_state(bh). | ||
1774 | */ | ||
1775 | static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) | ||
1776 | { | ||
1777 | int may_free = 1; | ||
1778 | struct buffer_head *bh = jh2bh(jh); | ||
1779 | |||
1780 | if (jh->b_cp_transaction) { | ||
1781 | JBUFFER_TRACE(jh, "on running+cp transaction"); | ||
1782 | __journal_temp_unlink_buffer(jh); | ||
1783 | /* | ||
1784 | * We don't want to write the buffer anymore, clear the | ||
1785 | * bit so that we don't confuse checks in | ||
1786 | * __journal_file_buffer | ||
1787 | */ | ||
1788 | clear_buffer_dirty(bh); | ||
1789 | __journal_file_buffer(jh, transaction, BJ_Forget); | ||
1790 | may_free = 0; | ||
1791 | } else { | ||
1792 | JBUFFER_TRACE(jh, "on running transaction"); | ||
1793 | __journal_unfile_buffer(jh); | ||
1794 | } | ||
1795 | return may_free; | ||
1796 | } | ||
1797 | |||
1798 | /* | ||
1799 | * journal_invalidatepage | ||
1800 | * | ||
1801 | * This code is tricky. It has a number of cases to deal with. | ||
1802 | * | ||
1803 | * There are two invariants which this code relies on: | ||
1804 | * | ||
1805 | * i_size must be updated on disk before we start calling invalidatepage on the | ||
1806 | * data. | ||
1807 | * | ||
1808 | * This is done in ext3 by defining an ext3_setattr method which | ||
1809 | * updates i_size before truncate gets going. By maintaining this | ||
1810 | * invariant, we can be sure that it is safe to throw away any buffers | ||
1811 | * attached to the current transaction: once the transaction commits, | ||
1812 | * we know that the data will not be needed. | ||
1813 | * | ||
1814 | * Note however that we can *not* throw away data belonging to the | ||
1815 | * previous, committing transaction! | ||
1816 | * | ||
1817 | * Any disk blocks which *are* part of the previous, committing | ||
1818 | * transaction (and which therefore cannot be discarded immediately) are | ||
1819 | * not going to be reused in the new running transaction | ||
1820 | * | ||
1821 | * The bitmap committed_data images guarantee this: any block which is | ||
1822 | * allocated in one transaction and removed in the next will be marked | ||
1823 | * as in-use in the committed_data bitmap, so cannot be reused until | ||
1824 | * the next transaction to delete the block commits. This means that | ||
1825 | * leaving committing buffers dirty is quite safe: the disk blocks | ||
1826 | * cannot be reallocated to a different file and so buffer aliasing is | ||
1827 | * not possible. | ||
1828 | * | ||
1829 | * | ||
1830 | * The above applies mainly to ordered data mode. In writeback mode we | ||
1831 | * don't make guarantees about the order in which data hits disk --- in | ||
1832 | * particular we don't guarantee that new dirty data is flushed before | ||
1833 | * transaction commit --- so it is always safe just to discard data | ||
1834 | * immediately in that mode. --sct | ||
1835 | */ | ||
1836 | |||
1837 | /* | ||
1838 | * The journal_unmap_buffer helper function returns zero if the buffer | ||
1839 | * concerned remains pinned as an anonymous buffer belonging to an older | ||
1840 | * transaction. | ||
1841 | * | ||
1842 | * We're outside-transaction here. Either or both of j_running_transaction | ||
1843 | * and j_committing_transaction may be NULL. | ||
1844 | */ | ||
1845 | static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, | ||
1846 | int partial_page) | ||
1847 | { | ||
1848 | transaction_t *transaction; | ||
1849 | struct journal_head *jh; | ||
1850 | int may_free = 1; | ||
1851 | |||
1852 | BUFFER_TRACE(bh, "entry"); | ||
1853 | |||
1854 | retry: | ||
1855 | /* | ||
1856 | * It is safe to proceed here without the j_list_lock because the | ||
1857 | * buffers cannot be stolen by try_to_free_buffers as long as we are | ||
1858 | * holding the page lock. --sct | ||
1859 | */ | ||
1860 | |||
1861 | if (!buffer_jbd(bh)) | ||
1862 | goto zap_buffer_unlocked; | ||
1863 | |||
1864 | spin_lock(&journal->j_state_lock); | ||
1865 | jbd_lock_bh_state(bh); | ||
1866 | spin_lock(&journal->j_list_lock); | ||
1867 | |||
1868 | jh = journal_grab_journal_head(bh); | ||
1869 | if (!jh) | ||
1870 | goto zap_buffer_no_jh; | ||
1871 | |||
1872 | /* | ||
1873 | * We cannot remove the buffer from checkpoint lists until the | ||
1874 | * transaction adding inode to orphan list (let's call it T) | ||
1875 | * is committed. Otherwise if the transaction changing the | ||
1876 | * buffer would be cleaned from the journal before T is | ||
1877 | * committed, a crash will cause that the correct contents of | ||
1878 | * the buffer will be lost. On the other hand we have to | ||
1879 | * clear the buffer dirty bit at latest at the moment when the | ||
1880 | * transaction marking the buffer as freed in the filesystem | ||
1881 | * structures is committed because from that moment on the | ||
1882 | * block can be reallocated and used by a different page. | ||
1883 | * Since the block hasn't been freed yet but the inode has | ||
1884 | * already been added to orphan list, it is safe for us to add | ||
1885 | * the buffer to BJ_Forget list of the newest transaction. | ||
1886 | * | ||
1887 | * Also we have to clear buffer_mapped flag of a truncated buffer | ||
1888 | * because the buffer_head may be attached to the page straddling | ||
1889 | * i_size (can happen only when blocksize < pagesize) and thus the | ||
1890 | * buffer_head can be reused when the file is extended again. So we end | ||
1891 | * up keeping around invalidated buffers attached to transactions' | ||
1892 | * BJ_Forget list just to stop checkpointing code from cleaning up | ||
1893 | * the transaction this buffer was modified in. | ||
1894 | */ | ||
1895 | transaction = jh->b_transaction; | ||
1896 | if (transaction == NULL) { | ||
1897 | /* First case: not on any transaction. If it | ||
1898 | * has no checkpoint link, then we can zap it: | ||
1899 | * it's a writeback-mode buffer so we don't care | ||
1900 | * if it hits disk safely. */ | ||
1901 | if (!jh->b_cp_transaction) { | ||
1902 | JBUFFER_TRACE(jh, "not on any transaction: zap"); | ||
1903 | goto zap_buffer; | ||
1904 | } | ||
1905 | |||
1906 | if (!buffer_dirty(bh)) { | ||
1907 | /* bdflush has written it. We can drop it now */ | ||
1908 | goto zap_buffer; | ||
1909 | } | ||
1910 | |||
1911 | /* OK, it must be in the journal but still not | ||
1912 | * written fully to disk: it's metadata or | ||
1913 | * journaled data... */ | ||
1914 | |||
1915 | if (journal->j_running_transaction) { | ||
1916 | /* ... and once the current transaction has | ||
1917 | * committed, the buffer won't be needed any | ||
1918 | * longer. */ | ||
1919 | JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); | ||
1920 | may_free = __dispose_buffer(jh, | ||
1921 | journal->j_running_transaction); | ||
1922 | goto zap_buffer; | ||
1923 | } else { | ||
1924 | /* There is no currently-running transaction. So the | ||
1925 | * orphan record which we wrote for this file must have | ||
1926 | * passed into commit. We must attach this buffer to | ||
1927 | * the committing transaction, if it exists. */ | ||
1928 | if (journal->j_committing_transaction) { | ||
1929 | JBUFFER_TRACE(jh, "give to committing trans"); | ||
1930 | may_free = __dispose_buffer(jh, | ||
1931 | journal->j_committing_transaction); | ||
1932 | goto zap_buffer; | ||
1933 | } else { | ||
1934 | /* The orphan record's transaction has | ||
1935 | * committed. We can cleanse this buffer */ | ||
1936 | clear_buffer_jbddirty(bh); | ||
1937 | goto zap_buffer; | ||
1938 | } | ||
1939 | } | ||
1940 | } else if (transaction == journal->j_committing_transaction) { | ||
1941 | JBUFFER_TRACE(jh, "on committing transaction"); | ||
1942 | if (jh->b_jlist == BJ_Locked) { | ||
1943 | /* | ||
1944 | * The buffer is on the committing transaction's locked | ||
1945 | * list. We have the buffer locked, so I/O has | ||
1946 | * completed. So we can nail the buffer now. | ||
1947 | */ | ||
1948 | may_free = __dispose_buffer(jh, transaction); | ||
1949 | goto zap_buffer; | ||
1950 | } | ||
1951 | /* | ||
1952 | * The buffer is committing, we simply cannot touch | ||
1953 | * it. If the page is straddling i_size we have to wait | ||
1954 | * for commit and try again. | ||
1955 | */ | ||
1956 | if (partial_page) { | ||
1957 | tid_t tid = journal->j_committing_transaction->t_tid; | ||
1958 | |||
1959 | journal_put_journal_head(jh); | ||
1960 | spin_unlock(&journal->j_list_lock); | ||
1961 | jbd_unlock_bh_state(bh); | ||
1962 | spin_unlock(&journal->j_state_lock); | ||
1963 | unlock_buffer(bh); | ||
1964 | log_wait_commit(journal, tid); | ||
1965 | lock_buffer(bh); | ||
1966 | goto retry; | ||
1967 | } | ||
1968 | /* | ||
1969 | * OK, buffer won't be reachable after truncate. We just set | ||
1970 | * j_next_transaction to the running transaction (if there is | ||
1971 | * one) and mark buffer as freed so that commit code knows it | ||
1972 | * should clear dirty bits when it is done with the buffer. | ||
1973 | */ | ||
1974 | set_buffer_freed(bh); | ||
1975 | if (journal->j_running_transaction && buffer_jbddirty(bh)) | ||
1976 | jh->b_next_transaction = journal->j_running_transaction; | ||
1977 | journal_put_journal_head(jh); | ||
1978 | spin_unlock(&journal->j_list_lock); | ||
1979 | jbd_unlock_bh_state(bh); | ||
1980 | spin_unlock(&journal->j_state_lock); | ||
1981 | return 0; | ||
1982 | } else { | ||
1983 | /* Good, the buffer belongs to the running transaction. | ||
1984 | * We are writing our own transaction's data, not any | ||
1985 | * previous one's, so it is safe to throw it away | ||
1986 | * (remember that we expect the filesystem to have set | ||
1987 | * i_size already for this truncate so recovery will not | ||
1988 | * expose the disk blocks we are discarding here.) */ | ||
1989 | J_ASSERT_JH(jh, transaction == journal->j_running_transaction); | ||
1990 | JBUFFER_TRACE(jh, "on running transaction"); | ||
1991 | may_free = __dispose_buffer(jh, transaction); | ||
1992 | } | ||
1993 | |||
1994 | zap_buffer: | ||
1995 | /* | ||
1996 | * This is tricky. Although the buffer is truncated, it may be reused | ||
1997 | * if blocksize < pagesize and it is attached to the page straddling | ||
1998 | * EOF. Since the buffer might have been added to BJ_Forget list of the | ||
1999 | * running transaction, journal_get_write_access() won't clear | ||
2000 | * b_modified and credit accounting gets confused. So clear b_modified | ||
2001 | * here. */ | ||
2002 | jh->b_modified = 0; | ||
2003 | journal_put_journal_head(jh); | ||
2004 | zap_buffer_no_jh: | ||
2005 | spin_unlock(&journal->j_list_lock); | ||
2006 | jbd_unlock_bh_state(bh); | ||
2007 | spin_unlock(&journal->j_state_lock); | ||
2008 | zap_buffer_unlocked: | ||
2009 | clear_buffer_dirty(bh); | ||
2010 | J_ASSERT_BH(bh, !buffer_jbddirty(bh)); | ||
2011 | clear_buffer_mapped(bh); | ||
2012 | clear_buffer_req(bh); | ||
2013 | clear_buffer_new(bh); | ||
2014 | bh->b_bdev = NULL; | ||
2015 | return may_free; | ||
2016 | } | ||
2017 | |||
2018 | /** | ||
2019 | * void journal_invalidatepage() - invalidate a journal page | ||
2020 | * @journal: journal to use for flush | ||
2021 | * @page: page to flush | ||
2022 | * @offset: offset of the range to invalidate | ||
2023 | * @length: length of the range to invalidate | ||
2024 | * | ||
2025 | * Reap page buffers containing data in specified range in page. | ||
2026 | */ | ||
2027 | void journal_invalidatepage(journal_t *journal, | ||
2028 | struct page *page, | ||
2029 | unsigned int offset, | ||
2030 | unsigned int length) | ||
2031 | { | ||
2032 | struct buffer_head *head, *bh, *next; | ||
2033 | unsigned int stop = offset + length; | ||
2034 | unsigned int curr_off = 0; | ||
2035 | int partial_page = (offset || length < PAGE_CACHE_SIZE); | ||
2036 | int may_free = 1; | ||
2037 | |||
2038 | if (!PageLocked(page)) | ||
2039 | BUG(); | ||
2040 | if (!page_has_buffers(page)) | ||
2041 | return; | ||
2042 | |||
2043 | BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); | ||
2044 | |||
2045 | /* We will potentially be playing with lists other than just the | ||
2046 | * data lists (especially for journaled data mode), so be | ||
2047 | * cautious in our locking. */ | ||
2048 | |||
2049 | head = bh = page_buffers(page); | ||
2050 | do { | ||
2051 | unsigned int next_off = curr_off + bh->b_size; | ||
2052 | next = bh->b_this_page; | ||
2053 | |||
2054 | if (next_off > stop) | ||
2055 | return; | ||
2056 | |||
2057 | if (offset <= curr_off) { | ||
2058 | /* This block is wholly outside the truncation point */ | ||
2059 | lock_buffer(bh); | ||
2060 | may_free &= journal_unmap_buffer(journal, bh, | ||
2061 | partial_page); | ||
2062 | unlock_buffer(bh); | ||
2063 | } | ||
2064 | curr_off = next_off; | ||
2065 | bh = next; | ||
2066 | |||
2067 | } while (bh != head); | ||
2068 | |||
2069 | if (!partial_page) { | ||
2070 | if (may_free && try_to_free_buffers(page)) | ||
2071 | J_ASSERT(!page_has_buffers(page)); | ||
2072 | } | ||
2073 | } | ||
2074 | |||
2075 | /* | ||
2076 | * File a buffer on the given transaction list. | ||
2077 | */ | ||
2078 | void __journal_file_buffer(struct journal_head *jh, | ||
2079 | transaction_t *transaction, int jlist) | ||
2080 | { | ||
2081 | struct journal_head **list = NULL; | ||
2082 | int was_dirty = 0; | ||
2083 | struct buffer_head *bh = jh2bh(jh); | ||
2084 | |||
2085 | J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); | ||
2086 | assert_spin_locked(&transaction->t_journal->j_list_lock); | ||
2087 | |||
2088 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); | ||
2089 | J_ASSERT_JH(jh, jh->b_transaction == transaction || | ||
2090 | jh->b_transaction == NULL); | ||
2091 | |||
2092 | if (jh->b_transaction && jh->b_jlist == jlist) | ||
2093 | return; | ||
2094 | |||
2095 | if (jlist == BJ_Metadata || jlist == BJ_Reserved || | ||
2096 | jlist == BJ_Shadow || jlist == BJ_Forget) { | ||
2097 | /* | ||
2098 | * For metadata buffers, we track dirty bit in buffer_jbddirty | ||
2099 | * instead of buffer_dirty. We should not see a dirty bit set | ||
2100 | * here because we clear it in do_get_write_access but e.g. | ||
2101 | * tune2fs can modify the sb and set the dirty bit at any time | ||
2102 | * so we try to gracefully handle that. | ||
2103 | */ | ||
2104 | if (buffer_dirty(bh)) | ||
2105 | warn_dirty_buffer(bh); | ||
2106 | if (test_clear_buffer_dirty(bh) || | ||
2107 | test_clear_buffer_jbddirty(bh)) | ||
2108 | was_dirty = 1; | ||
2109 | } | ||
2110 | |||
2111 | if (jh->b_transaction) | ||
2112 | __journal_temp_unlink_buffer(jh); | ||
2113 | else | ||
2114 | journal_grab_journal_head(bh); | ||
2115 | jh->b_transaction = transaction; | ||
2116 | |||
2117 | switch (jlist) { | ||
2118 | case BJ_None: | ||
2119 | J_ASSERT_JH(jh, !jh->b_committed_data); | ||
2120 | J_ASSERT_JH(jh, !jh->b_frozen_data); | ||
2121 | return; | ||
2122 | case BJ_SyncData: | ||
2123 | list = &transaction->t_sync_datalist; | ||
2124 | break; | ||
2125 | case BJ_Metadata: | ||
2126 | transaction->t_nr_buffers++; | ||
2127 | list = &transaction->t_buffers; | ||
2128 | break; | ||
2129 | case BJ_Forget: | ||
2130 | list = &transaction->t_forget; | ||
2131 | break; | ||
2132 | case BJ_IO: | ||
2133 | list = &transaction->t_iobuf_list; | ||
2134 | break; | ||
2135 | case BJ_Shadow: | ||
2136 | list = &transaction->t_shadow_list; | ||
2137 | break; | ||
2138 | case BJ_LogCtl: | ||
2139 | list = &transaction->t_log_list; | ||
2140 | break; | ||
2141 | case BJ_Reserved: | ||
2142 | list = &transaction->t_reserved_list; | ||
2143 | break; | ||
2144 | case BJ_Locked: | ||
2145 | list = &transaction->t_locked_list; | ||
2146 | break; | ||
2147 | } | ||
2148 | |||
2149 | __blist_add_buffer(list, jh); | ||
2150 | jh->b_jlist = jlist; | ||
2151 | |||
2152 | if (was_dirty) | ||
2153 | set_buffer_jbddirty(bh); | ||
2154 | } | ||
2155 | |||
2156 | void journal_file_buffer(struct journal_head *jh, | ||
2157 | transaction_t *transaction, int jlist) | ||
2158 | { | ||
2159 | jbd_lock_bh_state(jh2bh(jh)); | ||
2160 | spin_lock(&transaction->t_journal->j_list_lock); | ||
2161 | __journal_file_buffer(jh, transaction, jlist); | ||
2162 | spin_unlock(&transaction->t_journal->j_list_lock); | ||
2163 | jbd_unlock_bh_state(jh2bh(jh)); | ||
2164 | } | ||
2165 | |||
2166 | /* | ||
2167 | * Remove a buffer from its current buffer list in preparation for | ||
2168 | * dropping it from its current transaction entirely. If the buffer has | ||
2169 | * already started to be used by a subsequent transaction, refile the | ||
2170 | * buffer on that transaction's metadata list. | ||
2171 | * | ||
2172 | * Called under j_list_lock | ||
2173 | * Called under jbd_lock_bh_state(jh2bh(jh)) | ||
2174 | * | ||
2175 | * jh and bh may be already free when this function returns | ||
2176 | */ | ||
2177 | void __journal_refile_buffer(struct journal_head *jh) | ||
2178 | { | ||
2179 | int was_dirty, jlist; | ||
2180 | struct buffer_head *bh = jh2bh(jh); | ||
2181 | |||
2182 | J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); | ||
2183 | if (jh->b_transaction) | ||
2184 | assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); | ||
2185 | |||
2186 | /* If the buffer is now unused, just drop it. */ | ||
2187 | if (jh->b_next_transaction == NULL) { | ||
2188 | __journal_unfile_buffer(jh); | ||
2189 | return; | ||
2190 | } | ||
2191 | |||
2192 | /* | ||
2193 | * It has been modified by a later transaction: add it to the new | ||
2194 | * transaction's metadata list. | ||
2195 | */ | ||
2196 | |||
2197 | was_dirty = test_clear_buffer_jbddirty(bh); | ||
2198 | __journal_temp_unlink_buffer(jh); | ||
2199 | /* | ||
2200 | * We set b_transaction here because b_next_transaction will inherit | ||
2201 | * our jh reference and thus __journal_file_buffer() must not take a | ||
2202 | * new one. | ||
2203 | */ | ||
2204 | jh->b_transaction = jh->b_next_transaction; | ||
2205 | jh->b_next_transaction = NULL; | ||
2206 | if (buffer_freed(bh)) | ||
2207 | jlist = BJ_Forget; | ||
2208 | else if (jh->b_modified) | ||
2209 | jlist = BJ_Metadata; | ||
2210 | else | ||
2211 | jlist = BJ_Reserved; | ||
2212 | __journal_file_buffer(jh, jh->b_transaction, jlist); | ||
2213 | J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); | ||
2214 | |||
2215 | if (was_dirty) | ||
2216 | set_buffer_jbddirty(bh); | ||
2217 | } | ||
2218 | |||
2219 | /* | ||
2220 | * __journal_refile_buffer() with necessary locking added. We take our bh | ||
2221 | * reference so that we can safely unlock bh. | ||
2222 | * | ||
2223 | * The jh and bh may be freed by this call. | ||
2224 | */ | ||
2225 | void journal_refile_buffer(journal_t *journal, struct journal_head *jh) | ||
2226 | { | ||
2227 | struct buffer_head *bh = jh2bh(jh); | ||
2228 | |||
2229 | /* Get reference so that buffer cannot be freed before we unlock it */ | ||
2230 | get_bh(bh); | ||
2231 | jbd_lock_bh_state(bh); | ||
2232 | spin_lock(&journal->j_list_lock); | ||
2233 | __journal_refile_buffer(jh); | ||
2234 | jbd_unlock_bh_state(bh); | ||
2235 | spin_unlock(&journal->j_list_lock); | ||
2236 | __brelse(bh); | ||
2237 | } | ||