diff options
Diffstat (limited to 'fs/jbd2')
-rw-r--r-- | fs/jbd2/checkpoint.c | 1 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 294 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 53 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 365 |
4 files changed, 307 insertions, 406 deletions
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6914598022ce..91389c8aee8a 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact | |||
688 | 688 | ||
689 | J_ASSERT(transaction->t_state == T_FINISHED); | 689 | J_ASSERT(transaction->t_state == T_FINISHED); |
690 | J_ASSERT(transaction->t_buffers == NULL); | 690 | J_ASSERT(transaction->t_buffers == NULL); |
691 | J_ASSERT(transaction->t_sync_datalist == NULL); | ||
692 | J_ASSERT(transaction->t_forget == NULL); | 691 | J_ASSERT(transaction->t_forget == NULL); |
693 | J_ASSERT(transaction->t_iobuf_list == NULL); | 692 | J_ASSERT(transaction->t_iobuf_list == NULL); |
694 | J_ASSERT(transaction->t_shadow_list == NULL); | 693 | J_ASSERT(transaction->t_shadow_list == NULL); |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index a2ed72f7ceee..f8b3be873226 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/crc32.h> | 24 | #include <linux/crc32.h> |
25 | #include <linux/writeback.h> | ||
26 | #include <linux/backing-dev.h> | ||
25 | 27 | ||
26 | /* | 28 | /* |
27 | * Default IO end handler for temporary BJ_IO buffer_heads. | 29 | * Default IO end handler for temporary BJ_IO buffer_heads. |
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | |||
37 | } | 39 | } |
38 | 40 | ||
39 | /* | 41 | /* |
40 | * When an ext3-ordered file is truncated, it is possible that many pages are | 42 | * When an ext4 file is truncated, it is possible that some pages are not |
41 | * not sucessfully freed, because they are attached to a committing transaction. | 43 | * successfully freed, because they are attached to a committing transaction. |
42 | * After the transaction commits, these pages are left on the LRU, with no | 44 | * After the transaction commits, these pages are left on the LRU, with no |
43 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | 45 | * ->mapping, and with attached buffers. These pages are trivially reclaimable |
44 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | 46 | * by the VM, but their apparent absence upsets the VM accounting, and it makes |
@@ -80,21 +82,6 @@ nope: | |||
80 | } | 82 | } |
81 | 83 | ||
82 | /* | 84 | /* |
83 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is | ||
84 | * held. For ranking reasons we must trylock. If we lose, schedule away and | ||
85 | * return 0. j_list_lock is dropped in this case. | ||
86 | */ | ||
87 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) | ||
88 | { | ||
89 | if (!jbd_trylock_bh_state(bh)) { | ||
90 | spin_unlock(&journal->j_list_lock); | ||
91 | schedule(); | ||
92 | return 0; | ||
93 | } | ||
94 | return 1; | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * Done it all: now submit the commit record. We should have | 85 | * Done it all: now submit the commit record. We should have |
99 | * cleaned up our previous buffers by now, so if we are in abort | 86 | * cleaned up our previous buffers by now, so if we are in abort |
100 | * mode we can now just skip the rest of the journal write | 87 | * mode we can now just skip the rest of the journal write |
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal, | |||
112 | struct buffer_head *bh; | 99 | struct buffer_head *bh; |
113 | int ret; | 100 | int ret; |
114 | int barrier_done = 0; | 101 | int barrier_done = 0; |
102 | struct timespec now = current_kernel_time(); | ||
115 | 103 | ||
116 | if (is_journal_aborted(journal)) | 104 | if (is_journal_aborted(journal)) |
117 | return 0; | 105 | return 0; |
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal, | |||
126 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 114 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
127 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | 115 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); |
128 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 116 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
117 | tmp->h_commit_sec = cpu_to_be64(now.tv_sec); | ||
118 | tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); | ||
129 | 119 | ||
130 | if (JBD2_HAS_COMPAT_FEATURE(journal, | 120 | if (JBD2_HAS_COMPAT_FEATURE(journal, |
131 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | 121 | JBD2_FEATURE_COMPAT_CHECKSUM)) { |
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) | |||
197 | } | 187 | } |
198 | 188 | ||
199 | /* | 189 | /* |
200 | * Wait for all submitted IO to complete. | 190 | * write the filemap data using writepage() address_space_operations. |
191 | * We don't do block allocation here even for delalloc. We don't | ||
192 | * use writepages() because with dealyed allocation we may be doing | ||
193 | * block allocation in writepages(). | ||
201 | */ | 194 | */ |
202 | static int journal_wait_on_locked_list(journal_t *journal, | 195 | static int journal_submit_inode_data_buffers(struct address_space *mapping) |
203 | transaction_t *commit_transaction) | ||
204 | { | 196 | { |
205 | int ret = 0; | 197 | int ret; |
206 | struct journal_head *jh; | 198 | struct writeback_control wbc = { |
207 | 199 | .sync_mode = WB_SYNC_ALL, | |
208 | while (commit_transaction->t_locked_list) { | 200 | .nr_to_write = mapping->nrpages * 2, |
209 | struct buffer_head *bh; | 201 | .range_start = 0, |
210 | 202 | .range_end = i_size_read(mapping->host), | |
211 | jh = commit_transaction->t_locked_list->b_tprev; | 203 | .for_writepages = 1, |
212 | bh = jh2bh(jh); | 204 | }; |
213 | get_bh(bh); | 205 | |
214 | if (buffer_locked(bh)) { | 206 | ret = generic_writepages(mapping, &wbc); |
215 | spin_unlock(&journal->j_list_lock); | ||
216 | wait_on_buffer(bh); | ||
217 | if (unlikely(!buffer_uptodate(bh))) | ||
218 | ret = -EIO; | ||
219 | spin_lock(&journal->j_list_lock); | ||
220 | } | ||
221 | if (!inverted_lock(journal, bh)) { | ||
222 | put_bh(bh); | ||
223 | spin_lock(&journal->j_list_lock); | ||
224 | continue; | ||
225 | } | ||
226 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
227 | __jbd2_journal_unfile_buffer(jh); | ||
228 | jbd_unlock_bh_state(bh); | ||
229 | jbd2_journal_remove_journal_head(bh); | ||
230 | put_bh(bh); | ||
231 | } else { | ||
232 | jbd_unlock_bh_state(bh); | ||
233 | } | ||
234 | put_bh(bh); | ||
235 | cond_resched_lock(&journal->j_list_lock); | ||
236 | } | ||
237 | return ret; | 207 | return ret; |
238 | } | 208 | } |
239 | 209 | ||
240 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) | 210 | /* |
211 | * Submit all the data buffers of inode associated with the transaction to | ||
212 | * disk. | ||
213 | * | ||
214 | * We are in a committing transaction. Therefore no new inode can be added to | ||
215 | * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently | ||
216 | * operate on from being released while we write out pages. | ||
217 | */ | ||
218 | static int journal_submit_data_buffers(journal_t *journal, | ||
219 | transaction_t *commit_transaction) | ||
241 | { | 220 | { |
242 | int i; | 221 | struct jbd2_inode *jinode; |
222 | int err, ret = 0; | ||
223 | struct address_space *mapping; | ||
243 | 224 | ||
244 | for (i = 0; i < bufs; i++) { | 225 | spin_lock(&journal->j_list_lock); |
245 | wbuf[i]->b_end_io = end_buffer_write_sync; | 226 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { |
246 | /* We use-up our safety reference in submit_bh() */ | 227 | mapping = jinode->i_vfs_inode->i_mapping; |
247 | submit_bh(WRITE, wbuf[i]); | 228 | jinode->i_flags |= JI_COMMIT_RUNNING; |
229 | spin_unlock(&journal->j_list_lock); | ||
230 | /* | ||
231 | * submit the inode data buffers. We use writepage | ||
232 | * instead of writepages. Because writepages can do | ||
233 | * block allocation with delalloc. We need to write | ||
234 | * only allocated blocks here. | ||
235 | */ | ||
236 | err = journal_submit_inode_data_buffers(mapping); | ||
237 | if (!ret) | ||
238 | ret = err; | ||
239 | spin_lock(&journal->j_list_lock); | ||
240 | J_ASSERT(jinode->i_transaction == commit_transaction); | ||
241 | jinode->i_flags &= ~JI_COMMIT_RUNNING; | ||
242 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | ||
248 | } | 243 | } |
244 | spin_unlock(&journal->j_list_lock); | ||
245 | return ret; | ||
249 | } | 246 | } |
250 | 247 | ||
251 | /* | 248 | /* |
252 | * Submit all the data buffers to disk | 249 | * Wait for data submitted for writeout, refile inodes to proper |
250 | * transaction if needed. | ||
251 | * | ||
253 | */ | 252 | */ |
254 | static void journal_submit_data_buffers(journal_t *journal, | 253 | static int journal_finish_inode_data_buffers(journal_t *journal, |
255 | transaction_t *commit_transaction) | 254 | transaction_t *commit_transaction) |
256 | { | 255 | { |
257 | struct journal_head *jh; | 256 | struct jbd2_inode *jinode, *next_i; |
258 | struct buffer_head *bh; | 257 | int err, ret = 0; |
259 | int locked; | ||
260 | int bufs = 0; | ||
261 | struct buffer_head **wbuf = journal->j_wbuf; | ||
262 | 258 | ||
263 | /* | 259 | /* For locking, see the comment in journal_submit_data_buffers() */ |
264 | * Whenever we unlock the journal and sleep, things can get added | ||
265 | * onto ->t_sync_datalist, so we have to keep looping back to | ||
266 | * write_out_data until we *know* that the list is empty. | ||
267 | * | ||
268 | * Cleanup any flushed data buffers from the data list. Even in | ||
269 | * abort mode, we want to flush this out as soon as possible. | ||
270 | */ | ||
271 | write_out_data: | ||
272 | cond_resched(); | ||
273 | spin_lock(&journal->j_list_lock); | 260 | spin_lock(&journal->j_list_lock); |
261 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { | ||
262 | jinode->i_flags |= JI_COMMIT_RUNNING; | ||
263 | spin_unlock(&journal->j_list_lock); | ||
264 | err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); | ||
265 | if (!ret) | ||
266 | ret = err; | ||
267 | spin_lock(&journal->j_list_lock); | ||
268 | jinode->i_flags &= ~JI_COMMIT_RUNNING; | ||
269 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | ||
270 | } | ||
274 | 271 | ||
275 | while (commit_transaction->t_sync_datalist) { | 272 | /* Now refile inode to proper lists */ |
276 | jh = commit_transaction->t_sync_datalist; | 273 | list_for_each_entry_safe(jinode, next_i, |
277 | bh = jh2bh(jh); | 274 | &commit_transaction->t_inode_list, i_list) { |
278 | locked = 0; | 275 | list_del(&jinode->i_list); |
279 | 276 | if (jinode->i_next_transaction) { | |
280 | /* Get reference just to make sure buffer does not disappear | 277 | jinode->i_transaction = jinode->i_next_transaction; |
281 | * when we are forced to drop various locks */ | 278 | jinode->i_next_transaction = NULL; |
282 | get_bh(bh); | 279 | list_add(&jinode->i_list, |
283 | /* If the buffer is dirty, we need to submit IO and hence | 280 | &jinode->i_transaction->t_inode_list); |
284 | * we need the buffer lock. We try to lock the buffer without | ||
285 | * blocking. If we fail, we need to drop j_list_lock and do | ||
286 | * blocking lock_buffer(). | ||
287 | */ | ||
288 | if (buffer_dirty(bh)) { | ||
289 | if (test_set_buffer_locked(bh)) { | ||
290 | BUFFER_TRACE(bh, "needs blocking lock"); | ||
291 | spin_unlock(&journal->j_list_lock); | ||
292 | /* Write out all data to prevent deadlocks */ | ||
293 | journal_do_submit_data(wbuf, bufs); | ||
294 | bufs = 0; | ||
295 | lock_buffer(bh); | ||
296 | spin_lock(&journal->j_list_lock); | ||
297 | } | ||
298 | locked = 1; | ||
299 | } | ||
300 | /* We have to get bh_state lock. Again out of order, sigh. */ | ||
301 | if (!inverted_lock(journal, bh)) { | ||
302 | jbd_lock_bh_state(bh); | ||
303 | spin_lock(&journal->j_list_lock); | ||
304 | } | ||
305 | /* Someone already cleaned up the buffer? */ | ||
306 | if (!buffer_jbd(bh) | ||
307 | || jh->b_transaction != commit_transaction | ||
308 | || jh->b_jlist != BJ_SyncData) { | ||
309 | jbd_unlock_bh_state(bh); | ||
310 | if (locked) | ||
311 | unlock_buffer(bh); | ||
312 | BUFFER_TRACE(bh, "already cleaned up"); | ||
313 | put_bh(bh); | ||
314 | continue; | ||
315 | } | ||
316 | if (locked && test_clear_buffer_dirty(bh)) { | ||
317 | BUFFER_TRACE(bh, "needs writeout, adding to array"); | ||
318 | wbuf[bufs++] = bh; | ||
319 | __jbd2_journal_file_buffer(jh, commit_transaction, | ||
320 | BJ_Locked); | ||
321 | jbd_unlock_bh_state(bh); | ||
322 | if (bufs == journal->j_wbufsize) { | ||
323 | spin_unlock(&journal->j_list_lock); | ||
324 | journal_do_submit_data(wbuf, bufs); | ||
325 | bufs = 0; | ||
326 | goto write_out_data; | ||
327 | } | ||
328 | } else if (!locked && buffer_locked(bh)) { | ||
329 | __jbd2_journal_file_buffer(jh, commit_transaction, | ||
330 | BJ_Locked); | ||
331 | jbd_unlock_bh_state(bh); | ||
332 | put_bh(bh); | ||
333 | } else { | 281 | } else { |
334 | BUFFER_TRACE(bh, "writeout complete: unfile"); | 282 | jinode->i_transaction = NULL; |
335 | __jbd2_journal_unfile_buffer(jh); | ||
336 | jbd_unlock_bh_state(bh); | ||
337 | if (locked) | ||
338 | unlock_buffer(bh); | ||
339 | jbd2_journal_remove_journal_head(bh); | ||
340 | /* Once for our safety reference, once for | ||
341 | * jbd2_journal_remove_journal_head() */ | ||
342 | put_bh(bh); | ||
343 | put_bh(bh); | ||
344 | } | ||
345 | |||
346 | if (need_resched() || spin_needbreak(&journal->j_list_lock)) { | ||
347 | spin_unlock(&journal->j_list_lock); | ||
348 | goto write_out_data; | ||
349 | } | 283 | } |
350 | } | 284 | } |
351 | spin_unlock(&journal->j_list_lock); | 285 | spin_unlock(&journal->j_list_lock); |
352 | journal_do_submit_data(wbuf, bufs); | 286 | |
287 | return ret; | ||
353 | } | 288 | } |
354 | 289 | ||
355 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) | 290 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) |
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
524 | * Now start flushing things to disk, in the order they appear | 459 | * Now start flushing things to disk, in the order they appear |
525 | * on the transaction lists. Data blocks go first. | 460 | * on the transaction lists. Data blocks go first. |
526 | */ | 461 | */ |
527 | err = 0; | 462 | err = journal_submit_data_buffers(journal, commit_transaction); |
528 | journal_submit_data_buffers(journal, commit_transaction); | ||
529 | |||
530 | /* | ||
531 | * Wait for all previously submitted IO to complete if commit | ||
532 | * record is to be written synchronously. | ||
533 | */ | ||
534 | spin_lock(&journal->j_list_lock); | ||
535 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, | ||
536 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) | ||
537 | err = journal_wait_on_locked_list(journal, | ||
538 | commit_transaction); | ||
539 | |||
540 | spin_unlock(&journal->j_list_lock); | ||
541 | |||
542 | if (err) | 463 | if (err) |
543 | jbd2_journal_abort(journal, err); | 464 | jbd2_journal_abort(journal, err); |
544 | 465 | ||
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
547 | jbd_debug(3, "JBD: commit phase 2\n"); | 468 | jbd_debug(3, "JBD: commit phase 2\n"); |
548 | 469 | ||
549 | /* | 470 | /* |
550 | * If we found any dirty or locked buffers, then we should have | ||
551 | * looped back up to the write_out_data label. If there weren't | ||
552 | * any then journal_clean_data_list should have wiped the list | ||
553 | * clean by now, so check that it is in fact empty. | ||
554 | */ | ||
555 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); | ||
556 | |||
557 | jbd_debug (3, "JBD: commit phase 3\n"); | ||
558 | |||
559 | /* | ||
560 | * Way to go: we have now written out all of the data for a | 471 | * Way to go: we have now written out all of the data for a |
561 | * transaction! Now comes the tricky part: we need to write out | 472 | * transaction! Now comes the tricky part: we need to write out |
562 | * metadata. Loop over the transaction's entire buffer list: | 473 | * metadata. Loop over the transaction's entire buffer list: |
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
574 | J_ASSERT(commit_transaction->t_nr_buffers <= | 485 | J_ASSERT(commit_transaction->t_nr_buffers <= |
575 | commit_transaction->t_outstanding_credits); | 486 | commit_transaction->t_outstanding_credits); |
576 | 487 | ||
488 | err = 0; | ||
577 | descriptor = NULL; | 489 | descriptor = NULL; |
578 | bufs = 0; | 490 | bufs = 0; |
579 | while (commit_transaction->t_buffers) { | 491 | while (commit_transaction->t_buffers) { |
@@ -748,15 +660,19 @@ start_journal_io: | |||
748 | &cbh, crc32_sum); | 660 | &cbh, crc32_sum); |
749 | if (err) | 661 | if (err) |
750 | __jbd2_journal_abort_hard(journal); | 662 | __jbd2_journal_abort_hard(journal); |
751 | |||
752 | spin_lock(&journal->j_list_lock); | ||
753 | err = journal_wait_on_locked_list(journal, | ||
754 | commit_transaction); | ||
755 | spin_unlock(&journal->j_list_lock); | ||
756 | if (err) | ||
757 | __jbd2_journal_abort_hard(journal); | ||
758 | } | 663 | } |
759 | 664 | ||
665 | /* | ||
666 | * This is the right place to wait for data buffers both for ASYNC | ||
667 | * and !ASYNC commit. If commit is ASYNC, we need to wait only after | ||
668 | * the commit block went to disk (which happens above). If commit is | ||
669 | * SYNC, we need to wait for data buffers before we start writing | ||
670 | * commit block, which happens below in such setting. | ||
671 | */ | ||
672 | err = journal_finish_inode_data_buffers(journal, commit_transaction); | ||
673 | if (err) | ||
674 | jbd2_journal_abort(journal, err); | ||
675 | |||
760 | /* Lo and behold: we have just managed to send a transaction to | 676 | /* Lo and behold: we have just managed to send a transaction to |
761 | the log. Before we can commit it, wait for the IO so far to | 677 | the log. Before we can commit it, wait for the IO so far to |
762 | complete. Control buffers being written are on the | 678 | complete. Control buffers being written are on the |
@@ -768,7 +684,7 @@ start_journal_io: | |||
768 | so we incur less scheduling load. | 684 | so we incur less scheduling load. |
769 | */ | 685 | */ |
770 | 686 | ||
771 | jbd_debug(3, "JBD: commit phase 4\n"); | 687 | jbd_debug(3, "JBD: commit phase 3\n"); |
772 | 688 | ||
773 | /* | 689 | /* |
774 | * akpm: these are BJ_IO, and j_list_lock is not needed. | 690 | * akpm: these are BJ_IO, and j_list_lock is not needed. |
@@ -827,7 +743,7 @@ wait_for_iobuf: | |||
827 | 743 | ||
828 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | 744 | J_ASSERT (commit_transaction->t_shadow_list == NULL); |
829 | 745 | ||
830 | jbd_debug(3, "JBD: commit phase 5\n"); | 746 | jbd_debug(3, "JBD: commit phase 4\n"); |
831 | 747 | ||
832 | /* Here we wait for the revoke record and descriptor record buffers */ | 748 | /* Here we wait for the revoke record and descriptor record buffers */ |
833 | wait_for_ctlbuf: | 749 | wait_for_ctlbuf: |
@@ -854,7 +770,7 @@ wait_for_iobuf: | |||
854 | /* AKPM: bforget here */ | 770 | /* AKPM: bforget here */ |
855 | } | 771 | } |
856 | 772 | ||
857 | jbd_debug(3, "JBD: commit phase 6\n"); | 773 | jbd_debug(3, "JBD: commit phase 5\n"); |
858 | 774 | ||
859 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, | 775 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
860 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | 776 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
@@ -874,9 +790,9 @@ wait_for_iobuf: | |||
874 | transaction can be removed from any checkpoint list it was on | 790 | transaction can be removed from any checkpoint list it was on |
875 | before. */ | 791 | before. */ |
876 | 792 | ||
877 | jbd_debug(3, "JBD: commit phase 7\n"); | 793 | jbd_debug(3, "JBD: commit phase 6\n"); |
878 | 794 | ||
879 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); | 795 | J_ASSERT(list_empty(&commit_transaction->t_inode_list)); |
880 | J_ASSERT(commit_transaction->t_buffers == NULL); | 796 | J_ASSERT(commit_transaction->t_buffers == NULL); |
881 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | 797 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); |
882 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | 798 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); |
@@ -997,7 +913,7 @@ restart_loop: | |||
997 | 913 | ||
998 | /* Done with this transaction! */ | 914 | /* Done with this transaction! */ |
999 | 915 | ||
1000 | jbd_debug(3, "JBD: commit phase 8\n"); | 916 | jbd_debug(3, "JBD: commit phase 7\n"); |
1001 | 917 | ||
1002 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | 918 | J_ASSERT(commit_transaction->t_state == T_COMMIT); |
1003 | 919 | ||
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2e24567c4a79..b26c6d9fe6ae 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates); | |||
50 | EXPORT_SYMBOL(jbd2_journal_get_write_access); | 50 | EXPORT_SYMBOL(jbd2_journal_get_write_access); |
51 | EXPORT_SYMBOL(jbd2_journal_get_create_access); | 51 | EXPORT_SYMBOL(jbd2_journal_get_create_access); |
52 | EXPORT_SYMBOL(jbd2_journal_get_undo_access); | 52 | EXPORT_SYMBOL(jbd2_journal_get_undo_access); |
53 | EXPORT_SYMBOL(jbd2_journal_dirty_data); | ||
54 | EXPORT_SYMBOL(jbd2_journal_dirty_metadata); | 53 | EXPORT_SYMBOL(jbd2_journal_dirty_metadata); |
55 | EXPORT_SYMBOL(jbd2_journal_release_buffer); | 54 | EXPORT_SYMBOL(jbd2_journal_release_buffer); |
56 | EXPORT_SYMBOL(jbd2_journal_forget); | 55 | EXPORT_SYMBOL(jbd2_journal_forget); |
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); | |||
82 | EXPORT_SYMBOL(jbd2_journal_invalidatepage); | 81 | EXPORT_SYMBOL(jbd2_journal_invalidatepage); |
83 | EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); | 82 | EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); |
84 | EXPORT_SYMBOL(jbd2_journal_force_commit); | 83 | EXPORT_SYMBOL(jbd2_journal_force_commit); |
84 | EXPORT_SYMBOL(jbd2_journal_file_inode); | ||
85 | EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); | ||
86 | EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); | ||
87 | EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); | ||
85 | 88 | ||
86 | static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); | 89 | static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); |
87 | static void __journal_abort_soft (journal_t *journal, int errno); | 90 | static void __journal_abort_soft (journal_t *journal, int errno); |
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) | |||
2195 | } | 2198 | } |
2196 | 2199 | ||
2197 | /* | 2200 | /* |
2201 | * Initialize jbd inode head | ||
2202 | */ | ||
2203 | void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) | ||
2204 | { | ||
2205 | jinode->i_transaction = NULL; | ||
2206 | jinode->i_next_transaction = NULL; | ||
2207 | jinode->i_vfs_inode = inode; | ||
2208 | jinode->i_flags = 0; | ||
2209 | INIT_LIST_HEAD(&jinode->i_list); | ||
2210 | } | ||
2211 | |||
2212 | /* | ||
2213 | * Function to be called before we start removing inode from memory (i.e., | ||
2214 | * clear_inode() is a fine place to be called from). It removes inode from | ||
2215 | * transaction's lists. | ||
2216 | */ | ||
2217 | void jbd2_journal_release_jbd_inode(journal_t *journal, | ||
2218 | struct jbd2_inode *jinode) | ||
2219 | { | ||
2220 | int writeout = 0; | ||
2221 | |||
2222 | if (!journal) | ||
2223 | return; | ||
2224 | restart: | ||
2225 | spin_lock(&journal->j_list_lock); | ||
2226 | /* Is commit writing out inode - we have to wait */ | ||
2227 | if (jinode->i_flags & JI_COMMIT_RUNNING) { | ||
2228 | wait_queue_head_t *wq; | ||
2229 | DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); | ||
2230 | wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); | ||
2231 | prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); | ||
2232 | spin_unlock(&journal->j_list_lock); | ||
2233 | schedule(); | ||
2234 | finish_wait(wq, &wait.wait); | ||
2235 | goto restart; | ||
2236 | } | ||
2237 | |||
2238 | /* Do we need to wait for data writeback? */ | ||
2239 | if (journal->j_committing_transaction == jinode->i_transaction) | ||
2240 | writeout = 1; | ||
2241 | if (jinode->i_transaction) { | ||
2242 | list_del(&jinode->i_list); | ||
2243 | jinode->i_transaction = NULL; | ||
2244 | } | ||
2245 | spin_unlock(&journal->j_list_lock); | ||
2246 | } | ||
2247 | |||
2248 | /* | ||
2198 | * debugfs tunables | 2249 | * debugfs tunables |
2199 | */ | 2250 | */ |
2200 | #ifdef CONFIG_JBD2_DEBUG | 2251 | #ifdef CONFIG_JBD2_DEBUG |
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6e006e67804..4f7cadbb19fa 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); | |||
41 | * new transaction and we can't block without protecting against other | 41 | * new transaction and we can't block without protecting against other |
42 | * processes trying to touch the journal while it is in transition. | 42 | * processes trying to touch the journal while it is in transition. |
43 | * | 43 | * |
44 | * Called under j_state_lock | ||
45 | */ | 44 | */ |
46 | 45 | ||
47 | static transaction_t * | 46 | static transaction_t * |
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) | |||
52 | transaction->t_tid = journal->j_transaction_sequence++; | 51 | transaction->t_tid = journal->j_transaction_sequence++; |
53 | transaction->t_expires = jiffies + journal->j_commit_interval; | 52 | transaction->t_expires = jiffies + journal->j_commit_interval; |
54 | spin_lock_init(&transaction->t_handle_lock); | 53 | spin_lock_init(&transaction->t_handle_lock); |
54 | INIT_LIST_HEAD(&transaction->t_inode_list); | ||
55 | 55 | ||
56 | /* Set up the commit timer for the new transaction. */ | 56 | /* Set up the commit timer for the new transaction. */ |
57 | journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); | 57 | journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); |
@@ -943,183 +943,6 @@ out: | |||
943 | } | 943 | } |
944 | 944 | ||
945 | /** | 945 | /** |
946 | * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which | ||
947 | * needs to be flushed before we can commit the | ||
948 | * current transaction. | ||
949 | * @handle: transaction | ||
950 | * @bh: bufferhead to mark | ||
951 | * | ||
952 | * The buffer is placed on the transaction's data list and is marked as | ||
953 | * belonging to the transaction. | ||
954 | * | ||
955 | * Returns error number or 0 on success. | ||
956 | * | ||
957 | * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage | ||
958 | * by kswapd. | ||
959 | */ | ||
960 | int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) | ||
961 | { | ||
962 | journal_t *journal = handle->h_transaction->t_journal; | ||
963 | int need_brelse = 0; | ||
964 | struct journal_head *jh; | ||
965 | |||
966 | if (is_handle_aborted(handle)) | ||
967 | return 0; | ||
968 | |||
969 | jh = jbd2_journal_add_journal_head(bh); | ||
970 | JBUFFER_TRACE(jh, "entry"); | ||
971 | |||
972 | /* | ||
973 | * The buffer could *already* be dirty. Writeout can start | ||
974 | * at any time. | ||
975 | */ | ||
976 | jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); | ||
977 | |||
978 | /* | ||
979 | * What if the buffer is already part of a running transaction? | ||
980 | * | ||
981 | * There are two cases: | ||
982 | * 1) It is part of the current running transaction. Refile it, | ||
983 | * just in case we have allocated it as metadata, deallocated | ||
984 | * it, then reallocated it as data. | ||
985 | * 2) It is part of the previous, still-committing transaction. | ||
986 | * If all we want to do is to guarantee that the buffer will be | ||
987 | * written to disk before this new transaction commits, then | ||
988 | * being sure that the *previous* transaction has this same | ||
989 | * property is sufficient for us! Just leave it on its old | ||
990 | * transaction. | ||
991 | * | ||
992 | * In case (2), the buffer must not already exist as metadata | ||
993 | * --- that would violate write ordering (a transaction is free | ||
994 | * to write its data at any point, even before the previous | ||
995 | * committing transaction has committed). The caller must | ||
996 | * never, ever allow this to happen: there's nothing we can do | ||
997 | * about it in this layer. | ||
998 | */ | ||
999 | jbd_lock_bh_state(bh); | ||
1000 | spin_lock(&journal->j_list_lock); | ||
1001 | |||
1002 | /* Now that we have bh_state locked, are we really still mapped? */ | ||
1003 | if (!buffer_mapped(bh)) { | ||
1004 | JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); | ||
1005 | goto no_journal; | ||
1006 | } | ||
1007 | |||
1008 | if (jh->b_transaction) { | ||
1009 | JBUFFER_TRACE(jh, "has transaction"); | ||
1010 | if (jh->b_transaction != handle->h_transaction) { | ||
1011 | JBUFFER_TRACE(jh, "belongs to older transaction"); | ||
1012 | J_ASSERT_JH(jh, jh->b_transaction == | ||
1013 | journal->j_committing_transaction); | ||
1014 | |||
1015 | /* @@@ IS THIS TRUE ? */ | ||
1016 | /* | ||
1017 | * Not any more. Scenario: someone does a write() | ||
1018 | * in data=journal mode. The buffer's transaction has | ||
1019 | * moved into commit. Then someone does another | ||
1020 | * write() to the file. We do the frozen data copyout | ||
1021 | * and set b_next_transaction to point to j_running_t. | ||
1022 | * And while we're in that state, someone does a | ||
1023 | * writepage() in an attempt to pageout the same area | ||
1024 | * of the file via a shared mapping. At present that | ||
1025 | * calls jbd2_journal_dirty_data(), and we get right here. | ||
1026 | * It may be too late to journal the data. Simply | ||
1027 | * falling through to the next test will suffice: the | ||
1028 | * data will be dirty and wil be checkpointed. The | ||
1029 | * ordering comments in the next comment block still | ||
1030 | * apply. | ||
1031 | */ | ||
1032 | //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
1033 | |||
1034 | /* | ||
1035 | * If we're journalling data, and this buffer was | ||
1036 | * subject to a write(), it could be metadata, forget | ||
1037 | * or shadow against the committing transaction. Now, | ||
1038 | * someone has dirtied the same darn page via a mapping | ||
1039 | * and it is being writepage()'d. | ||
1040 | * We *could* just steal the page from commit, with some | ||
1041 | * fancy locking there. Instead, we just skip it - | ||
1042 | * don't tie the page's buffers to the new transaction | ||
1043 | * at all. | ||
1044 | * Implication: if we crash before the writepage() data | ||
1045 | * is written into the filesystem, recovery will replay | ||
1046 | * the write() data. | ||
1047 | */ | ||
1048 | if (jh->b_jlist != BJ_None && | ||
1049 | jh->b_jlist != BJ_SyncData && | ||
1050 | jh->b_jlist != BJ_Locked) { | ||
1051 | JBUFFER_TRACE(jh, "Not stealing"); | ||
1052 | goto no_journal; | ||
1053 | } | ||
1054 | |||
1055 | /* | ||
1056 | * This buffer may be undergoing writeout in commit. We | ||
1057 | * can't return from here and let the caller dirty it | ||
1058 | * again because that can cause the write-out loop in | ||
1059 | * commit to never terminate. | ||
1060 | */ | ||
1061 | if (buffer_dirty(bh)) { | ||
1062 | get_bh(bh); | ||
1063 | spin_unlock(&journal->j_list_lock); | ||
1064 | jbd_unlock_bh_state(bh); | ||
1065 | need_brelse = 1; | ||
1066 | sync_dirty_buffer(bh); | ||
1067 | jbd_lock_bh_state(bh); | ||
1068 | spin_lock(&journal->j_list_lock); | ||
1069 | /* Since we dropped the lock... */ | ||
1070 | if (!buffer_mapped(bh)) { | ||
1071 | JBUFFER_TRACE(jh, "buffer got unmapped"); | ||
1072 | goto no_journal; | ||
1073 | } | ||
1074 | /* The buffer may become locked again at any | ||
1075 | time if it is redirtied */ | ||
1076 | } | ||
1077 | |||
1078 | /* journal_clean_data_list() may have got there first */ | ||
1079 | if (jh->b_transaction != NULL) { | ||
1080 | JBUFFER_TRACE(jh, "unfile from commit"); | ||
1081 | __jbd2_journal_temp_unlink_buffer(jh); | ||
1082 | /* It still points to the committing | ||
1083 | * transaction; move it to this one so | ||
1084 | * that the refile assert checks are | ||
1085 | * happy. */ | ||
1086 | jh->b_transaction = handle->h_transaction; | ||
1087 | } | ||
1088 | /* The buffer will be refiled below */ | ||
1089 | |||
1090 | } | ||
1091 | /* | ||
1092 | * Special case --- the buffer might actually have been | ||
1093 | * allocated and then immediately deallocated in the previous, | ||
1094 | * committing transaction, so might still be left on that | ||
1095 | * transaction's metadata lists. | ||
1096 | */ | ||
1097 | if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { | ||
1098 | JBUFFER_TRACE(jh, "not on correct data list: unfile"); | ||
1099 | J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); | ||
1100 | __jbd2_journal_temp_unlink_buffer(jh); | ||
1101 | jh->b_transaction = handle->h_transaction; | ||
1102 | JBUFFER_TRACE(jh, "file as data"); | ||
1103 | __jbd2_journal_file_buffer(jh, handle->h_transaction, | ||
1104 | BJ_SyncData); | ||
1105 | } | ||
1106 | } else { | ||
1107 | JBUFFER_TRACE(jh, "not on a transaction"); | ||
1108 | __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); | ||
1109 | } | ||
1110 | no_journal: | ||
1111 | spin_unlock(&journal->j_list_lock); | ||
1112 | jbd_unlock_bh_state(bh); | ||
1113 | if (need_brelse) { | ||
1114 | BUFFER_TRACE(bh, "brelse"); | ||
1115 | __brelse(bh); | ||
1116 | } | ||
1117 | JBUFFER_TRACE(jh, "exit"); | ||
1118 | jbd2_journal_put_journal_head(jh); | ||
1119 | return 0; | ||
1120 | } | ||
1121 | |||
1122 | /** | ||
1123 | * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata | 946 | * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata |
1124 | * @handle: transaction to add buffer to. | 947 | * @handle: transaction to add buffer to. |
1125 | * @bh: buffer to mark | 948 | * @bh: buffer to mark |
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) | |||
1541 | * Remove a buffer from the appropriate transaction list. | 1364 | * Remove a buffer from the appropriate transaction list. |
1542 | * | 1365 | * |
1543 | * Note that this function can *change* the value of | 1366 | * Note that this function can *change* the value of |
1544 | * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, | 1367 | * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, |
1545 | * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller | 1368 | * t_log_list or t_reserved_list. If the caller is holding onto a copy of one |
1546 | * is holding onto a copy of one of thee pointers, it could go bad. | 1369 | * of these pointers, it could go bad. Generally the caller needs to re-read |
1547 | * Generally the caller needs to re-read the pointer from the transaction_t. | 1370 | * the pointer from the transaction_t. |
1548 | * | 1371 | * |
1549 | * Called under j_list_lock. The journal may not be locked. | 1372 | * Called under j_list_lock. The journal may not be locked. |
1550 | */ | 1373 | */ |
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) | |||
1566 | switch (jh->b_jlist) { | 1389 | switch (jh->b_jlist) { |
1567 | case BJ_None: | 1390 | case BJ_None: |
1568 | return; | 1391 | return; |
1569 | case BJ_SyncData: | ||
1570 | list = &transaction->t_sync_datalist; | ||
1571 | break; | ||
1572 | case BJ_Metadata: | 1392 | case BJ_Metadata: |
1573 | transaction->t_nr_buffers--; | 1393 | transaction->t_nr_buffers--; |
1574 | J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); | 1394 | J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); |
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) | |||
1589 | case BJ_Reserved: | 1409 | case BJ_Reserved: |
1590 | list = &transaction->t_reserved_list; | 1410 | list = &transaction->t_reserved_list; |
1591 | break; | 1411 | break; |
1592 | case BJ_Locked: | ||
1593 | list = &transaction->t_locked_list; | ||
1594 | break; | ||
1595 | } | 1412 | } |
1596 | 1413 | ||
1597 | __blist_del_buffer(list, jh); | 1414 | __blist_del_buffer(list, jh); |
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) | |||
1634 | goto out; | 1451 | goto out; |
1635 | 1452 | ||
1636 | spin_lock(&journal->j_list_lock); | 1453 | spin_lock(&journal->j_list_lock); |
1637 | if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { | 1454 | if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { |
1638 | if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { | ||
1639 | /* A written-back ordered data buffer */ | ||
1640 | JBUFFER_TRACE(jh, "release data"); | ||
1641 | __jbd2_journal_unfile_buffer(jh); | ||
1642 | jbd2_journal_remove_journal_head(bh); | ||
1643 | __brelse(bh); | ||
1644 | } | ||
1645 | } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { | ||
1646 | /* written-back checkpointed metadata buffer */ | 1455 | /* written-back checkpointed metadata buffer */ |
1647 | if (jh->b_jlist == BJ_None) { | 1456 | if (jh->b_jlist == BJ_None) { |
1648 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | 1457 | JBUFFER_TRACE(jh, "remove from checkpoint list"); |
@@ -1656,12 +1465,43 @@ out: | |||
1656 | return; | 1465 | return; |
1657 | } | 1466 | } |
1658 | 1467 | ||
1468 | /* | ||
1469 | * jbd2_journal_try_to_free_buffers() could race with | ||
1470 | * jbd2_journal_commit_transaction(). The later might still hold the | ||
1471 | * reference count to the buffers when inspecting them on | ||
1472 | * t_syncdata_list or t_locked_list. | ||
1473 | * | ||
1474 | * jbd2_journal_try_to_free_buffers() will call this function to | ||
1475 | * wait for the current transaction to finish syncing data buffers, before | ||
1476 | * try to free that buffer. | ||
1477 | * | ||
1478 | * Called with journal->j_state_lock hold. | ||
1479 | */ | ||
1480 | static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal) | ||
1481 | { | ||
1482 | transaction_t *transaction; | ||
1483 | tid_t tid; | ||
1484 | |||
1485 | spin_lock(&journal->j_state_lock); | ||
1486 | transaction = journal->j_committing_transaction; | ||
1487 | |||
1488 | if (!transaction) { | ||
1489 | spin_unlock(&journal->j_state_lock); | ||
1490 | return; | ||
1491 | } | ||
1492 | |||
1493 | tid = transaction->t_tid; | ||
1494 | spin_unlock(&journal->j_state_lock); | ||
1495 | jbd2_log_wait_commit(journal, tid); | ||
1496 | } | ||
1659 | 1497 | ||
1660 | /** | 1498 | /** |
1661 | * int jbd2_journal_try_to_free_buffers() - try to free page buffers. | 1499 | * int jbd2_journal_try_to_free_buffers() - try to free page buffers. |
1662 | * @journal: journal for operation | 1500 | * @journal: journal for operation |
1663 | * @page: to try and free | 1501 | * @page: to try and free |
1664 | * @unused_gfp_mask: unused | 1502 | * @gfp_mask: we use the mask to detect how hard should we try to release |
1503 | * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to | ||
1504 | * release the buffers. | ||
1665 | * | 1505 | * |
1666 | * | 1506 | * |
1667 | * For all the buffers on this page, | 1507 | * For all the buffers on this page, |
@@ -1690,9 +1530,11 @@ out: | |||
1690 | * journal_try_to_free_buffer() is changing its state. But that | 1530 | * journal_try_to_free_buffer() is changing its state. But that |
1691 | * cannot happen because we never reallocate freed data as metadata | 1531 | * cannot happen because we never reallocate freed data as metadata |
1692 | * while the data is part of a transaction. Yes? | 1532 | * while the data is part of a transaction. Yes? |
1533 | * | ||
1534 | * Return 0 on failure, 1 on success | ||
1693 | */ | 1535 | */ |
1694 | int jbd2_journal_try_to_free_buffers(journal_t *journal, | 1536 | int jbd2_journal_try_to_free_buffers(journal_t *journal, |
1695 | struct page *page, gfp_t unused_gfp_mask) | 1537 | struct page *page, gfp_t gfp_mask) |
1696 | { | 1538 | { |
1697 | struct buffer_head *head; | 1539 | struct buffer_head *head; |
1698 | struct buffer_head *bh; | 1540 | struct buffer_head *bh; |
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, | |||
1708 | /* | 1550 | /* |
1709 | * We take our own ref against the journal_head here to avoid | 1551 | * We take our own ref against the journal_head here to avoid |
1710 | * having to add tons of locking around each instance of | 1552 | * having to add tons of locking around each instance of |
1711 | * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head(). | 1553 | * jbd2_journal_remove_journal_head() and |
1554 | * jbd2_journal_put_journal_head(). | ||
1712 | */ | 1555 | */ |
1713 | jh = jbd2_journal_grab_journal_head(bh); | 1556 | jh = jbd2_journal_grab_journal_head(bh); |
1714 | if (!jh) | 1557 | if (!jh) |
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, | |||
1721 | if (buffer_jbd(bh)) | 1564 | if (buffer_jbd(bh)) |
1722 | goto busy; | 1565 | goto busy; |
1723 | } while ((bh = bh->b_this_page) != head); | 1566 | } while ((bh = bh->b_this_page) != head); |
1567 | |||
1724 | ret = try_to_free_buffers(page); | 1568 | ret = try_to_free_buffers(page); |
1569 | |||
1570 | /* | ||
1571 | * There are a number of places where jbd2_journal_try_to_free_buffers() | ||
1572 | * could race with jbd2_journal_commit_transaction(), the later still | ||
1573 | * holds the reference to the buffers to free while processing them. | ||
1574 | * try_to_free_buffers() failed to free those buffers. Some of the | ||
1575 | * caller of releasepage() request page buffers to be dropped, otherwise | ||
1576 | * treat the fail-to-free as errors (such as generic_file_direct_IO()) | ||
1577 | * | ||
1578 | * So, if the caller of try_to_release_page() wants the synchronous | ||
1579 | * behaviour(i.e make sure buffers are dropped upon return), | ||
1580 | * let's wait for the current transaction to finish flush of | ||
1581 | * dirty data buffers, then try to free those buffers again, | ||
1582 | * with the journal locked. | ||
1583 | */ | ||
1584 | if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { | ||
1585 | jbd2_journal_wait_for_transaction_sync_data(journal); | ||
1586 | ret = try_to_free_buffers(page); | ||
1587 | } | ||
1588 | |||
1725 | busy: | 1589 | busy: |
1726 | return ret; | 1590 | return ret; |
1727 | } | 1591 | } |
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) | |||
1823 | if (!buffer_jbd(bh)) | 1687 | if (!buffer_jbd(bh)) |
1824 | goto zap_buffer_unlocked; | 1688 | goto zap_buffer_unlocked; |
1825 | 1689 | ||
1690 | /* OK, we have data buffer in journaled mode */ | ||
1826 | spin_lock(&journal->j_state_lock); | 1691 | spin_lock(&journal->j_state_lock); |
1827 | jbd_lock_bh_state(bh); | 1692 | jbd_lock_bh_state(bh); |
1828 | spin_lock(&journal->j_list_lock); | 1693 | spin_lock(&journal->j_list_lock); |
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) | |||
1886 | } | 1751 | } |
1887 | } else if (transaction == journal->j_committing_transaction) { | 1752 | } else if (transaction == journal->j_committing_transaction) { |
1888 | JBUFFER_TRACE(jh, "on committing transaction"); | 1753 | JBUFFER_TRACE(jh, "on committing transaction"); |
1889 | if (jh->b_jlist == BJ_Locked) { | ||
1890 | /* | ||
1891 | * The buffer is on the committing transaction's locked | ||
1892 | * list. We have the buffer locked, so I/O has | ||
1893 | * completed. So we can nail the buffer now. | ||
1894 | */ | ||
1895 | may_free = __dispose_buffer(jh, transaction); | ||
1896 | goto zap_buffer; | ||
1897 | } | ||
1898 | /* | 1754 | /* |
1899 | * If it is committing, we simply cannot touch it. We | 1755 | * If it is committing, we simply cannot touch it. We |
1900 | * can remove it's next_transaction pointer from the | 1756 | * can remove it's next_transaction pointer from the |
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, | |||
2027 | J_ASSERT_JH(jh, !jh->b_committed_data); | 1883 | J_ASSERT_JH(jh, !jh->b_committed_data); |
2028 | J_ASSERT_JH(jh, !jh->b_frozen_data); | 1884 | J_ASSERT_JH(jh, !jh->b_frozen_data); |
2029 | return; | 1885 | return; |
2030 | case BJ_SyncData: | ||
2031 | list = &transaction->t_sync_datalist; | ||
2032 | break; | ||
2033 | case BJ_Metadata: | 1886 | case BJ_Metadata: |
2034 | transaction->t_nr_buffers++; | 1887 | transaction->t_nr_buffers++; |
2035 | list = &transaction->t_buffers; | 1888 | list = &transaction->t_buffers; |
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, | |||
2049 | case BJ_Reserved: | 1902 | case BJ_Reserved: |
2050 | list = &transaction->t_reserved_list; | 1903 | list = &transaction->t_reserved_list; |
2051 | break; | 1904 | break; |
2052 | case BJ_Locked: | ||
2053 | list = &transaction->t_locked_list; | ||
2054 | break; | ||
2055 | } | 1905 | } |
2056 | 1906 | ||
2057 | __blist_add_buffer(list, jh); | 1907 | __blist_add_buffer(list, jh); |
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) | |||
2141 | spin_unlock(&journal->j_list_lock); | 1991 | spin_unlock(&journal->j_list_lock); |
2142 | __brelse(bh); | 1992 | __brelse(bh); |
2143 | } | 1993 | } |
1994 | |||
1995 | /* | ||
1996 | * File inode in the inode list of the handle's transaction | ||
1997 | */ | ||
1998 | int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) | ||
1999 | { | ||
2000 | transaction_t *transaction = handle->h_transaction; | ||
2001 | journal_t *journal = transaction->t_journal; | ||
2002 | |||
2003 | if (is_handle_aborted(handle)) | ||
2004 | return -EIO; | ||
2005 | |||
2006 | jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, | ||
2007 | transaction->t_tid); | ||
2008 | |||
2009 | /* | ||
2010 | * First check whether inode isn't already on the transaction's | ||
2011 | * lists without taking the lock. Note that this check is safe | ||
2012 | * without the lock as we cannot race with somebody removing inode | ||
2013 | * from the transaction. The reason is that we remove inode from the | ||
2014 | * transaction only in journal_release_jbd_inode() and when we commit | ||
2015 | * the transaction. We are guarded from the first case by holding | ||
2016 | * a reference to the inode. We are safe against the second case | ||
2017 | * because if jinode->i_transaction == transaction, commit code | ||
2018 | * cannot touch the transaction because we hold reference to it, | ||
2019 | * and if jinode->i_next_transaction == transaction, commit code | ||
2020 | * will only file the inode where we want it. | ||
2021 | */ | ||
2022 | if (jinode->i_transaction == transaction || | ||
2023 | jinode->i_next_transaction == transaction) | ||
2024 | return 0; | ||
2025 | |||
2026 | spin_lock(&journal->j_list_lock); | ||
2027 | |||
2028 | if (jinode->i_transaction == transaction || | ||
2029 | jinode->i_next_transaction == transaction) | ||
2030 | goto done; | ||
2031 | |||
2032 | /* On some different transaction's list - should be | ||
2033 | * the committing one */ | ||
2034 | if (jinode->i_transaction) { | ||
2035 | J_ASSERT(jinode->i_next_transaction == NULL); | ||
2036 | J_ASSERT(jinode->i_transaction == | ||
2037 | journal->j_committing_transaction); | ||
2038 | jinode->i_next_transaction = transaction; | ||
2039 | goto done; | ||
2040 | } | ||
2041 | /* Not on any transaction list... */ | ||
2042 | J_ASSERT(!jinode->i_next_transaction); | ||
2043 | jinode->i_transaction = transaction; | ||
2044 | list_add(&jinode->i_list, &transaction->t_inode_list); | ||
2045 | done: | ||
2046 | spin_unlock(&journal->j_list_lock); | ||
2047 | |||
2048 | return 0; | ||
2049 | } | ||
2050 | |||
2051 | /* | ||
2052 | * This function must be called when inode is journaled in ordered mode | ||
2053 | * before truncation happens. It starts writeout of truncated part in | ||
2054 | * case it is in the committing transaction so that we stand to ordered | ||
2055 | * mode consistency guarantees. | ||
2056 | */ | ||
2057 | int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, | ||
2058 | loff_t new_size) | ||
2059 | { | ||
2060 | journal_t *journal; | ||
2061 | transaction_t *commit_trans; | ||
2062 | int ret = 0; | ||
2063 | |||
2064 | if (!inode->i_transaction && !inode->i_next_transaction) | ||
2065 | goto out; | ||
2066 | journal = inode->i_transaction->t_journal; | ||
2067 | spin_lock(&journal->j_state_lock); | ||
2068 | commit_trans = journal->j_committing_transaction; | ||
2069 | spin_unlock(&journal->j_state_lock); | ||
2070 | if (inode->i_transaction == commit_trans) { | ||
2071 | ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, | ||
2072 | new_size, LLONG_MAX); | ||
2073 | if (ret) | ||
2074 | jbd2_journal_abort(journal, ret); | ||
2075 | } | ||
2076 | out: | ||
2077 | return ret; | ||
2078 | } | ||