aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd2/commit.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jbd2/commit.c')
-rw-r--r--fs/jbd2/commit.c294
1 files changed, 105 insertions, 189 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/crc32.h> 24#include <linux/crc32.h>
25#include <linux/writeback.h>
26#include <linux/backing-dev.h>
25 27
26/* 28/*
27 * Default IO end handler for temporary BJ_IO buffer_heads. 29 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37} 39}
38 40
39/* 41/*
40 * When an ext3-ordered file is truncated, it is possible that many pages are 42 * When an ext4 file is truncated, it is possible that some pages are not
41 * not sucessfully freed, because they are attached to a committing transaction. 43 * successfully freed, because they are attached to a committing transaction.
42 * After the transaction commits, these pages are left on the LRU, with no 44 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable 45 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes 46 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
80} 82}
81 83
82/* 84/*
83 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84 * held. For ranking reasons we must trylock. If we lose, schedule away and
85 * return 0. j_list_lock is dropped in this case.
86 */
87static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88{
89 if (!jbd_trylock_bh_state(bh)) {
90 spin_unlock(&journal->j_list_lock);
91 schedule();
92 return 0;
93 }
94 return 1;
95}
96
97/*
98 * Done it all: now submit the commit record. We should have 85 * Done it all: now submit the commit record. We should have
99 * cleaned up our previous buffers by now, so if we are in abort 86 * cleaned up our previous buffers by now, so if we are in abort
100 * mode we can now just skip the rest of the journal write 87 * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
112 struct buffer_head *bh; 99 struct buffer_head *bh;
113 int ret; 100 int ret;
114 int barrier_done = 0; 101 int barrier_done = 0;
102 struct timespec now = current_kernel_time();
115 103
116 if (is_journal_aborted(journal)) 104 if (is_journal_aborted(journal))
117 return 0; 105 return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
126 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 114 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 115 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); 116 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
117 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
118 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
129 119
130 if (JBD2_HAS_COMPAT_FEATURE(journal, 120 if (JBD2_HAS_COMPAT_FEATURE(journal,
131 JBD2_FEATURE_COMPAT_CHECKSUM)) { 121 JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
197} 187}
198 188
199/* 189/*
200 * Wait for all submitted IO to complete. 190 * write the filemap data using writepage() address_space_operations.
191 * We don't do block allocation here even for delalloc. We don't
192 * use writepages() because with dealyed allocation we may be doing
193 * block allocation in writepages().
201 */ 194 */
202static int journal_wait_on_locked_list(journal_t *journal, 195static int journal_submit_inode_data_buffers(struct address_space *mapping)
203 transaction_t *commit_transaction)
204{ 196{
205 int ret = 0; 197 int ret;
206 struct journal_head *jh; 198 struct writeback_control wbc = {
207 199 .sync_mode = WB_SYNC_ALL,
208 while (commit_transaction->t_locked_list) { 200 .nr_to_write = mapping->nrpages * 2,
209 struct buffer_head *bh; 201 .range_start = 0,
210 202 .range_end = i_size_read(mapping->host),
211 jh = commit_transaction->t_locked_list->b_tprev; 203 .for_writepages = 1,
212 bh = jh2bh(jh); 204 };
213 get_bh(bh); 205
214 if (buffer_locked(bh)) { 206 ret = generic_writepages(mapping, &wbc);
215 spin_unlock(&journal->j_list_lock);
216 wait_on_buffer(bh);
217 if (unlikely(!buffer_uptodate(bh)))
218 ret = -EIO;
219 spin_lock(&journal->j_list_lock);
220 }
221 if (!inverted_lock(journal, bh)) {
222 put_bh(bh);
223 spin_lock(&journal->j_list_lock);
224 continue;
225 }
226 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
227 __jbd2_journal_unfile_buffer(jh);
228 jbd_unlock_bh_state(bh);
229 jbd2_journal_remove_journal_head(bh);
230 put_bh(bh);
231 } else {
232 jbd_unlock_bh_state(bh);
233 }
234 put_bh(bh);
235 cond_resched_lock(&journal->j_list_lock);
236 }
237 return ret; 207 return ret;
238 } 208}
239 209
240static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 210/*
211 * Submit all the data buffers of inode associated with the transaction to
212 * disk.
213 *
214 * We are in a committing transaction. Therefore no new inode can be added to
215 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
216 * operate on from being released while we write out pages.
217 */
218static int journal_submit_data_buffers(journal_t *journal,
219 transaction_t *commit_transaction)
241{ 220{
242 int i; 221 struct jbd2_inode *jinode;
222 int err, ret = 0;
223 struct address_space *mapping;
243 224
244 for (i = 0; i < bufs; i++) { 225 spin_lock(&journal->j_list_lock);
245 wbuf[i]->b_end_io = end_buffer_write_sync; 226 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
246 /* We use-up our safety reference in submit_bh() */ 227 mapping = jinode->i_vfs_inode->i_mapping;
247 submit_bh(WRITE, wbuf[i]); 228 jinode->i_flags |= JI_COMMIT_RUNNING;
229 spin_unlock(&journal->j_list_lock);
230 /*
231 * submit the inode data buffers. We use writepage
232 * instead of writepages. Because writepages can do
233 * block allocation with delalloc. We need to write
234 * only allocated blocks here.
235 */
236 err = journal_submit_inode_data_buffers(mapping);
237 if (!ret)
238 ret = err;
239 spin_lock(&journal->j_list_lock);
240 J_ASSERT(jinode->i_transaction == commit_transaction);
241 jinode->i_flags &= ~JI_COMMIT_RUNNING;
242 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
248 } 243 }
244 spin_unlock(&journal->j_list_lock);
245 return ret;
249} 246}
250 247
251/* 248/*
252 * Submit all the data buffers to disk 249 * Wait for data submitted for writeout, refile inodes to proper
250 * transaction if needed.
251 *
253 */ 252 */
254static void journal_submit_data_buffers(journal_t *journal, 253static int journal_finish_inode_data_buffers(journal_t *journal,
255 transaction_t *commit_transaction) 254 transaction_t *commit_transaction)
256{ 255{
257 struct journal_head *jh; 256 struct jbd2_inode *jinode, *next_i;
258 struct buffer_head *bh; 257 int err, ret = 0;
259 int locked;
260 int bufs = 0;
261 struct buffer_head **wbuf = journal->j_wbuf;
262 258
263 /* 259 /* For locking, see the comment in journal_submit_data_buffers() */
264 * Whenever we unlock the journal and sleep, things can get added
265 * onto ->t_sync_datalist, so we have to keep looping back to
266 * write_out_data until we *know* that the list is empty.
267 *
268 * Cleanup any flushed data buffers from the data list. Even in
269 * abort mode, we want to flush this out as soon as possible.
270 */
271write_out_data:
272 cond_resched();
273 spin_lock(&journal->j_list_lock); 260 spin_lock(&journal->j_list_lock);
261 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
262 jinode->i_flags |= JI_COMMIT_RUNNING;
263 spin_unlock(&journal->j_list_lock);
264 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
265 if (!ret)
266 ret = err;
267 spin_lock(&journal->j_list_lock);
268 jinode->i_flags &= ~JI_COMMIT_RUNNING;
269 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
270 }
274 271
275 while (commit_transaction->t_sync_datalist) { 272 /* Now refile inode to proper lists */
276 jh = commit_transaction->t_sync_datalist; 273 list_for_each_entry_safe(jinode, next_i,
277 bh = jh2bh(jh); 274 &commit_transaction->t_inode_list, i_list) {
278 locked = 0; 275 list_del(&jinode->i_list);
279 276 if (jinode->i_next_transaction) {
280 /* Get reference just to make sure buffer does not disappear 277 jinode->i_transaction = jinode->i_next_transaction;
281 * when we are forced to drop various locks */ 278 jinode->i_next_transaction = NULL;
282 get_bh(bh); 279 list_add(&jinode->i_list,
283 /* If the buffer is dirty, we need to submit IO and hence 280 &jinode->i_transaction->t_inode_list);
284 * we need the buffer lock. We try to lock the buffer without
285 * blocking. If we fail, we need to drop j_list_lock and do
286 * blocking lock_buffer().
287 */
288 if (buffer_dirty(bh)) {
289 if (test_set_buffer_locked(bh)) {
290 BUFFER_TRACE(bh, "needs blocking lock");
291 spin_unlock(&journal->j_list_lock);
292 /* Write out all data to prevent deadlocks */
293 journal_do_submit_data(wbuf, bufs);
294 bufs = 0;
295 lock_buffer(bh);
296 spin_lock(&journal->j_list_lock);
297 }
298 locked = 1;
299 }
300 /* We have to get bh_state lock. Again out of order, sigh. */
301 if (!inverted_lock(journal, bh)) {
302 jbd_lock_bh_state(bh);
303 spin_lock(&journal->j_list_lock);
304 }
305 /* Someone already cleaned up the buffer? */
306 if (!buffer_jbd(bh)
307 || jh->b_transaction != commit_transaction
308 || jh->b_jlist != BJ_SyncData) {
309 jbd_unlock_bh_state(bh);
310 if (locked)
311 unlock_buffer(bh);
312 BUFFER_TRACE(bh, "already cleaned up");
313 put_bh(bh);
314 continue;
315 }
316 if (locked && test_clear_buffer_dirty(bh)) {
317 BUFFER_TRACE(bh, "needs writeout, adding to array");
318 wbuf[bufs++] = bh;
319 __jbd2_journal_file_buffer(jh, commit_transaction,
320 BJ_Locked);
321 jbd_unlock_bh_state(bh);
322 if (bufs == journal->j_wbufsize) {
323 spin_unlock(&journal->j_list_lock);
324 journal_do_submit_data(wbuf, bufs);
325 bufs = 0;
326 goto write_out_data;
327 }
328 } else if (!locked && buffer_locked(bh)) {
329 __jbd2_journal_file_buffer(jh, commit_transaction,
330 BJ_Locked);
331 jbd_unlock_bh_state(bh);
332 put_bh(bh);
333 } else { 281 } else {
334 BUFFER_TRACE(bh, "writeout complete: unfile"); 282 jinode->i_transaction = NULL;
335 __jbd2_journal_unfile_buffer(jh);
336 jbd_unlock_bh_state(bh);
337 if (locked)
338 unlock_buffer(bh);
339 jbd2_journal_remove_journal_head(bh);
340 /* Once for our safety reference, once for
341 * jbd2_journal_remove_journal_head() */
342 put_bh(bh);
343 put_bh(bh);
344 }
345
346 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
347 spin_unlock(&journal->j_list_lock);
348 goto write_out_data;
349 } 283 }
350 } 284 }
351 spin_unlock(&journal->j_list_lock); 285 spin_unlock(&journal->j_list_lock);
352 journal_do_submit_data(wbuf, bufs); 286
287 return ret;
353} 288}
354 289
355static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 290static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
524 * Now start flushing things to disk, in the order they appear 459 * Now start flushing things to disk, in the order they appear
525 * on the transaction lists. Data blocks go first. 460 * on the transaction lists. Data blocks go first.
526 */ 461 */
527 err = 0; 462 err = journal_submit_data_buffers(journal, commit_transaction);
528 journal_submit_data_buffers(journal, commit_transaction);
529
530 /*
531 * Wait for all previously submitted IO to complete if commit
532 * record is to be written synchronously.
533 */
534 spin_lock(&journal->j_list_lock);
535 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
536 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
537 err = journal_wait_on_locked_list(journal,
538 commit_transaction);
539
540 spin_unlock(&journal->j_list_lock);
541
542 if (err) 463 if (err)
543 jbd2_journal_abort(journal, err); 464 jbd2_journal_abort(journal, err);
544 465
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
547 jbd_debug(3, "JBD: commit phase 2\n"); 468 jbd_debug(3, "JBD: commit phase 2\n");
548 469
549 /* 470 /*
550 * If we found any dirty or locked buffers, then we should have
551 * looped back up to the write_out_data label. If there weren't
552 * any then journal_clean_data_list should have wiped the list
553 * clean by now, so check that it is in fact empty.
554 */
555 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
556
557 jbd_debug (3, "JBD: commit phase 3\n");
558
559 /*
560 * Way to go: we have now written out all of the data for a 471 * Way to go: we have now written out all of the data for a
561 * transaction! Now comes the tricky part: we need to write out 472 * transaction! Now comes the tricky part: we need to write out
562 * metadata. Loop over the transaction's entire buffer list: 473 * metadata. Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
574 J_ASSERT(commit_transaction->t_nr_buffers <= 485 J_ASSERT(commit_transaction->t_nr_buffers <=
575 commit_transaction->t_outstanding_credits); 486 commit_transaction->t_outstanding_credits);
576 487
488 err = 0;
577 descriptor = NULL; 489 descriptor = NULL;
578 bufs = 0; 490 bufs = 0;
579 while (commit_transaction->t_buffers) { 491 while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
748 &cbh, crc32_sum); 660 &cbh, crc32_sum);
749 if (err) 661 if (err)
750 __jbd2_journal_abort_hard(journal); 662 __jbd2_journal_abort_hard(journal);
751
752 spin_lock(&journal->j_list_lock);
753 err = journal_wait_on_locked_list(journal,
754 commit_transaction);
755 spin_unlock(&journal->j_list_lock);
756 if (err)
757 __jbd2_journal_abort_hard(journal);
758 } 663 }
759 664
665 /*
666 * This is the right place to wait for data buffers both for ASYNC
667 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
668 * the commit block went to disk (which happens above). If commit is
669 * SYNC, we need to wait for data buffers before we start writing
670 * commit block, which happens below in such setting.
671 */
672 err = journal_finish_inode_data_buffers(journal, commit_transaction);
673 if (err)
674 jbd2_journal_abort(journal, err);
675
760 /* Lo and behold: we have just managed to send a transaction to 676 /* Lo and behold: we have just managed to send a transaction to
761 the log. Before we can commit it, wait for the IO so far to 677 the log. Before we can commit it, wait for the IO so far to
762 complete. Control buffers being written are on the 678 complete. Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
768 so we incur less scheduling load. 684 so we incur less scheduling load.
769 */ 685 */
770 686
771 jbd_debug(3, "JBD: commit phase 4\n"); 687 jbd_debug(3, "JBD: commit phase 3\n");
772 688
773 /* 689 /*
774 * akpm: these are BJ_IO, and j_list_lock is not needed. 690 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
827 743
828 J_ASSERT (commit_transaction->t_shadow_list == NULL); 744 J_ASSERT (commit_transaction->t_shadow_list == NULL);
829 745
830 jbd_debug(3, "JBD: commit phase 5\n"); 746 jbd_debug(3, "JBD: commit phase 4\n");
831 747
832 /* Here we wait for the revoke record and descriptor record buffers */ 748 /* Here we wait for the revoke record and descriptor record buffers */
833 wait_for_ctlbuf: 749 wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
854 /* AKPM: bforget here */ 770 /* AKPM: bforget here */
855 } 771 }
856 772
857 jbd_debug(3, "JBD: commit phase 6\n"); 773 jbd_debug(3, "JBD: commit phase 5\n");
858 774
859 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 775 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
860 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 776 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
874 transaction can be removed from any checkpoint list it was on 790 transaction can be removed from any checkpoint list it was on
875 before. */ 791 before. */
876 792
877 jbd_debug(3, "JBD: commit phase 7\n"); 793 jbd_debug(3, "JBD: commit phase 6\n");
878 794
879 J_ASSERT(commit_transaction->t_sync_datalist == NULL); 795 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
880 J_ASSERT(commit_transaction->t_buffers == NULL); 796 J_ASSERT(commit_transaction->t_buffers == NULL);
881 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 797 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
882 J_ASSERT(commit_transaction->t_iobuf_list == NULL); 798 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
997 913
998 /* Done with this transaction! */ 914 /* Done with this transaction! */
999 915
1000 jbd_debug(3, "JBD: commit phase 8\n"); 916 jbd_debug(3, "JBD: commit phase 7\n");
1001 917
1002 J_ASSERT(commit_transaction->t_state == T_COMMIT); 918 J_ASSERT(commit_transaction->t_state == T_COMMIT);
1003 919