aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd2/commit.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-07-18 13:53:16 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-18 13:53:16 -0400
commit9b610fda0df5d0f0b0c64242e37441ad1b384aac (patch)
tree0ea14b15f2e6546f37fe18d8ac3dc83077ec0e55 /fs/jbd2/commit.c
parentb8f8c3cf0a4ac0632ec3f0e15e9dc0c29de917af (diff)
parent5b664cb235e97afbf34db9c4d77f08ebd725335e (diff)
Merge branch 'linus' into timers/nohz
Diffstat (limited to 'fs/jbd2/commit.c')
-rw-r--r--fs/jbd2/commit.c295
1 files changed, 106 insertions, 189 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4d99685fdce4..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/crc32.h> 24#include <linux/crc32.h>
25#include <linux/writeback.h>
26#include <linux/backing-dev.h>
25 27
26/* 28/*
27 * Default IO end handler for temporary BJ_IO buffer_heads. 29 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37} 39}
38 40
39/* 41/*
40 * When an ext3-ordered file is truncated, it is possible that many pages are 42 * When an ext4 file is truncated, it is possible that some pages are not
41 * not sucessfully freed, because they are attached to a committing transaction. 43 * successfully freed, because they are attached to a committing transaction.
42 * After the transaction commits, these pages are left on the LRU, with no 44 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable 45 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes 46 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
80} 82}
81 83
82/* 84/*
83 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84 * held. For ranking reasons we must trylock. If we lose, schedule away and
85 * return 0. j_list_lock is dropped in this case.
86 */
87static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88{
89 if (!jbd_trylock_bh_state(bh)) {
90 spin_unlock(&journal->j_list_lock);
91 schedule();
92 return 0;
93 }
94 return 1;
95}
96
97/*
98 * Done it all: now submit the commit record. We should have 85 * Done it all: now submit the commit record. We should have
99 * cleaned up our previous buffers by now, so if we are in abort 86 * cleaned up our previous buffers by now, so if we are in abort
100 * mode we can now just skip the rest of the journal write 87 * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
112 struct buffer_head *bh; 99 struct buffer_head *bh;
113 int ret; 100 int ret;
114 int barrier_done = 0; 101 int barrier_done = 0;
102 struct timespec now = current_kernel_time();
115 103
116 if (is_journal_aborted(journal)) 104 if (is_journal_aborted(journal))
117 return 0; 105 return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
126 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 114 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 115 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); 116 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
117 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
118 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
129 119
130 if (JBD2_HAS_COMPAT_FEATURE(journal, 120 if (JBD2_HAS_COMPAT_FEATURE(journal,
131 JBD2_FEATURE_COMPAT_CHECKSUM)) { 121 JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -168,6 +158,7 @@ static int journal_submit_commit_record(journal_t *journal,
168 spin_unlock(&journal->j_state_lock); 158 spin_unlock(&journal->j_state_lock);
169 159
170 /* And try again, without the barrier */ 160 /* And try again, without the barrier */
161 lock_buffer(bh);
171 set_buffer_uptodate(bh); 162 set_buffer_uptodate(bh);
172 set_buffer_dirty(bh); 163 set_buffer_dirty(bh);
173 ret = submit_bh(WRITE, bh); 164 ret = submit_bh(WRITE, bh);
@@ -196,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
196} 187}
197 188
198/* 189/*
199 * Wait for all submitted IO to complete. 190 * write the filemap data using writepage() address_space_operations.
191 * We don't do block allocation here even for delalloc. We don't
192 * use writepages() because with dealyed allocation we may be doing
193 * block allocation in writepages().
200 */ 194 */
201static int journal_wait_on_locked_list(journal_t *journal, 195static int journal_submit_inode_data_buffers(struct address_space *mapping)
202 transaction_t *commit_transaction)
203{ 196{
204 int ret = 0; 197 int ret;
205 struct journal_head *jh; 198 struct writeback_control wbc = {
206 199 .sync_mode = WB_SYNC_ALL,
207 while (commit_transaction->t_locked_list) { 200 .nr_to_write = mapping->nrpages * 2,
208 struct buffer_head *bh; 201 .range_start = 0,
209 202 .range_end = i_size_read(mapping->host),
210 jh = commit_transaction->t_locked_list->b_tprev; 203 .for_writepages = 1,
211 bh = jh2bh(jh); 204 };
212 get_bh(bh); 205
213 if (buffer_locked(bh)) { 206 ret = generic_writepages(mapping, &wbc);
214 spin_unlock(&journal->j_list_lock);
215 wait_on_buffer(bh);
216 if (unlikely(!buffer_uptodate(bh)))
217 ret = -EIO;
218 spin_lock(&journal->j_list_lock);
219 }
220 if (!inverted_lock(journal, bh)) {
221 put_bh(bh);
222 spin_lock(&journal->j_list_lock);
223 continue;
224 }
225 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
226 __jbd2_journal_unfile_buffer(jh);
227 jbd_unlock_bh_state(bh);
228 jbd2_journal_remove_journal_head(bh);
229 put_bh(bh);
230 } else {
231 jbd_unlock_bh_state(bh);
232 }
233 put_bh(bh);
234 cond_resched_lock(&journal->j_list_lock);
235 }
236 return ret; 207 return ret;
237 } 208}
238 209
239static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 210/*
211 * Submit all the data buffers of inode associated with the transaction to
212 * disk.
213 *
214 * We are in a committing transaction. Therefore no new inode can be added to
215 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
216 * operate on from being released while we write out pages.
217 */
218static int journal_submit_data_buffers(journal_t *journal,
219 transaction_t *commit_transaction)
240{ 220{
241 int i; 221 struct jbd2_inode *jinode;
222 int err, ret = 0;
223 struct address_space *mapping;
242 224
243 for (i = 0; i < bufs; i++) { 225 spin_lock(&journal->j_list_lock);
244 wbuf[i]->b_end_io = end_buffer_write_sync; 226 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
245 /* We use-up our safety reference in submit_bh() */ 227 mapping = jinode->i_vfs_inode->i_mapping;
246 submit_bh(WRITE, wbuf[i]); 228 jinode->i_flags |= JI_COMMIT_RUNNING;
229 spin_unlock(&journal->j_list_lock);
230 /*
231 * submit the inode data buffers. We use writepage
232 * instead of writepages. Because writepages can do
233 * block allocation with delalloc. We need to write
234 * only allocated blocks here.
235 */
236 err = journal_submit_inode_data_buffers(mapping);
237 if (!ret)
238 ret = err;
239 spin_lock(&journal->j_list_lock);
240 J_ASSERT(jinode->i_transaction == commit_transaction);
241 jinode->i_flags &= ~JI_COMMIT_RUNNING;
242 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
247 } 243 }
244 spin_unlock(&journal->j_list_lock);
245 return ret;
248} 246}
249 247
250/* 248/*
251 * Submit all the data buffers to disk 249 * Wait for data submitted for writeout, refile inodes to proper
250 * transaction if needed.
251 *
252 */ 252 */
253static void journal_submit_data_buffers(journal_t *journal, 253static int journal_finish_inode_data_buffers(journal_t *journal,
254 transaction_t *commit_transaction) 254 transaction_t *commit_transaction)
255{ 255{
256 struct journal_head *jh; 256 struct jbd2_inode *jinode, *next_i;
257 struct buffer_head *bh; 257 int err, ret = 0;
258 int locked;
259 int bufs = 0;
260 struct buffer_head **wbuf = journal->j_wbuf;
261 258
262 /* 259 /* For locking, see the comment in journal_submit_data_buffers() */
263 * Whenever we unlock the journal and sleep, things can get added
264 * onto ->t_sync_datalist, so we have to keep looping back to
265 * write_out_data until we *know* that the list is empty.
266 *
267 * Cleanup any flushed data buffers from the data list. Even in
268 * abort mode, we want to flush this out as soon as possible.
269 */
270write_out_data:
271 cond_resched();
272 spin_lock(&journal->j_list_lock); 260 spin_lock(&journal->j_list_lock);
261 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
262 jinode->i_flags |= JI_COMMIT_RUNNING;
263 spin_unlock(&journal->j_list_lock);
264 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
265 if (!ret)
266 ret = err;
267 spin_lock(&journal->j_list_lock);
268 jinode->i_flags &= ~JI_COMMIT_RUNNING;
269 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
270 }
273 271
274 while (commit_transaction->t_sync_datalist) { 272 /* Now refile inode to proper lists */
275 jh = commit_transaction->t_sync_datalist; 273 list_for_each_entry_safe(jinode, next_i,
276 bh = jh2bh(jh); 274 &commit_transaction->t_inode_list, i_list) {
277 locked = 0; 275 list_del(&jinode->i_list);
278 276 if (jinode->i_next_transaction) {
279 /* Get reference just to make sure buffer does not disappear 277 jinode->i_transaction = jinode->i_next_transaction;
280 * when we are forced to drop various locks */ 278 jinode->i_next_transaction = NULL;
281 get_bh(bh); 279 list_add(&jinode->i_list,
282 /* If the buffer is dirty, we need to submit IO and hence 280 &jinode->i_transaction->t_inode_list);
283 * we need the buffer lock. We try to lock the buffer without
284 * blocking. If we fail, we need to drop j_list_lock and do
285 * blocking lock_buffer().
286 */
287 if (buffer_dirty(bh)) {
288 if (test_set_buffer_locked(bh)) {
289 BUFFER_TRACE(bh, "needs blocking lock");
290 spin_unlock(&journal->j_list_lock);
291 /* Write out all data to prevent deadlocks */
292 journal_do_submit_data(wbuf, bufs);
293 bufs = 0;
294 lock_buffer(bh);
295 spin_lock(&journal->j_list_lock);
296 }
297 locked = 1;
298 }
299 /* We have to get bh_state lock. Again out of order, sigh. */
300 if (!inverted_lock(journal, bh)) {
301 jbd_lock_bh_state(bh);
302 spin_lock(&journal->j_list_lock);
303 }
304 /* Someone already cleaned up the buffer? */
305 if (!buffer_jbd(bh)
306 || jh->b_transaction != commit_transaction
307 || jh->b_jlist != BJ_SyncData) {
308 jbd_unlock_bh_state(bh);
309 if (locked)
310 unlock_buffer(bh);
311 BUFFER_TRACE(bh, "already cleaned up");
312 put_bh(bh);
313 continue;
314 }
315 if (locked && test_clear_buffer_dirty(bh)) {
316 BUFFER_TRACE(bh, "needs writeout, adding to array");
317 wbuf[bufs++] = bh;
318 __jbd2_journal_file_buffer(jh, commit_transaction,
319 BJ_Locked);
320 jbd_unlock_bh_state(bh);
321 if (bufs == journal->j_wbufsize) {
322 spin_unlock(&journal->j_list_lock);
323 journal_do_submit_data(wbuf, bufs);
324 bufs = 0;
325 goto write_out_data;
326 }
327 } else if (!locked && buffer_locked(bh)) {
328 __jbd2_journal_file_buffer(jh, commit_transaction,
329 BJ_Locked);
330 jbd_unlock_bh_state(bh);
331 put_bh(bh);
332 } else { 281 } else {
333 BUFFER_TRACE(bh, "writeout complete: unfile"); 282 jinode->i_transaction = NULL;
334 __jbd2_journal_unfile_buffer(jh);
335 jbd_unlock_bh_state(bh);
336 if (locked)
337 unlock_buffer(bh);
338 jbd2_journal_remove_journal_head(bh);
339 /* Once for our safety reference, once for
340 * jbd2_journal_remove_journal_head() */
341 put_bh(bh);
342 put_bh(bh);
343 }
344
345 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
346 spin_unlock(&journal->j_list_lock);
347 goto write_out_data;
348 } 283 }
349 } 284 }
350 spin_unlock(&journal->j_list_lock); 285 spin_unlock(&journal->j_list_lock);
351 journal_do_submit_data(wbuf, bufs); 286
287 return ret;
352} 288}
353 289
354static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 290static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -523,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
523 * Now start flushing things to disk, in the order they appear 459 * Now start flushing things to disk, in the order they appear
524 * on the transaction lists. Data blocks go first. 460 * on the transaction lists. Data blocks go first.
525 */ 461 */
526 err = 0; 462 err = journal_submit_data_buffers(journal, commit_transaction);
527 journal_submit_data_buffers(journal, commit_transaction);
528
529 /*
530 * Wait for all previously submitted IO to complete if commit
531 * record is to be written synchronously.
532 */
533 spin_lock(&journal->j_list_lock);
534 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
535 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
536 err = journal_wait_on_locked_list(journal,
537 commit_transaction);
538
539 spin_unlock(&journal->j_list_lock);
540
541 if (err) 463 if (err)
542 jbd2_journal_abort(journal, err); 464 jbd2_journal_abort(journal, err);
543 465
@@ -546,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
546 jbd_debug(3, "JBD: commit phase 2\n"); 468 jbd_debug(3, "JBD: commit phase 2\n");
547 469
548 /* 470 /*
549 * If we found any dirty or locked buffers, then we should have
550 * looped back up to the write_out_data label. If there weren't
551 * any then journal_clean_data_list should have wiped the list
552 * clean by now, so check that it is in fact empty.
553 */
554 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
555
556 jbd_debug (3, "JBD: commit phase 3\n");
557
558 /*
559 * Way to go: we have now written out all of the data for a 471 * Way to go: we have now written out all of the data for a
560 * transaction! Now comes the tricky part: we need to write out 472 * transaction! Now comes the tricky part: we need to write out
561 * metadata. Loop over the transaction's entire buffer list: 473 * metadata. Loop over the transaction's entire buffer list:
@@ -573,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
573 J_ASSERT(commit_transaction->t_nr_buffers <= 485 J_ASSERT(commit_transaction->t_nr_buffers <=
574 commit_transaction->t_outstanding_credits); 486 commit_transaction->t_outstanding_credits);
575 487
488 err = 0;
576 descriptor = NULL; 489 descriptor = NULL;
577 bufs = 0; 490 bufs = 0;
578 while (commit_transaction->t_buffers) { 491 while (commit_transaction->t_buffers) {
@@ -747,15 +660,19 @@ start_journal_io:
747 &cbh, crc32_sum); 660 &cbh, crc32_sum);
748 if (err) 661 if (err)
749 __jbd2_journal_abort_hard(journal); 662 __jbd2_journal_abort_hard(journal);
750
751 spin_lock(&journal->j_list_lock);
752 err = journal_wait_on_locked_list(journal,
753 commit_transaction);
754 spin_unlock(&journal->j_list_lock);
755 if (err)
756 __jbd2_journal_abort_hard(journal);
757 } 663 }
758 664
665 /*
666 * This is the right place to wait for data buffers both for ASYNC
667 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
668 * the commit block went to disk (which happens above). If commit is
669 * SYNC, we need to wait for data buffers before we start writing
670 * commit block, which happens below in such setting.
671 */
672 err = journal_finish_inode_data_buffers(journal, commit_transaction);
673 if (err)
674 jbd2_journal_abort(journal, err);
675
759 /* Lo and behold: we have just managed to send a transaction to 676 /* Lo and behold: we have just managed to send a transaction to
760 the log. Before we can commit it, wait for the IO so far to 677 the log. Before we can commit it, wait for the IO so far to
761 complete. Control buffers being written are on the 678 complete. Control buffers being written are on the
@@ -767,7 +684,7 @@ start_journal_io:
767 so we incur less scheduling load. 684 so we incur less scheduling load.
768 */ 685 */
769 686
770 jbd_debug(3, "JBD: commit phase 4\n"); 687 jbd_debug(3, "JBD: commit phase 3\n");
771 688
772 /* 689 /*
773 * akpm: these are BJ_IO, and j_list_lock is not needed. 690 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -826,7 +743,7 @@ wait_for_iobuf:
826 743
827 J_ASSERT (commit_transaction->t_shadow_list == NULL); 744 J_ASSERT (commit_transaction->t_shadow_list == NULL);
828 745
829 jbd_debug(3, "JBD: commit phase 5\n"); 746 jbd_debug(3, "JBD: commit phase 4\n");
830 747
831 /* Here we wait for the revoke record and descriptor record buffers */ 748 /* Here we wait for the revoke record and descriptor record buffers */
832 wait_for_ctlbuf: 749 wait_for_ctlbuf:
@@ -853,7 +770,7 @@ wait_for_iobuf:
853 /* AKPM: bforget here */ 770 /* AKPM: bforget here */
854 } 771 }
855 772
856 jbd_debug(3, "JBD: commit phase 6\n"); 773 jbd_debug(3, "JBD: commit phase 5\n");
857 774
858 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 775 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
859 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 776 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -873,9 +790,9 @@ wait_for_iobuf:
873 transaction can be removed from any checkpoint list it was on 790 transaction can be removed from any checkpoint list it was on
874 before. */ 791 before. */
875 792
876 jbd_debug(3, "JBD: commit phase 7\n"); 793 jbd_debug(3, "JBD: commit phase 6\n");
877 794
878 J_ASSERT(commit_transaction->t_sync_datalist == NULL); 795 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
879 J_ASSERT(commit_transaction->t_buffers == NULL); 796 J_ASSERT(commit_transaction->t_buffers == NULL);
880 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 797 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
881 J_ASSERT(commit_transaction->t_iobuf_list == NULL); 798 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -996,7 +913,7 @@ restart_loop:
996 913
997 /* Done with this transaction! */ 914 /* Done with this transaction! */
998 915
999 jbd_debug(3, "JBD: commit phase 8\n"); 916 jbd_debug(3, "JBD: commit phase 7\n");
1000 917
1001 J_ASSERT(commit_transaction->t_state == T_COMMIT); 918 J_ASSERT(commit_transaction->t_state == T_COMMIT);
1002 919