Merge branch 'linus' into timers/nohz

author: Ingo Molnar <mingo@elte.hu> 2008-07-18 13:53:16 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-07-18 13:53:16 -0400
commit: 9b610fda0df5d0f0b0c64242e37441ad1b384aac (patch)
tree: 0ea14b15f2e6546f37fe18d8ac3dc83077ec0e55 /fs/jbd2/commit.c
parent: b8f8c3cf0a4ac0632ec3f0e15e9dc0c29de917af (diff)
parent: 5b664cb235e97afbf34db9c4d77f08ebd725335e (diff)
1 files changed, 106 insertions, 189 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4d99685fdce4..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
+ * When an ext4 file is truncated, it is possible that some pages are not
- * not sucessfully freed, because they are attached to a committing transaction.
+ * successfully freed, because they are attached to a committing transaction.
 * After the transaction commits, these pages are left on the LRU, with no
 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
 }
 /*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-        if (!jbd_trylock_bh_state(bh)) {
-                spin_unlock(&journal->j_list_lock);
-                schedule();
-                return 0;
-        }
-        return 1;
-}
-/*
 * Done it all: now submit the commit record.  We should have
 * cleaned up our previous buffers by now, so if we are in abort
 * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
        struct buffer_head *bh;
        int ret;
        int barrier_done = 0;
+        struct timespec now = current_kernel_time();
        if (is_journal_aborted(journal))
                return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+        tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+        tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
        if (JBD2_HAS_COMPAT_FEATURE(journal,
                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -168,6 +158,7 @@ static int journal_submit_commit_record(journal_t *journal,
                spin_unlock(&journal->j_state_lock);
                /* And try again, without the barrier */
+                lock_buffer(bh);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
                ret = submit_bh(WRITE, bh);
@@ -196,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 }
 /*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
 */
-static int journal_wait_on_locked_list(journal_t *journal,
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
-                                       transaction_t *commit_transaction)
 {
-        int ret = 0;
+        int ret;
-        struct journal_head *jh;
+        struct writeback_control wbc = {
+                .sync_mode =  WB_SYNC_ALL,
-        while (commit_transaction->t_locked_list) {
+                .nr_to_write = mapping->nrpages * 2,
-                struct buffer_head *bh;
+                .range_start = 0,
+                .range_end = i_size_read(mapping->host),
-                jh = commit_transaction->t_locked_list->b_tprev;
+                .for_writepages = 1,
-                bh = jh2bh(jh);
+        };
-                get_bh(bh);
-                if (buffer_locked(bh)) {
+        ret = generic_writepages(mapping, &wbc);
-                        spin_unlock(&journal->j_list_lock);
-                        wait_on_buffer(bh);
-                        if (unlikely(!buffer_uptodate(bh)))
-                                ret = -EIO;
-                        spin_lock(&journal->j_list_lock);
-                }
-                if (!inverted_lock(journal, bh)) {
-                        put_bh(bh);
-                        spin_lock(&journal->j_list_lock);
-                        continue;
-                }
-                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        put_bh(bh);
-                } else {
-                        jbd_unlock_bh_state(bh);
-                }
-                put_bh(bh);
-                cond_resched_lock(&journal->j_list_lock);
-        }
        return ret;
-  }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+                transaction_t *commit_transaction)
 {
-        int i;
+        struct jbd2_inode *jinode;
+        int err, ret = 0;
+        struct address_space *mapping;
-        for (i = 0; i < bufs; i++) {
+        spin_lock(&journal->j_list_lock);
-                wbuf[i]->b_end_io = end_buffer_write_sync;
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-                /* We use-up our safety reference in submit_bh() */
+                mapping = jinode->i_vfs_inode->i_mapping;
-                submit_bh(WRITE, wbuf[i]);
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                /*
+                 * submit the inode data buffers. We use writepage
+                 * instead of writepages. Because writepages can do
+                 * block allocation  with delalloc. We need to write
+                 * only allocated blocks here.
+                 */
+                err = journal_submit_inode_data_buffers(mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                J_ASSERT(jinode->i_transaction == commit_transaction);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
+        spin_unlock(&journal->j_list_lock);
+        return ret;
 }
 /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
 */
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_finish_inode_data_buffers(journal_t *journal,
-                                transaction_t *commit_transaction)
+                transaction_t *commit_transaction)
 {
-        struct journal_head *jh;
+        struct jbd2_inode *jinode, *next_i;
-        struct buffer_head *bh;
+        int err, ret = 0;
-        int locked;
-        int bufs = 0;
-        struct buffer_head **wbuf = journal->j_wbuf;
-        /*
+        /* For locking, see the comment in journal_submit_data_buffers() */
-         * Whenever we unlock the journal and sleep, things can get added
-         * onto ->t_sync_datalist, so we have to keep looping back to
-         * write_out_data until we *know* that the list is empty.
-         *
-         * Cleanup any flushed data buffers from the data list.  Even in
-         * abort mode, we want to flush this out as soon as possible.
-         */
-write_out_data:
-        cond_resched();
        spin_lock(&journal->j_list_lock);
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+        }
-        while (commit_transaction->t_sync_datalist) {
+        /* Now refile inode to proper lists */
-                jh = commit_transaction->t_sync_datalist;
+        list_for_each_entry_safe(jinode, next_i,
-                bh = jh2bh(jh);
+                                 &commit_transaction->t_inode_list, i_list) {
-                locked = 0;
+                list_del(&jinode->i_list);
+                if (jinode->i_next_transaction) {
-                /* Get reference just to make sure buffer does not disappear
+                        jinode->i_transaction = jinode->i_next_transaction;
-                 * when we are forced to drop various locks */
+                        jinode->i_next_transaction = NULL;
-                get_bh(bh);
+                        list_add(&jinode->i_list,
-                /* If the buffer is dirty, we need to submit IO and hence
+                                &jinode->i_transaction->t_inode_list);
-                 * we need the buffer lock. We try to lock the buffer without
-                 * blocking. If we fail, we need to drop j_list_lock and do
-                 * blocking lock_buffer().
-                 */
-                if (buffer_dirty(bh)) {
-                        if (test_set_buffer_locked(bh)) {
-                                BUFFER_TRACE(bh, "needs blocking lock");
-                                spin_unlock(&journal->j_list_lock);
-                                /* Write out all data to prevent deadlocks */
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                lock_buffer(bh);
-                                spin_lock(&journal->j_list_lock);
-                        }
-                        locked = 1;
-                }
-                /* We have to get bh_state lock. Again out of order, sigh. */
-                if (!inverted_lock(journal, bh)) {
-                        jbd_lock_bh_state(bh);
-                        spin_lock(&journal->j_list_lock);
-                }
-                /* Someone already cleaned up the buffer? */
-                if (!buffer_jbd(bh)
-                        || jh->b_transaction != commit_transaction
-                        || jh->b_jlist != BJ_SyncData) {
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "already cleaned up");
-                        put_bh(bh);
-                        continue;
-                }
-                if (locked && test_clear_buffer_dirty(bh)) {
-                        BUFFER_TRACE(bh, "needs writeout, adding to array");
-                        wbuf[bufs++] = bh;
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        if (bufs == journal->j_wbufsize) {
-                                spin_unlock(&journal->j_list_lock);
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                goto write_out_data;
-                        }
-                } else if (!locked && buffer_locked(bh)) {
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        put_bh(bh);
                } else {
-                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        jinode->i_transaction = NULL;
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        /* Once for our safety reference, once for
-                         * jbd2_journal_remove_journal_head() */
-                        put_bh(bh);
-                        put_bh(bh);
-                }
-                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-                        spin_unlock(&journal->j_list_lock);
-                        goto write_out_data;
                }
        }
        spin_unlock(&journal->j_list_lock);
-        journal_do_submit_data(wbuf, bufs);
+        return ret;
 }
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -523,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-        err = 0;
+        err = journal_submit_data_buffers(journal, commit_transaction);
-        journal_submit_data_buffers(journal, commit_transaction);
-        /*
-         * Wait for all previously submitted IO to complete if commit
-         * record is to be written synchronously.
-         */
-        spin_lock(&journal->j_list_lock);
-        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-        spin_unlock(&journal->j_list_lock);
        if (err)
                jbd2_journal_abort(journal, err);
@@ -546,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        jbd_debug(3, "JBD: commit phase 2\n");
        /*
-         * If we found any dirty or locked buffers, then we should have
-         * looped back up to the write_out_data label.  If there weren't
-         * any then journal_clean_data_list should have wiped the list
-         * clean by now, so check that it is in fact empty.
-         */
-        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-        jbd_debug (3, "JBD: commit phase 3\n");
-        /*
         * Way to go: we have now written out all of the data for a
         * transaction!  Now comes the tricky part: we need to write out
         * metadata.  Loop over the transaction's entire buffer list:
@@ -573,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        J_ASSERT(commit_transaction->t_nr_buffers <=
                 commit_transaction->t_outstanding_credits);
+        err = 0;
        descriptor = NULL;
        bufs = 0;
        while (commit_transaction->t_buffers) {
@@ -747,15 +660,19 @@ start_journal_io:
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
-                spin_lock(&journal->j_list_lock);
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-                spin_unlock(&journal->j_list_lock);
-                if (err)
-                        __jbd2_journal_abort_hard(journal);
        }
+        /*
+         * This is the right place to wait for data buffers both for ASYNC
+         * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+         * the commit block went to disk (which happens above). If commit is
+         * SYNC, we need to wait for data buffers before we start writing
+         * commit block, which happens below in such setting.
+         */
+        err = journal_finish_inode_data_buffers(journal, commit_transaction);
+        if (err)
+                jbd2_journal_abort(journal, err);
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
@@ -767,7 +684,7 @@ start_journal_io:
           so we incur less scheduling load.
        */
-        jbd_debug(3, "JBD: commit phase 4\n");
+        jbd_debug(3, "JBD: commit phase 3\n");
        /*
         * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -826,7 +743,7 @@ wait_for_iobuf:
        J_ASSERT (commit_transaction->t_shadow_list == NULL);
-        jbd_debug(3, "JBD: commit phase 5\n");
+        jbd_debug(3, "JBD: commit phase 4\n");
        /* Here we wait for the revoke record and descriptor record buffers */
 wait_for_ctlbuf:
@@ -853,7 +770,7 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
-        jbd_debug(3, "JBD: commit phase 6\n");
+        jbd_debug(3, "JBD: commit phase 5\n");
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -873,9 +790,9 @@ wait_for_iobuf:
           transaction can be removed from any checkpoint list it was on
           before. */
-        jbd_debug(3, "JBD: commit phase 7\n");
+        jbd_debug(3, "JBD: commit phase 6\n");
-        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -996,7 +913,7 @@ restart_loop:
        /* Done with this transaction! */
-        jbd_debug(3, "JBD: commit phase 8\n");
+        jbd_debug(3, "JBD: commit phase 7\n");
        J_ASSERT(commit_transaction->t_state == T_COMMIT);
author	Ingo Molnar <mingo@elte.hu>	2008-07-18 13:53:16 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-07-18 13:53:16 -0400
commit	9b610fda0df5d0f0b0c64242e37441ad1b384aac (patch)
tree	0ea14b15f2e6546f37fe18d8ac3dc83077ec0e55 /fs/jbd2/commit.c
parent	b8f8c3cf0a4ac0632ec3f0e15e9dc0c29de917af (diff)
parent	5b664cb235e97afbf34db9c4d77f08ebd725335e (diff)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4d99685fdce4..f8b3be873226 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
22	#include <linux/pagemap.h>	22	#include <linux/pagemap.h>
23	#include <linux/jiffies.h>	23	#include <linux/jiffies.h>
24	#include <linux/crc32.h>	24	#include <linux/crc32.h>
		25	#include <linux/writeback.h>
		26	#include <linux/backing-dev.h>
25		27
26	/*	28	/*
27	* Default IO end handler for temporary BJ_IO buffer_heads.	29	* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37	}	39	}
38		40
39	/*	41	/*
40	* When an ext3-ordered file is truncated, it is possible that many pages are	42	* When an ext4 file is truncated, it is possible that some pages are not
41	* not sucessfully freed, because they are attached to a committing transaction.	43	* successfully freed, because they are attached to a committing transaction.
42	* After the transaction commits, these pages are left on the LRU, with no	44	* After the transaction commits, these pages are left on the LRU, with no
43	* ->mapping, and with attached buffers. These pages are trivially reclaimable	45	* ->mapping, and with attached buffers. These pages are trivially reclaimable
44	* by the VM, but their apparent absence upsets the VM accounting, and it makes	46	* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
80	}	82	}
81		83
82	/*	84	/*
83	* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84	* held. For ranking reasons we must trylock. If we lose, schedule away and
85	* return 0. j_list_lock is dropped in this case.
86	*/
87	static int inverted_lock(journal_t journal, struct buffer_head bh)
88	{
89	if (!jbd_trylock_bh_state(bh)) {
90	spin_unlock(&journal->j_list_lock);
91	schedule();
92	return 0;
93	}
94	return 1;
95	}
96
97	/*
98	* Done it all: now submit the commit record. We should have	85	* Done it all: now submit the commit record. We should have
99	* cleaned up our previous buffers by now, so if we are in abort	86	* cleaned up our previous buffers by now, so if we are in abort
100	* mode we can now just skip the rest of the journal write	87	* mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
112	struct buffer_head *bh;	99	struct buffer_head *bh;
113	int ret;	100	int ret;
114	int barrier_done = 0;	101	int barrier_done = 0;
		102	struct timespec now = current_kernel_time();
115		103
116	if (is_journal_aborted(journal))	104	if (is_journal_aborted(journal))
117	return 0;	105	return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
126	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);	114	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);	115	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);	116	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
		117	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
		118	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
129		119
130	if (JBD2_HAS_COMPAT_FEATURE(journal,	120	if (JBD2_HAS_COMPAT_FEATURE(journal,
131	JBD2_FEATURE_COMPAT_CHECKSUM)) {	121	JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -168,6 +158,7 @@ static int journal_submit_commit_record(journal_t *journal,
168	spin_unlock(&journal->j_state_lock);	158	spin_unlock(&journal->j_state_lock);
169		159
170	/* And try again, without the barrier */	160	/* And try again, without the barrier */
		161	lock_buffer(bh);
171	set_buffer_uptodate(bh);	162	set_buffer_uptodate(bh);
172	set_buffer_dirty(bh);	163	set_buffer_dirty(bh);
173	ret = submit_bh(WRITE, bh);	164	ret = submit_bh(WRITE, bh);
@@ -196,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
196	}	187	}
197		188
198	/*	189	/*
199	* Wait for all submitted IO to complete.	190	* write the filemap data using writepage() address_space_operations.
		191	* We don't do block allocation here even for delalloc. We don't
		192	* use writepages() because with dealyed allocation we may be doing
		193	* block allocation in writepages().
200	*/	194	*/
201	static int journal_wait_on_locked_list(journal_t *journal,	195	static int journal_submit_inode_data_buffers(struct address_space *mapping)
202	transaction_t *commit_transaction)
203	{	196	{
204	int ret = 0;	197	int ret;
205	struct journal_head *jh;	198	struct writeback_control wbc = {
206		199	.sync_mode = WB_SYNC_ALL,
207	while (commit_transaction->t_locked_list) {	200	.nr_to_write = mapping->nrpages * 2,
208	struct buffer_head *bh;	201	.range_start = 0,
209		202	.range_end = i_size_read(mapping->host),
210	jh = commit_transaction->t_locked_list->b_tprev;	203	.for_writepages = 1,
211	bh = jh2bh(jh);	204	};
212	get_bh(bh);	205
213	if (buffer_locked(bh)) {	206	ret = generic_writepages(mapping, &wbc);
214	spin_unlock(&journal->j_list_lock);
215	wait_on_buffer(bh);
216	if (unlikely(!buffer_uptodate(bh)))
217	ret = -EIO;
218	spin_lock(&journal->j_list_lock);
219	}
220	if (!inverted_lock(journal, bh)) {
221	put_bh(bh);
222	spin_lock(&journal->j_list_lock);
223	continue;
224	}
225	if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
226	__jbd2_journal_unfile_buffer(jh);
227	jbd_unlock_bh_state(bh);
228	jbd2_journal_remove_journal_head(bh);
229	put_bh(bh);
230	} else {
231	jbd_unlock_bh_state(bh);
232	}
233	put_bh(bh);
234	cond_resched_lock(&journal->j_list_lock);
235	}
236	return ret;	207	return ret;
237	}	208	}
238		209
239	static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)	210	/*
		211	* Submit all the data buffers of inode associated with the transaction to
		212	* disk.
		213	*
		214	* We are in a committing transaction. Therefore no new inode can be added to
		215	* our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
		216	* operate on from being released while we write out pages.
		217	*/
		218	static int journal_submit_data_buffers(journal_t *journal,
		219	transaction_t *commit_transaction)
240	{	220	{
241	int i;	221	struct jbd2_inode *jinode;
		222	int err, ret = 0;
		223	struct address_space *mapping;
242		224
243	for (i = 0; i < bufs; i++) {	225	spin_lock(&journal->j_list_lock);
244	wbuf[i]->b_end_io = end_buffer_write_sync;	226	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
245	/* We use-up our safety reference in submit_bh() */	227	mapping = jinode->i_vfs_inode->i_mapping;
246	submit_bh(WRITE, wbuf[i]);	228	jinode->i_flags \|= JI_COMMIT_RUNNING;
		229	spin_unlock(&journal->j_list_lock);
		230	/*
		231	* submit the inode data buffers. We use writepage
		232	* instead of writepages. Because writepages can do
		233	* block allocation with delalloc. We need to write
		234	* only allocated blocks here.
		235	*/
		236	err = journal_submit_inode_data_buffers(mapping);
		237	if (!ret)
		238	ret = err;
		239	spin_lock(&journal->j_list_lock);
		240	J_ASSERT(jinode->i_transaction == commit_transaction);
		241	jinode->i_flags &= ~JI_COMMIT_RUNNING;
		242	wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
247	}	243	}
		244	spin_unlock(&journal->j_list_lock);
		245	return ret;
248	}	246	}
249		247
250	/*	248	/*
251	* Submit all the data buffers to disk	249	* Wait for data submitted for writeout, refile inodes to proper
		250	* transaction if needed.
		251	*
252	*/	252	*/
253	static void journal_submit_data_buffers(journal_t *journal,	253	static int journal_finish_inode_data_buffers(journal_t *journal,
254	transaction_t *commit_transaction)	254	transaction_t *commit_transaction)
255	{	255	{
256	struct journal_head *jh;	256	struct jbd2_inode jinode, next_i;
257	struct buffer_head *bh;	257	int err, ret = 0;
258	int locked;
259	int bufs = 0;
260	struct buffer_head **wbuf = journal->j_wbuf;
261		258
262	/*	259	/* For locking, see the comment in journal_submit_data_buffers() */
263	* Whenever we unlock the journal and sleep, things can get added
264	* onto ->t_sync_datalist, so we have to keep looping back to
265	* write_out_data until we know that the list is empty.
266	*
267	* Cleanup any flushed data buffers from the data list. Even in
268	* abort mode, we want to flush this out as soon as possible.
269	*/
270	write_out_data:
271	cond_resched();
272	spin_lock(&journal->j_list_lock);	260	spin_lock(&journal->j_list_lock);
		261	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
		262	jinode->i_flags \|= JI_COMMIT_RUNNING;
		263	spin_unlock(&journal->j_list_lock);
		264	err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
		265	if (!ret)
		266	ret = err;
		267	spin_lock(&journal->j_list_lock);
		268	jinode->i_flags &= ~JI_COMMIT_RUNNING;
		269	wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
		270	}
273		271
274	while (commit_transaction->t_sync_datalist) {	272	/* Now refile inode to proper lists */
275	jh = commit_transaction->t_sync_datalist;	273	list_for_each_entry_safe(jinode, next_i,
276	bh = jh2bh(jh);	274	&commit_transaction->t_inode_list, i_list) {
277	locked = 0;	275	list_del(&jinode->i_list);
278		276	if (jinode->i_next_transaction) {
279	/* Get reference just to make sure buffer does not disappear	277	jinode->i_transaction = jinode->i_next_transaction;
280	* when we are forced to drop various locks */	278	jinode->i_next_transaction = NULL;
281	get_bh(bh);	279	list_add(&jinode->i_list,
282	/* If the buffer is dirty, we need to submit IO and hence	280	&jinode->i_transaction->t_inode_list);
283	* we need the buffer lock. We try to lock the buffer without
284	* blocking. If we fail, we need to drop j_list_lock and do
285	* blocking lock_buffer().
286	*/
287	if (buffer_dirty(bh)) {
288	if (test_set_buffer_locked(bh)) {
289	BUFFER_TRACE(bh, "needs blocking lock");
290	spin_unlock(&journal->j_list_lock);
291	/* Write out all data to prevent deadlocks */
292	journal_do_submit_data(wbuf, bufs);
293	bufs = 0;
294	lock_buffer(bh);
295	spin_lock(&journal->j_list_lock);
296	}
297	locked = 1;
298	}
299	/* We have to get bh_state lock. Again out of order, sigh. */
300	if (!inverted_lock(journal, bh)) {
301	jbd_lock_bh_state(bh);
302	spin_lock(&journal->j_list_lock);
303	}
304	/* Someone already cleaned up the buffer? */
305	if (!buffer_jbd(bh)
306	\|\| jh->b_transaction != commit_transaction
307	\|\| jh->b_jlist != BJ_SyncData) {
308	jbd_unlock_bh_state(bh);
309	if (locked)
310	unlock_buffer(bh);
311	BUFFER_TRACE(bh, "already cleaned up");
312	put_bh(bh);
313	continue;
314	}
315	if (locked && test_clear_buffer_dirty(bh)) {
316	BUFFER_TRACE(bh, "needs writeout, adding to array");
317	wbuf[bufs++] = bh;
318	__jbd2_journal_file_buffer(jh, commit_transaction,
319	BJ_Locked);
320	jbd_unlock_bh_state(bh);
321	if (bufs == journal->j_wbufsize) {
322	spin_unlock(&journal->j_list_lock);
323	journal_do_submit_data(wbuf, bufs);
324	bufs = 0;
325	goto write_out_data;
326	}
327	} else if (!locked && buffer_locked(bh)) {
328	__jbd2_journal_file_buffer(jh, commit_transaction,
329	BJ_Locked);
330	jbd_unlock_bh_state(bh);
331	put_bh(bh);
332	} else {	281	} else {
333	BUFFER_TRACE(bh, "writeout complete: unfile");	282	jinode->i_transaction = NULL;
334	__jbd2_journal_unfile_buffer(jh);
335	jbd_unlock_bh_state(bh);
336	if (locked)
337	unlock_buffer(bh);
338	jbd2_journal_remove_journal_head(bh);
339	/* Once for our safety reference, once for
340	* jbd2_journal_remove_journal_head() */
341	put_bh(bh);
342	put_bh(bh);
343	}
344
345	if (need_resched() \|\| spin_needbreak(&journal->j_list_lock)) {
346	spin_unlock(&journal->j_list_lock);
347	goto write_out_data;
348	}	283	}
349	}	284	}
350	spin_unlock(&journal->j_list_lock);	285	spin_unlock(&journal->j_list_lock);
351	journal_do_submit_data(wbuf, bufs);	286
		287	return ret;
352	}	288	}
353		289
354	static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)	290	static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -523,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
523	* Now start flushing things to disk, in the order they appear	459	* Now start flushing things to disk, in the order they appear
524	* on the transaction lists. Data blocks go first.	460	* on the transaction lists. Data blocks go first.
525	*/	461	*/
526	err = 0;	462	err = journal_submit_data_buffers(journal, commit_transaction);
527	journal_submit_data_buffers(journal, commit_transaction);
528
529	/*
530	* Wait for all previously submitted IO to complete if commit
531	* record is to be written synchronously.
532	*/
533	spin_lock(&journal->j_list_lock);
534	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
535	JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
536	err = journal_wait_on_locked_list(journal,
537	commit_transaction);
538
539	spin_unlock(&journal->j_list_lock);
540
541	if (err)	463	if (err)
542	jbd2_journal_abort(journal, err);	464	jbd2_journal_abort(journal, err);
543		465
@@ -546,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
546	jbd_debug(3, "JBD: commit phase 2\n");	468	jbd_debug(3, "JBD: commit phase 2\n");
547		469
548	/*	470	/*
549	* If we found any dirty or locked buffers, then we should have
550	* looped back up to the write_out_data label. If there weren't
551	* any then journal_clean_data_list should have wiped the list
552	* clean by now, so check that it is in fact empty.
553	*/
554	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
555
556	jbd_debug (3, "JBD: commit phase 3\n");
557
558	/*
559	* Way to go: we have now written out all of the data for a	471	* Way to go: we have now written out all of the data for a
560	* transaction! Now comes the tricky part: we need to write out	472	* transaction! Now comes the tricky part: we need to write out
561	* metadata. Loop over the transaction's entire buffer list:	473	* metadata. Loop over the transaction's entire buffer list:
@@ -573,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
573	J_ASSERT(commit_transaction->t_nr_buffers <=	485	J_ASSERT(commit_transaction->t_nr_buffers <=
574	commit_transaction->t_outstanding_credits);	486	commit_transaction->t_outstanding_credits);
575		487
		488	err = 0;
576	descriptor = NULL;	489	descriptor = NULL;
577	bufs = 0;	490	bufs = 0;
578	while (commit_transaction->t_buffers) {	491	while (commit_transaction->t_buffers) {
@@ -747,15 +660,19 @@ start_journal_io:
747	&cbh, crc32_sum);	660	&cbh, crc32_sum);
748	if (err)	661	if (err)
749	__jbd2_journal_abort_hard(journal);	662	__jbd2_journal_abort_hard(journal);
750
751	spin_lock(&journal->j_list_lock);
752	err = journal_wait_on_locked_list(journal,
753	commit_transaction);
754	spin_unlock(&journal->j_list_lock);
755	if (err)
756	__jbd2_journal_abort_hard(journal);
757	}	663	}
758		664
		665	/*
		666	* This is the right place to wait for data buffers both for ASYNC
		667	* and !ASYNC commit. If commit is ASYNC, we need to wait only after
		668	* the commit block went to disk (which happens above). If commit is
		669	* SYNC, we need to wait for data buffers before we start writing
		670	* commit block, which happens below in such setting.
		671	*/
		672	err = journal_finish_inode_data_buffers(journal, commit_transaction);
		673	if (err)
		674	jbd2_journal_abort(journal, err);
		675
759	/* Lo and behold: we have just managed to send a transaction to	676	/* Lo and behold: we have just managed to send a transaction to
760	the log. Before we can commit it, wait for the IO so far to	677	the log. Before we can commit it, wait for the IO so far to
761	complete. Control buffers being written are on the	678	complete. Control buffers being written are on the
@@ -767,7 +684,7 @@ start_journal_io:
767	so we incur less scheduling load.	684	so we incur less scheduling load.
768	*/	685	*/
769		686
770	jbd_debug(3, "JBD: commit phase 4\n");	687	jbd_debug(3, "JBD: commit phase 3\n");
771		688
772	/*	689	/*
773	* akpm: these are BJ_IO, and j_list_lock is not needed.	690	* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -826,7 +743,7 @@ wait_for_iobuf:
826		743
827	J_ASSERT (commit_transaction->t_shadow_list == NULL);	744	J_ASSERT (commit_transaction->t_shadow_list == NULL);
828		745
829	jbd_debug(3, "JBD: commit phase 5\n");	746	jbd_debug(3, "JBD: commit phase 4\n");
830		747
831	/* Here we wait for the revoke record and descriptor record buffers */	748	/* Here we wait for the revoke record and descriptor record buffers */
832	wait_for_ctlbuf:	749	wait_for_ctlbuf:
@@ -853,7 +770,7 @@ wait_for_iobuf:
853	/* AKPM: bforget here */	770	/* AKPM: bforget here */
854	}	771	}
855		772
856	jbd_debug(3, "JBD: commit phase 6\n");	773	jbd_debug(3, "JBD: commit phase 5\n");
857		774
858	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,	775	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
859	JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {	776	JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -873,9 +790,9 @@ wait_for_iobuf:
873	transaction can be removed from any checkpoint list it was on	790	transaction can be removed from any checkpoint list it was on
874	before. */	791	before. */
875		792
876	jbd_debug(3, "JBD: commit phase 7\n");	793	jbd_debug(3, "JBD: commit phase 6\n");
877		794
878	J_ASSERT(commit_transaction->t_sync_datalist == NULL);	795	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
879	J_ASSERT(commit_transaction->t_buffers == NULL);	796	J_ASSERT(commit_transaction->t_buffers == NULL);
880	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);	797	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
881	J_ASSERT(commit_transaction->t_iobuf_list == NULL);	798	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -996,7 +913,7 @@ restart_loop:
996		913
997	/* Done with this transaction! */	914	/* Done with this transaction! */
998		915
999	jbd_debug(3, "JBD: commit phase 8\n");	916	jbd_debug(3, "JBD: commit phase 7\n");
1000		917
1001	J_ASSERT(commit_transaction->t_state == T_COMMIT);	918	J_ASSERT(commit_transaction->t_state == T_COMMIT);
1002		919