1 files changed, 105 insertions, 189 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
+ * When an ext4 file is truncated, it is possible that some pages are not
- * not sucessfully freed, because they are attached to a committing transaction.
+ * successfully freed, because they are attached to a committing transaction.
 * After the transaction commits, these pages are left on the LRU, with no
 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
 }
 /*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-        if (!jbd_trylock_bh_state(bh)) {
-                spin_unlock(&journal->j_list_lock);
-                schedule();
-                return 0;
-        }
-        return 1;
-}
-/*
 * Done it all: now submit the commit record.  We should have
 * cleaned up our previous buffers by now, so if we are in abort
 * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
        struct buffer_head *bh;
        int ret;
        int barrier_done = 0;
+        struct timespec now = current_kernel_time();
        if (is_journal_aborted(journal))
                return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+        tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+        tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
        if (JBD2_HAS_COMPAT_FEATURE(journal,
                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 }
 /*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
 */
-static int journal_wait_on_locked_list(journal_t *journal,
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
-                                       transaction_t *commit_transaction)
 {
-        int ret = 0;
+        int ret;
-        struct journal_head *jh;
+        struct writeback_control wbc = {
+                .sync_mode =  WB_SYNC_ALL,
-        while (commit_transaction->t_locked_list) {
+                .nr_to_write = mapping->nrpages * 2,
-                struct buffer_head *bh;
+                .range_start = 0,
+                .range_end = i_size_read(mapping->host),
-                jh = commit_transaction->t_locked_list->b_tprev;
+                .for_writepages = 1,
-                bh = jh2bh(jh);
+        };
-                get_bh(bh);
-                if (buffer_locked(bh)) {
+        ret = generic_writepages(mapping, &wbc);
-                        spin_unlock(&journal->j_list_lock);
-                        wait_on_buffer(bh);
-                        if (unlikely(!buffer_uptodate(bh)))
-                                ret = -EIO;
-                        spin_lock(&journal->j_list_lock);
-                }
-                if (!inverted_lock(journal, bh)) {
-                        put_bh(bh);
-                        spin_lock(&journal->j_list_lock);
-                        continue;
-                }
-                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        put_bh(bh);
-                } else {
-                        jbd_unlock_bh_state(bh);
-                }
-                put_bh(bh);
-                cond_resched_lock(&journal->j_list_lock);
-        }
        return ret;
-  }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+                transaction_t *commit_transaction)
 {
-        int i;
+        struct jbd2_inode *jinode;
+        int err, ret = 0;
+        struct address_space *mapping;
-        for (i = 0; i < bufs; i++) {
+        spin_lock(&journal->j_list_lock);
-                wbuf[i]->b_end_io = end_buffer_write_sync;
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-                /* We use-up our safety reference in submit_bh() */
+                mapping = jinode->i_vfs_inode->i_mapping;
-                submit_bh(WRITE, wbuf[i]);
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                /*
+                 * submit the inode data buffers. We use writepage
+                 * instead of writepages. Because writepages can do
+                 * block allocation  with delalloc. We need to write
+                 * only allocated blocks here.
+                 */
+                err = journal_submit_inode_data_buffers(mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                J_ASSERT(jinode->i_transaction == commit_transaction);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
+        spin_unlock(&journal->j_list_lock);
+        return ret;
 }
 /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
 */
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_finish_inode_data_buffers(journal_t *journal,
-                                transaction_t *commit_transaction)
+                transaction_t *commit_transaction)
 {
-        struct journal_head *jh;
+        struct jbd2_inode *jinode, *next_i;
-        struct buffer_head *bh;
+        int err, ret = 0;
-        int locked;
-        int bufs = 0;
-        struct buffer_head **wbuf = journal->j_wbuf;
-        /*
+        /* For locking, see the comment in journal_submit_data_buffers() */
-         * Whenever we unlock the journal and sleep, things can get added
-         * onto ->t_sync_datalist, so we have to keep looping back to
-         * write_out_data until we *know* that the list is empty.
-         *
-         * Cleanup any flushed data buffers from the data list.  Even in
-         * abort mode, we want to flush this out as soon as possible.
-         */
-write_out_data:
-        cond_resched();
        spin_lock(&journal->j_list_lock);
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+        }
-        while (commit_transaction->t_sync_datalist) {
+        /* Now refile inode to proper lists */
-                jh = commit_transaction->t_sync_datalist;
+        list_for_each_entry_safe(jinode, next_i,
-                bh = jh2bh(jh);
+                                 &commit_transaction->t_inode_list, i_list) {
-                locked = 0;
+                list_del(&jinode->i_list);
+                if (jinode->i_next_transaction) {
-                /* Get reference just to make sure buffer does not disappear
+                        jinode->i_transaction = jinode->i_next_transaction;
-                 * when we are forced to drop various locks */
+                        jinode->i_next_transaction = NULL;
-                get_bh(bh);
+                        list_add(&jinode->i_list,
-                /* If the buffer is dirty, we need to submit IO and hence
+                                &jinode->i_transaction->t_inode_list);
-                 * we need the buffer lock. We try to lock the buffer without
-                 * blocking. If we fail, we need to drop j_list_lock and do
-                 * blocking lock_buffer().
-                 */
-                if (buffer_dirty(bh)) {
-                        if (test_set_buffer_locked(bh)) {
-                                BUFFER_TRACE(bh, "needs blocking lock");
-                                spin_unlock(&journal->j_list_lock);
-                                /* Write out all data to prevent deadlocks */
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                lock_buffer(bh);
-                                spin_lock(&journal->j_list_lock);
-                        }
-                        locked = 1;
-                }
-                /* We have to get bh_state lock. Again out of order, sigh. */
-                if (!inverted_lock(journal, bh)) {
-                        jbd_lock_bh_state(bh);
-                        spin_lock(&journal->j_list_lock);
-                }
-                /* Someone already cleaned up the buffer? */
-                if (!buffer_jbd(bh)
-                        || jh->b_transaction != commit_transaction
-                        || jh->b_jlist != BJ_SyncData) {
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "already cleaned up");
-                        put_bh(bh);
-                        continue;
-                }
-                if (locked && test_clear_buffer_dirty(bh)) {
-                        BUFFER_TRACE(bh, "needs writeout, adding to array");
-                        wbuf[bufs++] = bh;
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        if (bufs == journal->j_wbufsize) {
-                                spin_unlock(&journal->j_list_lock);
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                goto write_out_data;
-                        }
-                } else if (!locked && buffer_locked(bh)) {
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        put_bh(bh);
                } else {
-                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        jinode->i_transaction = NULL;
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        /* Once for our safety reference, once for
-                         * jbd2_journal_remove_journal_head() */
-                        put_bh(bh);
-                        put_bh(bh);
-                }
-                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-                        spin_unlock(&journal->j_list_lock);
-                        goto write_out_data;
                }
        }
        spin_unlock(&journal->j_list_lock);
-        journal_do_submit_data(wbuf, bufs);
+        return ret;
 }
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-        err = 0;
+        err = journal_submit_data_buffers(journal, commit_transaction);
-        journal_submit_data_buffers(journal, commit_transaction);
-        /*
-         * Wait for all previously submitted IO to complete if commit
-         * record is to be written synchronously.
-         */
-        spin_lock(&journal->j_list_lock);
-        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-        spin_unlock(&journal->j_list_lock);
        if (err)
                jbd2_journal_abort(journal, err);
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        jbd_debug(3, "JBD: commit phase 2\n");
        /*
-         * If we found any dirty or locked buffers, then we should have
-         * looped back up to the write_out_data label.  If there weren't
-         * any then journal_clean_data_list should have wiped the list
-         * clean by now, so check that it is in fact empty.
-         */
-        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-        jbd_debug (3, "JBD: commit phase 3\n");
-        /*
         * Way to go: we have now written out all of the data for a
         * transaction!  Now comes the tricky part: we need to write out
         * metadata.  Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        J_ASSERT(commit_transaction->t_nr_buffers <=
                 commit_transaction->t_outstanding_credits);
+        err = 0;
        descriptor = NULL;
        bufs = 0;
        while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
-                spin_lock(&journal->j_list_lock);
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-                spin_unlock(&journal->j_list_lock);
-                if (err)
-                        __jbd2_journal_abort_hard(journal);
        }
+        /*
+         * This is the right place to wait for data buffers both for ASYNC
+         * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+         * the commit block went to disk (which happens above). If commit is
+         * SYNC, we need to wait for data buffers before we start writing
+         * commit block, which happens below in such setting.
+         */
+        err = journal_finish_inode_data_buffers(journal, commit_transaction);
+        if (err)
+                jbd2_journal_abort(journal, err);
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
           so we incur less scheduling load.
        */
-        jbd_debug(3, "JBD: commit phase 4\n");
+        jbd_debug(3, "JBD: commit phase 3\n");
        /*
         * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
        J_ASSERT (commit_transaction->t_shadow_list == NULL);
-        jbd_debug(3, "JBD: commit phase 5\n");
+        jbd_debug(3, "JBD: commit phase 4\n");
        /* Here we wait for the revoke record and descriptor record buffers */
 wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
-        jbd_debug(3, "JBD: commit phase 6\n");
+        jbd_debug(3, "JBD: commit phase 5\n");
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
           transaction can be removed from any checkpoint list it was on
           before. */
-        jbd_debug(3, "JBD: commit phase 7\n");
+        jbd_debug(3, "JBD: commit phase 6\n");
-        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
        /* Done with this transaction! */
-        jbd_debug(3, "JBD: commit phase 8\n");
+        jbd_debug(3, "JBD: commit phase 7\n");
        J_ASSERT(commit_transaction->t_state == T_COMMIT);

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index a2ed72f7ceee..f8b3be873226 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
22	#include <linux/pagemap.h>	22	#include <linux/pagemap.h>
23	#include <linux/jiffies.h>	23	#include <linux/jiffies.h>
24	#include <linux/crc32.h>	24	#include <linux/crc32.h>
		25	#include <linux/writeback.h>
		26	#include <linux/backing-dev.h>
25		27
26	/*	28	/*
27	* Default IO end handler for temporary BJ_IO buffer_heads.	29	* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37	}	39	}
38		40
39	/*	41	/*
40	* When an ext3-ordered file is truncated, it is possible that many pages are	42	* When an ext4 file is truncated, it is possible that some pages are not
41	* not sucessfully freed, because they are attached to a committing transaction.	43	* successfully freed, because they are attached to a committing transaction.
42	* After the transaction commits, these pages are left on the LRU, with no	44	* After the transaction commits, these pages are left on the LRU, with no
43	* ->mapping, and with attached buffers. These pages are trivially reclaimable	45	* ->mapping, and with attached buffers. These pages are trivially reclaimable
44	* by the VM, but their apparent absence upsets the VM accounting, and it makes	46	* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
80	}	82	}
81		83
82	/*	84	/*
83	* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84	* held. For ranking reasons we must trylock. If we lose, schedule away and
85	* return 0. j_list_lock is dropped in this case.
86	*/
87	static int inverted_lock(journal_t journal, struct buffer_head bh)
88	{
89	if (!jbd_trylock_bh_state(bh)) {
90	spin_unlock(&journal->j_list_lock);
91	schedule();
92	return 0;
93	}
94	return 1;
95	}
96
97	/*
98	* Done it all: now submit the commit record. We should have	85	* Done it all: now submit the commit record. We should have
99	* cleaned up our previous buffers by now, so if we are in abort	86	* cleaned up our previous buffers by now, so if we are in abort
100	* mode we can now just skip the rest of the journal write	87	* mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
112	struct buffer_head *bh;	99	struct buffer_head *bh;
113	int ret;	100	int ret;
114	int barrier_done = 0;	101	int barrier_done = 0;
		102	struct timespec now = current_kernel_time();
115		103
116	if (is_journal_aborted(journal))	104	if (is_journal_aborted(journal))
117	return 0;	105	return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
126	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);	114	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);	115	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);	116	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
		117	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
		118	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
129		119
130	if (JBD2_HAS_COMPAT_FEATURE(journal,	120	if (JBD2_HAS_COMPAT_FEATURE(journal,
131	JBD2_FEATURE_COMPAT_CHECKSUM)) {	121	JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
197	}	187	}
198		188
199	/*	189	/*
200	* Wait for all submitted IO to complete.	190	* write the filemap data using writepage() address_space_operations.
		191	* We don't do block allocation here even for delalloc. We don't
		192	* use writepages() because with dealyed allocation we may be doing
		193	* block allocation in writepages().
201	*/	194	*/
202	static int journal_wait_on_locked_list(journal_t *journal,	195	static int journal_submit_inode_data_buffers(struct address_space *mapping)
203	transaction_t *commit_transaction)
204	{	196	{
205	int ret = 0;	197	int ret;
206	struct journal_head *jh;	198	struct writeback_control wbc = {
207		199	.sync_mode = WB_SYNC_ALL,
208	while (commit_transaction->t_locked_list) {	200	.nr_to_write = mapping->nrpages * 2,
209	struct buffer_head *bh;	201	.range_start = 0,
210		202	.range_end = i_size_read(mapping->host),
211	jh = commit_transaction->t_locked_list->b_tprev;	203	.for_writepages = 1,
212	bh = jh2bh(jh);	204	};
213	get_bh(bh);	205
214	if (buffer_locked(bh)) {	206	ret = generic_writepages(mapping, &wbc);
215	spin_unlock(&journal->j_list_lock);
216	wait_on_buffer(bh);
217	if (unlikely(!buffer_uptodate(bh)))
218	ret = -EIO;
219	spin_lock(&journal->j_list_lock);
220	}
221	if (!inverted_lock(journal, bh)) {
222	put_bh(bh);
223	spin_lock(&journal->j_list_lock);
224	continue;
225	}
226	if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
227	__jbd2_journal_unfile_buffer(jh);
228	jbd_unlock_bh_state(bh);
229	jbd2_journal_remove_journal_head(bh);
230	put_bh(bh);
231	} else {
232	jbd_unlock_bh_state(bh);
233	}
234	put_bh(bh);
235	cond_resched_lock(&journal->j_list_lock);
236	}
237	return ret;	207	return ret;
238	}	208	}
239		209
240	static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)	210	/*
		211	* Submit all the data buffers of inode associated with the transaction to
		212	* disk.
		213	*
		214	* We are in a committing transaction. Therefore no new inode can be added to
		215	* our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
		216	* operate on from being released while we write out pages.
		217	*/
		218	static int journal_submit_data_buffers(journal_t *journal,
		219	transaction_t *commit_transaction)
241	{	220	{
242	int i;	221	struct jbd2_inode *jinode;
		222	int err, ret = 0;
		223	struct address_space *mapping;
243		224
244	for (i = 0; i < bufs; i++) {	225	spin_lock(&journal->j_list_lock);
245	wbuf[i]->b_end_io = end_buffer_write_sync;	226	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
246	/* We use-up our safety reference in submit_bh() */	227	mapping = jinode->i_vfs_inode->i_mapping;
247	submit_bh(WRITE, wbuf[i]);	228	jinode->i_flags \|= JI_COMMIT_RUNNING;
		229	spin_unlock(&journal->j_list_lock);
		230	/*
		231	* submit the inode data buffers. We use writepage
		232	* instead of writepages. Because writepages can do
		233	* block allocation with delalloc. We need to write
		234	* only allocated blocks here.
		235	*/
		236	err = journal_submit_inode_data_buffers(mapping);
		237	if (!ret)
		238	ret = err;
		239	spin_lock(&journal->j_list_lock);
		240	J_ASSERT(jinode->i_transaction == commit_transaction);
		241	jinode->i_flags &= ~JI_COMMIT_RUNNING;
		242	wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
248	}	243	}
		244	spin_unlock(&journal->j_list_lock);
		245	return ret;
249	}	246	}
250		247
251	/*	248	/*
252	* Submit all the data buffers to disk	249	* Wait for data submitted for writeout, refile inodes to proper
		250	* transaction if needed.
		251	*
253	*/	252	*/
254	static void journal_submit_data_buffers(journal_t *journal,	253	static int journal_finish_inode_data_buffers(journal_t *journal,
255	transaction_t *commit_transaction)	254	transaction_t *commit_transaction)
256	{	255	{
257	struct journal_head *jh;	256	struct jbd2_inode jinode, next_i;
258	struct buffer_head *bh;	257	int err, ret = 0;
259	int locked;
260	int bufs = 0;
261	struct buffer_head **wbuf = journal->j_wbuf;
262		258
263	/*	259	/* For locking, see the comment in journal_submit_data_buffers() */
264	* Whenever we unlock the journal and sleep, things can get added
265	* onto ->t_sync_datalist, so we have to keep looping back to
266	* write_out_data until we know that the list is empty.
267	*
268	* Cleanup any flushed data buffers from the data list. Even in
269	* abort mode, we want to flush this out as soon as possible.
270	*/
271	write_out_data:
272	cond_resched();
273	spin_lock(&journal->j_list_lock);	260	spin_lock(&journal->j_list_lock);
		261	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
		262	jinode->i_flags \|= JI_COMMIT_RUNNING;
		263	spin_unlock(&journal->j_list_lock);
		264	err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
		265	if (!ret)
		266	ret = err;
		267	spin_lock(&journal->j_list_lock);
		268	jinode->i_flags &= ~JI_COMMIT_RUNNING;
		269	wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
		270	}
274		271
275	while (commit_transaction->t_sync_datalist) {	272	/* Now refile inode to proper lists */
276	jh = commit_transaction->t_sync_datalist;	273	list_for_each_entry_safe(jinode, next_i,
277	bh = jh2bh(jh);	274	&commit_transaction->t_inode_list, i_list) {
278	locked = 0;	275	list_del(&jinode->i_list);
279		276	if (jinode->i_next_transaction) {
280	/* Get reference just to make sure buffer does not disappear	277	jinode->i_transaction = jinode->i_next_transaction;
281	* when we are forced to drop various locks */	278	jinode->i_next_transaction = NULL;
282	get_bh(bh);	279	list_add(&jinode->i_list,
283	/* If the buffer is dirty, we need to submit IO and hence	280	&jinode->i_transaction->t_inode_list);
284	* we need the buffer lock. We try to lock the buffer without
285	* blocking. If we fail, we need to drop j_list_lock and do
286	* blocking lock_buffer().
287	*/
288	if (buffer_dirty(bh)) {
289	if (test_set_buffer_locked(bh)) {
290	BUFFER_TRACE(bh, "needs blocking lock");
291	spin_unlock(&journal->j_list_lock);
292	/* Write out all data to prevent deadlocks */
293	journal_do_submit_data(wbuf, bufs);
294	bufs = 0;
295	lock_buffer(bh);
296	spin_lock(&journal->j_list_lock);
297	}
298	locked = 1;
299	}
300	/* We have to get bh_state lock. Again out of order, sigh. */
301	if (!inverted_lock(journal, bh)) {
302	jbd_lock_bh_state(bh);
303	spin_lock(&journal->j_list_lock);
304	}
305	/* Someone already cleaned up the buffer? */
306	if (!buffer_jbd(bh)
307	\|\| jh->b_transaction != commit_transaction
308	\|\| jh->b_jlist != BJ_SyncData) {
309	jbd_unlock_bh_state(bh);
310	if (locked)
311	unlock_buffer(bh);
312	BUFFER_TRACE(bh, "already cleaned up");
313	put_bh(bh);
314	continue;
315	}
316	if (locked && test_clear_buffer_dirty(bh)) {
317	BUFFER_TRACE(bh, "needs writeout, adding to array");
318	wbuf[bufs++] = bh;
319	__jbd2_journal_file_buffer(jh, commit_transaction,
320	BJ_Locked);
321	jbd_unlock_bh_state(bh);
322	if (bufs == journal->j_wbufsize) {
323	spin_unlock(&journal->j_list_lock);
324	journal_do_submit_data(wbuf, bufs);
325	bufs = 0;
326	goto write_out_data;
327	}
328	} else if (!locked && buffer_locked(bh)) {
329	__jbd2_journal_file_buffer(jh, commit_transaction,
330	BJ_Locked);
331	jbd_unlock_bh_state(bh);
332	put_bh(bh);
333	} else {	281	} else {
334	BUFFER_TRACE(bh, "writeout complete: unfile");	282	jinode->i_transaction = NULL;
335	__jbd2_journal_unfile_buffer(jh);
336	jbd_unlock_bh_state(bh);
337	if (locked)
338	unlock_buffer(bh);
339	jbd2_journal_remove_journal_head(bh);
340	/* Once for our safety reference, once for
341	* jbd2_journal_remove_journal_head() */
342	put_bh(bh);
343	put_bh(bh);
344	}
345
346	if (need_resched() \|\| spin_needbreak(&journal->j_list_lock)) {
347	spin_unlock(&journal->j_list_lock);
348	goto write_out_data;
349	}	283	}
350	}	284	}
351	spin_unlock(&journal->j_list_lock);	285	spin_unlock(&journal->j_list_lock);
352	journal_do_submit_data(wbuf, bufs);	286
		287	return ret;
353	}	288	}
354		289
355	static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)	290	static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
524	* Now start flushing things to disk, in the order they appear	459	* Now start flushing things to disk, in the order they appear
525	* on the transaction lists. Data blocks go first.	460	* on the transaction lists. Data blocks go first.
526	*/	461	*/
527	err = 0;	462	err = journal_submit_data_buffers(journal, commit_transaction);
528	journal_submit_data_buffers(journal, commit_transaction);
529
530	/*
531	* Wait for all previously submitted IO to complete if commit
532	* record is to be written synchronously.
533	*/
534	spin_lock(&journal->j_list_lock);
535	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
536	JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
537	err = journal_wait_on_locked_list(journal,
538	commit_transaction);
539
540	spin_unlock(&journal->j_list_lock);
541
542	if (err)	463	if (err)
543	jbd2_journal_abort(journal, err);	464	jbd2_journal_abort(journal, err);
544		465
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
547	jbd_debug(3, "JBD: commit phase 2\n");	468	jbd_debug(3, "JBD: commit phase 2\n");
548		469
549	/*	470	/*
550	* If we found any dirty or locked buffers, then we should have
551	* looped back up to the write_out_data label. If there weren't
552	* any then journal_clean_data_list should have wiped the list
553	* clean by now, so check that it is in fact empty.
554	*/
555	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
556
557	jbd_debug (3, "JBD: commit phase 3\n");
558
559	/*
560	* Way to go: we have now written out all of the data for a	471	* Way to go: we have now written out all of the data for a
561	* transaction! Now comes the tricky part: we need to write out	472	* transaction! Now comes the tricky part: we need to write out
562	* metadata. Loop over the transaction's entire buffer list:	473	* metadata. Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
574	J_ASSERT(commit_transaction->t_nr_buffers <=	485	J_ASSERT(commit_transaction->t_nr_buffers <=
575	commit_transaction->t_outstanding_credits);	486	commit_transaction->t_outstanding_credits);
576		487
		488	err = 0;
577	descriptor = NULL;	489	descriptor = NULL;
578	bufs = 0;	490	bufs = 0;
579	while (commit_transaction->t_buffers) {	491	while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
748	&cbh, crc32_sum);	660	&cbh, crc32_sum);
749	if (err)	661	if (err)
750	__jbd2_journal_abort_hard(journal);	662	__jbd2_journal_abort_hard(journal);
751
752	spin_lock(&journal->j_list_lock);
753	err = journal_wait_on_locked_list(journal,
754	commit_transaction);
755	spin_unlock(&journal->j_list_lock);
756	if (err)
757	__jbd2_journal_abort_hard(journal);
758	}	663	}
759		664
		665	/*
		666	* This is the right place to wait for data buffers both for ASYNC
		667	* and !ASYNC commit. If commit is ASYNC, we need to wait only after
		668	* the commit block went to disk (which happens above). If commit is
		669	* SYNC, we need to wait for data buffers before we start writing
		670	* commit block, which happens below in such setting.
		671	*/
		672	err = journal_finish_inode_data_buffers(journal, commit_transaction);
		673	if (err)
		674	jbd2_journal_abort(journal, err);
		675
760	/* Lo and behold: we have just managed to send a transaction to	676	/* Lo and behold: we have just managed to send a transaction to
761	the log. Before we can commit it, wait for the IO so far to	677	the log. Before we can commit it, wait for the IO so far to
762	complete. Control buffers being written are on the	678	complete. Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
768	so we incur less scheduling load.	684	so we incur less scheduling load.
769	*/	685	*/
770		686
771	jbd_debug(3, "JBD: commit phase 4\n");	687	jbd_debug(3, "JBD: commit phase 3\n");
772		688
773	/*	689	/*
774	* akpm: these are BJ_IO, and j_list_lock is not needed.	690	* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
827		743
828	J_ASSERT (commit_transaction->t_shadow_list == NULL);	744	J_ASSERT (commit_transaction->t_shadow_list == NULL);
829		745
830	jbd_debug(3, "JBD: commit phase 5\n");	746	jbd_debug(3, "JBD: commit phase 4\n");
831		747
832	/* Here we wait for the revoke record and descriptor record buffers */	748	/* Here we wait for the revoke record and descriptor record buffers */
833	wait_for_ctlbuf:	749	wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
854	/* AKPM: bforget here */	770	/* AKPM: bforget here */
855	}	771	}
856		772
857	jbd_debug(3, "JBD: commit phase 6\n");	773	jbd_debug(3, "JBD: commit phase 5\n");
858		774
859	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,	775	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
860	JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {	776	JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
874	transaction can be removed from any checkpoint list it was on	790	transaction can be removed from any checkpoint list it was on
875	before. */	791	before. */
876		792
877	jbd_debug(3, "JBD: commit phase 7\n");	793	jbd_debug(3, "JBD: commit phase 6\n");
878		794
879	J_ASSERT(commit_transaction->t_sync_datalist == NULL);	795	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
880	J_ASSERT(commit_transaction->t_buffers == NULL);	796	J_ASSERT(commit_transaction->t_buffers == NULL);
881	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);	797	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
882	J_ASSERT(commit_transaction->t_iobuf_list == NULL);	798	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
997		913
998	/* Done with this transaction! */	914	/* Done with this transaction! */
999		915
1000	jbd_debug(3, "JBD: commit phase 8\n");	916	jbd_debug(3, "JBD: commit phase 7\n");
1001		917
1002	J_ASSERT(commit_transaction->t_state == T_COMMIT);	918	J_ASSERT(commit_transaction->t_state == T_COMMIT);
1003		919