[PATCH] jbd: fix commit of ordered data buffers

Original commit code assumes, that when a buffer on BJ_SyncData list is locked, it is being written to disk. But this is not true and hence it can lead to a potential data loss on crash. Also the code didn't count with the fact that journal_dirty_data() can steal buffers from committing transaction and hence could write buffers that no longer belong to the committing transaction. Finally it could possibly happen that we tried writing out one buffer several times. The patch below tries to solve these problems by a complete rewrite of the data commit code. We go through buffers on t_sync_datalist, lock buffers needing write out and store them in an array. Buffers are also immediately refiled to BJ_Locked list or unfiled (if the write out is completed). When the array is full or we have to block on buffer lock, we submit all accumulated buffers for IO. [suitable for 2.6.18.x around the 2.6.19-rc2 timeframe] Signed-off-by: Jan Kara <jack@suse.cz> Cc: Badari Pulavarty <pbadari@us.ibm.com> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Jan Kara <jack@suse.cz> 2006-09-26 02:30:53 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-09-26 11:48:44 -0400
commit: 3998b9301d3d55be8373add22b6bc5e11c1d9b71 (patch)
tree: 9ec65c5b492a9bffc46467f4210739cd07ac510c /fs/jbd/commit.c
parent: 632bbfeee4f042c05bc65150b4433a297d3fe387 (diff)
1 files changed, 113 insertions, 69 deletions
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 42da60784311..32a8caf0c41e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -160,6 +160,117 @@ static int journal_write_commit_record(journal_t *journal,
        return (ret == -EIO);
 }
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+{
+        int i;
+        for (i = 0; i < bufs; i++) {
+                wbuf[i]->b_end_io = end_buffer_write_sync;
+                /* We use-up our safety reference in submit_bh() */
+                submit_bh(WRITE, wbuf[i]);
+        }
+}
+/*
+ *  Submit all the data buffers to disk
+ */
+static void journal_submit_data_buffers(journal_t *journal,
+                                transaction_t *commit_transaction)
+{
+        struct journal_head *jh;
+        struct buffer_head *bh;
+        int locked;
+        int bufs = 0;
+        struct buffer_head **wbuf = journal->j_wbuf;
+        /*
+         * Whenever we unlock the journal and sleep, things can get added
+         * onto ->t_sync_datalist, so we have to keep looping back to
+         * write_out_data until we *know* that the list is empty.
+         *
+         * Cleanup any flushed data buffers from the data list.  Even in
+         * abort mode, we want to flush this out as soon as possible.
+         */
+write_out_data:
+        cond_resched();
+        spin_lock(&journal->j_list_lock);
+        while (commit_transaction->t_sync_datalist) {
+                jh = commit_transaction->t_sync_datalist;
+                bh = jh2bh(jh);
+                locked = 0;
+                /* Get reference just to make sure buffer does not disappear
+                 * when we are forced to drop various locks */
+                get_bh(bh);
+                /* If the buffer is dirty, we need to submit IO and hence
+                 * we need the buffer lock. We try to lock the buffer without
+                 * blocking. If we fail, we need to drop j_list_lock and do
+                 * blocking lock_buffer().
+                 */
+                if (buffer_dirty(bh)) {
+                        if (test_set_buffer_locked(bh)) {
+                                BUFFER_TRACE(bh, "needs blocking lock");
+                                spin_unlock(&journal->j_list_lock);
+                                /* Write out all data to prevent deadlocks */
+                                journal_do_submit_data(wbuf, bufs);
+                                bufs = 0;
+                                lock_buffer(bh);
+                                spin_lock(&journal->j_list_lock);
+                        }
+                        locked = 1;
+                }
+                /* We have to get bh_state lock. Again out of order, sigh. */
+                if (!inverted_lock(journal, bh)) {
+                        jbd_lock_bh_state(bh);
+                        spin_lock(&journal->j_list_lock);
+                }
+                /* Someone already cleaned up the buffer? */
+                if (!buffer_jbd(bh)
+                        || jh->b_transaction != commit_transaction
+                        || jh->b_jlist != BJ_SyncData) {
+                        jbd_unlock_bh_state(bh);
+                        if (locked)
+                                unlock_buffer(bh);
+                        BUFFER_TRACE(bh, "already cleaned up");
+                        put_bh(bh);
+                        continue;
+                }
+                if (locked && test_clear_buffer_dirty(bh)) {
+                        BUFFER_TRACE(bh, "needs writeout, adding to array");
+                        wbuf[bufs++] = bh;
+                        __journal_file_buffer(jh, commit_transaction,
+                                                BJ_Locked);
+                        jbd_unlock_bh_state(bh);
+                        if (bufs == journal->j_wbufsize) {
+                                spin_unlock(&journal->j_list_lock);
+                                journal_do_submit_data(wbuf, bufs);
+                                bufs = 0;
+                                goto write_out_data;
+                        }
+                }
+                else {
+                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        __journal_unfile_buffer(jh);
+                        jbd_unlock_bh_state(bh);
+                        if (locked)
+                                unlock_buffer(bh);
+                        journal_remove_journal_head(bh);
+                        /* Once for our safety reference, once for
+                         * journal_remove_journal_head() */
+                        put_bh(bh);
+                        put_bh(bh);
+                }
+                if (lock_need_resched(&journal->j_list_lock)) {
+                        spin_unlock(&journal->j_list_lock);
+                        goto write_out_data;
+                }
+        }
+        spin_unlock(&journal->j_list_lock);
+        journal_do_submit_data(wbuf, bufs);
+}
 /*
 * journal_commit_transaction
 *
@@ -313,80 +424,13 @@ void journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
        err = 0;
-        /*
+        journal_submit_data_buffers(journal, commit_transaction);
-         * Whenever we unlock the journal and sleep, things can get added
-         * onto ->t_sync_datalist, so we have to keep looping back to
-         * write_out_data until we *know* that the list is empty.
-         */
-        bufs = 0;
-        /*
-         * Cleanup any flushed data buffers from the data list.  Even in
-         * abort mode, we want to flush this out as soon as possible.
-         */
-write_out_data:
-        cond_resched();
-        spin_lock(&journal->j_list_lock);
-        while (commit_transaction->t_sync_datalist) {
-                struct buffer_head *bh;
-                jh = commit_transaction->t_sync_datalist;
-                commit_transaction->t_sync_datalist = jh->b_tnext;
-                bh = jh2bh(jh);
-                if (buffer_locked(bh)) {
-                        BUFFER_TRACE(bh, "locked");
-                        if (!inverted_lock(journal, bh))
-                                goto write_out_data;
-                        __journal_temp_unlink_buffer(jh);
-                        __journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        if (lock_need_resched(&journal->j_list_lock)) {
-                                spin_unlock(&journal->j_list_lock);
-                                goto write_out_data;
-                        }
-                } else {
-                        if (buffer_dirty(bh)) {
-                                BUFFER_TRACE(bh, "start journal writeout");
-                                get_bh(bh);
-                                wbuf[bufs++] = bh;
-                                if (bufs == journal->j_wbufsize) {
-                                        jbd_debug(2, "submit %d writes\n",
-                                                        bufs);
-                                        spin_unlock(&journal->j_list_lock);
-                                        ll_rw_block(SWRITE, bufs, wbuf);
-                                        journal_brelse_array(wbuf, bufs);
-                                        bufs = 0;
-                                        goto write_out_data;
-                                }
-                        } else {
-                                BUFFER_TRACE(bh, "writeout complete: unfile");
-                                if (!inverted_lock(journal, bh))
-                                        goto write_out_data;
-                                __journal_unfile_buffer(jh);
-                                jbd_unlock_bh_state(bh);
-                                journal_remove_journal_head(bh);
-                                put_bh(bh);
-                                if (lock_need_resched(&journal->j_list_lock)) {
-                                        spin_unlock(&journal->j_list_lock);
-                                        goto write_out_data;
-                                }
-                        }
-                }
-        }
-        if (bufs) {
-                spin_unlock(&journal->j_list_lock);
-                ll_rw_block(SWRITE, bufs, wbuf);
-                journal_brelse_array(wbuf, bufs);
-                spin_lock(&journal->j_list_lock);
-        }
        /*
         * Wait for all previously submitted IO to complete.
         */
+        spin_lock(&journal->j_list_lock);
        while (commit_transaction->t_locked_list) {
                struct buffer_head *bh;
author	Jan Kara <jack@suse.cz>	2006-09-26 02:30:53 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-09-26 11:48:44 -0400
commit	3998b9301d3d55be8373add22b6bc5e11c1d9b71 (patch)
tree	9ec65c5b492a9bffc46467f4210739cd07ac510c /fs/jbd/commit.c
parent	632bbfeee4f042c05bc65150b4433a297d3fe387 (diff)

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 42da60784311..32a8caf0c41e 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c
@@ -160,6 +160,117 @@ static int journal_write_commit_record(journal_t *journal,
160	return (ret == -EIO);	160	return (ret == -EIO);
161	}	161	}
162		162
		163	static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
		164	{
		165	int i;
		166
		167	for (i = 0; i < bufs; i++) {
		168	wbuf[i]->b_end_io = end_buffer_write_sync;
		169	/* We use-up our safety reference in submit_bh() */
		170	submit_bh(WRITE, wbuf[i]);
		171	}
		172	}
		173
		174	/*
		175	* Submit all the data buffers to disk
		176	*/
		177	static void journal_submit_data_buffers(journal_t *journal,
		178	transaction_t *commit_transaction)
		179	{
		180	struct journal_head *jh;
		181	struct buffer_head *bh;
		182	int locked;
		183	int bufs = 0;
		184	struct buffer_head **wbuf = journal->j_wbuf;
		185
		186	/*
		187	* Whenever we unlock the journal and sleep, things can get added
		188	* onto ->t_sync_datalist, so we have to keep looping back to
		189	* write_out_data until we know that the list is empty.
		190	*
		191	* Cleanup any flushed data buffers from the data list. Even in
		192	* abort mode, we want to flush this out as soon as possible.
		193	*/
		194	write_out_data:
		195	cond_resched();
		196	spin_lock(&journal->j_list_lock);
		197
		198	while (commit_transaction->t_sync_datalist) {
		199	jh = commit_transaction->t_sync_datalist;
		200	bh = jh2bh(jh);
		201	locked = 0;
		202
		203	/* Get reference just to make sure buffer does not disappear
		204	* when we are forced to drop various locks */
		205	get_bh(bh);
		206	/* If the buffer is dirty, we need to submit IO and hence
		207	* we need the buffer lock. We try to lock the buffer without
		208	* blocking. If we fail, we need to drop j_list_lock and do
		209	* blocking lock_buffer().
		210	*/
		211	if (buffer_dirty(bh)) {
		212	if (test_set_buffer_locked(bh)) {
		213	BUFFER_TRACE(bh, "needs blocking lock");
		214	spin_unlock(&journal->j_list_lock);
		215	/* Write out all data to prevent deadlocks */
		216	journal_do_submit_data(wbuf, bufs);
		217	bufs = 0;
		218	lock_buffer(bh);
		219	spin_lock(&journal->j_list_lock);
		220	}
		221	locked = 1;
		222	}
		223	/* We have to get bh_state lock. Again out of order, sigh. */
		224	if (!inverted_lock(journal, bh)) {
		225	jbd_lock_bh_state(bh);
		226	spin_lock(&journal->j_list_lock);
		227	}
		228	/* Someone already cleaned up the buffer? */
		229	if (!buffer_jbd(bh)
		230	\|\| jh->b_transaction != commit_transaction
		231	\|\| jh->b_jlist != BJ_SyncData) {
		232	jbd_unlock_bh_state(bh);
		233	if (locked)
		234	unlock_buffer(bh);
		235	BUFFER_TRACE(bh, "already cleaned up");
		236	put_bh(bh);
		237	continue;
		238	}
		239	if (locked && test_clear_buffer_dirty(bh)) {
		240	BUFFER_TRACE(bh, "needs writeout, adding to array");
		241	wbuf[bufs++] = bh;
		242	__journal_file_buffer(jh, commit_transaction,
		243	BJ_Locked);
		244	jbd_unlock_bh_state(bh);
		245	if (bufs == journal->j_wbufsize) {
		246	spin_unlock(&journal->j_list_lock);
		247	journal_do_submit_data(wbuf, bufs);
		248	bufs = 0;
		249	goto write_out_data;
		250	}
		251	}
		252	else {
		253	BUFFER_TRACE(bh, "writeout complete: unfile");
		254	__journal_unfile_buffer(jh);
		255	jbd_unlock_bh_state(bh);
		256	if (locked)
		257	unlock_buffer(bh);
		258	journal_remove_journal_head(bh);
		259	/* Once for our safety reference, once for
		260	* journal_remove_journal_head() */
		261	put_bh(bh);
		262	put_bh(bh);
		263	}
		264
		265	if (lock_need_resched(&journal->j_list_lock)) {
		266	spin_unlock(&journal->j_list_lock);
		267	goto write_out_data;
		268	}
		269	}
		270	spin_unlock(&journal->j_list_lock);
		271	journal_do_submit_data(wbuf, bufs);
		272	}
		273
163	/*	274	/*
164	* journal_commit_transaction	275	* journal_commit_transaction
165	*	276	*
@@ -313,80 +424,13 @@ void journal_commit_transaction(journal_t *journal)
313	* Now start flushing things to disk, in the order they appear	424	* Now start flushing things to disk, in the order they appear
314	* on the transaction lists. Data blocks go first.	425	* on the transaction lists. Data blocks go first.
315	*/	426	*/
316
317	err = 0;	427	err = 0;
318	/*	428	journal_submit_data_buffers(journal, commit_transaction);
319	* Whenever we unlock the journal and sleep, things can get added
320	* onto ->t_sync_datalist, so we have to keep looping back to
321	* write_out_data until we know that the list is empty.
322	*/
323	bufs = 0;
324	/*
325	* Cleanup any flushed data buffers from the data list. Even in
326	* abort mode, we want to flush this out as soon as possible.
327	*/
328	write_out_data:
329	cond_resched();
330	spin_lock(&journal->j_list_lock);
331
332	while (commit_transaction->t_sync_datalist) {
333	struct buffer_head *bh;
334
335	jh = commit_transaction->t_sync_datalist;
336	commit_transaction->t_sync_datalist = jh->b_tnext;
337	bh = jh2bh(jh);
338	if (buffer_locked(bh)) {
339	BUFFER_TRACE(bh, "locked");
340	if (!inverted_lock(journal, bh))
341	goto write_out_data;
342	__journal_temp_unlink_buffer(jh);
343	__journal_file_buffer(jh, commit_transaction,
344	BJ_Locked);
345	jbd_unlock_bh_state(bh);
346	if (lock_need_resched(&journal->j_list_lock)) {
347	spin_unlock(&journal->j_list_lock);
348	goto write_out_data;
349	}
350	} else {
351	if (buffer_dirty(bh)) {
352	BUFFER_TRACE(bh, "start journal writeout");
353	get_bh(bh);
354	wbuf[bufs++] = bh;
355	if (bufs == journal->j_wbufsize) {
356	jbd_debug(2, "submit %d writes\n",
357	bufs);
358	spin_unlock(&journal->j_list_lock);
359	ll_rw_block(SWRITE, bufs, wbuf);
360	journal_brelse_array(wbuf, bufs);
361	bufs = 0;
362	goto write_out_data;
363	}
364	} else {
365	BUFFER_TRACE(bh, "writeout complete: unfile");
366	if (!inverted_lock(journal, bh))
367	goto write_out_data;
368	__journal_unfile_buffer(jh);
369	jbd_unlock_bh_state(bh);
370	journal_remove_journal_head(bh);
371	put_bh(bh);
372	if (lock_need_resched(&journal->j_list_lock)) {
373	spin_unlock(&journal->j_list_lock);
374	goto write_out_data;
375	}
376	}
377	}
378	}
379
380	if (bufs) {
381	spin_unlock(&journal->j_list_lock);
382	ll_rw_block(SWRITE, bufs, wbuf);
383	journal_brelse_array(wbuf, bufs);
384	spin_lock(&journal->j_list_lock);
385	}
386		429
387	/*	430	/*
388	* Wait for all previously submitted IO to complete.	431	* Wait for all previously submitted IO to complete.
389	*/	432	*/
		433	spin_lock(&journal->j_list_lock);
390	while (commit_transaction->t_locked_list) {	434	while (commit_transaction->t_locked_list) {
391	struct buffer_head *bh;	435	struct buffer_head *bh;
392		436