jbd: Fix a race between checkpointing code and journal_get_write_access()

The following race can happen: CPU1 CPU2 checkpointing code checks the buffer, adds it to an array for writeback do_get_write_access() ... lock_buffer() unlock_buffer() flush_batch() submits the buffer for IO __jbd_journal_file_buffer() So a buffer under writeout is returned from do_get_write_access(). Since the filesystem code relies on the fact that journaled buffers cannot be written out, it does not take the buffer lock and so it can modify buffer while it is under writeout. That can lead to a filesystem corruption if we crash at the right moment. The similar problem can happen with the journal_get_create_access() path. We fix the problem by clearing the buffer dirty bit under buffer_lock even if the buffer is on BJ_None list. Actually, we clear the dirty bit regardless the list the buffer is in and warn about the fact if the buffer is already journalled. Thanks for spotting the problem goes to dingdinghua <dingdinghua85@gmail.com>. Reported-by: dingdinghua <dingdinghua85@gmail.com> Signed-off-by: Jan Kara <jack@suse.cz>
author: Jan Kara <jack@suse.cz> 2009-06-24 11:31:40 -0400
committer: Jan Kara <jack@suse.cz> 2009-07-15 15:30:07 -0400
commit: 1e9fd53b783ea646de3ee09a4574afeb6778d504 (patch)
tree: 0b87e9ca83c612a1c3c5b91a359f8604cde48c45 /fs/jbd
parent: 9eaaa2d5759837402ec5eee13b2a97921808c3eb (diff)
1 files changed, 35 insertions, 33 deletions
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73242ba7c7b1..c03ac11f74be 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+        printk(KERN_WARNING
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-                struct buffer_head *bh = jh2bh(jh);
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
-                if (test_clear_buffer_dirty(bh))
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
-                        set_buffer_jbddirty(bh);
-        }
 }
 /*
@@ -583,14 +564,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
author	Jan Kara <jack@suse.cz>	2009-06-24 11:31:40 -0400
committer	Jan Kara <jack@suse.cz>	2009-07-15 15:30:07 -0400
commit	1e9fd53b783ea646de3ee09a4574afeb6778d504 (patch)
tree	0b87e9ca83c612a1c3c5b91a359f8604cde48c45 /fs/jbd
parent	9eaaa2d5759837402ec5eee13b2a97921808c3eb (diff)

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 73242ba7c7b1..c03ac11f74be 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c
@@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal)
489	wake_up(&journal->j_wait_transaction_locked);	489	wake_up(&journal->j_wait_transaction_locked);
490	}	490	}
491		491
492	/*	492	static void warn_dirty_buffer(struct buffer_head *bh)
493	* Report any unexpected dirty buffers which turn up. Normally those
494	* indicate an error, but they can occur if the user is running (say)
495	* tune2fs to modify the live filesystem, so we need the option of
496	* continuing as gracefully as possible. #
497	*
498	* The caller should already hold the journal lock and
499	* j_list_lock spinlock: most callers will need those anyway
500	* in order to probe the buffer's journaling state safely.
501	*/
502	static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
503	{	493	{
504	int jlist;	494	char b[BDEVNAME_SIZE];
505
506	/* If this buffer is one which might reasonably be dirty
507	* --- ie. data, or not part of this journal --- then
508	* we're OK to leave it alone, but otherwise we need to
509	* move the dirty bit to the journal's own internal
510	* JBDDirty bit. */
511	jlist = jh->b_jlist;
512		495
513	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|	496	printk(KERN_WARNING
514	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {	497	"JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
515	struct buffer_head *bh = jh2bh(jh);	498	"There's a risk of filesystem corruption in case of system "
516		499	"crash.\n",
517	if (test_clear_buffer_dirty(bh))	500	bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
518	set_buffer_jbddirty(bh);
519	}
520	}	501	}
521		502
522	/*	503	/*
@@ -583,14 +564,16 @@ repeat:
583	if (jh->b_next_transaction)	564	if (jh->b_next_transaction)
584	J_ASSERT_JH(jh, jh->b_next_transaction ==	565	J_ASSERT_JH(jh, jh->b_next_transaction ==
585	transaction);	566	transaction);
		567	warn_dirty_buffer(bh);
586	}	568	}
587	/*	569	/*
588	* In any case we need to clean the dirty flag and we must	570	* In any case we need to clean the dirty flag and we must
589	* do it under the buffer lock to be sure we don't race	571	* do it under the buffer lock to be sure we don't race
590	* with running write-out.	572	* with running write-out.
591	*/	573	*/
592	JBUFFER_TRACE(jh, "Unexpected dirty buffer");	574	JBUFFER_TRACE(jh, "Journalling dirty buffer");
593	jbd_unexpected_dirty_buffer(jh);	575	clear_buffer_dirty(bh);
		576	set_buffer_jbddirty(bh);
594	}	577	}
595		578
596	unlock_buffer(bh);	579	unlock_buffer(bh);
@@ -826,6 +809,15 @@ int journal_get_create_access(handle_t handle, struct buffer_head bh)
826	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));	809	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
827		810
828	if (jh->b_transaction == NULL) {	811	if (jh->b_transaction == NULL) {
		812	/*
		813	* Previous journal_forget() could have left the buffer
		814	* with jbddirty bit set because it was being committed. When
		815	* the commit finished, we've filed the buffer for
		816	* checkpointing and marked it dirty. Now we are reallocating
		817	* the buffer so the transaction freeing it must have
		818	* committed and so it's safe to clear the dirty bit.
		819	*/
		820	clear_buffer_dirty(jh2bh(jh));
829	jh->b_transaction = transaction;	821	jh->b_transaction = transaction;
830		822
831	/* first access by this transaction */	823	/* first access by this transaction */
@@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head jh, transaction_t transaction)
1782		1774
1783	if (jh->b_cp_transaction) {	1775	if (jh->b_cp_transaction) {
1784	JBUFFER_TRACE(jh, "on running+cp transaction");	1776	JBUFFER_TRACE(jh, "on running+cp transaction");
		1777	/*
		1778	* We don't want to write the buffer anymore, clear the
		1779	* bit so that we don't confuse checks in
		1780	* __journal_file_buffer
		1781	*/
		1782	clear_buffer_dirty(bh);
1785	__journal_file_buffer(jh, transaction, BJ_Forget);	1783	__journal_file_buffer(jh, transaction, BJ_Forget);
1786	clear_buffer_jbddirty(bh);
1787	may_free = 0;	1784	may_free = 0;
1788	} else {	1785	} else {
1789	JBUFFER_TRACE(jh, "on running transaction");	1786	JBUFFER_TRACE(jh, "on running transaction");
@@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh,
2041	if (jh->b_transaction && jh->b_jlist == jlist)	2038	if (jh->b_transaction && jh->b_jlist == jlist)
2042	return;	2039	return;
2043		2040
2044	/* The following list of buffer states needs to be consistent
2045	* with __jbd_unexpected_dirty_buffer()'s handling of dirty
2046	* state. */
2047
2048	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|	2041	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|
2049	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {	2042	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {
		2043	/*
		2044	* For metadata buffers, we track dirty bit in buffer_jbddirty
		2045	* instead of buffer_dirty. We should not see a dirty bit set
		2046	* here because we clear it in do_get_write_access but e.g.
		2047	* tune2fs can modify the sb and set the dirty bit at any time
		2048	* so we try to gracefully handle that.
		2049	*/
		2050	if (buffer_dirty(bh))
		2051	warn_dirty_buffer(bh);
2050	if (test_clear_buffer_dirty(bh) \|\|	2052	if (test_clear_buffer_dirty(bh) \|\|
2051	test_clear_buffer_jbddirty(bh))	2053	test_clear_buffer_jbddirty(bh))
2052	was_dirty = 1;	2054	was_dirty = 1;