jbd2: Fix a race between checkpointing code and journal_get_write_access()

The following race can happen: CPU1 CPU2 checkpointing code checks the buffer, adds it to an array for writeback do_get_write_access() ... lock_buffer() unlock_buffer() flush_batch() submits the buffer for IO __jbd2_journal_file_buffer() So a buffer under writeout is returned from do_get_write_access(). Since the filesystem code relies on the fact that journaled buffers cannot be written out, it does not take the buffer lock and so it can modify buffer while it is under writeout. That can lead to a filesystem corruption if we crash at the right moment. We fix the problem by clearing the buffer dirty bit under buffer_lock even if the buffer is on BJ_None list. Actually, we clear the dirty bit regardless the list the buffer is in and warn about the fact if the buffer is already journalled. Thanks for spotting the problem goes to dingdinghua <dingdinghua85@gmail.com>. Reported-by: dingdinghua <dingdinghua85@gmail.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
author: Jan Kara <jack@suse.cz> 2009-07-13 16:16:20 -0400
committer: Theodore Ts'o <tytso@mit.edu> 2009-07-13 16:16:20 -0400
commit: f91d1d04171026e56c7e343ee3cdcc801dd85cfb (patch)
tree: 282341868e06cb09ecd6c0838ea027feec514c2c /fs/jbd2/transaction.c
parent: 3e03f9ca6a2599db1823bb0ea24e0845219a0e69 (diff)
1 files changed, 35 insertions, 33 deletions
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 494501edba6b..6213ac728f30 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -499,34 +499,15 @@ void jbd2_journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+        printk(KERN_WARNING
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-                struct buffer_head *bh = jh2bh(jh);
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
-                if (test_clear_buffer_dirty(bh))
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
-                        set_buffer_jbddirty(bh);
-        }
 }
 /*
@@ -593,14 +574,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -843,6 +826,15 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous jbd2_journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1644,8 +1636,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -1896,12 +1893,17 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
author	Jan Kara <jack@suse.cz>	2009-07-13 16:16:20 -0400
committer	Theodore Ts'o <tytso@mit.edu>	2009-07-13 16:16:20 -0400
commit	f91d1d04171026e56c7e343ee3cdcc801dd85cfb (patch)
tree	282341868e06cb09ecd6c0838ea027feec514c2c /fs/jbd2/transaction.c
parent	3e03f9ca6a2599db1823bb0ea24e0845219a0e69 (diff)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 494501edba6b..6213ac728f30 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c
@@ -499,34 +499,15 @@ void jbd2_journal_unlock_updates (journal_t *journal)
499	wake_up(&journal->j_wait_transaction_locked);	499	wake_up(&journal->j_wait_transaction_locked);
500	}	500	}
501		501
502	/*	502	static void warn_dirty_buffer(struct buffer_head *bh)
503	* Report any unexpected dirty buffers which turn up. Normally those
504	* indicate an error, but they can occur if the user is running (say)
505	* tune2fs to modify the live filesystem, so we need the option of
506	* continuing as gracefully as possible. #
507	*
508	* The caller should already hold the journal lock and
509	* j_list_lock spinlock: most callers will need those anyway
510	* in order to probe the buffer's journaling state safely.
511	*/
512	static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
513	{	503	{
514	int jlist;	504	char b[BDEVNAME_SIZE];
515
516	/* If this buffer is one which might reasonably be dirty
517	* --- ie. data, or not part of this journal --- then
518	* we're OK to leave it alone, but otherwise we need to
519	* move the dirty bit to the journal's own internal
520	* JBDDirty bit. */
521	jlist = jh->b_jlist;
522		505
523	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|	506	printk(KERN_WARNING
524	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {	507	"JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
525	struct buffer_head *bh = jh2bh(jh);	508	"There's a risk of filesystem corruption in case of system "
526		509	"crash.\n",
527	if (test_clear_buffer_dirty(bh))	510	bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
528	set_buffer_jbddirty(bh);
529	}
530	}	511	}
531		512
532	/*	513	/*
@@ -593,14 +574,16 @@ repeat:
593	if (jh->b_next_transaction)	574	if (jh->b_next_transaction)
594	J_ASSERT_JH(jh, jh->b_next_transaction ==	575	J_ASSERT_JH(jh, jh->b_next_transaction ==
595	transaction);	576	transaction);
		577	warn_dirty_buffer(bh);
596	}	578	}
597	/*	579	/*
598	* In any case we need to clean the dirty flag and we must	580	* In any case we need to clean the dirty flag and we must
599	* do it under the buffer lock to be sure we don't race	581	* do it under the buffer lock to be sure we don't race
600	* with running write-out.	582	* with running write-out.
601	*/	583	*/
602	JBUFFER_TRACE(jh, "Unexpected dirty buffer");	584	JBUFFER_TRACE(jh, "Journalling dirty buffer");
603	jbd_unexpected_dirty_buffer(jh);	585	clear_buffer_dirty(bh);
		586	set_buffer_jbddirty(bh);
604	}	587	}
605		588
606	unlock_buffer(bh);	589	unlock_buffer(bh);
@@ -843,6 +826,15 @@ int jbd2_journal_get_create_access(handle_t handle, struct buffer_head bh)
843	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));	826	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
844		827
845	if (jh->b_transaction == NULL) {	828	if (jh->b_transaction == NULL) {
		829	/*
		830	* Previous jbd2_journal_forget() could have left the buffer
		831	* with jbddirty bit set because it was being committed. When
		832	* the commit finished, we've filed the buffer for
		833	* checkpointing and marked it dirty. Now we are reallocating
		834	* the buffer so the transaction freeing it must have
		835	* committed and so it's safe to clear the dirty bit.
		836	*/
		837	clear_buffer_dirty(jh2bh(jh));
846	jh->b_transaction = transaction;	838	jh->b_transaction = transaction;
847		839
848	/* first access by this transaction */	840	/* first access by this transaction */
@@ -1644,8 +1636,13 @@ static int __dispose_buffer(struct journal_head jh, transaction_t transaction)
1644		1636
1645	if (jh->b_cp_transaction) {	1637	if (jh->b_cp_transaction) {
1646	JBUFFER_TRACE(jh, "on running+cp transaction");	1638	JBUFFER_TRACE(jh, "on running+cp transaction");
		1639	/*
		1640	* We don't want to write the buffer anymore, clear the
		1641	* bit so that we don't confuse checks in
		1642	* __journal_file_buffer
		1643	*/
		1644	clear_buffer_dirty(bh);
1647	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);	1645	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1648	clear_buffer_jbddirty(bh);
1649	may_free = 0;	1646	may_free = 0;
1650	} else {	1647	} else {
1651	JBUFFER_TRACE(jh, "on running transaction");	1648	JBUFFER_TRACE(jh, "on running transaction");
@@ -1896,12 +1893,17 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
1896	if (jh->b_transaction && jh->b_jlist == jlist)	1893	if (jh->b_transaction && jh->b_jlist == jlist)
1897	return;	1894	return;
1898		1895
1899	/* The following list of buffer states needs to be consistent
1900	* with __jbd_unexpected_dirty_buffer()'s handling of dirty
1901	* state. */
1902
1903	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|	1896	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|
1904	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {	1897	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {
		1898	/*
		1899	* For metadata buffers, we track dirty bit in buffer_jbddirty
		1900	* instead of buffer_dirty. We should not see a dirty bit set
		1901	* here because we clear it in do_get_write_access but e.g.
		1902	* tune2fs can modify the sb and set the dirty bit at any time
		1903	* so we try to gracefully handle that.
		1904	*/
		1905	if (buffer_dirty(bh))
		1906	warn_dirty_buffer(bh);
1905	if (test_clear_buffer_dirty(bh) \|\|	1907	if (test_clear_buffer_dirty(bh) \|\|
1906	test_clear_buffer_jbddirty(bh))	1908	test_clear_buffer_jbddirty(bh))
1907	was_dirty = 1;	1909	was_dirty = 1;