jbd: Issue cache flush after checkpointing

When we reach cleanup_journal_tail(), there is no guarantee that checkpointed buffers are on a stable storage - especially if buffers were written out by log_do_checkpoint(), they are likely to be only in disk's caches. Thus when we update journal superblock, effectively removing old transaction from journal, this write of superblock can get to stable storage before those checkpointed buffers which can result in filesystem corruption after a crash. A similar problem can happen if we replay the journal and wipe it before flushing disk's caches. Thus we must unconditionally issue a cache flush before we update journal superblock in these cases. The fix is slightly complicated by the fact that we have to get log tail before we issue cache flush but we can store it in the journal superblock only after the cache flush. Otherwise we risk races where new tail is written before appropriate cache flush is finished. I managed to reproduce the corruption using somewhat tweaked Chris Mason's barrier-test scheduler. Also this should fix occasional reports of 'Bit already freed' filesystem errors which are totally unreproducible but inspection of several fs images I've gathered over time points to a problem like this. CC: stable@kernel.org Signed-off-by: Jan Kara <jack@suse.cz>
author: Jan Kara <jack@suse.cz> 2011-11-25 18:35:39 -0500
committer: Jan Kara <jack@suse.cz> 2012-01-11 07:36:57 -0500
commit: 353b67d8ced4dc53281c88150ad295e24bc4b4c5 (patch)
tree: a339a47a9899d01108c6167ffbbefcea07f63912 /fs/jbd/checkpoint.c
parent: e4e11180dfa545233e5145919b75b7fac88638df (diff)
1 files changed, 22 insertions, 5 deletions
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 5d1a00a5041b..05f0754f2b46 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -453,8 +453,6 @@ out:
 *
 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
 *
- * Called with the journal lock held.
- *
 * This is the only part of the journaling code which really needs to be
 * aware of transaction aborts.  Checkpointing involves writing to the
 * main filesystem area rather than to the journal, so it can proceed
@@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
        if (is_journal_aborted(journal))
                return 1;
-        /* OK, work out the oldest transaction remaining in the log, and
+        /*
+         * OK, work out the oldest transaction remaining in the log, and
         * the log block it starts at.
         *
         * If the log is now empty, we need to work out which is the
         * next transaction ID we will write, and where it will
-         * start. */
+         * start.
+         */
        spin_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        transaction = journal->j_checkpoint_transactions;
@@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
                spin_unlock(&journal->j_state_lock);
                return 1;
        }
+        spin_unlock(&journal->j_state_lock);
+        /*
+         * We need to make sure that any blocks that were recently written out
+         * --- perhaps by log_do_checkpoint() --- are flushed out before we
+         * drop the transactions from the journal. It's unlikely this will be
+         * necessary, especially with an appropriately sized journal, but we
+         * need this to guarantee correctness.  Fortunately
+         * cleanup_journal_tail() doesn't get called all that often.
+         */
+        if (journal->j_flags & JFS_BARRIER)
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+        spin_lock(&journal->j_state_lock);
+        if (!tid_gt(first_tid, journal->j_tail_sequence)) {
+                spin_unlock(&journal->j_state_lock);
+                /* Someone else cleaned up journal so return 0 */
+                return 0;
+        }
        /* OK, update the superblock to recover the freed space.
         * Physical blocks come first: have we wrapped beyond the end of
         * the log?  */
author	Jan Kara <jack@suse.cz>	2011-11-25 18:35:39 -0500
committer	Jan Kara <jack@suse.cz>	2012-01-11 07:36:57 -0500
commit	353b67d8ced4dc53281c88150ad295e24bc4b4c5 (patch)
tree	a339a47a9899d01108c6167ffbbefcea07f63912 /fs/jbd/checkpoint.c
parent	e4e11180dfa545233e5145919b75b7fac88638df (diff)

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 5d1a00a5041b..05f0754f2b46 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c
@@ -453,8 +453,6 @@ out:
453	*	453	*
454	* Return <0 on error, 0 on success, 1 if there was nothing to clean up.	454	* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
455	*	455	*
456	* Called with the journal lock held.
457	*
458	* This is the only part of the journaling code which really needs to be	456	* This is the only part of the journaling code which really needs to be
459	* aware of transaction aborts. Checkpointing involves writing to the	457	* aware of transaction aborts. Checkpointing involves writing to the
460	* main filesystem area rather than to the journal, so it can proceed	458	* main filesystem area rather than to the journal, so it can proceed
@@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
472	if (is_journal_aborted(journal))	470	if (is_journal_aborted(journal))
473	return 1;	471	return 1;
474		472
475	/* OK, work out the oldest transaction remaining in the log, and	473	/*
		474	* OK, work out the oldest transaction remaining in the log, and
476	* the log block it starts at.	475	* the log block it starts at.
477	*	476	*
478	* If the log is now empty, we need to work out which is the	477	* If the log is now empty, we need to work out which is the
479	* next transaction ID we will write, and where it will	478	* next transaction ID we will write, and where it will
480	* start. */	479	* start.
481		480	*/
482	spin_lock(&journal->j_state_lock);	481	spin_lock(&journal->j_state_lock);
483	spin_lock(&journal->j_list_lock);	482	spin_lock(&journal->j_list_lock);
484	transaction = journal->j_checkpoint_transactions;	483	transaction = journal->j_checkpoint_transactions;
@@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
504	spin_unlock(&journal->j_state_lock);	503	spin_unlock(&journal->j_state_lock);
505	return 1;	504	return 1;
506	}	505	}
		506	spin_unlock(&journal->j_state_lock);
		507
		508	/*
		509	* We need to make sure that any blocks that were recently written out
		510	* --- perhaps by log_do_checkpoint() --- are flushed out before we
		511	* drop the transactions from the journal. It's unlikely this will be
		512	* necessary, especially with an appropriately sized journal, but we
		513	* need this to guarantee correctness. Fortunately
		514	* cleanup_journal_tail() doesn't get called all that often.
		515	*/
		516	if (journal->j_flags & JFS_BARRIER)
		517	blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
507		518
		519	spin_lock(&journal->j_state_lock);
		520	if (!tid_gt(first_tid, journal->j_tail_sequence)) {
		521	spin_unlock(&journal->j_state_lock);
		522	/* Someone else cleaned up journal so return 0 */
		523	return 0;
		524	}
508	/* OK, update the superblock to recover the freed space.	525	/* OK, update the superblock to recover the freed space.
509	* Physical blocks come first: have we wrapped beyond the end of	526	* Physical blocks come first: have we wrapped beyond the end of
510	* the log? */	527	* the log? */