ext4: fix data corruption in inodes with journalled data

When journalling data for an inode (either because it is a symlink or because the filesystem is mounted in data=journal mode), ext4_evict_inode() can discard unwritten data by calling truncate_inode_pages(). This is because we don't mark the buffer / page dirty when journalling data but only add the buffer to the running transaction and thus mm does not know there are still unwritten data. Fix the problem by carefully tracking transaction containing inode's data, committing this transaction, and writing uncheckpointed buffers when inode should be reaped. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
author: Jan Kara <jack@suse.cz> 2011-07-26 09:07:11 -0400
committer: Theodore Ts'o <tytso@mit.edu> 2011-07-26 09:07:11 -0400
commit: 2d859db3e4a82a365572592d57624a5f996ed0ec (patch)
tree: d725aca3ab9555b8ee92f753a797034ff79f580c /fs/ext4
parent: b7ca1e8ec53259359db5313f923a0a20fa04bdb6 (diff)
1 files changed, 29 insertions, 0 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index de50b16a8f67..43e4abd67be7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -121,6 +121,33 @@ void ext4_evict_inode(struct inode *inode)
        trace_ext4_evict_inode(inode);
        if (inode->i_nlink) {
+                /*
+                 * When journalling data dirty buffers are tracked only in the
+                 * journal. So although mm thinks everything is clean and
+                 * ready for reaping the inode might still have some pages to
+                 * write in the running transaction or waiting to be
+                 * checkpointed. Thus calling jbd2_journal_invalidatepage()
+                 * (via truncate_inode_pages()) to discard these buffers can
+                 * cause data loss. Also even if we did not discard these
+                 * buffers, we would have no way to find them after the inode
+                 * is reaped and thus user could see stale data if he tries to
+                 * read them before the transaction is checkpointed. So be
+                 * careful and force everything to disk here... We use
+                 * ei->i_datasync_tid to store the newest transaction
+                 * containing inode's data.
+                 *
+                 * Note that directories do not have this problem because they
+                 * don't use page cache.
+                 */
+                if (ext4_should_journal_data(inode) &&
+                    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+                        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+                        tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+                        jbd2_log_start_commit(journal, commit_tid);
+                        jbd2_log_wait_commit(journal, commit_tid);
+                        filemap_write_and_wait(&inode->i_data);
+                }
                truncate_inode_pages(&inode->i_data, 0);
                goto no_delete;
        }
@@ -970,6 +997,7 @@ static int ext4_journalled_write_end(struct file *file,
        if (new_i_size > inode->i_size)
                i_size_write(inode, pos+copied);
        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        if (new_i_size > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, new_i_size);
                ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1678,6 +1706,7 @@ static int __ext4_journalled_writepage(struct page *page,
                                write_end_fn);
        if (ret == 0)
                ret = err;
+        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
author	Jan Kara <jack@suse.cz>	2011-07-26 09:07:11 -0400
committer	Theodore Ts'o <tytso@mit.edu>	2011-07-26 09:07:11 -0400
commit	2d859db3e4a82a365572592d57624a5f996ed0ec (patch)
tree	d725aca3ab9555b8ee92f753a797034ff79f580c /fs/ext4
parent	b7ca1e8ec53259359db5313f923a0a20fa04bdb6 (diff)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index de50b16a8f67..43e4abd67be7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c
@@ -121,6 +121,33 @@ void ext4_evict_inode(struct inode *inode)
121		121
122	trace_ext4_evict_inode(inode);	122	trace_ext4_evict_inode(inode);
123	if (inode->i_nlink) {	123	if (inode->i_nlink) {
		124	/*
		125	* When journalling data dirty buffers are tracked only in the
		126	* journal. So although mm thinks everything is clean and
		127	* ready for reaping the inode might still have some pages to
		128	* write in the running transaction or waiting to be
		129	* checkpointed. Thus calling jbd2_journal_invalidatepage()
		130	* (via truncate_inode_pages()) to discard these buffers can
		131	* cause data loss. Also even if we did not discard these
		132	* buffers, we would have no way to find them after the inode
		133	* is reaped and thus user could see stale data if he tries to
		134	* read them before the transaction is checkpointed. So be
		135	* careful and force everything to disk here... We use
		136	* ei->i_datasync_tid to store the newest transaction
		137	* containing inode's data.
		138	*
		139	* Note that directories do not have this problem because they
		140	* don't use page cache.
		141	*/
		142	if (ext4_should_journal_data(inode) &&
		143	(S_ISLNK(inode->i_mode) \|\| S_ISREG(inode->i_mode))) {
		144	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
		145	tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
		146
		147	jbd2_log_start_commit(journal, commit_tid);
		148	jbd2_log_wait_commit(journal, commit_tid);
		149	filemap_write_and_wait(&inode->i_data);
		150	}
124	truncate_inode_pages(&inode->i_data, 0);	151	truncate_inode_pages(&inode->i_data, 0);
125	goto no_delete;	152	goto no_delete;
126	}	153	}
@@ -970,6 +997,7 @@ static int ext4_journalled_write_end(struct file *file,
970	if (new_i_size > inode->i_size)	997	if (new_i_size > inode->i_size)
971	i_size_write(inode, pos+copied);	998	i_size_write(inode, pos+copied);
972	ext4_set_inode_state(inode, EXT4_STATE_JDATA);	999	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
		1000	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
973	if (new_i_size > EXT4_I(inode)->i_disksize) {	1001	if (new_i_size > EXT4_I(inode)->i_disksize) {
974	ext4_update_i_disksize(inode, new_i_size);	1002	ext4_update_i_disksize(inode, new_i_size);
975	ret2 = ext4_mark_inode_dirty(handle, inode);	1003	ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1678,6 +1706,7 @@ static int __ext4_journalled_writepage(struct page *page,
1678	write_end_fn);	1706	write_end_fn);
1679	if (ret == 0)	1707	if (ret == 0)
1680	ret = err;	1708	ret = err;
		1709	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1681	err = ext4_journal_stop(handle);	1710	err = ext4_journal_stop(handle);
1682	if (!ret)	1711	if (!ret)
1683	ret = err;	1712	ret = err;