ocfs2: improve fsync efficiency and fix deadlock between aio_write and sync_file

Currently, ocfs2_sync_file grabs i_mutex and forces the current journal transaction to complete. This isn't terribly efficient, since sync_file really only needs to wait for the last transaction involving that inode to complete, and this doesn't require i_mutex. Therefore, implement the necessary bits to track the newest tid associated with an inode, and teach sync_file to wait for that instead of waiting for everything in the journal to commit. Furthermore, only issue the flush request to the drive if jbd2 hasn't already done so. This also eliminates the deadlock between ocfs2_file_aio_write() and ocfs2_sync_file(). aio_write takes i_mutex then calls ocfs2_aiodio_wait() to wait for unaligned dio writes to finish. However, if that dio completion involves calling fsync, then we can get into trouble when some ocfs2_sync_file tries to take i_mutex. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Darrick J. Wong <darrick.wong@oracle.com> 2014-04-03 17:46:48 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-03 19:20:53 -0400
commit: 2931cdcb49194503b19345c597b68fdcf78396f8 (patch)
tree: 2492e18b4aa23b3815b6f54112bc4667158e101a /fs/ocfs2/inode.c
parent: a75fe48cad2fb81e0e2671c73aea6c78ce5626d4 (diff)
1 files changed, 28 insertions, 0 deletions
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f29a90fde619..28ab8a9e88a1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
        struct inode *inode = NULL;
        struct super_block *sb = osb->sb;
        struct ocfs2_find_inode_args args;
+        journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
        trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
                               sysfile_type);
@@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
                goto bail;
        }
+        /*
+         * Set transaction id's of transactions that have to be committed
+         * to finish f[data]sync. We set them to currently running transaction
+         * as we cannot be sure that the inode or some of its metadata isn't
+         * part of the transaction - the inode could have been reclaimed and
+         * now it is reread from disk.
+         */
+        if (journal) {
+                transaction_t *transaction;
+                tid_t tid;
+                struct ocfs2_inode_info *oi = OCFS2_I(inode);
+                read_lock(&journal->j_state_lock);
+                if (journal->j_running_transaction)
+                        transaction = journal->j_running_transaction;
+                else
+                        transaction = journal->j_committing_transaction;
+                if (transaction)
+                        tid = transaction->t_tid;
+                else
+                        tid = journal->j_commit_sequence;
+                read_unlock(&journal->j_state_lock);
+                oi->i_sync_tid = tid;
+                oi->i_datasync_tid = tid;
+        }
 bail:
        if (!IS_ERR(inode)) {
                trace_ocfs2_iget_end(inode, 
@@ -1260,6 +1287,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
        ocfs2_journal_dirty(handle, bh);
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
 leave:
        return status;
 }
author	Darrick J. Wong <darrick.wong@oracle.com>	2014-04-03 17:46:48 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-03 19:20:53 -0400
commit	2931cdcb49194503b19345c597b68fdcf78396f8 (patch)
tree	2492e18b4aa23b3815b6f54112bc4667158e101a /fs/ocfs2/inode.c
parent	a75fe48cad2fb81e0e2671c73aea6c78ce5626d4 (diff)

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f29a90fde619..28ab8a9e88a1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c
@@ -130,6 +130,7 @@ struct inode ocfs2_iget(struct ocfs2_super osb, u64 blkno, unsigned flags,
130	struct inode *inode = NULL;	130	struct inode *inode = NULL;
131	struct super_block *sb = osb->sb;	131	struct super_block *sb = osb->sb;
132	struct ocfs2_find_inode_args args;	132	struct ocfs2_find_inode_args args;
		133	journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
133		134
134	trace_ocfs2_iget_begin((unsigned long long)blkno, flags,	135	trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
135	sysfile_type);	136	sysfile_type);
@@ -169,6 +170,32 @@ struct inode ocfs2_iget(struct ocfs2_super osb, u64 blkno, unsigned flags,
169	goto bail;	170	goto bail;
170	}	171	}
171		172
		173	/*
		174	* Set transaction id's of transactions that have to be committed
		175	* to finish f[data]sync. We set them to currently running transaction
		176	* as we cannot be sure that the inode or some of its metadata isn't
		177	* part of the transaction - the inode could have been reclaimed and
		178	* now it is reread from disk.
		179	*/
		180	if (journal) {
		181	transaction_t *transaction;
		182	tid_t tid;
		183	struct ocfs2_inode_info *oi = OCFS2_I(inode);
		184
		185	read_lock(&journal->j_state_lock);
		186	if (journal->j_running_transaction)
		187	transaction = journal->j_running_transaction;
		188	else
		189	transaction = journal->j_committing_transaction;
		190	if (transaction)
		191	tid = transaction->t_tid;
		192	else
		193	tid = journal->j_commit_sequence;
		194	read_unlock(&journal->j_state_lock);
		195	oi->i_sync_tid = tid;
		196	oi->i_datasync_tid = tid;
		197	}
		198
172	bail:	199	bail:
173	if (!IS_ERR(inode)) {	200	if (!IS_ERR(inode)) {
174	trace_ocfs2_iget_end(inode,	201	trace_ocfs2_iget_end(inode,
@@ -1260,6 +1287,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1260	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);	1287	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1261		1288
1262	ocfs2_journal_dirty(handle, bh);	1289	ocfs2_journal_dirty(handle, bh);
		1290	ocfs2_update_inode_fsync_trans(handle, inode, 1);
1263	leave:	1291	leave:
1264	return status;	1292	return status;
1265	}	1293	}