aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2014-04-03 17:46:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-03 19:20:53 -0400
commit2931cdcb49194503b19345c597b68fdcf78396f8 (patch)
tree2492e18b4aa23b3815b6f54112bc4667158e101a /fs
parenta75fe48cad2fb81e0e2671c73aea6c78ce5626d4 (diff)
ocfs2: improve fsync efficiency and fix deadlock between aio_write and sync_file
Currently, ocfs2_sync_file grabs i_mutex and forces the current journal transaction to complete. This isn't terribly efficient, since sync_file really only needs to wait for the last transaction involving that inode to complete, and this doesn't require i_mutex. Therefore, implement the necessary bits to track the newest tid associated with an inode, and teach sync_file to wait for that instead of waiting for everything in the journal to commit. Furthermore, only issue the flush request to the drive if jbd2 hasn't already done so. This also eliminates the deadlock between ocfs2_file_aio_write() and ocfs2_sync_file(). aio_write takes i_mutex then calls ocfs2_aiodio_wait() to wait for unaligned dio writes to finish. However, if that dio completion involves calling fsync, then we can get into trouble when some ocfs2_sync_file tries to take i_mutex. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/ocfs2/alloc.c1
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/file.c36
-rw-r--r--fs/ocfs2/inode.c28
-rw-r--r--fs/ocfs2/inode.h7
-rw-r--r--fs/ocfs2/journal.h11
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/super.c3
9 files changed, 74 insertions, 21 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e2edff38be52..6b97d68e34d3 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6932,6 +6932,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6932 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6932 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6933 spin_unlock(&oi->ip_lock); 6933 spin_unlock(&oi->ip_lock);
6934 6934
6935 ocfs2_update_inode_fsync_trans(handle, inode, 1);
6935 ocfs2_dinode_new_extent_list(inode, di); 6936 ocfs2_dinode_new_extent_list(inode, di);
6936 6937
6937 ocfs2_journal_dirty(handle, di_bh); 6938 ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ebe44f7dce0b..d310d12a9adc 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2039,6 +2039,7 @@ out_write_size:
2039 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2039 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2040 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 2040 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
2041 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 2041 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
2042 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2042 ocfs2_journal_dirty(handle, wc->w_di_bh); 2043 ocfs2_journal_dirty(handle, wc->w_di_bh);
2043 2044
2044 ocfs2_commit_trans(osb, handle); 2045 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 91a7e85ac8fd..8b48e9b7ad0e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2957 ocfs2_init_dir_trailer(dir, dirdata_bh, i); 2957 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
2958 } 2958 }
2959 2959
2960 ocfs2_update_inode_fsync_trans(handle, dir, 1);
2960 ocfs2_journal_dirty(handle, dirdata_bh); 2961 ocfs2_journal_dirty(handle, dirdata_bh);
2961 2962
2962 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2963 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3338,6 +3339,7 @@ do_extend:
3338 } else { 3339 } else {
3339 de->rec_len = cpu_to_le16(sb->s_blocksize); 3340 de->rec_len = cpu_to_le16(sb->s_blocksize);
3340 } 3341 }
3342 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3341 ocfs2_journal_dirty(handle, new_bh); 3343 ocfs2_journal_dirty(handle, new_bh);
3342 3344
3343 dir_i_size += dir->i_sb->s_blocksize; 3345 dir_i_size += dir->i_sb->s_blocksize;
@@ -3896,6 +3898,7 @@ out_commit:
3896 dquot_free_space_nodirty(dir, 3898 dquot_free_space_nodirty(dir,
3897 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3899 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3898 3900
3901 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3899 ocfs2_commit_trans(osb, handle); 3902 ocfs2_commit_trans(osb, handle);
3900 3903
3901out: 3904out:
@@ -4134,6 +4137,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4134 mlog_errno(ret); 4137 mlog_errno(ret);
4135 did_quota = 0; 4138 did_quota = 0;
4136 4139
4140 ocfs2_update_inode_fsync_trans(handle, dir, 1);
4137 ocfs2_journal_dirty(handle, dx_root_bh); 4141 ocfs2_journal_dirty(handle, dx_root_bh);
4138 4142
4139out_commit: 4143out_commit:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1673438789fe..bd94d26b0b21 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
175 int datasync) 175 int datasync)
176{ 176{
177 int err = 0; 177 int err = 0;
178 journal_t *journal;
179 struct inode *inode = file->f_mapping->host; 178 struct inode *inode = file->f_mapping->host;
180 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 179 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
180 struct ocfs2_inode_info *oi = OCFS2_I(inode);
181 journal_t *journal = osb->journal->j_journal;
182 int ret;
183 tid_t commit_tid;
184 bool needs_barrier = false;
181 185
182 trace_ocfs2_sync_file(inode, file, file->f_path.dentry, 186 trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
183 OCFS2_I(inode)->ip_blkno, 187 OCFS2_I(inode)->ip_blkno,
@@ -192,29 +196,19 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
192 if (err) 196 if (err)
193 return err; 197 return err;
194 198
195 /* 199 commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
196 * Probably don't need the i_mutex at all in here, just putting it here 200 if (journal->j_flags & JBD2_BARRIER &&
197 * to be consistent with how fsync used to be called, someone more 201 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
198 * familiar with the fs could possibly remove it. 202 needs_barrier = true;
199 */ 203 err = jbd2_complete_transaction(journal, commit_tid);
200 mutex_lock(&inode->i_mutex); 204 if (needs_barrier) {
201 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 205 ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
202 /* 206 if (!err)
203 * We still have to flush drive's caches to get data to the 207 err = ret;
204 * platter
205 */
206 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
207 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
208 goto bail;
209 } 208 }
210 209
211 journal = osb->journal->j_journal;
212 err = jbd2_journal_force_commit(journal);
213
214bail:
215 if (err) 210 if (err)
216 mlog_errno(err); 211 mlog_errno(err);
217 mutex_unlock(&inode->i_mutex);
218 212
219 return (err < 0) ? -EIO : 0; 213 return (err < 0) ? -EIO : 0;
220} 214}
@@ -650,7 +644,7 @@ restarted_transaction:
650 mlog_errno(status); 644 mlog_errno(status);
651 goto leave; 645 goto leave;
652 } 646 }
653 647 ocfs2_update_inode_fsync_trans(handle, inode, 1);
654 ocfs2_journal_dirty(handle, bh); 648 ocfs2_journal_dirty(handle, bh);
655 649
656 spin_lock(&OCFS2_I(inode)->ip_lock); 650 spin_lock(&OCFS2_I(inode)->ip_lock);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f29a90fde619..28ab8a9e88a1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
130 struct inode *inode = NULL; 130 struct inode *inode = NULL;
131 struct super_block *sb = osb->sb; 131 struct super_block *sb = osb->sb;
132 struct ocfs2_find_inode_args args; 132 struct ocfs2_find_inode_args args;
133 journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
133 134
134 trace_ocfs2_iget_begin((unsigned long long)blkno, flags, 135 trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
135 sysfile_type); 136 sysfile_type);
@@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
169 goto bail; 170 goto bail;
170 } 171 }
171 172
173 /*
174 * Set transaction id's of transactions that have to be committed
175 * to finish f[data]sync. We set them to currently running transaction
176 * as we cannot be sure that the inode or some of its metadata isn't
177 * part of the transaction - the inode could have been reclaimed and
178 * now it is reread from disk.
179 */
180 if (journal) {
181 transaction_t *transaction;
182 tid_t tid;
183 struct ocfs2_inode_info *oi = OCFS2_I(inode);
184
185 read_lock(&journal->j_state_lock);
186 if (journal->j_running_transaction)
187 transaction = journal->j_running_transaction;
188 else
189 transaction = journal->j_committing_transaction;
190 if (transaction)
191 tid = transaction->t_tid;
192 else
193 tid = journal->j_commit_sequence;
194 read_unlock(&journal->j_state_lock);
195 oi->i_sync_tid = tid;
196 oi->i_datasync_tid = tid;
197 }
198
172bail: 199bail:
173 if (!IS_ERR(inode)) { 200 if (!IS_ERR(inode)) {
174 trace_ocfs2_iget_end(inode, 201 trace_ocfs2_iget_end(inode,
@@ -1260,6 +1287,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1260 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1287 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1261 1288
1262 ocfs2_journal_dirty(handle, bh); 1289 ocfs2_journal_dirty(handle, bh);
1290 ocfs2_update_inode_fsync_trans(handle, inode, 1);
1263leave: 1291leave:
1264 return status; 1292 return status;
1265} 1293}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9f1580b506a5..837e5e42af85 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -73,6 +73,13 @@ struct ocfs2_inode_info
73 u32 ip_dir_lock_gen; 73 u32 ip_dir_lock_gen;
74 74
75 struct ocfs2_alloc_reservation ip_la_data_resv; 75 struct ocfs2_alloc_reservation ip_la_data_resv;
76
77 /*
78 * Transactions that contain inode's metadata needed to complete
79 * fsync and fdatasync, respectively.
80 */
81 tid_t i_sync_tid;
82 tid_t i_datasync_tid;
76}; 83};
77 84
78/* 85/*
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 9ff4e8cf9d97..7f8cde94abfe 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -626,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
626 new_size); 626 new_size);
627} 627}
628 628
629static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
630 struct inode *inode,
631 int datasync)
632{
633 struct ocfs2_inode_info *oi = OCFS2_I(inode);
634
635 oi->i_sync_tid = handle->h_transaction->t_tid;
636 if (datasync)
637 oi->i_datasync_tid = handle->h_transaction->t_tid;
638}
639
629#endif /* OCFS2_JOURNAL_H */ 640#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3683643f3f0e..e61e4c9a077c 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -495,6 +495,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
495 struct ocfs2_dinode *fe = NULL; 495 struct ocfs2_dinode *fe = NULL;
496 struct ocfs2_extent_list *fel; 496 struct ocfs2_extent_list *fel;
497 u16 feat; 497 u16 feat;
498 struct ocfs2_inode_info *oi = OCFS2_I(inode);
498 499
499 *new_fe_bh = NULL; 500 *new_fe_bh = NULL;
500 501
@@ -576,6 +577,9 @@ static int __ocfs2_mknod_locked(struct inode *dir,
576 mlog_errno(status); 577 mlog_errno(status);
577 } 578 }
578 579
580 oi->i_sync_tid = handle->h_transaction->t_tid;
581 oi->i_datasync_tid = handle->h_transaction->t_tid;
582
579 status = 0; /* error in ocfs2_create_new_inode_locks is not 583 status = 0; /* error in ocfs2_create_new_inode_locks is not
580 * critical */ 584 * critical */
581 585
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d7190b2cfd40..9fef73da1ca5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
561 if (!oi) 561 if (!oi)
562 return NULL; 562 return NULL;
563 563
564 oi->i_sync_tid = 0;
565 oi->i_datasync_tid = 0;
566
564 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); 567 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
565 return &oi->vfs_inode; 568 return &oi->vfs_inode;
566} 569}