aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2009-10-16 13:26:15 -0400
committerJan Kara <jack@suse.cz>2009-11-11 09:22:49 -0500
commitfe8bc91c4c30122b357d197117705cfd4fabaf28 (patch)
tree33e2895847d812204209444db4093af9a2c3623d
parentea0174a7137c8ca9f130ca681f3a99c872da6778 (diff)
ext3: Wait for proper transaction commit on fsync
We cannot rely on buffer dirty bits during fsync because pdflush can come before fsync is called and clear dirty bits without forcing a transaction commit. What we do is that we track which transaction has last changed the inode and which transaction last changed allocation and force it to disk on fsync. Signed-off-by: Jan Kara <jack@suse.cz> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
-rw-r--r--fs/ext3/fsync.c36
-rw-r--r--fs/ext3/inode.c32
-rw-r--r--fs/ext3/super.c2
-rw-r--r--include/linux/ext3_fs_i.h8
4 files changed, 57 insertions, 21 deletions
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 451d166bbe93..8209f266e9ad 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -46,19 +46,21 @@
46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
47{ 47{
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = dentry->d_inode;
49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
49 int ret = 0; 51 int ret = 0;
52 tid_t commit_tid;
53
54 if (inode->i_sb->s_flags & MS_RDONLY)
55 return 0;
50 56
51 J_ASSERT(ext3_journal_current_handle() == NULL); 57 J_ASSERT(ext3_journal_current_handle() == NULL);
52 58
53 /* 59 /*
54 * data=writeback: 60 * data=writeback,ordered:
55 * The caller's filemap_fdatawrite()/wait will sync the data. 61 * The caller's filemap_fdatawrite()/wait will sync the data.
56 * sync_inode() will sync the metadata 62 * Metadata is in the journal, we wait for a proper transaction
57 * 63 * to commit here.
58 * data=ordered:
59 * The caller's filemap_fdatawrite() will write the data and
60 * sync_inode() will write the inode if it is dirty. Then the caller's
61 * filemap_fdatawait() will wait on the pages.
62 * 64 *
63 * data=journal: 65 * data=journal:
64 * filemap_fdatawrite won't do anything (the buffers are clean). 66 * filemap_fdatawrite won't do anything (the buffers are clean).
@@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
73 goto out; 75 goto out;
74 } 76 }
75 77
76 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 78 if (datasync)
77 goto flush; 79 commit_tid = atomic_read(&ei->i_datasync_tid);
80 else
81 commit_tid = atomic_read(&ei->i_sync_tid);
78 82
79 /* 83 if (log_start_commit(journal, commit_tid)) {
80 * The VFS has written the file data. If the inode is unaltered 84 log_wait_commit(journal, commit_tid);
81 * then we need not start a commit.
82 */
83 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
84 struct writeback_control wbc = {
85 .sync_mode = WB_SYNC_ALL,
86 .nr_to_write = 0, /* sys_fsync did this */
87 };
88 ret = sync_inode(inode, &wbc);
89 goto out; 85 goto out;
90 } 86 }
91flush: 87
92 /* 88 /*
93 * In case we didn't commit a transaction, we have to flush 89 * In case we didn't commit a transaction, we have to flush
94 * disk caches manually so that data really is on persistent 90 * disk caches manually so that data really is on persistent
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 069a163393b4..354ed3b47b30 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
699 int err = 0; 699 int err = 0;
700 struct ext3_block_alloc_info *block_i; 700 struct ext3_block_alloc_info *block_i;
701 ext3_fsblk_t current_block; 701 ext3_fsblk_t current_block;
702 struct ext3_inode_info *ei = EXT3_I(inode);
702 703
703 block_i = EXT3_I(inode)->i_block_alloc_info; 704 block_i = ei->i_block_alloc_info;
704 /* 705 /*
705 * If we're splicing into a [td]indirect block (as opposed to the 706 * If we're splicing into a [td]indirect block (as opposed to the
706 * inode) then we need to get write access to the [td]indirect block 707 * inode) then we need to get write access to the [td]indirect block
@@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
741 742
742 inode->i_ctime = CURRENT_TIME_SEC; 743 inode->i_ctime = CURRENT_TIME_SEC;
743 ext3_mark_inode_dirty(handle, inode); 744 ext3_mark_inode_dirty(handle, inode);
745 /* ext3_mark_inode_dirty already updated i_sync_tid */
746 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
744 747
745 /* had we spliced it onto indirect block? */ 748 /* had we spliced it onto indirect block? */
746 if (where->bh) { 749 if (where->bh) {
@@ -2754,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2754 struct ext3_inode_info *ei; 2757 struct ext3_inode_info *ei;
2755 struct buffer_head *bh; 2758 struct buffer_head *bh;
2756 struct inode *inode; 2759 struct inode *inode;
2760 journal_t *journal = EXT3_SB(sb)->s_journal;
2761 transaction_t *transaction;
2757 long ret; 2762 long ret;
2758 int block; 2763 int block;
2759 2764
@@ -2831,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2831 ei->i_data[block] = raw_inode->i_block[block]; 2836 ei->i_data[block] = raw_inode->i_block[block];
2832 INIT_LIST_HEAD(&ei->i_orphan); 2837 INIT_LIST_HEAD(&ei->i_orphan);
2833 2838
2839 /*
2840 * Set transaction id's of transactions that have to be committed
2841 * to finish f[data]sync. We set them to currently running transaction
2842 * as we cannot be sure that the inode or some of its metadata isn't
2843 * part of the transaction - the inode could have been reclaimed and
2844 * now it is reread from disk.
2845 */
2846 if (journal) {
2847 tid_t tid;
2848
2849 spin_lock(&journal->j_state_lock);
2850 if (journal->j_running_transaction)
2851 transaction = journal->j_running_transaction;
2852 else
2853 transaction = journal->j_committing_transaction;
2854 if (transaction)
2855 tid = transaction->t_tid;
2856 else
2857 tid = journal->j_commit_sequence;
2858 spin_unlock(&journal->j_state_lock);
2859 atomic_set(&ei->i_sync_tid, tid);
2860 atomic_set(&ei->i_datasync_tid, tid);
2861 }
2862
2834 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && 2863 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2835 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { 2864 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2836 /* 2865 /*
@@ -3015,6 +3044,7 @@ again:
3015 err = rc; 3044 err = rc;
3016 ei->i_state &= ~EXT3_STATE_NEW; 3045 ei->i_state &= ~EXT3_STATE_NEW;
3017 3046
3047 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3018out_brelse: 3048out_brelse:
3019 brelse (bh); 3049 brelse (bh);
3020 ext3_std_error(inode->i_sb, err); 3050 ext3_std_error(inode->i_sb, err);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7a520a862f49..427496c4767c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
466 return NULL; 466 return NULL;
467 ei->i_block_alloc_info = NULL; 467 ei->i_block_alloc_info = NULL;
468 ei->vfs_inode.i_version = 1; 468 ei->vfs_inode.i_version = 1;
469 atomic_set(&ei->i_datasync_tid, 0);
470 atomic_set(&ei->i_sync_tid, 0);
469 return &ei->vfs_inode; 471 return &ei->vfs_inode;
470} 472}
471 473
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index ca1bfe90004f..93e7428156ba 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -137,6 +137,14 @@ struct ext3_inode_info {
137 * by other means, so we have truncate_mutex. 137 * by other means, so we have truncate_mutex.
138 */ 138 */
139 struct mutex truncate_mutex; 139 struct mutex truncate_mutex;
140
141 /*
142 * Transactions that contain inode's metadata needed to complete
143 * fsync and fdatasync, respectively.
144 */
145 atomic_t i_sync_tid;
146 atomic_t i_datasync_tid;
147
140 struct inode vfs_inode; 148 struct inode vfs_inode;
141}; 149};
142 150