diff options
author | Jan Kara <jack@suse.cz> | 2009-12-08 23:51:10 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2009-12-08 23:51:10 -0500 |
commit | b436b9bef84de6893e86346d8fbf7104bc520645 (patch) | |
tree | 50fb9ae167bcd622e9adf47646bcf3b4c7dd111d | |
parent | 194074acacebc169ded90a4657193f5180015051 (diff) |
ext4: Wait for proper transaction commit on fsync
We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r-- | fs/ext4/ext4.h | 7 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 13 | ||||
-rw-r--r-- | fs/ext4/extents.c | 14 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 46 | ||||
-rw-r--r-- | fs/ext4/inode.c | 29 | ||||
-rw-r--r-- | fs/ext4/super.c | 2 |
6 files changed, 80 insertions, 31 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4cfc2f0edb3f..ab31e65d46d0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -709,6 +709,13 @@ struct ext4_inode_info { | |||
709 | struct list_head i_aio_dio_complete_list; | 709 | struct list_head i_aio_dio_complete_list; |
710 | /* current io_end structure for async DIO write*/ | 710 | /* current io_end structure for async DIO write*/ |
711 | ext4_io_end_t *cur_aio_dio; | 711 | ext4_io_end_t *cur_aio_dio; |
712 | |||
713 | /* | ||
714 | * Transactions that contain inode's metadata needed to complete | ||
715 | * fsync and fdatasync, respectively. | ||
716 | */ | ||
717 | tid_t i_sync_tid; | ||
718 | tid_t i_datasync_tid; | ||
712 | }; | 719 | }; |
713 | 720 | ||
714 | /* | 721 | /* |
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 2c2b262bd31b..05eca817d704 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h | |||
@@ -249,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) | |||
249 | return 0; | 249 | return 0; |
250 | } | 250 | } |
251 | 251 | ||
252 | static inline void ext4_update_inode_fsync_trans(handle_t *handle, | ||
253 | struct inode *inode, | ||
254 | int datasync) | ||
255 | { | ||
256 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
257 | |||
258 | if (ext4_handle_valid(handle)) { | ||
259 | ei->i_sync_tid = handle->h_transaction->t_tid; | ||
260 | if (datasync) | ||
261 | ei->i_datasync_tid = handle->h_transaction->t_tid; | ||
262 | } | ||
263 | } | ||
264 | |||
252 | /* super.c */ | 265 | /* super.c */ |
253 | int ext4_force_commit(struct super_block *sb); | 266 | int ext4_force_commit(struct super_block *sb); |
254 | 267 | ||
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5967f18fd7e7..700206e525da 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -3058,6 +3058,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, | |||
3058 | if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { | 3058 | if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { |
3059 | ret = ext4_convert_unwritten_extents_dio(handle, inode, | 3059 | ret = ext4_convert_unwritten_extents_dio(handle, inode, |
3060 | path); | 3060 | path); |
3061 | if (ret >= 0) | ||
3062 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
3061 | goto out2; | 3063 | goto out2; |
3062 | } | 3064 | } |
3063 | /* buffered IO case */ | 3065 | /* buffered IO case */ |
@@ -3085,6 +3087,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, | |||
3085 | ret = ext4_ext_convert_to_initialized(handle, inode, | 3087 | ret = ext4_ext_convert_to_initialized(handle, inode, |
3086 | path, iblock, | 3088 | path, iblock, |
3087 | max_blocks); | 3089 | max_blocks); |
3090 | if (ret >= 0) | ||
3091 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
3088 | out: | 3092 | out: |
3089 | if (ret <= 0) { | 3093 | if (ret <= 0) { |
3090 | err = ret; | 3094 | err = ret; |
@@ -3323,10 +3327,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
3323 | allocated = ext4_ext_get_actual_len(&newex); | 3327 | allocated = ext4_ext_get_actual_len(&newex); |
3324 | set_buffer_new(bh_result); | 3328 | set_buffer_new(bh_result); |
3325 | 3329 | ||
3326 | /* Cache only when it is _not_ an uninitialized extent */ | 3330 | /* |
3327 | if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) | 3331 | * Cache the extent and update transaction to commit on fdatasync only |
3332 | * when it is _not_ an uninitialized extent. | ||
3333 | */ | ||
3334 | if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { | ||
3328 | ext4_ext_put_in_cache(inode, iblock, allocated, newblock, | 3335 | ext4_ext_put_in_cache(inode, iblock, allocated, newblock, |
3329 | EXT4_EXT_CACHE_EXTENT); | 3336 | EXT4_EXT_CACHE_EXTENT); |
3337 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
3338 | } else | ||
3339 | ext4_update_inode_fsync_trans(handle, inode, 0); | ||
3330 | out: | 3340 | out: |
3331 | if (allocated > max_blocks) | 3341 | if (allocated > max_blocks) |
3332 | allocated = max_blocks; | 3342 | allocated = max_blocks; |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index a3c25076aef1..0b22497d92e1 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -51,25 +51,30 @@ | |||
51 | int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) | 51 | int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) |
52 | { | 52 | { |
53 | struct inode *inode = dentry->d_inode; | 53 | struct inode *inode = dentry->d_inode; |
54 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
54 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | 55 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; |
55 | int err, ret = 0; | 56 | int ret; |
57 | tid_t commit_tid; | ||
56 | 58 | ||
57 | J_ASSERT(ext4_journal_current_handle() == NULL); | 59 | J_ASSERT(ext4_journal_current_handle() == NULL); |
58 | 60 | ||
59 | trace_ext4_sync_file(file, dentry, datasync); | 61 | trace_ext4_sync_file(file, dentry, datasync); |
60 | 62 | ||
63 | if (inode->i_sb->s_flags & MS_RDONLY) | ||
64 | return 0; | ||
65 | |||
61 | ret = flush_aio_dio_completed_IO(inode); | 66 | ret = flush_aio_dio_completed_IO(inode); |
62 | if (ret < 0) | 67 | if (ret < 0) |
63 | return ret; | 68 | return ret; |
69 | |||
70 | if (!journal) | ||
71 | return simple_fsync(file, dentry, datasync); | ||
72 | |||
64 | /* | 73 | /* |
65 | * data=writeback: | 74 | * data=writeback,ordered: |
66 | * The caller's filemap_fdatawrite()/wait will sync the data. | 75 | * The caller's filemap_fdatawrite()/wait will sync the data. |
67 | * sync_inode() will sync the metadata | 76 | * Metadata is in the journal, we wait for proper transaction to |
68 | * | 77 | * commit here. |
69 | * data=ordered: | ||
70 | * The caller's filemap_fdatawrite() will write the data and | ||
71 | * sync_inode() will write the inode if it is dirty. Then the caller's | ||
72 | * filemap_fdatawait() will wait on the pages. | ||
73 | * | 78 | * |
74 | * data=journal: | 79 | * data=journal: |
75 | * filemap_fdatawrite won't do anything (the buffers are clean). | 80 | * filemap_fdatawrite won't do anything (the buffers are clean). |
@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) | |||
82 | if (ext4_should_journal_data(inode)) | 87 | if (ext4_should_journal_data(inode)) |
83 | return ext4_force_commit(inode->i_sb); | 88 | return ext4_force_commit(inode->i_sb); |
84 | 89 | ||
85 | if (!journal) | 90 | commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; |
86 | ret = sync_mapping_buffers(inode->i_mapping); | 91 | if (jbd2_log_start_commit(journal, commit_tid)) |
87 | 92 | jbd2_log_wait_commit(journal, commit_tid); | |
88 | if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) | 93 | else if (journal->j_flags & JBD2_BARRIER) |
89 | goto out; | ||
90 | |||
91 | /* | ||
92 | * The VFS has written the file data. If the inode is unaltered | ||
93 | * then we need not start a commit. | ||
94 | */ | ||
95 | if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { | ||
96 | struct writeback_control wbc = { | ||
97 | .sync_mode = WB_SYNC_ALL, | ||
98 | .nr_to_write = 0, /* sys_fsync did this */ | ||
99 | }; | ||
100 | err = sync_inode(inode, &wbc); | ||
101 | if (ret == 0) | ||
102 | ret = err; | ||
103 | } | ||
104 | out: | ||
105 | if (journal && (journal->j_flags & JBD2_BARRIER)) | ||
106 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | 94 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); |
107 | return ret; | 95 | return ret; |
108 | } | 96 | } |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 958c3ff800e9..f1bc1e338828 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -983,6 +983,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, | |||
983 | goto cleanup; | 983 | goto cleanup; |
984 | 984 | ||
985 | set_buffer_new(bh_result); | 985 | set_buffer_new(bh_result); |
986 | |||
987 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
986 | got_it: | 988 | got_it: |
987 | map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); | 989 | map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); |
988 | if (count > blocks_to_boundary) | 990 | if (count > blocks_to_boundary) |
@@ -4738,6 +4740,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4738 | struct ext4_inode *raw_inode; | 4740 | struct ext4_inode *raw_inode; |
4739 | struct ext4_inode_info *ei; | 4741 | struct ext4_inode_info *ei; |
4740 | struct inode *inode; | 4742 | struct inode *inode; |
4743 | journal_t *journal = EXT4_SB(sb)->s_journal; | ||
4741 | long ret; | 4744 | long ret; |
4742 | int block; | 4745 | int block; |
4743 | 4746 | ||
@@ -4802,6 +4805,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4802 | ei->i_data[block] = raw_inode->i_block[block]; | 4805 | ei->i_data[block] = raw_inode->i_block[block]; |
4803 | INIT_LIST_HEAD(&ei->i_orphan); | 4806 | INIT_LIST_HEAD(&ei->i_orphan); |
4804 | 4807 | ||
4808 | /* | ||
4809 | * Set transaction id's of transactions that have to be committed | ||
4810 | * to finish f[data]sync. We set them to currently running transaction | ||
4811 | * as we cannot be sure that the inode or some of its metadata isn't | ||
4812 | * part of the transaction - the inode could have been reclaimed and | ||
4813 | * now it is reread from disk. | ||
4814 | */ | ||
4815 | if (journal) { | ||
4816 | transaction_t *transaction; | ||
4817 | tid_t tid; | ||
4818 | |||
4819 | spin_lock(&journal->j_state_lock); | ||
4820 | if (journal->j_running_transaction) | ||
4821 | transaction = journal->j_running_transaction; | ||
4822 | else | ||
4823 | transaction = journal->j_committing_transaction; | ||
4824 | if (transaction) | ||
4825 | tid = transaction->t_tid; | ||
4826 | else | ||
4827 | tid = journal->j_commit_sequence; | ||
4828 | spin_unlock(&journal->j_state_lock); | ||
4829 | ei->i_sync_tid = tid; | ||
4830 | ei->i_datasync_tid = tid; | ||
4831 | } | ||
4832 | |||
4805 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { | 4833 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { |
4806 | ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); | 4834 | ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); |
4807 | if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > | 4835 | if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > |
@@ -5056,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle, | |||
5056 | err = rc; | 5084 | err = rc; |
5057 | ei->i_state &= ~EXT4_STATE_NEW; | 5085 | ei->i_state &= ~EXT4_STATE_NEW; |
5058 | 5086 | ||
5087 | ext4_update_inode_fsync_trans(handle, inode, 0); | ||
5059 | out_brelse: | 5088 | out_brelse: |
5060 | brelse(bh); | 5089 | brelse(bh); |
5061 | ext4_std_error(inode->i_sb, err); | 5090 | ext4_std_error(inode->i_sb, err); |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8ab0c9518473..2b13dcfcf775 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
706 | spin_lock_init(&(ei->i_block_reservation_lock)); | 706 | spin_lock_init(&(ei->i_block_reservation_lock)); |
707 | INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); | 707 | INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); |
708 | ei->cur_aio_dio = NULL; | 708 | ei->cur_aio_dio = NULL; |
709 | ei->i_sync_tid = 0; | ||
710 | ei->i_datasync_tid = 0; | ||
709 | 711 | ||
710 | return &ei->vfs_inode; | 712 | return &ei->vfs_inode; |
711 | } | 713 | } |