diff options
author | Dmitry Monakhov <dmonakhov@openvz.org> | 2012-10-05 11:31:55 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2012-10-05 11:31:55 -0400 |
commit | c278531d39f3158bfee93dc67da0b77e09776de2 (patch) | |
tree | b83341e04d54b3f1cd8171f43ec77bbfba06e571 /fs | |
parent | 041bbb6d369811e948ae01f3d00414264076be35 (diff) |
ext4: fix ext4_flush_completed_IO wait semantics
BUG #1) All places where we call ext4_flush_completed_IO are broken
because buffered io and DIO/AIO goes through three stages
1) submitted io,
2) completed io (in i_completed_io_list) conversion pended
3) finished io (conversion done)
And by calling ext4_flush_completed_IO we will flush only
requests which were in (2) stage, which is wrong because:
1) punch_hole and truncate _must_ wait for all outstanding unwritten io
regardless to it's state.
2) fsync and nolock_dio_read should also wait because there is
a time window between end_page_writeback() and ext4_add_complete_io()
As result integrity fsync is broken in case of buffered write
to fallocated region:
fsync blkdev_completion
->filemap_write_and_wait_range
->ext4_end_bio
->end_page_writeback
<-- filemap_write_and_wait_range return
->ext4_flush_completed_IO
sees empty i_completed_io_list but pended
conversion still exist
->ext4_add_complete_io
BUG #2) Race window becomes wider due to the 'ext4: completed_io
locking cleanup V4' patch series
This patch make following changes:
1) ext4_flush_completed_io() now first try to flush completed io and when
wait for any outstanding unwritten io via ext4_unwritten_wait()
2) Rename function to more appropriate name.
3) Assert that all callers of ext4_flush_unwritten_io should hold i_mutex to
prevent endless wait
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ext4/ext4.h | 3 | ||||
-rw-r--r-- | fs/ext4/extents.c | 6 | ||||
-rw-r--r-- | fs/ext4/file.c | 2 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 2 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 8 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 11 |
6 files changed, 19 insertions, 13 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1be2b4472a83..3ab2539b7b2e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -1947,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); | |||
1947 | 1947 | ||
1948 | /* fsync.c */ | 1948 | /* fsync.c */ |
1949 | extern int ext4_sync_file(struct file *, loff_t, loff_t, int); | 1949 | extern int ext4_sync_file(struct file *, loff_t, loff_t, int); |
1950 | extern int ext4_flush_completed_IO(struct inode *); | 1950 | extern int ext4_flush_unwritten_io(struct inode *); |
1951 | 1951 | ||
1952 | /* hash.c */ | 1952 | /* hash.c */ |
1953 | extern int ext4fs_dirhash(const char *name, int len, struct | 1953 | extern int ext4fs_dirhash(const char *name, int len, struct |
@@ -2371,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations; | |||
2371 | extern const struct inode_operations ext4_file_inode_operations; | 2371 | extern const struct inode_operations ext4_file_inode_operations; |
2372 | extern const struct file_operations ext4_file_operations; | 2372 | extern const struct file_operations ext4_file_operations; |
2373 | extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); | 2373 | extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); |
2374 | extern void ext4_unwritten_wait(struct inode *inode); | ||
2374 | 2375 | ||
2375 | /* namei.c */ | 2376 | /* namei.c */ |
2376 | extern const struct inode_operations ext4_dir_inode_operations; | 2377 | extern const struct inode_operations ext4_dir_inode_operations; |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c1fcf489e056..1c94cca35ed1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -4268,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode) | |||
4268 | * finish any pending end_io work so we won't run the risk of | 4268 | * finish any pending end_io work so we won't run the risk of |
4269 | * converting any truncated blocks to initialized later | 4269 | * converting any truncated blocks to initialized later |
4270 | */ | 4270 | */ |
4271 | ext4_flush_completed_IO(inode); | 4271 | ext4_flush_unwritten_io(inode); |
4272 | 4272 | ||
4273 | /* | 4273 | /* |
4274 | * probably first extent we're gonna free will be last in block | 4274 | * probably first extent we're gonna free will be last in block |
@@ -4847,10 +4847,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
4847 | 4847 | ||
4848 | /* Wait all existing dio workers, newcomers will block on i_mutex */ | 4848 | /* Wait all existing dio workers, newcomers will block on i_mutex */ |
4849 | ext4_inode_block_unlocked_dio(inode); | 4849 | ext4_inode_block_unlocked_dio(inode); |
4850 | inode_dio_wait(inode); | 4850 | err = ext4_flush_unwritten_io(inode); |
4851 | err = ext4_flush_completed_IO(inode); | ||
4852 | if (err) | 4851 | if (err) |
4853 | goto out_dio; | 4852 | goto out_dio; |
4853 | inode_dio_wait(inode); | ||
4854 | 4854 | ||
4855 | credits = ext4_writepage_trans_blocks(inode); | 4855 | credits = ext4_writepage_trans_blocks(inode); |
4856 | handle = ext4_journal_start(inode, credits); | 4856 | handle = ext4_journal_start(inode, credits); |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 39335bda404b..ca6f07afe601 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -55,7 +55,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) | |||
55 | return 0; | 55 | return 0; |
56 | } | 56 | } |
57 | 57 | ||
58 | static void ext4_unwritten_wait(struct inode *inode) | 58 | void ext4_unwritten_wait(struct inode *inode) |
59 | { | 59 | { |
60 | wait_queue_head_t *wq = ext4_ioend_wq(inode); | 60 | wait_queue_head_t *wq = ext4_ioend_wq(inode); |
61 | 61 | ||
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 460000868b8e..be1d89f385b4 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -138,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
138 | if (inode->i_sb->s_flags & MS_RDONLY) | 138 | if (inode->i_sb->s_flags & MS_RDONLY) |
139 | goto out; | 139 | goto out; |
140 | 140 | ||
141 | ret = ext4_flush_completed_IO(inode); | 141 | ret = ext4_flush_unwritten_io(inode); |
142 | if (ret < 0) | 142 | if (ret < 0) |
143 | goto out; | 143 | goto out; |
144 | 144 | ||
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 8d849dae8428..792e388e7b44 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c | |||
@@ -807,9 +807,11 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | |||
807 | 807 | ||
808 | retry: | 808 | retry: |
809 | if (rw == READ && ext4_should_dioread_nolock(inode)) { | 809 | if (rw == READ && ext4_should_dioread_nolock(inode)) { |
810 | if (unlikely(!list_empty(&ei->i_completed_io_list))) | 810 | if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { |
811 | ext4_flush_completed_IO(inode); | 811 | mutex_lock(&inode->i_mutex); |
812 | 812 | ext4_flush_unwritten_io(inode); | |
813 | mutex_unlock(&inode->i_mutex); | ||
814 | } | ||
813 | /* | 815 | /* |
814 | * Nolock dioread optimization may be dynamically disabled | 816 | * Nolock dioread optimization may be dynamically disabled |
815 | * via ext4_inode_block_unlocked_dio(). Check inode's state | 817 | * via ext4_inode_block_unlocked_dio(). Check inode's state |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5b24c407701b..68e896e12a67 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -189,8 +189,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode, | |||
189 | 189 | ||
190 | list_add_tail(&io->list, &complete); | 190 | list_add_tail(&io->list, &complete); |
191 | } | 191 | } |
192 | /* It is important to update all flags for all end_io in one shot w/o | ||
193 | * dropping the lock.*/ | ||
194 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | 192 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
195 | while (!list_empty(&complete)) { | 193 | while (!list_empty(&complete)) { |
196 | io = list_entry(complete.next, ext4_io_end_t, list); | 194 | io = list_entry(complete.next, ext4_io_end_t, list); |
@@ -228,9 +226,14 @@ static void ext4_end_io_work(struct work_struct *work) | |||
228 | ext4_do_flush_completed_IO(io->inode, io); | 226 | ext4_do_flush_completed_IO(io->inode, io); |
229 | } | 227 | } |
230 | 228 | ||
231 | int ext4_flush_completed_IO(struct inode *inode) | 229 | int ext4_flush_unwritten_io(struct inode *inode) |
232 | { | 230 | { |
233 | return ext4_do_flush_completed_IO(inode, NULL); | 231 | int ret; |
232 | WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && | ||
233 | !(inode->i_state & I_FREEING)); | ||
234 | ret = ext4_do_flush_completed_IO(inode, NULL); | ||
235 | ext4_unwritten_wait(inode); | ||
236 | return ret; | ||
234 | } | 237 | } |
235 | 238 | ||
236 | ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) | 239 | ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) |