diff options
author | Jan Kara <jack@suse.com> | 2015-12-07 14:28:03 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2015-12-07 14:28:03 -0500 |
commit | ea3d7209ca01da209cda6f0dea8be9cc4b7a933b (patch) | |
tree | 809b37322befdf8dda2d12b991d1c832241bc8bc /fs/ext4/inode.c | |
parent | f41683a204ea61568f0fd0804d47c19561f2ee39 (diff) |
ext4: fix races between page faults and hole punching
Currently, page faults and hole punching are completely unsynchronized.
This can result in page fault faulting in a page into a range that we
are punching after truncate_pagecache_range() has been called and thus
we can end up with a page mapped to disk blocks that will be shortly
freed. Filesystem corruption will shortly follow. Note that the same
race is avoided for truncate by checking page fault offset against
i_size but there isn't similar mechanism available for punching holes.
Fix the problem by creating new rw semaphore i_mmap_sem in inode and
grab it for writing over truncate, hole punching, and other functions
removing blocks from extent tree and for read over page faults. We
cannot easily use i_data_sem for this since that ranks below transaction
start and we need something ranking above it so that it can be held over
the whole truncate / hole punching operation. Also remove various
workarounds we had in the code to reduce race window when page fault
could have created pages with stale mapping information.
Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 36 |
1 files changed, 27 insertions, 9 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ea433a7f4bca..d1207d03c961 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -3623,6 +3623,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) | |||
3623 | 3623 | ||
3624 | } | 3624 | } |
3625 | 3625 | ||
3626 | /* Wait all existing dio workers, newcomers will block on i_mutex */ | ||
3627 | ext4_inode_block_unlocked_dio(inode); | ||
3628 | inode_dio_wait(inode); | ||
3629 | |||
3630 | /* | ||
3631 | * Prevent page faults from reinstantiating pages we have released from | ||
3632 | * page cache. | ||
3633 | */ | ||
3634 | down_write(&EXT4_I(inode)->i_mmap_sem); | ||
3626 | first_block_offset = round_up(offset, sb->s_blocksize); | 3635 | first_block_offset = round_up(offset, sb->s_blocksize); |
3627 | last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; | 3636 | last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; |
3628 | 3637 | ||
@@ -3631,10 +3640,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) | |||
3631 | truncate_pagecache_range(inode, first_block_offset, | 3640 | truncate_pagecache_range(inode, first_block_offset, |
3632 | last_block_offset); | 3641 | last_block_offset); |
3633 | 3642 | ||
3634 | /* Wait all existing dio workers, newcomers will block on i_mutex */ | ||
3635 | ext4_inode_block_unlocked_dio(inode); | ||
3636 | inode_dio_wait(inode); | ||
3637 | |||
3638 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 3643 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
3639 | credits = ext4_writepage_trans_blocks(inode); | 3644 | credits = ext4_writepage_trans_blocks(inode); |
3640 | else | 3645 | else |
@@ -3680,16 +3685,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) | |||
3680 | if (IS_SYNC(inode)) | 3685 | if (IS_SYNC(inode)) |
3681 | ext4_handle_sync(handle); | 3686 | ext4_handle_sync(handle); |
3682 | 3687 | ||
3683 | /* Now release the pages again to reduce race window */ | ||
3684 | if (last_block_offset > first_block_offset) | ||
3685 | truncate_pagecache_range(inode, first_block_offset, | ||
3686 | last_block_offset); | ||
3687 | |||
3688 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | 3688 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); |
3689 | ext4_mark_inode_dirty(handle, inode); | 3689 | ext4_mark_inode_dirty(handle, inode); |
3690 | out_stop: | 3690 | out_stop: |
3691 | ext4_journal_stop(handle); | 3691 | ext4_journal_stop(handle); |
3692 | out_dio: | 3692 | out_dio: |
3693 | up_write(&EXT4_I(inode)->i_mmap_sem); | ||
3693 | ext4_inode_resume_unlocked_dio(inode); | 3694 | ext4_inode_resume_unlocked_dio(inode); |
3694 | out_mutex: | 3695 | out_mutex: |
3695 | mutex_unlock(&inode->i_mutex); | 3696 | mutex_unlock(&inode->i_mutex); |
@@ -4823,6 +4824,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
4823 | } else | 4824 | } else |
4824 | ext4_wait_for_tail_page_commit(inode); | 4825 | ext4_wait_for_tail_page_commit(inode); |
4825 | } | 4826 | } |
4827 | down_write(&EXT4_I(inode)->i_mmap_sem); | ||
4826 | /* | 4828 | /* |
4827 | * Truncate pagecache after we've waited for commit | 4829 | * Truncate pagecache after we've waited for commit |
4828 | * in data=journal mode to make pages freeable. | 4830 | * in data=journal mode to make pages freeable. |
@@ -4830,6 +4832,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
4830 | truncate_pagecache(inode, inode->i_size); | 4832 | truncate_pagecache(inode, inode->i_size); |
4831 | if (shrink) | 4833 | if (shrink) |
4832 | ext4_truncate(inode); | 4834 | ext4_truncate(inode); |
4835 | up_write(&EXT4_I(inode)->i_mmap_sem); | ||
4833 | } | 4836 | } |
4834 | 4837 | ||
4835 | if (!rc) { | 4838 | if (!rc) { |
@@ -5278,6 +5281,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
5278 | 5281 | ||
5279 | sb_start_pagefault(inode->i_sb); | 5282 | sb_start_pagefault(inode->i_sb); |
5280 | file_update_time(vma->vm_file); | 5283 | file_update_time(vma->vm_file); |
5284 | |||
5285 | down_read(&EXT4_I(inode)->i_mmap_sem); | ||
5281 | /* Delalloc case is easy... */ | 5286 | /* Delalloc case is easy... */ |
5282 | if (test_opt(inode->i_sb, DELALLOC) && | 5287 | if (test_opt(inode->i_sb, DELALLOC) && |
5283 | !ext4_should_journal_data(inode) && | 5288 | !ext4_should_journal_data(inode) && |
@@ -5347,6 +5352,19 @@ retry_alloc: | |||
5347 | out_ret: | 5352 | out_ret: |
5348 | ret = block_page_mkwrite_return(ret); | 5353 | ret = block_page_mkwrite_return(ret); |
5349 | out: | 5354 | out: |
5355 | up_read(&EXT4_I(inode)->i_mmap_sem); | ||
5350 | sb_end_pagefault(inode->i_sb); | 5356 | sb_end_pagefault(inode->i_sb); |
5351 | return ret; | 5357 | return ret; |
5352 | } | 5358 | } |
5359 | |||
5360 | int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
5361 | { | ||
5362 | struct inode *inode = file_inode(vma->vm_file); | ||
5363 | int err; | ||
5364 | |||
5365 | down_read(&EXT4_I(inode)->i_mmap_sem); | ||
5366 | err = filemap_fault(vma, vmf); | ||
5367 | up_read(&EXT4_I(inode)->i_mmap_sem); | ||
5368 | |||
5369 | return err; | ||
5370 | } | ||