aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
authorJan Kara <jack@suse.com>2015-12-07 14:28:03 -0500
committerTheodore Ts'o <tytso@mit.edu>2015-12-07 14:28:03 -0500
commitea3d7209ca01da209cda6f0dea8be9cc4b7a933b (patch)
tree809b37322befdf8dda2d12b991d1c832241bc8bc /fs/ext4/inode.c
parentf41683a204ea61568f0fd0804d47c19561f2ee39 (diff)
ext4: fix races between page faults and hole punching
Currently, page faults and hole punching are completely unsynchronized. This can result in page fault faulting in a page into a range that we are punching after truncate_pagecache_range() has been called and thus we can end up with a page mapped to disk blocks that will be shortly freed. Filesystem corruption will shortly follow. Note that the same race is avoided for truncate by checking page fault offset against i_size but there isn't similar mechanism available for punching holes. Fix the problem by creating new rw semaphore i_mmap_sem in inode and grab it for writing over truncate, hole punching, and other functions removing blocks from extent tree and for read over page faults. We cannot easily use i_data_sem for this since that ranks below transaction start and we need something ranking above it so that it can be held over the whole truncate / hole punching operation. Also remove various workarounds we had in the code to reduce race window when page fault could have created pages with stale mapping information. Signed-off-by: Jan Kara <jack@suse.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c36
1 files changed, 27 insertions, 9 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ea433a7f4bca..d1207d03c961 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3623,6 +3623,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3623 3623
3624 } 3624 }
3625 3625
3626 /* Wait all existing dio workers, newcomers will block on i_mutex */
3627 ext4_inode_block_unlocked_dio(inode);
3628 inode_dio_wait(inode);
3629
3630 /*
3631 * Prevent page faults from reinstantiating pages we have released from
3632 * page cache.
3633 */
3634 down_write(&EXT4_I(inode)->i_mmap_sem);
3626 first_block_offset = round_up(offset, sb->s_blocksize); 3635 first_block_offset = round_up(offset, sb->s_blocksize);
3627 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 3636 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
3628 3637
@@ -3631,10 +3640,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3631 truncate_pagecache_range(inode, first_block_offset, 3640 truncate_pagecache_range(inode, first_block_offset,
3632 last_block_offset); 3641 last_block_offset);
3633 3642
3634 /* Wait all existing dio workers, newcomers will block on i_mutex */
3635 ext4_inode_block_unlocked_dio(inode);
3636 inode_dio_wait(inode);
3637
3638 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3643 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3639 credits = ext4_writepage_trans_blocks(inode); 3644 credits = ext4_writepage_trans_blocks(inode);
3640 else 3645 else
@@ -3680,16 +3685,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3680 if (IS_SYNC(inode)) 3685 if (IS_SYNC(inode))
3681 ext4_handle_sync(handle); 3686 ext4_handle_sync(handle);
3682 3687
3683 /* Now release the pages again to reduce race window */
3684 if (last_block_offset > first_block_offset)
3685 truncate_pagecache_range(inode, first_block_offset,
3686 last_block_offset);
3687
3688 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3688 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3689 ext4_mark_inode_dirty(handle, inode); 3689 ext4_mark_inode_dirty(handle, inode);
3690out_stop: 3690out_stop:
3691 ext4_journal_stop(handle); 3691 ext4_journal_stop(handle);
3692out_dio: 3692out_dio:
3693 up_write(&EXT4_I(inode)->i_mmap_sem);
3693 ext4_inode_resume_unlocked_dio(inode); 3694 ext4_inode_resume_unlocked_dio(inode);
3694out_mutex: 3695out_mutex:
3695 mutex_unlock(&inode->i_mutex); 3696 mutex_unlock(&inode->i_mutex);
@@ -4823,6 +4824,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4823 } else 4824 } else
4824 ext4_wait_for_tail_page_commit(inode); 4825 ext4_wait_for_tail_page_commit(inode);
4825 } 4826 }
4827 down_write(&EXT4_I(inode)->i_mmap_sem);
4826 /* 4828 /*
4827 * Truncate pagecache after we've waited for commit 4829 * Truncate pagecache after we've waited for commit
4828 * in data=journal mode to make pages freeable. 4830 * in data=journal mode to make pages freeable.
@@ -4830,6 +4832,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4830 truncate_pagecache(inode, inode->i_size); 4832 truncate_pagecache(inode, inode->i_size);
4831 if (shrink) 4833 if (shrink)
4832 ext4_truncate(inode); 4834 ext4_truncate(inode);
4835 up_write(&EXT4_I(inode)->i_mmap_sem);
4833 } 4836 }
4834 4837
4835 if (!rc) { 4838 if (!rc) {
@@ -5278,6 +5281,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5278 5281
5279 sb_start_pagefault(inode->i_sb); 5282 sb_start_pagefault(inode->i_sb);
5280 file_update_time(vma->vm_file); 5283 file_update_time(vma->vm_file);
5284
5285 down_read(&EXT4_I(inode)->i_mmap_sem);
5281 /* Delalloc case is easy... */ 5286 /* Delalloc case is easy... */
5282 if (test_opt(inode->i_sb, DELALLOC) && 5287 if (test_opt(inode->i_sb, DELALLOC) &&
5283 !ext4_should_journal_data(inode) && 5288 !ext4_should_journal_data(inode) &&
@@ -5347,6 +5352,19 @@ retry_alloc:
5347out_ret: 5352out_ret:
5348 ret = block_page_mkwrite_return(ret); 5353 ret = block_page_mkwrite_return(ret);
5349out: 5354out:
5355 up_read(&EXT4_I(inode)->i_mmap_sem);
5350 sb_end_pagefault(inode->i_sb); 5356 sb_end_pagefault(inode->i_sb);
5351 return ret; 5357 return ret;
5352} 5358}
5359
5360int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
5361{
5362 struct inode *inode = file_inode(vma->vm_file);
5363 int err;
5364
5365 down_read(&EXT4_I(inode)->i_mmap_sem);
5366 err = filemap_fault(vma, vmf);
5367 up_read(&EXT4_I(inode)->i_mmap_sem);
5368
5369 return err;
5370}