summaryrefslogtreecommitdiffstats
path: root/fs/read_write.c
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2018-10-29 19:40:55 -0400
committerDave Chinner <david@fromorbit.com>2018-10-29 19:40:55 -0400
commit07d19dc9fbe9128378b9e226abe886fd8fd473df (patch)
treeabda6be0e0233dfd2a798546233174085d82c1ff /fs/read_write.c
parent9fd91a90cb9837372af24a804853e15c11aed93e (diff)
vfs: avoid problematic remapping requests into partial EOF block
A deduplication data corruption is exposed in XFS and btrfs. It is caused by extending the block match range to include the partial EOF block, but then allowing unknown data beyond EOF to be considered a "match" to data in the destination file because the comparison is only made to the end of the source file. This corrupts the destination file when the source extent is shared with it. The VFS remapping prep functions only support whole block dedupe, but we still need to appear to support whole file dedupe correctly. Hence if the dedupe request includes the last block of the souce file, don't include it in the actual dedupe operation. If the rest of the range dedupes successfully, then reject the entire request. A subsequent patch will enable us to shorten dedupe requests correctly. When reflinking sub-file ranges, a data corruption can occur when the source file range includes a partial EOF block. This shares the unknown data beyond EOF into the second file at a position inside EOF, exposing stale data in the second file. If the reflink request includes the last block of the souce file, only proceed with the reflink operation if it lands at or past the destination file's current EOF. If it lands within the destination file EOF, reject the entire request with -EINVAL and make the caller go the hard way. A subsequent patch will enable us to shorten reflink requests correctly. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Chinner <david@fromorbit.com>
Diffstat (limited to 'fs/read_write.c')
-rw-r--r--fs/read_write.c33
1 files changed, 33 insertions, 0 deletions
diff --git a/fs/read_write.c b/fs/read_write.c
index 2456da3f8a41..0f0a6efdd502 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1708,6 +1708,34 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
1708 1708
1709 return security_file_permission(file, write ? MAY_WRITE : MAY_READ); 1709 return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1710} 1710}
1711/*
1712 * Ensure that we don't remap a partial EOF block in the middle of something
1713 * else. Assume that the offsets have already been checked for block
1714 * alignment.
1715 *
1716 * For deduplication we always scale down to the previous block because we
1717 * can't meaningfully compare post-EOF contents.
1718 *
1719 * For clone we only link a partial EOF block above the destination file's EOF.
1720 */
1721static int generic_remap_check_len(struct inode *inode_in,
1722 struct inode *inode_out,
1723 loff_t pos_out,
1724 u64 *len,
1725 bool is_dedupe)
1726{
1727 u64 blkmask = i_blocksize(inode_in) - 1;
1728
1729 if ((*len & blkmask) == 0)
1730 return 0;
1731
1732 if (is_dedupe)
1733 *len &= ~blkmask;
1734 else if (pos_out + *len < i_size_read(inode_out))
1735 return -EINVAL;
1736
1737 return 0;
1738}
1711 1739
1712/* 1740/*
1713 * Check that the two inodes are eligible for cloning, the ranges make 1741 * Check that the two inodes are eligible for cloning, the ranges make
@@ -1787,6 +1815,11 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
1787 return -EBADE; 1815 return -EBADE;
1788 } 1816 }
1789 1817
1818 ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
1819 is_dedupe);
1820 if (ret)
1821 return ret;
1822
1790 return 1; 1823 return 1;
1791} 1824}
1792EXPORT_SYMBOL(vfs_clone_file_prep); 1825EXPORT_SYMBOL(vfs_clone_file_prep);