aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2018-10-05 21:44:39 -0400
committerDave Chinner <david@fromorbit.com>2018-10-05 21:44:39 -0400
commitb39989009bdb84992915c9869f58094ed5becf10 (patch)
tree574f76831c2cd648c830512c18dabba768ac96eb
parentdceeb47b0ed65e14de53507a8a9c32a90831cfa1 (diff)
xfs: fix data corruption w/ unaligned reflink ranges
When reflinking sub-file ranges, a data corruption can occur when the source file range includes a partial EOF block. This shares the unknown data beyond EOF into the second file at a position inside EOF, exposing stale data in the second file. XFS only supports whole block sharing, but we still need to support whole file reflink correctly. Hence if the reflink request includes the last block of the souce file, only proceed with the reflink operation if it lands at or past the destination file's current EOF. If it lands within the destination file EOF, reject the entire request with -EINVAL and make the caller go the hard way. This avoids the data corruption vector, but also avoids disruption of returning EINVAL to userspace for the common case of whole file cloning. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
-rw-r--r--fs/xfs/xfs_reflink.c47
1 files changed, 34 insertions, 13 deletions
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f889398e25d6..42ea7bab9144 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1262,22 +1262,32 @@ xfs_reflink_zero_posteof(
1262 1262
1263/* 1263/*
1264 * Prepare two files for range cloning. Upon a successful return both inodes 1264 * Prepare two files for range cloning. Upon a successful return both inodes
1265 * will have the iolock and mmaplock held, the page cache of the out file 1265 * will have the iolock and mmaplock held, the page cache of the out file will
1266 * will be truncated, and any leases on the out file will have been broken. 1266 * be truncated, and any leases on the out file will have been broken. This
1267 * This function borrows heavily from xfs_file_aio_write_checks. 1267 * function borrows heavily from xfs_file_aio_write_checks.
1268 * 1268 *
1269 * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't 1269 * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
1270 * checked that the bytes beyond EOF physically match. Hence we cannot use the 1270 * checked that the bytes beyond EOF physically match. Hence we cannot use the
1271 * EOF block in the source dedupe range because it's not a complete block match, 1271 * EOF block in the source dedupe range because it's not a complete block match,
1272 * hence can introduce a corruption into the file that has it's 1272 * hence can introduce a corruption into the file that has it's block replaced.
1273 * block replaced.
1274 * 1273 *
1275 * Despite this issue, we still need to report that range as successfully 1274 * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
1276 * deduped to avoid confusing userspace with EINVAL errors on completely 1275 * "block aligned" for the purposes of cloning entire files. However, if the
1277 * matching file data. The only time that an unaligned length will be passed to 1276 * source file range includes the EOF block and it lands within the existing EOF
1278 * us is when it spans the EOF block of the source file, so if we simply mask it 1277 * of the destination file, then we can expose stale data from beyond the source
1279 * down to be block aligned here the we will dedupe everything but that partial 1278 * file EOF in the destination file.
1280 * EOF block. 1279 *
1280 * XFS doesn't support partial block sharing, so in both cases we have check
1281 * these cases ourselves. For dedupe, we can simply round the length to dedupe
1282 * down to the previous whole block and ignore the partial EOF block. While this
1283 * means we can't dedupe the last block of a file, this is an acceptible
1284 * tradeoff for simplicity on implementation.
1285 *
1286 * For cloning, we want to share the partial EOF block if it is also the new EOF
1287 * block of the destination file. If the partial EOF block lies inside the
1288 * existing destination EOF, then we have to abort the clone to avoid exposing
1289 * stale data in the destination file. Hence we reject these clone attempts with
1290 * -EINVAL in this case.
1281 */ 1291 */
1282STATIC int 1292STATIC int
1283xfs_reflink_remap_prep( 1293xfs_reflink_remap_prep(
@@ -1293,6 +1303,7 @@ xfs_reflink_remap_prep(
1293 struct inode *inode_out = file_inode(file_out); 1303 struct inode *inode_out = file_inode(file_out);
1294 struct xfs_inode *dest = XFS_I(inode_out); 1304 struct xfs_inode *dest = XFS_I(inode_out);
1295 bool same_inode = (inode_in == inode_out); 1305 bool same_inode = (inode_in == inode_out);
1306 u64 blkmask = i_blocksize(inode_in) - 1;
1296 ssize_t ret; 1307 ssize_t ret;
1297 1308
1298 /* Lock both files against IO */ 1309 /* Lock both files against IO */
@@ -1325,8 +1336,18 @@ xfs_reflink_remap_prep(
1325 * from the source file so we don't try to dedupe the partial 1336 * from the source file so we don't try to dedupe the partial
1326 * EOF block. 1337 * EOF block.
1327 */ 1338 */
1328 if (is_dedupe) 1339 if (is_dedupe) {
1329 *len &= ~((u64)i_blocksize(inode_in) - 1); 1340 *len &= ~blkmask;
1341 } else if (*len & blkmask) {
1342 /*
1343 * The user is attempting to share a partial EOF block,
1344 * if it's inside the destination EOF then reject it.
1345 */
1346 if (pos_out + *len < i_size_read(inode_out)) {
1347 ret = -EINVAL;
1348 goto out_unlock;
1349 }
1350 }
1330 1351
1331 /* Attach dquots to dest inode before changing block map */ 1352 /* Attach dquots to dest inode before changing block map */
1332 ret = xfs_qm_dqattach(dest); 1353 ret = xfs_qm_dqattach(dest);