diff options
| author | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2018-10-11 01:17:42 -0400 |
|---|---|---|
| committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2018-10-11 01:17:42 -0400 |
| commit | 4718dcad7decac3a43b7339b2226f3d987cca75c (patch) | |
| tree | 0b80ffa06cbc6d307203696bde1cbeb50c76dcd7 | |
| parent | b8db9e69dba97075d37d7bc20ef49f39298e3875 (diff) | |
| parent | b39989009bdb84992915c9869f58094ed5becf10 (diff) | |
Merge tag 'xfs-fixes-for-4.19-rc7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Dave writes:
"xfs: fixes for 4.19-rc7
Update for 4.19-rc7 to fix numerous file clone and deduplication issues."
* tag 'xfs-fixes-for-4.19-rc7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
xfs: fix data corruption w/ unaligned reflink ranges
xfs: fix data corruption w/ unaligned dedupe ranges
xfs: update ctime and remove suid before cloning files
xfs: zero posteof blocks when cloning above eof
xfs: refactor clonerange preparation into a separate helper
| -rw-r--r-- | fs/xfs/xfs_reflink.c | 200 |
1 files changed, 165 insertions, 35 deletions
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 5289e22cb081..42ea7bab9144 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c | |||
| @@ -1220,35 +1220,92 @@ retry: | |||
| 1220 | return 0; | 1220 | return 0; |
| 1221 | } | 1221 | } |
| 1222 | 1222 | ||
| 1223 | /* Unlock both inodes after they've been prepped for a range clone. */ | ||
| 1224 | STATIC void | ||
| 1225 | xfs_reflink_remap_unlock( | ||
| 1226 | struct file *file_in, | ||
| 1227 | struct file *file_out) | ||
| 1228 | { | ||
| 1229 | struct inode *inode_in = file_inode(file_in); | ||
| 1230 | struct xfs_inode *src = XFS_I(inode_in); | ||
| 1231 | struct inode *inode_out = file_inode(file_out); | ||
| 1232 | struct xfs_inode *dest = XFS_I(inode_out); | ||
| 1233 | bool same_inode = (inode_in == inode_out); | ||
| 1234 | |||
| 1235 | xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); | ||
| 1236 | if (!same_inode) | ||
| 1237 | xfs_iunlock(src, XFS_MMAPLOCK_SHARED); | ||
| 1238 | inode_unlock(inode_out); | ||
| 1239 | if (!same_inode) | ||
| 1240 | inode_unlock_shared(inode_in); | ||
| 1241 | } | ||
| 1242 | |||
| 1223 | /* | 1243 | /* |
| 1224 | * Link a range of blocks from one file to another. | 1244 | * If we're reflinking to a point past the destination file's EOF, we must |
| 1245 | * zero any speculative post-EOF preallocations that sit between the old EOF | ||
| 1246 | * and the destination file offset. | ||
| 1225 | */ | 1247 | */ |
| 1226 | int | 1248 | static int |
| 1227 | xfs_reflink_remap_range( | 1249 | xfs_reflink_zero_posteof( |
| 1250 | struct xfs_inode *ip, | ||
| 1251 | loff_t pos) | ||
| 1252 | { | ||
| 1253 | loff_t isize = i_size_read(VFS_I(ip)); | ||
| 1254 | |||
| 1255 | if (pos <= isize) | ||
| 1256 | return 0; | ||
| 1257 | |||
| 1258 | trace_xfs_zero_eof(ip, isize, pos - isize); | ||
| 1259 | return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, | ||
| 1260 | &xfs_iomap_ops); | ||
| 1261 | } | ||
| 1262 | |||
| 1263 | /* | ||
| 1264 | * Prepare two files for range cloning. Upon a successful return both inodes | ||
| 1265 | * will have the iolock and mmaplock held, the page cache of the out file will | ||
| 1266 | * be truncated, and any leases on the out file will have been broken. This | ||
| 1267 | * function borrows heavily from xfs_file_aio_write_checks. | ||
| 1268 | * | ||
| 1269 | * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't | ||
| 1270 | * checked that the bytes beyond EOF physically match. Hence we cannot use the | ||
| 1271 | * EOF block in the source dedupe range because it's not a complete block match, | ||
| 1272 | * hence can introduce a corruption into the file that has it's block replaced. | ||
| 1273 | * | ||
| 1274 | * In similar fashion, the VFS file cloning also allows partial EOF blocks to be | ||
| 1275 | * "block aligned" for the purposes of cloning entire files. However, if the | ||
| 1276 | * source file range includes the EOF block and it lands within the existing EOF | ||
| 1277 | * of the destination file, then we can expose stale data from beyond the source | ||
| 1278 | * file EOF in the destination file. | ||
| 1279 | * | ||
| 1280 | * XFS doesn't support partial block sharing, so in both cases we have check | ||
| 1281 | * these cases ourselves. For dedupe, we can simply round the length to dedupe | ||
| 1282 | * down to the previous whole block and ignore the partial EOF block. While this | ||
| 1283 | * means we can't dedupe the last block of a file, this is an acceptible | ||
| 1284 | * tradeoff for simplicity on implementation. | ||
| 1285 | * | ||
| 1286 | * For cloning, we want to share the partial EOF block if it is also the new EOF | ||
| 1287 | * block of the destination file. If the partial EOF block lies inside the | ||
| 1288 | * existing destination EOF, then we have to abort the clone to avoid exposing | ||
| 1289 | * stale data in the destination file. Hence we reject these clone attempts with | ||
| 1290 | * -EINVAL in this case. | ||
| 1291 | */ | ||
| 1292 | STATIC int | ||
| 1293 | xfs_reflink_remap_prep( | ||
| 1228 | struct file *file_in, | 1294 | struct file *file_in, |
| 1229 | loff_t pos_in, | 1295 | loff_t pos_in, |
| 1230 | struct file *file_out, | 1296 | struct file *file_out, |
| 1231 | loff_t pos_out, | 1297 | loff_t pos_out, |
| 1232 | u64 len, | 1298 | u64 *len, |
| 1233 | bool is_dedupe) | 1299 | bool is_dedupe) |
| 1234 | { | 1300 | { |
| 1235 | struct inode *inode_in = file_inode(file_in); | 1301 | struct inode *inode_in = file_inode(file_in); |
| 1236 | struct xfs_inode *src = XFS_I(inode_in); | 1302 | struct xfs_inode *src = XFS_I(inode_in); |
| 1237 | struct inode *inode_out = file_inode(file_out); | 1303 | struct inode *inode_out = file_inode(file_out); |
| 1238 | struct xfs_inode *dest = XFS_I(inode_out); | 1304 | struct xfs_inode *dest = XFS_I(inode_out); |
| 1239 | struct xfs_mount *mp = src->i_mount; | ||
| 1240 | bool same_inode = (inode_in == inode_out); | 1305 | bool same_inode = (inode_in == inode_out); |
| 1241 | xfs_fileoff_t sfsbno, dfsbno; | 1306 | u64 blkmask = i_blocksize(inode_in) - 1; |
| 1242 | xfs_filblks_t fsblen; | ||
| 1243 | xfs_extlen_t cowextsize; | ||
| 1244 | ssize_t ret; | 1307 | ssize_t ret; |
| 1245 | 1308 | ||
| 1246 | if (!xfs_sb_version_hasreflink(&mp->m_sb)) | ||
| 1247 | return -EOPNOTSUPP; | ||
| 1248 | |||
| 1249 | if (XFS_FORCED_SHUTDOWN(mp)) | ||
| 1250 | return -EIO; | ||
| 1251 | |||
| 1252 | /* Lock both files against IO */ | 1309 | /* Lock both files against IO */ |
| 1253 | ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); | 1310 | ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); |
| 1254 | if (ret) | 1311 | if (ret) |
| @@ -1270,33 +1327,115 @@ xfs_reflink_remap_range( | |||
| 1270 | goto out_unlock; | 1327 | goto out_unlock; |
| 1271 | 1328 | ||
| 1272 | ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, | 1329 | ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, |
| 1273 | &len, is_dedupe); | 1330 | len, is_dedupe); |
| 1274 | if (ret <= 0) | 1331 | if (ret <= 0) |
| 1275 | goto out_unlock; | 1332 | goto out_unlock; |
| 1276 | 1333 | ||
| 1334 | /* | ||
| 1335 | * If the dedupe data matches, chop off the partial EOF block | ||
| 1336 | * from the source file so we don't try to dedupe the partial | ||
| 1337 | * EOF block. | ||
| 1338 | */ | ||
| 1339 | if (is_dedupe) { | ||
| 1340 | *len &= ~blkmask; | ||
| 1341 | } else if (*len & blkmask) { | ||
| 1342 | /* | ||
| 1343 | * The user is attempting to share a partial EOF block, | ||
| 1344 | * if it's inside the destination EOF then reject it. | ||
| 1345 | */ | ||
| 1346 | if (pos_out + *len < i_size_read(inode_out)) { | ||
| 1347 | ret = -EINVAL; | ||
| 1348 | goto out_unlock; | ||
| 1349 | } | ||
| 1350 | } | ||
| 1351 | |||
| 1277 | /* Attach dquots to dest inode before changing block map */ | 1352 | /* Attach dquots to dest inode before changing block map */ |
| 1278 | ret = xfs_qm_dqattach(dest); | 1353 | ret = xfs_qm_dqattach(dest); |
| 1279 | if (ret) | 1354 | if (ret) |
| 1280 | goto out_unlock; | 1355 | goto out_unlock; |
| 1281 | 1356 | ||
| 1282 | trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); | ||
| 1283 | |||
| 1284 | /* | 1357 | /* |
| 1285 | * Clear out post-eof preallocations because we don't have page cache | 1358 | * Zero existing post-eof speculative preallocations in the destination |
| 1286 | * backing the delayed allocations and they'll never get freed on | 1359 | * file. |
| 1287 | * their own. | ||
| 1288 | */ | 1360 | */ |
| 1289 | if (xfs_can_free_eofblocks(dest, true)) { | 1361 | ret = xfs_reflink_zero_posteof(dest, pos_out); |
| 1290 | ret = xfs_free_eofblocks(dest); | 1362 | if (ret) |
| 1291 | if (ret) | 1363 | goto out_unlock; |
| 1292 | goto out_unlock; | ||
| 1293 | } | ||
| 1294 | 1364 | ||
| 1295 | /* Set flags and remap blocks. */ | 1365 | /* Set flags and remap blocks. */ |
| 1296 | ret = xfs_reflink_set_inode_flag(src, dest); | 1366 | ret = xfs_reflink_set_inode_flag(src, dest); |
| 1297 | if (ret) | 1367 | if (ret) |
| 1298 | goto out_unlock; | 1368 | goto out_unlock; |
| 1299 | 1369 | ||
| 1370 | /* Zap any page cache for the destination file's range. */ | ||
| 1371 | truncate_inode_pages_range(&inode_out->i_data, pos_out, | ||
| 1372 | PAGE_ALIGN(pos_out + *len) - 1); | ||
| 1373 | |||
| 1374 | /* If we're altering the file contents... */ | ||
| 1375 | if (!is_dedupe) { | ||
| 1376 | /* | ||
| 1377 | * ...update the timestamps (which will grab the ilock again | ||
| 1378 | * from xfs_fs_dirty_inode, so we have to call it before we | ||
| 1379 | * take the ilock). | ||
| 1380 | */ | ||
| 1381 | if (!(file_out->f_mode & FMODE_NOCMTIME)) { | ||
| 1382 | ret = file_update_time(file_out); | ||
| 1383 | if (ret) | ||
| 1384 | goto out_unlock; | ||
| 1385 | } | ||
| 1386 | |||
| 1387 | /* | ||
| 1388 | * ...clear the security bits if the process is not being run | ||
| 1389 | * by root. This keeps people from modifying setuid and setgid | ||
| 1390 | * binaries. | ||
| 1391 | */ | ||
| 1392 | ret = file_remove_privs(file_out); | ||
| 1393 | if (ret) | ||
| 1394 | goto out_unlock; | ||
| 1395 | } | ||
| 1396 | |||
| 1397 | return 1; | ||
| 1398 | out_unlock: | ||
| 1399 | xfs_reflink_remap_unlock(file_in, file_out); | ||
| 1400 | return ret; | ||
| 1401 | } | ||
| 1402 | |||
| 1403 | /* | ||
| 1404 | * Link a range of blocks from one file to another. | ||
| 1405 | */ | ||
| 1406 | int | ||
| 1407 | xfs_reflink_remap_range( | ||
| 1408 | struct file *file_in, | ||
| 1409 | loff_t pos_in, | ||
| 1410 | struct file *file_out, | ||
| 1411 | loff_t pos_out, | ||
| 1412 | u64 len, | ||
| 1413 | bool is_dedupe) | ||
| 1414 | { | ||
| 1415 | struct inode *inode_in = file_inode(file_in); | ||
| 1416 | struct xfs_inode *src = XFS_I(inode_in); | ||
| 1417 | struct inode *inode_out = file_inode(file_out); | ||
| 1418 | struct xfs_inode *dest = XFS_I(inode_out); | ||
| 1419 | struct xfs_mount *mp = src->i_mount; | ||
| 1420 | xfs_fileoff_t sfsbno, dfsbno; | ||
| 1421 | xfs_filblks_t fsblen; | ||
| 1422 | xfs_extlen_t cowextsize; | ||
| 1423 | ssize_t ret; | ||
| 1424 | |||
| 1425 | if (!xfs_sb_version_hasreflink(&mp->m_sb)) | ||
| 1426 | return -EOPNOTSUPP; | ||
| 1427 | |||
| 1428 | if (XFS_FORCED_SHUTDOWN(mp)) | ||
| 1429 | return -EIO; | ||
| 1430 | |||
| 1431 | /* Prepare and then clone file data. */ | ||
| 1432 | ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, | ||
| 1433 | &len, is_dedupe); | ||
| 1434 | if (ret <= 0) | ||
| 1435 | return ret; | ||
| 1436 | |||
| 1437 | trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); | ||
| 1438 | |||
| 1300 | dfsbno = XFS_B_TO_FSBT(mp, pos_out); | 1439 | dfsbno = XFS_B_TO_FSBT(mp, pos_out); |
| 1301 | sfsbno = XFS_B_TO_FSBT(mp, pos_in); | 1440 | sfsbno = XFS_B_TO_FSBT(mp, pos_in); |
| 1302 | fsblen = XFS_B_TO_FSB(mp, len); | 1441 | fsblen = XFS_B_TO_FSB(mp, len); |
| @@ -1305,10 +1444,6 @@ xfs_reflink_remap_range( | |||
| 1305 | if (ret) | 1444 | if (ret) |
| 1306 | goto out_unlock; | 1445 | goto out_unlock; |
| 1307 | 1446 | ||
| 1308 | /* Zap any page cache for the destination file's range. */ | ||
| 1309 | truncate_inode_pages_range(&inode_out->i_data, pos_out, | ||
| 1310 | PAGE_ALIGN(pos_out + len) - 1); | ||
| 1311 | |||
| 1312 | /* | 1447 | /* |
| 1313 | * Carry the cowextsize hint from src to dest if we're sharing the | 1448 | * Carry the cowextsize hint from src to dest if we're sharing the |
| 1314 | * entire source file to the entire destination file, the source file | 1449 | * entire source file to the entire destination file, the source file |
| @@ -1325,12 +1460,7 @@ xfs_reflink_remap_range( | |||
| 1325 | is_dedupe); | 1460 | is_dedupe); |
| 1326 | 1461 | ||
| 1327 | out_unlock: | 1462 | out_unlock: |
| 1328 | xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); | 1463 | xfs_reflink_remap_unlock(file_in, file_out); |
| 1329 | if (!same_inode) | ||
| 1330 | xfs_iunlock(src, XFS_MMAPLOCK_SHARED); | ||
| 1331 | inode_unlock(inode_out); | ||
| 1332 | if (!same_inode) | ||
| 1333 | inode_unlock_shared(inode_in); | ||
| 1334 | if (ret) | 1464 | if (ret) |
| 1335 | trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); | 1465 | trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); |
| 1336 | return ret; | 1466 | return ret; |
