aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-10-13 10:15:00 -0400
committerFilipe Manana <fdmanana@suse.com>2015-10-13 23:36:43 -0400
commit8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 (patch)
tree121a835cc755dd13c00ca7c20a984ba4addf9ff8
parentb96b1db039ebc584d03a9933b279e0d3e704c528 (diff)
Btrfs: fix file corruption and data loss after cloning inline extents
Currently the clone ioctl allows to clone an inline extent from one file to another that already has other (non-inlined) extents. This is a problem because btrfs is not designed to deal with files having inline and regular extents, if a file has an inline extent then it must be the only extent in the file and must start at file offset 0. Having a file with an inline extent followed by regular extents results in EIO errors when doing reads or writes against the first 4K of the file. Also, the clone ioctl allows one to lose data if the source file consists of a single inline extent, with a size of N bytes, and the destination file consists of a single inline extent with a size of M bytes, where we have M > N. In this case the clone operation removes the inline extent from the destination file and then copies the inline extent from the source file into the destination file - we lose the M - N bytes from the destination file, a read operation will get the value 0x00 for any bytes in the the range [N, M] (the destination inode's i_size remained as M, that's why we can read past N bytes). So fix this by not allowing such destructive operations to happen and return errno EOPNOTSUPP to user space. Currently the fstest btrfs/035 tests the data loss case but it totally ignores this - i.e. expects the operation to succeed and does not check the we got data loss. The following test case for fstests exercises all these cases that result in file corruption and data loss: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" rm -f $seqres.full test_cloning_inline_extents() { local mkfs_opts=$1 local mount_opts=$2 _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 _scratch_mount $mount_opts # File bar, the source for all the following clone operations, consists # of a single inline extent (50 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ | _filter_xfs_io # Test cloning into a file with an extent (non-inlined) where the # destination offset overlaps that extent. It should not be possible to # clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo data after clone operation:" # All bytes should have the value 0xaa (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io # Test cloning the inline extent against a file which has a hole in its # first 4K followed by a non-inlined extent. It should not be possible # as well to clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo2 data after clone operation:" # All bytes should have the value 0x00 (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo2 $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io # Test cloning the inline extent against a file which has a size of zero # but has a prealloc extent. It should not be possible as well to clone # the inline extent from file bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "First 50 bytes of foo3 after clone operation:" # Should not be able to read any bytes, file has 0 bytes i_size (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo3 $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size not greater than the size of # bar's inline extent (40 < 50). # It should be possible to do the extent cloning from bar to this file. $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 # Doing IO against any range in the first 4K of the file should work. echo "File foo4 data after clone operation:" # Must match file bar's content. od -t x1 $SCRATCH_MNT/foo4 $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size greater than the size of bar's # inline extent (60 > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 # Reading the file should not fail. echo "File foo5 data after clone operation:" # Must have a size of 60 bytes, with all bytes having a value of 0x03 # (the clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo5 # Test cloning the inline extent against a file which has no extents but # has a size greater than bar's inline extent (16K > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 # Reading the file should not fail. echo "File foo6 data after clone operation:" # Must have a size of 16K, with all bytes having a value of 0x00 (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo6 # Test cloning the inline extent against a file which has no extents but # has a size not greater than bar's inline extent (30 < 50). # It should be possible to clone the inline extent from file bar into # this file. $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 # Reading the file should not fail. echo "File foo7 data after clone operation:" # Must have a size of 50 bytes, with all bytes having a value of 0xbb. od -t x1 $SCRATCH_MNT/foo7 # Test cloning the inline extent against a file which has a size not # greater than the size of bar's inline extent (20 < 50) but has # a prealloc extent that goes beyond the file's size. It should not be # possible to clone the inline extent from bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" \ -c "pwrite -S 0x88 0 20" \ $SCRATCH_MNT/foo8 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 echo "File foo8 data after clone operation:" # Must have a size of 20 bytes, with all bytes having a value of 0x88 # (the clone operation did not modify our file). od -t x1 $SCRATCH_MNT/foo8 _scratch_unmount } echo -e "\nTesting without compression and without the no-holes feature...\n" test_cloning_inline_extents echo -e "\nTesting with compression and without the no-holes feature...\n" test_cloning_inline_extents "" "-o compress" echo -e "\nTesting without compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "" echo -e "\nTesting with compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "-o compress" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com>
-rw-r--r--fs/btrfs/ioctl.c195
1 files changed, 152 insertions, 43 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 80342d3fa5d2..55a735ae1453 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3328,6 +3328,150 @@ static void clone_update_extent_map(struct inode *inode,
3328 &BTRFS_I(inode)->runtime_flags); 3328 &BTRFS_I(inode)->runtime_flags);
3329} 3329}
3330 3330
3331/*
3332 * Make sure we do not end up inserting an inline extent into a file that has
3333 * already other (non-inline) extents. If a file has an inline extent it can
3334 * not have any other extents and the (single) inline extent must start at the
3335 * file offset 0. Failing to respect these rules will lead to file corruption,
3336 * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
3337 *
3338 * We can have extents that have been already written to disk or we can have
3339 * dirty ranges still in delalloc, in which case the extent maps and items are
3340 * created only when we run delalloc, and the delalloc ranges might fall outside
3341 * the range we are currently locking in the inode's io tree. So we check the
3342 * inode's i_size because of that (i_size updates are done while holding the
3343 * i_mutex, which we are holding here).
3344 * We also check to see if the inode has a size not greater than "datal" but has
3345 * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
3346 * protected against such concurrent fallocate calls by the i_mutex).
3347 *
3348 * If the file has no extents but a size greater than datal, do not allow the
3349 * copy because we would need turn the inline extent into a non-inline one (even
3350 * with NO_HOLES enabled). If we find our destination inode only has one inline
3351 * extent, just overwrite it with the source inline extent if its size is less
3352 * than the source extent's size, or we could copy the source inline extent's
3353 * data into the destination inode's inline extent if the later is greater then
3354 * the former.
3355 */
3356static int clone_copy_inline_extent(struct inode *src,
3357 struct inode *dst,
3358 struct btrfs_trans_handle *trans,
3359 struct btrfs_path *path,
3360 struct btrfs_key *new_key,
3361 const u64 drop_start,
3362 const u64 datal,
3363 const u64 skip,
3364 const u64 size,
3365 char *inline_data)
3366{
3367 struct btrfs_root *root = BTRFS_I(dst)->root;
3368 const u64 aligned_end = ALIGN(new_key->offset + datal,
3369 root->sectorsize);
3370 int ret;
3371 struct btrfs_key key;
3372
3373 if (new_key->offset > 0)
3374 return -EOPNOTSUPP;
3375
3376 key.objectid = btrfs_ino(dst);
3377 key.type = BTRFS_EXTENT_DATA_KEY;
3378 key.offset = 0;
3379 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3380 if (ret < 0) {
3381 return ret;
3382 } else if (ret > 0) {
3383 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
3384 ret = btrfs_next_leaf(root, path);
3385 if (ret < 0)
3386 return ret;
3387 else if (ret > 0)
3388 goto copy_inline_extent;
3389 }
3390 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3391 if (key.objectid == btrfs_ino(dst) &&
3392 key.type == BTRFS_EXTENT_DATA_KEY) {
3393 ASSERT(key.offset > 0);
3394 return -EOPNOTSUPP;
3395 }
3396 } else if (i_size_read(dst) <= datal) {
3397 struct btrfs_file_extent_item *ei;
3398 u64 ext_len;
3399
3400 /*
3401 * If the file size is <= datal, make sure there are no other
3402 * extents following (can happen do to an fallocate call with
3403 * the flag FALLOC_FL_KEEP_SIZE).
3404 */
3405 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3406 struct btrfs_file_extent_item);
3407 /*
3408 * If it's an inline extent, it can not have other extents
3409 * following it.
3410 */
3411 if (btrfs_file_extent_type(path->nodes[0], ei) ==
3412 BTRFS_FILE_EXTENT_INLINE)
3413 goto copy_inline_extent;
3414
3415 ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3416 if (ext_len > aligned_end)
3417 return -EOPNOTSUPP;
3418
3419 ret = btrfs_next_item(root, path);
3420 if (ret < 0) {
3421 return ret;
3422 } else if (ret == 0) {
3423 btrfs_item_key_to_cpu(path->nodes[0], &key,
3424 path->slots[0]);
3425 if (key.objectid == btrfs_ino(dst) &&
3426 key.type == BTRFS_EXTENT_DATA_KEY)
3427 return -EOPNOTSUPP;
3428 }
3429 }
3430
3431copy_inline_extent:
3432 /*
3433 * We have no extent items, or we have an extent at offset 0 which may
3434 * or may not be inlined. All these cases are dealt the same way.
3435 */
3436 if (i_size_read(dst) > datal) {
3437 /*
3438 * If the destination inode has an inline extent...
3439 * This would require copying the data from the source inline
3440 * extent into the beginning of the destination's inline extent.
3441 * But this is really complex, both extents can be compressed
3442 * or just one of them, which would require decompressing and
3443 * re-compressing data (which could increase the new compressed
3444 * size, not allowing the compressed data to fit anymore in an
3445 * inline extent).
3446 * So just don't support this case for now (it should be rare,
3447 * we are not really saving space when cloning inline extents).
3448 */
3449 return -EOPNOTSUPP;
3450 }
3451
3452 btrfs_release_path(path);
3453 ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
3454 if (ret)
3455 return ret;
3456 ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
3457 if (ret)
3458 return ret;
3459
3460 if (skip) {
3461 const u32 start = btrfs_file_extent_calc_inline_size(0);
3462
3463 memmove(inline_data + start, inline_data + start + skip, datal);
3464 }
3465
3466 write_extent_buffer(path->nodes[0], inline_data,
3467 btrfs_item_ptr_offset(path->nodes[0],
3468 path->slots[0]),
3469 size);
3470 inode_add_bytes(dst, datal);
3471
3472 return 0;
3473}
3474
3331/** 3475/**
3332 * btrfs_clone() - clone a range from inode file to another 3476 * btrfs_clone() - clone a range from inode file to another
3333 * 3477 *
@@ -3594,21 +3738,6 @@ process_slot:
3594 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 3738 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
3595 u64 skip = 0; 3739 u64 skip = 0;
3596 u64 trim = 0; 3740 u64 trim = 0;
3597 u64 aligned_end = 0;
3598
3599 /*
3600 * Don't copy an inline extent into an offset
3601 * greater than zero. Having an inline extent
3602 * at such an offset results in chaos as btrfs
3603 * isn't prepared for such cases. Just skip
3604 * this case for the same reasons as commented
3605 * at btrfs_ioctl_clone().
3606 */
3607 if (last_dest_end > 0) {
3608 ret = -EOPNOTSUPP;
3609 btrfs_end_transaction(trans, root);
3610 goto out;
3611 }
3612 3741
3613 if (off > key.offset) { 3742 if (off > key.offset) {
3614 skip = off - key.offset; 3743 skip = off - key.offset;
@@ -3626,42 +3755,22 @@ process_slot:
3626 size -= skip + trim; 3755 size -= skip + trim;
3627 datal -= skip + trim; 3756 datal -= skip + trim;
3628 3757
3629 aligned_end = ALIGN(new_key.offset + datal, 3758 ret = clone_copy_inline_extent(src, inode,
3630 root->sectorsize); 3759 trans, path,
3631 ret = btrfs_drop_extents(trans, root, inode, 3760 &new_key,
3632 drop_start, 3761 drop_start,
3633 aligned_end, 3762 datal,
3634 1); 3763 skip, size, buf);
3635 if (ret) { 3764 if (ret) {
3636 if (ret != -EOPNOTSUPP) 3765 if (ret != -EOPNOTSUPP)
3637 btrfs_abort_transaction(trans, 3766 btrfs_abort_transaction(trans,
3638 root, ret); 3767 root,
3639 btrfs_end_transaction(trans, root); 3768 ret);
3640 goto out;
3641 }
3642
3643 ret = btrfs_insert_empty_item(trans, root, path,
3644 &new_key, size);
3645 if (ret) {
3646 btrfs_abort_transaction(trans, root,
3647 ret);
3648 btrfs_end_transaction(trans, root); 3769 btrfs_end_transaction(trans, root);
3649 goto out; 3770 goto out;
3650 } 3771 }
3651
3652 if (skip) {
3653 u32 start =
3654 btrfs_file_extent_calc_inline_size(0);
3655 memmove(buf+start, buf+start+skip,
3656 datal);
3657 }
3658
3659 leaf = path->nodes[0]; 3772 leaf = path->nodes[0];
3660 slot = path->slots[0]; 3773 slot = path->slots[0];
3661 write_extent_buffer(leaf, buf,
3662 btrfs_item_ptr_offset(leaf, slot),
3663 size);
3664 inode_add_bytes(inode, datal);
3665 } 3774 }
3666 3775
3667 /* If we have an implicit hole (NO_HOLES feature). */ 3776 /* If we have an implicit hole (NO_HOLES feature). */