aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-08-19 06:09:40 -0400
committerChris Mason <clm@fb.com>2015-08-19 17:27:46 -0400
commitb84b8390d6009cde5134f775a251103c14bbed74 (patch)
tree256265c33abc80a21d1e3dfd2df78f053a9f1895 /fs/btrfs/tree-log.c
parent1f9b8c8fbc9a4d029760b16f477b9d15500e3a34 (diff)
Btrfs: fix file read corruption after extent cloning and fsync
If we partially clone one extent of a file into a lower offset of the file, fsync the file, power fail and then mount the fs to trigger log replay, we can get multiple checksum items in the csum tree that overlap each other and result in checksum lookup failures later. Those failures can make file data read requests assume a checksum value of 0, but they will not return an error (-EIO for example) to userspace exactly because the expected checksum value 0 is a special value that makes the read bio endio callback return success and set all the bytes of the corresponding page with the value 0x01 (at fs/btrfs/inode.c:__readpage_endio_check()). From a userspace perspective this is equivalent to file corruption because we are not returning what was written to the file. Details about how this can happen, and why, are included inline in the following reproducer test case for fstests and the comment added to tree-log.c. seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { _cleanup_flakey rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_dm_flakey _require_cloner _require_metadata_journaling $SCRATCH_DEV rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file with a single 100K extent starting at file # offset 800K. We fsync the file here to make the fsync log tree gets # a single csum item that covers the whole 100K extent, which causes # the second fsync, done after the cloning operation below, to not # leave in the log tree two csum items covering two sub-ranges # ([0, 20K[ and [20K, 100K[)) of our extent. $XFS_IO_PROG -f -c "pwrite -S 0xaa 800K 100K" \ -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io # Now clone part of our extent into file offset 400K. This adds a file # extent item to our inode's metadata that points to the 100K extent # we created before, using a data offset of 20K and a data length of # 20K, so that it refers to the sub-range [20K, 40K[ of our original # extent. $CLONER_PROG -s $((800 * 1024 + 20 * 1024)) -d $((400 * 1024)) \ -l $((20 * 1024)) $SCRATCH_MNT/foo $SCRATCH_MNT/foo # Now fsync our file to make sure the extent cloning is durably # persisted. This fsync will not add a second csum item to the log # tree containing the checksums for the blocks in the sub-range # [20K, 40K[ of our extent, because there was already a csum item in # the log tree covering the whole extent, added by the first fsync # we did before. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo echo "File digest before power failure:" md5sum $SCRATCH_MNT/foo | _filter_scratch # Silently drop all writes and ummount to simulate a crash/power # failure. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again, mount to trigger log replay and validate file # contents. # The fsync log replay first processes the file extent item # corresponding to the file offset 400K (the one which refers to the # [20K, 40K[ sub-range of our 100K extent) and then processes the file # extent item for file offset 800K. It used to happen that when # processing the later, it erroneously left in the csum tree 2 csum # items that overlapped each other, 1 for the sub-range [20K, 40K[ and # 1 for the whole range of our extent. This introduced a problem where # subsequent lookups for the checksums of blocks within the range # [40K, 100K[ of our extent would not find anything because lookups in # the csum tree ended up looking only at the smaller csum item, the # one covering the subrange [20K, 40K[. This made read requests assume # an expected checksum with a value of 0 for those blocks, which caused # checksum verification failure when the read operations finished. # However those checksum failure did not result in read requests # returning an error to user space (like -EIO for e.g.) because the # expected checksum value had the special value 0, and in that case # btrfs set all bytes of the corresponding pages with the value 0x01 # and produce the following warning in dmesg/syslog: # # "BTRFS warning (device dm-0): csum failed ino 257 off 917504 csum\ # 1322675045 expected csum 0" # _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey echo "File digest after log replay:" # Must match the same digest he had after cloning the extent and # before the power failure happened. md5sum $SCRATCH_MNT/foo | _filter_scratch _unmount_flakey status=0 exit Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c54
1 files changed, 54 insertions, 0 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6d650468d21a..1bbaace73383 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -722,12 +722,66 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
722 &ordered_sums, 0); 722 &ordered_sums, 0);
723 if (ret) 723 if (ret)
724 goto out; 724 goto out;
725 /*
726 * Now delete all existing cums in the csum root that
727 * cover our range. We do this because we can have an
728 * extent that is completely referenced by one file
729 * extent item and partially referenced by another
730 * file extent item (like after using the clone or
731 * extent_same ioctls). In this case if we end up doing
732 * the replay of the one that partially references the
733 * extent first, and we do not do the csum deletion
734 * below, we can get 2 csum items in the csum tree that
735 * overlap each other. For example, imagine our log has
736 * the two following file extent items:
737 *
738 * key (257 EXTENT_DATA 409600)
739 * extent data disk byte 12845056 nr 102400
740 * extent data offset 20480 nr 20480 ram 102400
741 *
742 * key (257 EXTENT_DATA 819200)
743 * extent data disk byte 12845056 nr 102400
744 * extent data offset 0 nr 102400 ram 102400
745 *
746 * Where the second one fully references the 100K extent
747 * that starts at disk byte 12845056, and the log tree
748 * has a single csum item that covers the entire range
749 * of the extent:
750 *
751 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
752 *
753 * After the first file extent item is replayed, the
754 * csum tree gets the following csum item:
755 *
756 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
757 *
758 * Which covers the 20K sub-range starting at offset 20K
759 * of our extent. Now when we replay the second file
760 * extent item, if we do not delete existing csum items
761 * that cover any of its blocks, we end up getting two
762 * csum items in our csum tree that overlap each other:
763 *
764 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
765 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
766 *
767 * Which is a problem, because after this anyone trying
768 * to lookup up for the checksum of any block of our
769 * extent starting at an offset of 40K or higher, will
770 * end up looking at the second csum item only, which
771 * does not contain the checksum for any block starting
772 * at offset 40K or higher of our extent.
773 */
725 while (!list_empty(&ordered_sums)) { 774 while (!list_empty(&ordered_sums)) {
726 struct btrfs_ordered_sum *sums; 775 struct btrfs_ordered_sum *sums;
727 sums = list_entry(ordered_sums.next, 776 sums = list_entry(ordered_sums.next,
728 struct btrfs_ordered_sum, 777 struct btrfs_ordered_sum,
729 list); 778 list);
730 if (!ret) 779 if (!ret)
780 ret = btrfs_del_csums(trans,
781 root->fs_info->csum_root,
782 sums->bytenr,
783 sums->len);
784 if (!ret)
731 ret = btrfs_csum_file_blocks(trans, 785 ret = btrfs_csum_file_blocks(trans,
732 root->fs_info->csum_root, 786 root->fs_info->csum_root,
733 sums); 787 sums);