diff options
author | Filipe Manana <fdmanana@suse.com> | 2014-08-25 05:43:00 -0400 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2014-09-02 19:46:05 -0400 |
commit | d9f85963e3f7f5582552fdae54a2b89d6c62daf5 (patch) | |
tree | 676942dbb75c27e47f8d2c3565c9c4b76832e2dc | |
parent | e9512d72e8e61c750c90efacd720abe3c4569822 (diff) |
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r-- | fs/btrfs/inode.c | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d020d6d9ace..7313571e1860 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -778,8 +778,12 @@ retry: | |||
778 | ins.offset, | 778 | ins.offset, |
779 | BTRFS_ORDERED_COMPRESSED, | 779 | BTRFS_ORDERED_COMPRESSED, |
780 | async_extent->compress_type); | 780 | async_extent->compress_type); |
781 | if (ret) | 781 | if (ret) { |
782 | btrfs_drop_extent_cache(inode, async_extent->start, | ||
783 | async_extent->start + | ||
784 | async_extent->ram_size - 1, 0); | ||
782 | goto out_free_reserve; | 785 | goto out_free_reserve; |
786 | } | ||
783 | 787 | ||
784 | /* | 788 | /* |
785 | * clear dirty, set writeback and unlock the pages. | 789 | * clear dirty, set writeback and unlock the pages. |
@@ -971,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode, | |||
971 | ret = btrfs_add_ordered_extent(inode, start, ins.objectid, | 975 | ret = btrfs_add_ordered_extent(inode, start, ins.objectid, |
972 | ram_size, cur_alloc_size, 0); | 976 | ram_size, cur_alloc_size, 0); |
973 | if (ret) | 977 | if (ret) |
974 | goto out_reserve; | 978 | goto out_drop_extent_cache; |
975 | 979 | ||
976 | if (root->root_key.objectid == | 980 | if (root->root_key.objectid == |
977 | BTRFS_DATA_RELOC_TREE_OBJECTID) { | 981 | BTRFS_DATA_RELOC_TREE_OBJECTID) { |
978 | ret = btrfs_reloc_clone_csums(inode, start, | 982 | ret = btrfs_reloc_clone_csums(inode, start, |
979 | cur_alloc_size); | 983 | cur_alloc_size); |
980 | if (ret) | 984 | if (ret) |
981 | goto out_reserve; | 985 | goto out_drop_extent_cache; |
982 | } | 986 | } |
983 | 987 | ||
984 | if (disk_num_bytes < cur_alloc_size) | 988 | if (disk_num_bytes < cur_alloc_size) |
@@ -1006,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode, | |||
1006 | out: | 1010 | out: |
1007 | return ret; | 1011 | return ret; |
1008 | 1012 | ||
1013 | out_drop_extent_cache: | ||
1014 | btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); | ||
1009 | out_reserve: | 1015 | out_reserve: |
1010 | btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); | 1016 | btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); |
1011 | out_unlock: | 1017 | out_unlock: |