diff options
| author | Filipe Manana <fdmanana@suse.com> | 2015-06-24 23:17:46 -0400 |
|---|---|---|
| committer | Chris Mason <clm@fb.com> | 2015-07-01 20:17:12 -0400 |
| commit | a89ca6f24ffe435edad57de02eaabd37a2c6bff6 (patch) | |
| tree | 9428cf5260c00f66637d29f3d8c37902c6127ddb | |
| parent | 36283bf777d963fac099213297e155d071096994 (diff) | |
Btrfs: fix fsync after truncate when no_holes feature is enabled
When we have the no_holes feature enabled, if a we truncate a file to a
smaller size, truncate it again but to a size greater than or equals to
its original size and fsync it, the log tree will not have any information
about the hole covering the range [truncate_1_offset, new_file_size[.
Which means if the fsync log is replayed, the file will remain with the
state it had before both truncate operations.
Without the no_holes feature this does not happen, since when the inode
is logged (full sync flag is set) it will find in the fs/subvol tree a
leaf with a generation matching the current transaction id that has an
explicit extent item representing the hole.
Fix this by adding an explicit extent item representing a hole between
the last extent and the inode's i_size if we are doing a full sync.
The issue is easy to reproduce with the following test case for fstests:
. ./common/rc
. ./common/filter
. ./common/dmflakey
_need_to_be_root
_supported_fs generic
_supported_os Linux
_require_scratch
_require_dm_flakey
# This test was motivated by an issue found in btrfs when the btrfs
# no-holes feature is enabled (introduced in kernel 3.14). So enable
# the feature if the fs being tested is btrfs.
if [ $FSTYP == "btrfs" ]; then
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
MKFS_OPTIONS="$MKFS_OPTIONS -O no-holes"
fi
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test files and make sure everything is durably persisted.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0 64K" \
-c "pwrite -S 0xbb 64K 61K" \
$SCRATCH_MNT/foo | _filter_xfs_io
$XFS_IO_PROG -f -c "pwrite -S 0xee 0 64K" \
-c "pwrite -S 0xff 64K 61K" \
$SCRATCH_MNT/bar | _filter_xfs_io
sync
# Now truncate our file foo to a smaller size (64Kb) and then truncate
# it to the size it had before the shrinking truncate (125Kb). Then
# fsync our file. If a power failure happens after the fsync, we expect
# our file to have a size of 125Kb, with the first 64Kb of data having
# the value 0xaa and the second 61Kb of data having the value 0x00.
$XFS_IO_PROG -c "truncate 64K" \
-c "truncate 125K" \
-c "fsync" \
$SCRATCH_MNT/foo
# Do something similar to our file bar, but the first truncation sets
# the file size to 0 and the second truncation expands the size to the
# double of what it was initially.
$XFS_IO_PROG -c "truncate 0" \
-c "truncate 253K" \
-c "fsync" \
$SCRATCH_MNT/bar
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Allow writes again, mount to trigger log replay and validate file
# contents.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# We expect foo to have a size of 125Kb, the first 64Kb of data all
# having the value 0xaa and the remaining 61Kb to be a hole (all bytes
# with value 0x00).
echo "File foo content after log replay:"
od -t x1 $SCRATCH_MNT/foo
# We expect bar to have a size of 253Kb and no extents (any byte read
# from bar has the value 0x00).
echo "File bar content after log replay:"
od -t x1 $SCRATCH_MNT/bar
status=0
exit
The expected file contents in the golden output are:
File foo content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0372000
File bar content after log replay:
0000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0772000
Without this fix, their contents are:
File foo content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
*
0372000
File bar content after log replay:
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0200000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0372000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0772000
A test case submission for fstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
| -rw-r--r-- | fs/btrfs/tree-log.c | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 66f87156882f..9c45431e69ab 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
| @@ -4197,6 +4197,107 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, | |||
| 4197 | return 0; | 4197 | return 0; |
| 4198 | } | 4198 | } |
| 4199 | 4199 | ||
| 4200 | /* | ||
| 4201 | * If the no holes feature is enabled we need to make sure any hole between the | ||
| 4202 | * last extent and the i_size of our inode is explicitly marked in the log. This | ||
| 4203 | * is to make sure that doing something like: | ||
| 4204 | * | ||
| 4205 | * 1) create file with 128Kb of data | ||
| 4206 | * 2) truncate file to 64Kb | ||
| 4207 | * 3) truncate file to 256Kb | ||
| 4208 | * 4) fsync file | ||
| 4209 | * 5) <crash/power failure> | ||
| 4210 | * 6) mount fs and trigger log replay | ||
| 4211 | * | ||
| 4212 | * Will give us a file with a size of 256Kb, the first 64Kb of data match what | ||
| 4213 | * the file had in its first 64Kb of data at step 1 and the last 192Kb of the | ||
| 4214 | * file correspond to a hole. The presence of explicit holes in a log tree is | ||
| 4215 | * what guarantees that log replay will remove/adjust file extent items in the | ||
| 4216 | * fs/subvol tree. | ||
| 4217 | * | ||
| 4218 | * Here we do not need to care about holes between extents, that is already done | ||
| 4219 | * by copy_items(). We also only need to do this in the full sync path, where we | ||
| 4220 | * lookup for extents from the fs/subvol tree only. In the fast path case, we | ||
| 4221 | * lookup the list of modified extent maps and if any represents a hole, we | ||
| 4222 | * insert a corresponding extent representing a hole in the log tree. | ||
| 4223 | */ | ||
| 4224 | static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, | ||
| 4225 | struct btrfs_root *root, | ||
| 4226 | struct inode *inode, | ||
| 4227 | struct btrfs_path *path) | ||
| 4228 | { | ||
| 4229 | int ret; | ||
| 4230 | struct btrfs_key key; | ||
| 4231 | u64 hole_start; | ||
| 4232 | u64 hole_size; | ||
| 4233 | struct extent_buffer *leaf; | ||
| 4234 | struct btrfs_root *log = root->log_root; | ||
| 4235 | const u64 ino = btrfs_ino(inode); | ||
| 4236 | const u64 i_size = i_size_read(inode); | ||
| 4237 | |||
| 4238 | if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) | ||
| 4239 | return 0; | ||
| 4240 | |||
| 4241 | key.objectid = ino; | ||
| 4242 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
| 4243 | key.offset = (u64)-1; | ||
| 4244 | |||
| 4245 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
| 4246 | ASSERT(ret != 0); | ||
| 4247 | if (ret < 0) | ||
| 4248 | return ret; | ||
| 4249 | |||
| 4250 | ASSERT(path->slots[0] > 0); | ||
| 4251 | path->slots[0]--; | ||
| 4252 | leaf = path->nodes[0]; | ||
| 4253 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
| 4254 | |||
| 4255 | if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { | ||
| 4256 | /* inode does not have any extents */ | ||
| 4257 | hole_start = 0; | ||
| 4258 | hole_size = i_size; | ||
| 4259 | } else { | ||
| 4260 | struct btrfs_file_extent_item *extent; | ||
| 4261 | u64 len; | ||
| 4262 | |||
| 4263 | /* | ||
| 4264 | * If there's an extent beyond i_size, an explicit hole was | ||
| 4265 | * already inserted by copy_items(). | ||
| 4266 | */ | ||
| 4267 | if (key.offset >= i_size) | ||
| 4268 | return 0; | ||
| 4269 | |||
| 4270 | extent = btrfs_item_ptr(leaf, path->slots[0], | ||
| 4271 | struct btrfs_file_extent_item); | ||
| 4272 | |||
| 4273 | if (btrfs_file_extent_type(leaf, extent) == | ||
| 4274 | BTRFS_FILE_EXTENT_INLINE) { | ||
| 4275 | len = btrfs_file_extent_inline_len(leaf, | ||
| 4276 | path->slots[0], | ||
| 4277 | extent); | ||
| 4278 | ASSERT(len == i_size); | ||
| 4279 | return 0; | ||
| 4280 | } | ||
| 4281 | |||
| 4282 | len = btrfs_file_extent_num_bytes(leaf, extent); | ||
| 4283 | /* Last extent goes beyond i_size, no need to log a hole. */ | ||
| 4284 | if (key.offset + len > i_size) | ||
| 4285 | return 0; | ||
| 4286 | hole_start = key.offset + len; | ||
| 4287 | hole_size = i_size - hole_start; | ||
| 4288 | } | ||
| 4289 | btrfs_release_path(path); | ||
| 4290 | |||
| 4291 | /* Last extent ends at i_size. */ | ||
| 4292 | if (hole_size == 0) | ||
| 4293 | return 0; | ||
| 4294 | |||
| 4295 | hole_size = ALIGN(hole_size, root->sectorsize); | ||
| 4296 | ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, | ||
| 4297 | hole_size, 0, hole_size, 0, 0, 0); | ||
| 4298 | return ret; | ||
| 4299 | } | ||
| 4300 | |||
| 4200 | /* log a single inode in the tree log. | 4301 | /* log a single inode in the tree log. |
| 4201 | * At least one parent directory for this inode must exist in the tree | 4302 | * At least one parent directory for this inode must exist in the tree |
| 4202 | * or be logged already. | 4303 | * or be logged already. |
| @@ -4460,6 +4561,13 @@ next_slot: | |||
| 4460 | err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); | 4561 | err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); |
| 4461 | if (err) | 4562 | if (err) |
| 4462 | goto out_unlock; | 4563 | goto out_unlock; |
| 4564 | if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { | ||
| 4565 | btrfs_release_path(path); | ||
| 4566 | btrfs_release_path(dst_path); | ||
| 4567 | err = btrfs_log_trailing_hole(trans, root, inode, path); | ||
| 4568 | if (err) | ||
| 4569 | goto out_unlock; | ||
| 4570 | } | ||
| 4463 | log_extents: | 4571 | log_extents: |
| 4464 | btrfs_release_path(path); | 4572 | btrfs_release_path(path); |
| 4465 | btrfs_release_path(dst_path); | 4573 | btrfs_release_path(dst_path); |
