Btrfs: fix fsync after truncate when no_holes feature is enabled

When we have the no_holes feature enabled, if a we truncate a file to a smaller size, truncate it again but to a size greater than or equals to its original size and fsync it, the log tree will not have any information about the hole covering the range [truncate_1_offset, new_file_size[. Which means if the fsync log is replayed, the file will remain with the state it had before both truncate operations. Without the no_holes feature this does not happen, since when the inode is logged (full sync flag is set) it will find in the fs/subvol tree a leaf with a generation matching the current transaction id that has an explicit extent item representing the hole. Fix this by adding an explicit extent item representing a hole between the last extent and the inode's i_size if we are doing a full sync. The issue is easy to reproduce with the following test case for fstests: . ./common/rc . ./common/filter . ./common/dmflakey _need_to_be_root _supported_fs generic _supported_os Linux _require_scratch _require_dm_flakey # This test was motivated by an issue found in btrfs when the btrfs # no-holes feature is enabled (introduced in kernel 3.14). So enable # the feature if the fs being tested is btrfs. if [ $FSTYP == "btrfs" ]; then _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" MKFS_OPTIONS="$MKFS_OPTIONS -O no-holes" fi rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 _init_flakey _mount_flakey # Create our test files and make sure everything is durably persisted. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 64K" \ -c "pwrite -S 0xbb 64K 61K" \ $SCRATCH_MNT/foo | _filter_xfs_io $XFS_IO_PROG -f -c "pwrite -S 0xee 0 64K" \ -c "pwrite -S 0xff 64K 61K" \ $SCRATCH_MNT/bar | _filter_xfs_io sync # Now truncate our file foo to a smaller size (64Kb) and then truncate # it to the size it had before the shrinking truncate (125Kb). Then # fsync our file. If a power failure happens after the fsync, we expect # our file to have a size of 125Kb, with the first 64Kb of data having # the value 0xaa and the second 61Kb of data having the value 0x00. $XFS_IO_PROG -c "truncate 64K" \ -c "truncate 125K" \ -c "fsync" \ $SCRATCH_MNT/foo # Do something similar to our file bar, but the first truncation sets # the file size to 0 and the second truncation expands the size to the # double of what it was initially. $XFS_IO_PROG -c "truncate 0" \ -c "truncate 253K" \ -c "fsync" \ $SCRATCH_MNT/bar _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again, mount to trigger log replay and validate file # contents. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # We expect foo to have a size of 125Kb, the first 64Kb of data all # having the value 0xaa and the remaining 61Kb to be a hole (all bytes # with value 0x00). echo "File foo content after log replay:" od -t x1 $SCRATCH_MNT/foo # We expect bar to have a size of 253Kb and no extents (any byte read # from bar has the value 0x00). echo "File bar content after log replay:" od -t x1 $SCRATCH_MNT/bar status=0 exit The expected file contents in the golden output are: File foo content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0372000 File bar content after log replay: 0000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0772000 Without this fix, their contents are: File foo content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb * 0372000 File bar content after log replay: 0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0200000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0372000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0772000 A test case submission for fstests follows soon. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Chris Mason <clm@fb.com>
author: Filipe Manana <fdmanana@suse.com> 2015-06-24 23:17:46 -0400
committer: Chris Mason <clm@fb.com> 2015-07-01 20:17:12 -0400
commit: a89ca6f24ffe435edad57de02eaabd37a2c6bff6 (patch)
tree: 9428cf5260c00f66637d29f3d8c37902c6127ddb /fs/btrfs/tree-log.c
parent: 36283bf777d963fac099213297e155d071096994 (diff)
1 files changed, 108 insertions, 0 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 66f87156882f..9c45431e69ab 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4197,6 +4197,107 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
        return 0;
 }
+/*
+ * If the no holes feature is enabled we need to make sure any hole between the
+ * last extent and the i_size of our inode is explicitly marked in the log. This
+ * is to make sure that doing something like:
+ *
+ *      1) create file with 128Kb of data
+ *      2) truncate file to 64Kb
+ *      3) truncate file to 256Kb
+ *      4) fsync file
+ *      5) <crash/power failure>
+ *      6) mount fs and trigger log replay
+ *
+ * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+ * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+ * file correspond to a hole. The presence of explicit holes in a log tree is
+ * what guarantees that log replay will remove/adjust file extent items in the
+ * fs/subvol tree.
+ *
+ * Here we do not need to care about holes between extents, that is already done
+ * by copy_items(). We also only need to do this in the full sync path, where we
+ * lookup for extents from the fs/subvol tree only. In the fast path case, we
+ * lookup the list of modified extent maps and if any represents a hole, we
+ * insert a corresponding extent representing a hole in the log tree.
+ */
+static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct inode *inode,
+                                   struct btrfs_path *path)
+{
+        int ret;
+        struct btrfs_key key;
+        u64 hole_start;
+        u64 hole_size;
+        struct extent_buffer *leaf;
+        struct btrfs_root *log = root->log_root;
+        const u64 ino = btrfs_ino(inode);
+        const u64 i_size = i_size_read(inode);
+        if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
+                return 0;
+        key.objectid = ino;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = (u64)-1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        ASSERT(ret != 0);
+        if (ret < 0)
+                return ret;
+        ASSERT(path->slots[0] > 0);
+        path->slots[0]--;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+                /* inode does not have any extents */
+                hole_start = 0;
+                hole_size = i_size;
+        } else {
+                struct btrfs_file_extent_item *extent;
+                u64 len;
+                /*
+                 * If there's an extent beyond i_size, an explicit hole was
+                 * already inserted by copy_items().
+                 */
+                if (key.offset >= i_size)
+                        return 0;
+                extent = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, extent) ==
+                    BTRFS_FILE_EXTENT_INLINE) {
+                        len = btrfs_file_extent_inline_len(leaf,
+                                                           path->slots[0],
+                                                           extent);
+                        ASSERT(len == i_size);
+                        return 0;
+                }
+                len = btrfs_file_extent_num_bytes(leaf, extent);
+                /* Last extent goes beyond i_size, no need to log a hole. */
+                if (key.offset + len > i_size)
+                        return 0;
+                hole_start = key.offset + len;
+                hole_size = i_size - hole_start;
+        }
+        btrfs_release_path(path);
+        /* Last extent ends at i_size. */
+        if (hole_size == 0)
+                return 0;
+        hole_size = ALIGN(hole_size, root->sectorsize);
+        ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+                                       hole_size, 0, hole_size, 0, 0, 0);
+        return ret;
+}
 /* log a single inode in the tree log.
 * At least one parent directory for this inode must exist in the tree
 * or be logged already.
@@ -4460,6 +4561,13 @@ next_slot:
        err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
        if (err)
                goto out_unlock;
+        if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+                btrfs_release_path(path);
+                btrfs_release_path(dst_path);
+                err = btrfs_log_trailing_hole(trans, root, inode, path);
+                if (err)
+                        goto out_unlock;
+        }
 log_extents:
        btrfs_release_path(path);
        btrfs_release_path(dst_path);
author	Filipe Manana <fdmanana@suse.com>	2015-06-24 23:17:46 -0400
committer	Chris Mason <clm@fb.com>	2015-07-01 20:17:12 -0400
commit	a89ca6f24ffe435edad57de02eaabd37a2c6bff6 (patch)
tree	9428cf5260c00f66637d29f3d8c37902c6127ddb /fs/btrfs/tree-log.c
parent	36283bf777d963fac099213297e155d071096994 (diff)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 66f87156882f..9c45431e69ab 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c
@@ -4197,6 +4197,107 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4197	return 0;	4197	return 0;
4198	}	4198	}
4199		4199
		4200	/*
		4201	* If the no holes feature is enabled we need to make sure any hole between the
		4202	* last extent and the i_size of our inode is explicitly marked in the log. This
		4203	* is to make sure that doing something like:
		4204	*
		4205	* 1) create file with 128Kb of data
		4206	* 2) truncate file to 64Kb
		4207	* 3) truncate file to 256Kb
		4208	* 4) fsync file
		4209	* 5) <crash/power failure>
		4210	* 6) mount fs and trigger log replay
		4211	*
		4212	* Will give us a file with a size of 256Kb, the first 64Kb of data match what
		4213	* the file had in its first 64Kb of data at step 1 and the last 192Kb of the
		4214	* file correspond to a hole. The presence of explicit holes in a log tree is
		4215	* what guarantees that log replay will remove/adjust file extent items in the
		4216	* fs/subvol tree.
		4217	*
		4218	* Here we do not need to care about holes between extents, that is already done
		4219	* by copy_items(). We also only need to do this in the full sync path, where we
		4220	* lookup for extents from the fs/subvol tree only. In the fast path case, we
		4221	* lookup the list of modified extent maps and if any represents a hole, we
		4222	* insert a corresponding extent representing a hole in the log tree.
		4223	*/
		4224	static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
		4225	struct btrfs_root *root,
		4226	struct inode *inode,
		4227	struct btrfs_path *path)
		4228	{
		4229	int ret;
		4230	struct btrfs_key key;
		4231	u64 hole_start;
		4232	u64 hole_size;
		4233	struct extent_buffer *leaf;
		4234	struct btrfs_root *log = root->log_root;
		4235	const u64 ino = btrfs_ino(inode);
		4236	const u64 i_size = i_size_read(inode);
		4237
		4238	if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
		4239	return 0;
		4240
		4241	key.objectid = ino;
		4242	key.type = BTRFS_EXTENT_DATA_KEY;
		4243	key.offset = (u64)-1;
		4244
		4245	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		4246	ASSERT(ret != 0);
		4247	if (ret < 0)
		4248	return ret;
		4249
		4250	ASSERT(path->slots[0] > 0);
		4251	path->slots[0]--;
		4252	leaf = path->nodes[0];
		4253	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		4254
		4255	if (key.objectid != ino \|\| key.type != BTRFS_EXTENT_DATA_KEY) {
		4256	/* inode does not have any extents */
		4257	hole_start = 0;
		4258	hole_size = i_size;
		4259	} else {
		4260	struct btrfs_file_extent_item *extent;
		4261	u64 len;
		4262
		4263	/*
		4264	* If there's an extent beyond i_size, an explicit hole was
		4265	* already inserted by copy_items().
		4266	*/
		4267	if (key.offset >= i_size)
		4268	return 0;
		4269
		4270	extent = btrfs_item_ptr(leaf, path->slots[0],
		4271	struct btrfs_file_extent_item);
		4272
		4273	if (btrfs_file_extent_type(leaf, extent) ==
		4274	BTRFS_FILE_EXTENT_INLINE) {
		4275	len = btrfs_file_extent_inline_len(leaf,
		4276	path->slots[0],
		4277	extent);
		4278	ASSERT(len == i_size);
		4279	return 0;
		4280	}
		4281
		4282	len = btrfs_file_extent_num_bytes(leaf, extent);
		4283	/* Last extent goes beyond i_size, no need to log a hole. */
		4284	if (key.offset + len > i_size)
		4285	return 0;
		4286	hole_start = key.offset + len;
		4287	hole_size = i_size - hole_start;
		4288	}
		4289	btrfs_release_path(path);
		4290
		4291	/* Last extent ends at i_size. */
		4292	if (hole_size == 0)
		4293	return 0;
		4294
		4295	hole_size = ALIGN(hole_size, root->sectorsize);
		4296	ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
		4297	hole_size, 0, hole_size, 0, 0, 0);
		4298	return ret;
		4299	}
		4300
4200	/* log a single inode in the tree log.	4301	/* log a single inode in the tree log.
4201	* At least one parent directory for this inode must exist in the tree	4302	* At least one parent directory for this inode must exist in the tree
4202	* or be logged already.	4303	* or be logged already.
@@ -4460,6 +4561,13 @@ next_slot:
4460	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);	4561	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
4461	if (err)	4562	if (err)
4462	goto out_unlock;	4563	goto out_unlock;
		4564	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
		4565	btrfs_release_path(path);
		4566	btrfs_release_path(dst_path);
		4567	err = btrfs_log_trailing_hole(trans, root, inode, path);
		4568	if (err)
		4569	goto out_unlock;
		4570	}
4463	log_extents:	4571	log_extents:
4464	btrfs_release_path(path);	4572	btrfs_release_path(path);
4465	btrfs_release_path(dst_path);	4573	btrfs_release_path(dst_path);