diff options
author | Filipe Manana <fdmanana@suse.com> | 2015-06-17 07:49:23 -0400 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2015-06-30 17:36:47 -0400 |
commit | e4545de5b035c7debb73d260c78377dbb69cbfb5 (patch) | |
tree | 7e88530b6232a24f09ff6dd6ad61c056a13dcb5c /fs/btrfs/tree-log.c | |
parent | da288d280d16f4d7e4ada331cb33d381b408b10c (diff) |
Btrfs: fix fsync data loss after append write
If we do an append write to a file (which increases its inode's i_size)
that does not have the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode,
and the previous transaction added a new hard link to the file, which sets
the flag BTRFS_INODE_COPY_EVERYTHING in the file's inode, and then fsync
the file, the inode's new i_size isn't logged. This has the consequence
that after the fsync log is replayed, the file size remains what it was
before the append write operation, which means users/applications will
not be able to read the data that was successsfully fsync'ed before.
This happens because neither the inode item nor the delayed inode get
their i_size updated when the append write is made - doing so would
require starting a transaction in the buffered write path, something that
we do not do intentionally for performance reasons.
Fix this by making sure that when the flag BTRFS_INODE_COPY_EVERYTHING is
set the inode is logged with its current i_size (log the in-memory inode
into the log tree).
This issue is not a recent regression and is easy to reproduce with the
following test case for fstests:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
here=`pwd`
tmp=/tmp/$$
status=1 # failure is the default!
_cleanup()
{
_cleanup_flakey
rm -f $tmp.*
}
trap "_cleanup; exit \$status" 0 1 2 3 15
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_supported_fs generic
_supported_os Linux
_need_to_be_root
_require_scratch
_require_dm_flakey
_require_metadata_journaling $SCRATCH_DEV
_crash_and_mount()
{
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Allow writes again and mount. This makes the fs replay its fsync log.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
}
rm -f $seqres.full
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create the test file with some initial data and then fsync it.
# The fsync here is only needed to trigger the issue in btrfs, as it causes the
# the flag BTRFS_INODE_NEEDS_FULL_SYNC to be removed from the btrfs inode.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0 32k" \
-c "fsync" \
$SCRATCH_MNT/foo | _filter_xfs_io
sync
# Add a hard link to our file.
# On btrfs this sets the flag BTRFS_INODE_COPY_EVERYTHING on the btrfs inode,
# which is a necessary condition to trigger the issue.
ln $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Sync the filesystem to force a commit of the current btrfs transaction, this
# is a necessary condition to trigger the bug on btrfs.
sync
# Now append more data to our file, increasing its size, and fsync the file.
# In btrfs because the inode flag BTRFS_INODE_COPY_EVERYTHING was set and the
# write path did not update the inode item in the btree nor the delayed inode
# item (in memory struture) in the current transaction (created by the fsync
# handler), the fsync did not record the inode's new i_size in the fsync
# log/journal. This made the data unavailable after the fsync log/journal is
# replayed.
$XFS_IO_PROG -c "pwrite -S 0xbb 32K 32K" \
-c "fsync" \
$SCRATCH_MNT/foo | _filter_xfs_io
echo "File content after fsync and before crash:"
od -t x1 $SCRATCH_MNT/foo
_crash_and_mount
echo "File content after crash and log replay:"
od -t x1 $SCRATCH_MNT/foo
status=0
exit
The expected file output before and after the crash/power failure expects the
appended data to be available, which is:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0100000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
*
0200000
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r-- | fs/btrfs/tree-log.c | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1ce80c1c4eb6..76c4f9d1b80a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -4155,6 +4155,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
4155 | u64 ino = btrfs_ino(inode); | 4155 | u64 ino = btrfs_ino(inode); |
4156 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | 4156 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; |
4157 | u64 logged_isize = 0; | 4157 | u64 logged_isize = 0; |
4158 | bool need_log_inode_item = true; | ||
4158 | 4159 | ||
4159 | path = btrfs_alloc_path(); | 4160 | path = btrfs_alloc_path(); |
4160 | if (!path) | 4161 | if (!path) |
@@ -4263,11 +4264,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
4263 | } else { | 4264 | } else { |
4264 | if (inode_only == LOG_INODE_ALL) | 4265 | if (inode_only == LOG_INODE_ALL) |
4265 | fast_search = true; | 4266 | fast_search = true; |
4266 | ret = log_inode_item(trans, log, dst_path, inode); | ||
4267 | if (ret) { | ||
4268 | err = ret; | ||
4269 | goto out_unlock; | ||
4270 | } | ||
4271 | goto log_extents; | 4267 | goto log_extents; |
4272 | } | 4268 | } |
4273 | 4269 | ||
@@ -4290,6 +4286,9 @@ again: | |||
4290 | if (min_key.type > max_key.type) | 4286 | if (min_key.type > max_key.type) |
4291 | break; | 4287 | break; |
4292 | 4288 | ||
4289 | if (min_key.type == BTRFS_INODE_ITEM_KEY) | ||
4290 | need_log_inode_item = false; | ||
4291 | |||
4293 | src = path->nodes[0]; | 4292 | src = path->nodes[0]; |
4294 | if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { | 4293 | if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { |
4295 | ins_nr++; | 4294 | ins_nr++; |
@@ -4360,6 +4359,11 @@ next_slot: | |||
4360 | log_extents: | 4359 | log_extents: |
4361 | btrfs_release_path(path); | 4360 | btrfs_release_path(path); |
4362 | btrfs_release_path(dst_path); | 4361 | btrfs_release_path(dst_path); |
4362 | if (need_log_inode_item) { | ||
4363 | err = log_inode_item(trans, log, dst_path, inode); | ||
4364 | if (err) | ||
4365 | goto out_unlock; | ||
4366 | } | ||
4363 | if (fast_search) { | 4367 | if (fast_search) { |
4364 | /* | 4368 | /* |
4365 | * Some ordered extents started by fsync might have completed | 4369 | * Some ordered extents started by fsync might have completed |