summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2019-06-19 08:05:39 -0400
committerDavid Sterba <dsterba@suse.com>2019-07-02 06:30:50 -0400
commit803f0f64d17769071d7287d9e3e3b79a3e1ae937 (patch)
tree03a5e0b4665fdd17b04890e096767d62db93aea0 /fs/btrfs/tree-log.c
parent89b798ad1b42b1de10d64feda241e35e90c7b102 (diff)
Btrfs: fix fsync not persisting dentry deletions due to inode evictions
In order to avoid searches on a log tree when unlinking an inode, we check if the inode being unlinked was logged in the current transaction, as well as the inode of its parent directory. When any of the inodes are logged, we proceed to delete directory items and inode reference items from the log, to ensure that if a subsequent fsync of only the inode being unlinked or only of the parent directory when the other is not fsync'ed as well, does not result in the entry still existing after a power failure. That check however is not reliable when one of the inodes involved (the one being unlinked or its parent directory's inode) is evicted, since the logged_trans field is transient, that is, it is not stored on disk, so it is lost when the inode is evicted and loaded into memory again (which is set to zero on load). As a consequence the checks currently being done by btrfs_del_dir_entries_in_log() and btrfs_del_inode_ref_in_log() always return true if the inode was evicted before, regardless of the inode having been logged or not before (and in the current transaction), this results in the dentry being unlinked still existing after a log replay if after the unlink operation only one of the inodes involved is fsync'ed. Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/dir $ touch /mnt/dir/foo $ xfs_io -c fsync /mnt/dir/foo # Keep an open file descriptor on our directory while we evict inodes. # We just want to evict the file's inode, the directory's inode must not # be evicted. $ ( cd /mnt/dir; while true; do :; done ) & $ pid=$! # Wait a bit to give time to background process to chdir to our test # directory. $ sleep 0.5 # Trigger eviction of the file's inode. $ echo 2 > /proc/sys/vm/drop_caches # Unlink our file and fsync the parent directory. After a power failure # we don't expect to see the file anymore, since we fsync'ed the parent # directory. $ rm -f $SCRATCH_MNT/dir/foo $ xfs_io -c fsync /mnt/dir <power failure> $ mount /dev/sdb /mnt $ ls /mnt/dir foo $ --> file still there, unlink not persisted despite explicit fsync on dir Fix this by checking if the inode has the full_sync bit set in its runtime flags as well, since that bit is set everytime an inode is loaded from disk, or for other less common cases such as after a shrinking truncate or failure to allocate extent maps for holes, and gets cleared after the first fsync. Also consider the inode as possibly logged only if it was last modified in the current transaction (besides having the full_fsync flag set). Fixes: 3a5f1d458ad161 ("Btrfs: Optimize btree walking while logging inodes") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c28
1 files changed, 26 insertions, 2 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a04659fded7..6c8297bcfeb7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3323,6 +3323,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3323} 3323}
3324 3324
3325/* 3325/*
3326 * Check if an inode was logged in the current transaction. We can't always rely
3327 * on an inode's logged_trans value, because it's an in-memory only field and
3328 * therefore not persisted. This means that its value is lost if the inode gets
3329 * evicted and loaded again from disk (in which case it has a value of 0, and
3330 * certainly it is smaller then any possible transaction ID), when that happens
3331 * the full_sync flag is set in the inode's runtime flags, so on that case we
3332 * assume eviction happened and ignore the logged_trans value, assuming the
3333 * worst case, that the inode was logged before in the current transaction.
3334 */
3335static bool inode_logged(struct btrfs_trans_handle *trans,
3336 struct btrfs_inode *inode)
3337{
3338 if (inode->logged_trans == trans->transid)
3339 return true;
3340
3341 if (inode->last_trans == trans->transid &&
3342 test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3343 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3344 return true;
3345
3346 return false;
3347}
3348
3349/*
3326 * If both a file and directory are logged, and unlinks or renames are 3350 * If both a file and directory are logged, and unlinks or renames are
3327 * mixed in, we have a few interesting corners: 3351 * mixed in, we have a few interesting corners:
3328 * 3352 *
@@ -3356,7 +3380,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3356 int bytes_del = 0; 3380 int bytes_del = 0;
3357 u64 dir_ino = btrfs_ino(dir); 3381 u64 dir_ino = btrfs_ino(dir);
3358 3382
3359 if (dir->logged_trans < trans->transid) 3383 if (!inode_logged(trans, dir))
3360 return 0; 3384 return 0;
3361 3385
3362 ret = join_running_log_trans(root); 3386 ret = join_running_log_trans(root);
@@ -3460,7 +3484,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3460 u64 index; 3484 u64 index;
3461 int ret; 3485 int ret;
3462 3486
3463 if (inode->logged_trans < trans->transid) 3487 if (!inode_logged(trans, inode))
3464 return 0; 3488 return 0;
3465 3489
3466 ret = join_running_log_trans(root); 3490 ret = join_running_log_trans(root);