aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-08-05 11:49:08 -0400
committerChris Mason <clm@fb.com>2015-08-09 09:17:04 -0400
commit18aa09229741364280d0a1670597b5207fc05b8d (patch)
tree1c19a2b79ba242a23cac13c347420b09da8b4614 /fs/btrfs/tree-log.c
parentdd81d459a37d73cfa39896bd070e7b92e66e3628 (diff)
Btrfs: fix stale dir entries after removing a link and fsync
We have one more case where after a log tree is replayed we get inconsistent metadata leading to stale directory entries, due to some directories having entries pointing to some inode while the inode does not have a matching BTRFS_INODE_[REF|EXTREF]_KEY item. To trigger the problem we need to have a file with multiple hard links belonging to different parent directories. Then if one of those hard links is removed and we fsync the file using one of its other links that belongs to a different parent directory, we end up not logging the fact that the removed hard link doesn't exists anymore in the parent directory. Simple reproducer: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { _cleanup_flakey rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _need_to_be_root _supported_fs generic _supported_os Linux _require_scratch _require_dm_flakey _require_metadata_journaling $SCRATCH_DEV rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 _init_flakey _mount_flakey # Create our test directory and file. mkdir $SCRATCH_MNT/testdir touch $SCRATCH_MNT/foo ln $SCRATCH_MNT/foo $SCRATCH_MNT/testdir/foo2 ln $SCRATCH_MNT/foo $SCRATCH_MNT/testdir/foo3 # Make sure everything done so far is durably persisted. sync # Now we remove one of our file's hardlinks in the directory testdir. unlink $SCRATCH_MNT/testdir/foo3 # We now fsync our file using the "foo" link, which has a parent that # is not the directory "testdir". $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Silently drop all writes and unmount to simulate a crash/power # failure. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again, mount to trigger journal/log replay. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # After the journal/log is replayed we expect to not see the "foo3" # link anymore and we should be able to remove all names in the # directory "testdir" and then remove it (no stale directory entries # left after the journal/log replay). echo "Entries in testdir:" ls -1 $SCRATCH_MNT/testdir rm -f $SCRATCH_MNT/testdir/* rmdir $SCRATCH_MNT/testdir _unmount_flakey status=0 exit The test fails with: $ ./check generic/107 FSTYP -- btrfs PLATFORM -- Linux/x86_64 debian3 4.1.0-rc6-btrfs-next-11+ MKFS_OPTIONS -- /dev/sdc MOUNT_OPTIONS -- /dev/sdc /home/fdmanana/btrfs-tests/scratch_1 generic/107 3s ... - output mismatch (see .../results/generic/107.out.bad) --- tests/generic/107.out 2015-08-01 01:39:45.807462161 +0100 +++ /home/fdmanana/git/hub/xfstests/results//generic/107.out.bad @@ -1,3 +1,5 @@ QA output created by 107 Entries in testdir: foo2 +foo3 +rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty ... _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent \ (see /home/fdmanana/git/hub/xfstests/results//generic/107.full) _check_dmesg: something found in dmesg (see .../results/generic/107.dmesg) Ran: generic/107 Failures: generic/107 Failed 1 of 1 tests $ cat /home/fdmanana/git/hub/xfstests/results//generic/107.full (...) checking fs roots root 5 inode 257 errors 200, dir isize wrong unresolved ref dir 257 index 3 namelen 4 name foo3 filetype 1 errors 5, no dir item, no inode ref (...) And produces the following warning in dmesg: [127298.759064] BTRFS info (device dm-0): failed to delete reference to foo3, inode 258 parent 257 [127298.762081] ------------[ cut here ]------------ [127298.763311] WARNING: CPU: 10 PID: 7891 at fs/btrfs/inode.c:3956 __btrfs_unlink_inode+0x182/0x35a [btrfs]() [127298.767327] BTRFS: Transaction aborted (error -2) (...) [127298.788611] Call Trace: [127298.789137] [<ffffffff8145f077>] dump_stack+0x4f/0x7b [127298.790090] [<ffffffff81095de5>] ? console_unlock+0x356/0x3a2 [127298.791157] [<ffffffff8104b3b0>] warn_slowpath_common+0xa1/0xbb [127298.792323] [<ffffffffa065ad09>] ? __btrfs_unlink_inode+0x182/0x35a [btrfs] [127298.793633] [<ffffffff8104b410>] warn_slowpath_fmt+0x46/0x48 [127298.794699] [<ffffffffa065ad09>] __btrfs_unlink_inode+0x182/0x35a [btrfs] [127298.797640] [<ffffffffa065be8f>] btrfs_unlink_inode+0x1e/0x40 [btrfs] [127298.798876] [<ffffffffa065bf11>] btrfs_unlink+0x60/0x9b [btrfs] [127298.800154] [<ffffffff8116fb48>] vfs_unlink+0x9c/0xed [127298.801303] [<ffffffff81173481>] do_unlinkat+0x12b/0x1fb [127298.802450] [<ffffffff81253855>] ? lockdep_sys_exit_thunk+0x12/0x14 [127298.803797] [<ffffffff81174056>] SyS_unlinkat+0x29/0x2b [127298.805017] [<ffffffff81465197>] system_call_fastpath+0x12/0x6f [127298.806310] ---[ end trace bbfddacb7aaada7b ]--- [127298.807325] BTRFS warning (device dm-0): __btrfs_unlink_inode:3956: Aborting unused transaction(No such entry). So fix this by logging all parent inodes, current and old ones, to make sure we do not get stale entries after log replay. This is not a simple solution such as triggering a full transaction commit because it would imply full transaction commit when an inode is fsynced in the same transaction that modified it and reloaded it after eviction (because its last_unlink_trans is set to the same value as its last_trans as of the commit with the title "Btrfs: fix stale dir entries after unlink, inode eviction and fsync"), and it would also make fstest generic/066 fail since one of the fsyncs triggers a full commit and the next fsync will not find the inode in the log anymore (therefore not removing the xattr). Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c158
1 files changed, 138 insertions, 20 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cb5666e7c3f9..9314adeba946 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4960,6 +4960,94 @@ next_dir_inode:
4960 return ret; 4960 return ret;
4961} 4961}
4962 4962
4963static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
4964 struct inode *inode,
4965 struct btrfs_log_ctx *ctx)
4966{
4967 int ret;
4968 struct btrfs_path *path;
4969 struct btrfs_key key;
4970 struct btrfs_root *root = BTRFS_I(inode)->root;
4971 const u64 ino = btrfs_ino(inode);
4972
4973 path = btrfs_alloc_path();
4974 if (!path)
4975 return -ENOMEM;
4976 path->skip_locking = 1;
4977 path->search_commit_root = 1;
4978
4979 key.objectid = ino;
4980 key.type = BTRFS_INODE_REF_KEY;
4981 key.offset = 0;
4982 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4983 if (ret < 0)
4984 goto out;
4985
4986 while (true) {
4987 struct extent_buffer *leaf = path->nodes[0];
4988 int slot = path->slots[0];
4989 u32 cur_offset = 0;
4990 u32 item_size;
4991 unsigned long ptr;
4992
4993 if (slot >= btrfs_header_nritems(leaf)) {
4994 ret = btrfs_next_leaf(root, path);
4995 if (ret < 0)
4996 goto out;
4997 else if (ret > 0)
4998 break;
4999 continue;
5000 }
5001
5002 btrfs_item_key_to_cpu(leaf, &key, slot);
5003 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5004 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5005 break;
5006
5007 item_size = btrfs_item_size_nr(leaf, slot);
5008 ptr = btrfs_item_ptr_offset(leaf, slot);
5009 while (cur_offset < item_size) {
5010 struct btrfs_key inode_key;
5011 struct inode *dir_inode;
5012
5013 inode_key.type = BTRFS_INODE_ITEM_KEY;
5014 inode_key.offset = 0;
5015
5016 if (key.type == BTRFS_INODE_EXTREF_KEY) {
5017 struct btrfs_inode_extref *extref;
5018
5019 extref = (struct btrfs_inode_extref *)
5020 (ptr + cur_offset);
5021 inode_key.objectid = btrfs_inode_extref_parent(
5022 leaf, extref);
5023 cur_offset += sizeof(*extref);
5024 cur_offset += btrfs_inode_extref_name_len(leaf,
5025 extref);
5026 } else {
5027 inode_key.objectid = key.offset;
5028 cur_offset = item_size;
5029 }
5030
5031 dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
5032 root, NULL);
5033 /* If parent inode was deleted, skip it. */
5034 if (IS_ERR(dir_inode))
5035 continue;
5036
5037 ret = btrfs_log_inode(trans, root, dir_inode,
5038 LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5039 iput(dir_inode);
5040 if (ret)
5041 goto out;
5042 }
5043 path->slots[0]++;
5044 }
5045 ret = 0;
5046out:
5047 btrfs_free_path(path);
5048 return ret;
5049}
5050
4963/* 5051/*
4964 * helper function around btrfs_log_inode to make sure newly created 5052 * helper function around btrfs_log_inode to make sure newly created
4965 * parent directories also end up in the log. A minimal inode and backref 5053 * parent directories also end up in the log. A minimal inode and backref
@@ -4979,9 +5067,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4979 struct dentry *old_parent = NULL; 5067 struct dentry *old_parent = NULL;
4980 int ret = 0; 5068 int ret = 0;
4981 u64 last_committed = root->fs_info->last_trans_committed; 5069 u64 last_committed = root->fs_info->last_trans_committed;
4982 const struct dentry * const first_parent = parent;
4983 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
4984 last_committed);
4985 bool log_dentries = false; 5070 bool log_dentries = false;
4986 struct inode *orig_inode = inode; 5071 struct inode *orig_inode = inode;
4987 5072
@@ -5042,6 +5127,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
5042 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) 5127 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
5043 log_dentries = true; 5128 log_dentries = true;
5044 5129
5130 /*
5131 * On unlink we must make sure all our current and old parent directores
5132 * inodes are fully logged. This is to prevent leaving dangling
5133 * directory index entries in directories that were our parents but are
5134 * not anymore. Not doing this results in old parent directory being
5135 * impossible to delete after log replay (rmdir will always fail with
5136 * error -ENOTEMPTY).
5137 *
5138 * Example 1:
5139 *
5140 * mkdir testdir
5141 * touch testdir/foo
5142 * ln testdir/foo testdir/bar
5143 * sync
5144 * unlink testdir/bar
5145 * xfs_io -c fsync testdir/foo
5146 * <power failure>
5147 * mount fs, triggers log replay
5148 *
5149 * If we don't log the parent directory (testdir), after log replay the
5150 * directory still has an entry pointing to the file inode using the bar
5151 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
5152 * the file inode has a link count of 1.
5153 *
5154 * Example 2:
5155 *
5156 * mkdir testdir
5157 * touch foo
5158 * ln foo testdir/foo2
5159 * ln foo testdir/foo3
5160 * sync
5161 * unlink testdir/foo3
5162 * xfs_io -c fsync foo
5163 * <power failure>
5164 * mount fs, triggers log replay
5165 *
5166 * Similar as the first example, after log replay the parent directory
5167 * testdir still has an entry pointing to the inode file with name foo3
5168 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
5169 * and has a link count of 2.
5170 */
5171 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
5172 ret = btrfs_log_all_parents(trans, orig_inode, ctx);
5173 if (ret)
5174 goto end_trans;
5175 }
5176
5045 while (1) { 5177 while (1) {
5046 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) 5178 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
5047 break; 5179 break;
@@ -5050,23 +5182,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
5050 if (root != BTRFS_I(inode)->root) 5182 if (root != BTRFS_I(inode)->root)
5051 break; 5183 break;
5052 5184
5053 /* 5185 if (BTRFS_I(inode)->generation > last_committed) {
5054 * On unlink we must make sure our immediate parent directory 5186 ret = btrfs_log_inode(trans, root, inode,
5055 * inode is fully logged. This is to prevent leaving dangling 5187 LOG_INODE_EXISTS,
5056 * directory index entries and a wrong directory inode's i_size.
5057 * Not doing so can result in a directory being impossible to
5058 * delete after log replay (rmdir will always fail with error
5059 * -ENOTEMPTY).
5060 */
5061 if (did_unlink && parent == first_parent)
5062 inode_only = LOG_INODE_ALL;
5063 else
5064 inode_only = LOG_INODE_EXISTS;
5065
5066 if (BTRFS_I(inode)->generation >
5067 root->fs_info->last_trans_committed ||
5068 inode_only == LOG_INODE_ALL) {
5069 ret = btrfs_log_inode(trans, root, inode, inode_only,
5070 0, LLONG_MAX, ctx); 5188 0, LLONG_MAX, ctx);
5071 if (ret) 5189 if (ret)
5072 goto end_trans; 5190 goto end_trans;