summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2019-02-13 07:14:03 -0500
committerDavid Sterba <dsterba@suse.com>2019-02-25 08:13:40 -0500
commit6b5fc433a7ad6711052d1aa4be0debc6316b219f (patch)
tree25f35fdab4d3a15c353fe1ccb8bc0e34d5f21560 /fs/btrfs/tree-log.c
parent38e3eebff643db725633657d1d87a3be019d1018 (diff)
Btrfs: fix fsync after succession of renames of different files
After a succession of rename operations of different files and fsyncing one of them, such that each file gets a new name that corresponds to an old name of another file, we can end up with a log that will cause a failure when attempted to replay at mount time (an EEXIST error). We currently have correct behaviour when such succession of renames involves only two files, but if there are more files involved, we end up not logging all the inodes that are needed, therefore resulting in a failure when attempting to replay the log. Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/testdir $ touch /mnt/testdir/fname1 $ touch /mnt/testdir/fname2 $ sync $ mv /mnt/testdir/fname1 /mnt/testdir/fname3 $ mv /mnt/testdir/fname2 /mnt/testdir/fname4 $ ln /mnt/testdir/fname3 /mnt/testdir/fname2 $ touch /mnt/testdir/fname1 $ xfs_io -c "fsync" /mnt/testdir/fname1 <power failure> $ mount /dev/sdb /mnt mount: mount /dev/sdb on /mnt failed: File exists So fix this by checking all inode dependencies when logging an inode. That is, if one logged inode A has a new name that matches the old name of some other inode B, check if inode B has a new name that matches the old name of some other inode C, and so on. This fix is implemented not by doing any recursive function calls but by using an iterative method using a linked list that is used in a first-in-first-out fashion. A test case for fstests follows soon. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c241
1 files changed, 197 insertions, 44 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1a69a45ae926..70d41f669025 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1330,6 +1330,67 @@ out:
1330 return ret; 1330 return ret;
1331} 1331}
1332 1332
1333static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1334 struct inode *dir, struct inode *inode, const char *name,
1335 int namelen, u64 ref_index)
1336{
1337 struct btrfs_dir_item *dir_item;
1338 struct btrfs_key key;
1339 struct btrfs_path *path;
1340 struct inode *other_inode = NULL;
1341 int ret;
1342
1343 path = btrfs_alloc_path();
1344 if (!path)
1345 return -ENOMEM;
1346
1347 dir_item = btrfs_lookup_dir_item(NULL, root, path,
1348 btrfs_ino(BTRFS_I(dir)),
1349 name, namelen, 0);
1350 if (!dir_item) {
1351 btrfs_release_path(path);
1352 goto add_link;
1353 } else if (IS_ERR(dir_item)) {
1354 ret = PTR_ERR(dir_item);
1355 goto out;
1356 }
1357
1358 /*
1359 * Our inode's dentry collides with the dentry of another inode which is
1360 * in the log but not yet processed since it has a higher inode number.
1361 * So delete that other dentry.
1362 */
1363 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1364 btrfs_release_path(path);
1365 other_inode = read_one_inode(root, key.objectid);
1366 if (!other_inode) {
1367 ret = -ENOENT;
1368 goto out;
1369 }
1370 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1371 name, namelen);
1372 if (ret)
1373 goto out;
1374 /*
1375 * If we dropped the link count to 0, bump it so that later the iput()
1376 * on the inode will not free it. We will fixup the link count later.
1377 */
1378 if (other_inode->i_nlink == 0)
1379 inc_nlink(other_inode);
1380
1381 ret = btrfs_run_delayed_items(trans);
1382 if (ret)
1383 goto out;
1384add_link:
1385 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1386 name, namelen, 0, ref_index);
1387out:
1388 iput(other_inode);
1389 btrfs_free_path(path);
1390
1391 return ret;
1392}
1393
1333/* 1394/*
1334 * replay one inode back reference item found in the log tree. 1395 * replay one inode back reference item found in the log tree.
1335 * eb, slot and key refer to the buffer and key found in the log tree. 1396 * eb, slot and key refer to the buffer and key found in the log tree.
@@ -1466,9 +1527,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1466 goto out; 1527 goto out;
1467 1528
1468 /* insert our name */ 1529 /* insert our name */
1469 ret = btrfs_add_link(trans, BTRFS_I(dir), 1530 ret = add_link(trans, root, dir, inode, name, namelen,
1470 BTRFS_I(inode), 1531 ref_index);
1471 name, namelen, 0, ref_index);
1472 if (ret) 1532 if (ret)
1473 goto out; 1533 goto out;
1474 1534
@@ -4780,8 +4840,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4780 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4840 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4781 di, &di_key); 4841 di, &di_key);
4782 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4842 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4783 ret = 1; 4843 if (di_key.objectid != key->objectid) {
4784 *other_ino = di_key.objectid; 4844 ret = 1;
4845 *other_ino = di_key.objectid;
4846 } else {
4847 ret = 0;
4848 }
4785 } else { 4849 } else {
4786 ret = -EAGAIN; 4850 ret = -EAGAIN;
4787 } 4851 }
@@ -4801,6 +4865,126 @@ out:
4801 return ret; 4865 return ret;
4802} 4866}
4803 4867
4868struct btrfs_ino_list {
4869 u64 ino;
4870 struct list_head list;
4871};
4872
4873static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4874 struct btrfs_root *root,
4875 struct btrfs_path *path,
4876 struct btrfs_log_ctx *ctx,
4877 u64 ino)
4878{
4879 struct btrfs_ino_list *ino_elem;
4880 LIST_HEAD(inode_list);
4881 int ret = 0;
4882
4883 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4884 if (!ino_elem)
4885 return -ENOMEM;
4886 ino_elem->ino = ino;
4887 list_add_tail(&ino_elem->list, &inode_list);
4888
4889 while (!list_empty(&inode_list)) {
4890 struct btrfs_fs_info *fs_info = root->fs_info;
4891 struct btrfs_key key;
4892 struct inode *inode;
4893
4894 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4895 list);
4896 ino = ino_elem->ino;
4897 list_del(&ino_elem->list);
4898 kfree(ino_elem);
4899 if (ret)
4900 continue;
4901
4902 btrfs_release_path(path);
4903
4904 key.objectid = ino;
4905 key.type = BTRFS_INODE_ITEM_KEY;
4906 key.offset = 0;
4907 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4908 /*
4909 * If the other inode that had a conflicting dir entry was
4910 * deleted in the current transaction, we don't need to do more
4911 * work nor fallback to a transaction commit.
4912 */
4913 if (IS_ERR(inode)) {
4914 ret = PTR_ERR(inode);
4915 if (ret == -ENOENT)
4916 ret = 0;
4917 continue;
4918 }
4919 /*
4920 * We are safe logging the other inode without acquiring its
4921 * lock as long as we log with the LOG_INODE_EXISTS mode. We
4922 * are safe against concurrent renames of the other inode as
4923 * well because during a rename we pin the log and update the
4924 * log with the new name before we unpin it.
4925 */
4926 ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
4927 LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
4928 if (ret) {
4929 iput(inode);
4930 continue;
4931 }
4932
4933 key.objectid = ino;
4934 key.type = BTRFS_INODE_REF_KEY;
4935 key.offset = 0;
4936 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4937 if (ret < 0) {
4938 iput(inode);
4939 continue;
4940 }
4941
4942 while (true) {
4943 struct extent_buffer *leaf = path->nodes[0];
4944 int slot = path->slots[0];
4945 u64 other_ino = 0;
4946
4947 if (slot >= btrfs_header_nritems(leaf)) {
4948 ret = btrfs_next_leaf(root, path);
4949 if (ret < 0) {
4950 break;
4951 } else if (ret > 0) {
4952 ret = 0;
4953 break;
4954 }
4955 continue;
4956 }
4957
4958 btrfs_item_key_to_cpu(leaf, &key, slot);
4959 if (key.objectid != ino ||
4960 (key.type != BTRFS_INODE_REF_KEY &&
4961 key.type != BTRFS_INODE_EXTREF_KEY)) {
4962 ret = 0;
4963 break;
4964 }
4965
4966 ret = btrfs_check_ref_name_override(leaf, slot, &key,
4967 BTRFS_I(inode), &other_ino);
4968 if (ret < 0)
4969 break;
4970 if (ret > 0) {
4971 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4972 if (!ino_elem) {
4973 ret = -ENOMEM;
4974 break;
4975 }
4976 ino_elem->ino = other_ino;
4977 list_add_tail(&ino_elem->list, &inode_list);
4978 ret = 0;
4979 }
4980 path->slots[0]++;
4981 }
4982 iput(inode);
4983 }
4984
4985 return ret;
4986}
4987
4804/* log a single inode in the tree log. 4988/* log a single inode in the tree log.
4805 * At least one parent directory for this inode must exist in the tree 4989 * At least one parent directory for this inode must exist in the tree
4806 * or be logged already. 4990 * or be logged already.
@@ -4840,6 +5024,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4840 u64 logged_isize = 0; 5024 u64 logged_isize = 0;
4841 bool need_log_inode_item = true; 5025 bool need_log_inode_item = true;
4842 bool xattrs_logged = false; 5026 bool xattrs_logged = false;
5027 bool recursive_logging = (inode_only == LOG_OTHER_INODE);
4843 5028
4844 path = btrfs_alloc_path(); 5029 path = btrfs_alloc_path();
4845 if (!path) 5030 if (!path)
@@ -4981,7 +5166,8 @@ again:
4981 5166
4982 if ((min_key.type == BTRFS_INODE_REF_KEY || 5167 if ((min_key.type == BTRFS_INODE_REF_KEY ||
4983 min_key.type == BTRFS_INODE_EXTREF_KEY) && 5168 min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4984 inode->generation == trans->transid) { 5169 inode->generation == trans->transid &&
5170 !recursive_logging) {
4985 u64 other_ino = 0; 5171 u64 other_ino = 0;
4986 5172
4987 ret = btrfs_check_ref_name_override(path->nodes[0], 5173 ret = btrfs_check_ref_name_override(path->nodes[0],
@@ -4992,9 +5178,6 @@ again:
4992 goto out_unlock; 5178 goto out_unlock;
4993 } else if (ret > 0 && ctx && 5179 } else if (ret > 0 && ctx &&
4994 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5180 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
4995 struct btrfs_key inode_key;
4996 struct inode *other_inode;
4997
4998 if (ins_nr > 0) { 5181 if (ins_nr > 0) {
4999 ins_nr++; 5182 ins_nr++;
5000 } else { 5183 } else {
@@ -5010,43 +5193,13 @@ again:
5010 goto out_unlock; 5193 goto out_unlock;
5011 } 5194 }
5012 ins_nr = 0; 5195 ins_nr = 0;
5013 btrfs_release_path(path); 5196
5014 inode_key.objectid = other_ino; 5197 err = log_conflicting_inodes(trans, root, path,
5015 inode_key.type = BTRFS_INODE_ITEM_KEY; 5198 ctx, other_ino);
5016 inode_key.offset = 0;
5017 other_inode = btrfs_iget(fs_info->sb,
5018 &inode_key, root,
5019 NULL);
5020 /*
5021 * If the other inode that had a conflicting dir
5022 * entry was deleted in the current transaction,
5023 * we don't need to do more work nor fallback to
5024 * a transaction commit.
5025 */
5026 if (other_inode == ERR_PTR(-ENOENT)) {
5027 goto next_key;
5028 } else if (IS_ERR(other_inode)) {
5029 err = PTR_ERR(other_inode);
5030 goto out_unlock;
5031 }
5032 /*
5033 * We are safe logging the other inode without
5034 * acquiring its i_mutex as long as we log with
5035 * the LOG_INODE_EXISTS mode. We're safe against
5036 * concurrent renames of the other inode as well
5037 * because during a rename we pin the log and
5038 * update the log with the new name before we
5039 * unpin it.
5040 */
5041 err = btrfs_log_inode(trans, root,
5042 BTRFS_I(other_inode),
5043 LOG_OTHER_INODE, 0, LLONG_MAX,
5044 ctx);
5045 iput(other_inode);
5046 if (err) 5199 if (err)
5047 goto out_unlock; 5200 goto out_unlock;
5048 else 5201 btrfs_release_path(path);
5049 goto next_key; 5202 goto next_key;
5050 } 5203 }
5051 } 5204 }
5052 5205