summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2019-04-17 06:31:06 -0400
committerDavid Sterba <dsterba@suse.com>2019-04-29 13:02:52 -0400
commitb8aa330d2acb122563be87c42d82c5c8649cf658 (patch)
treea5a916bcec3216bbaacda81043dd0495472b9f07 /fs/btrfs/tree-log.c
parent62d54f3a7fa27ef6a74d6cdf643ce04beba3afa7 (diff)
Btrfs: improve performance on fsync of files with multiple hardlinks
Commit 41bd6067692382 ("Btrfs: fix fsync of files with multiple hard links in new directories") introduced a path that makes fsync fallback to a full transaction commit in order to avoid losing hard links and new ancestors of the fsynced inode. That path is triggered only when the inode has more than one hard link and either has a new hard link created in the current transaction or the inode was evicted and reloaded in the current transaction. That path ends up getting triggered very often (hundreds of times) during the course of pgbench benchmarks, resulting in performance drops of about 20%. This change restores the performance by not triggering the full transaction commit in those cases, and instead iterate the fs/subvolume tree in search of all possible new ancestors, for all hard links, to log them. Reported-by: Zhao Yuhu <zyuhu@suse.com> Tested-by: James Wang <jnwang@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c228
1 files changed, 188 insertions, 40 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 67cd144e6be1..6adcd8a2c5c7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5832,6 +5832,190 @@ out:
5832 return ret; 5832 return ret;
5833} 5833}
5834 5834
5835static int log_new_ancestors(struct btrfs_trans_handle *trans,
5836 struct btrfs_root *root,
5837 struct btrfs_path *path,
5838 struct btrfs_log_ctx *ctx)
5839{
5840 struct btrfs_key found_key;
5841
5842 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5843
5844 while (true) {
5845 struct btrfs_fs_info *fs_info = root->fs_info;
5846 const u64 last_committed = fs_info->last_trans_committed;
5847 struct extent_buffer *leaf = path->nodes[0];
5848 int slot = path->slots[0];
5849 struct btrfs_key search_key;
5850 struct inode *inode;
5851 int ret = 0;
5852
5853 btrfs_release_path(path);
5854
5855 search_key.objectid = found_key.offset;
5856 search_key.type = BTRFS_INODE_ITEM_KEY;
5857 search_key.offset = 0;
5858 inode = btrfs_iget(fs_info->sb, &search_key, root, NULL);
5859 if (IS_ERR(inode))
5860 return PTR_ERR(inode);
5861
5862 if (BTRFS_I(inode)->generation > last_committed)
5863 ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5864 LOG_INODE_EXISTS,
5865 0, LLONG_MAX, ctx);
5866 iput(inode);
5867 if (ret)
5868 return ret;
5869
5870 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5871 break;
5872
5873 search_key.type = BTRFS_INODE_REF_KEY;
5874 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5875 if (ret < 0)
5876 return ret;
5877
5878 leaf = path->nodes[0];
5879 slot = path->slots[0];
5880 if (slot >= btrfs_header_nritems(leaf)) {
5881 ret = btrfs_next_leaf(root, path);
5882 if (ret < 0)
5883 return ret;
5884 else if (ret > 0)
5885 return -ENOENT;
5886 leaf = path->nodes[0];
5887 slot = path->slots[0];
5888 }
5889
5890 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5891 if (found_key.objectid != search_key.objectid ||
5892 found_key.type != BTRFS_INODE_REF_KEY)
5893 return -ENOENT;
5894 }
5895 return 0;
5896}
5897
5898static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5899 struct btrfs_inode *inode,
5900 struct dentry *parent,
5901 struct btrfs_log_ctx *ctx)
5902{
5903 struct btrfs_root *root = inode->root;
5904 struct btrfs_fs_info *fs_info = root->fs_info;
5905 struct dentry *old_parent = NULL;
5906 struct super_block *sb = inode->vfs_inode.i_sb;
5907 int ret = 0;
5908
5909 while (true) {
5910 if (!parent || d_really_is_negative(parent) ||
5911 sb != parent->d_sb)
5912 break;
5913
5914 inode = BTRFS_I(d_inode(parent));
5915 if (root != inode->root)
5916 break;
5917
5918 if (inode->generation > fs_info->last_trans_committed) {
5919 ret = btrfs_log_inode(trans, root, inode,
5920 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
5921 if (ret)
5922 break;
5923 }
5924 if (IS_ROOT(parent))
5925 break;
5926
5927 parent = dget_parent(parent);
5928 dput(old_parent);
5929 old_parent = parent;
5930 }
5931 dput(old_parent);
5932
5933 return ret;
5934}
5935
5936static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
5937 struct btrfs_inode *inode,
5938 struct dentry *parent,
5939 struct btrfs_log_ctx *ctx)
5940{
5941 struct btrfs_root *root = inode->root;
5942 const u64 ino = btrfs_ino(inode);
5943 struct btrfs_path *path;
5944 struct btrfs_key search_key;
5945 int ret;
5946
5947 /*
5948 * For a single hard link case, go through a fast path that does not
5949 * need to iterate the fs/subvolume tree.
5950 */
5951 if (inode->vfs_inode.i_nlink < 2)
5952 return log_new_ancestors_fast(trans, inode, parent, ctx);
5953
5954 path = btrfs_alloc_path();
5955 if (!path)
5956 return -ENOMEM;
5957
5958 search_key.objectid = ino;
5959 search_key.type = BTRFS_INODE_REF_KEY;
5960 search_key.offset = 0;
5961again:
5962 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5963 if (ret < 0)
5964 goto out;
5965 if (ret == 0)
5966 path->slots[0]++;
5967
5968 while (true) {
5969 struct extent_buffer *leaf = path->nodes[0];
5970 int slot = path->slots[0];
5971 struct btrfs_key found_key;
5972
5973 if (slot >= btrfs_header_nritems(leaf)) {
5974 ret = btrfs_next_leaf(root, path);
5975 if (ret < 0)
5976 goto out;
5977 else if (ret > 0)
5978 break;
5979 continue;
5980 }
5981
5982 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5983 if (found_key.objectid != ino ||
5984 found_key.type > BTRFS_INODE_EXTREF_KEY)
5985 break;
5986
5987 /*
5988 * Don't deal with extended references because they are rare
5989 * cases and too complex to deal with (we would need to keep
5990 * track of which subitem we are processing for each item in
5991 * this loop, etc). So just return some error to fallback to
5992 * a transaction commit.
5993 */
5994 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
5995 ret = -EMLINK;
5996 goto out;
5997 }
5998
5999 /*
6000 * Logging ancestors needs to do more searches on the fs/subvol
6001 * tree, so it releases the path as needed to avoid deadlocks.
6002 * Keep track of the last inode ref key and resume from that key
6003 * after logging all new ancestors for the current hard link.
6004 */
6005 memcpy(&search_key, &found_key, sizeof(search_key));
6006
6007 ret = log_new_ancestors(trans, root, path, ctx);
6008 if (ret)
6009 goto out;
6010 btrfs_release_path(path);
6011 goto again;
6012 }
6013 ret = 0;
6014out:
6015 btrfs_free_path(path);
6016 return ret;
6017}
6018
5835/* 6019/*
5836 * helper function around btrfs_log_inode to make sure newly created 6020 * helper function around btrfs_log_inode to make sure newly created
5837 * parent directories also end up in the log. A minimal inode and backref 6021 * parent directories also end up in the log. A minimal inode and backref
@@ -5849,11 +6033,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
5849 struct btrfs_root *root = inode->root; 6033 struct btrfs_root *root = inode->root;
5850 struct btrfs_fs_info *fs_info = root->fs_info; 6034 struct btrfs_fs_info *fs_info = root->fs_info;
5851 struct super_block *sb; 6035 struct super_block *sb;
5852 struct dentry *old_parent = NULL;
5853 int ret = 0; 6036 int ret = 0;
5854 u64 last_committed = fs_info->last_trans_committed; 6037 u64 last_committed = fs_info->last_trans_committed;
5855 bool log_dentries = false; 6038 bool log_dentries = false;
5856 struct btrfs_inode *orig_inode = inode;
5857 6039
5858 sb = inode->vfs_inode.i_sb; 6040 sb = inode->vfs_inode.i_sb;
5859 6041
@@ -5959,54 +6141,20 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
5959 * and has a link count of 2. 6141 * and has a link count of 2.
5960 */ 6142 */
5961 if (inode->last_unlink_trans > last_committed) { 6143 if (inode->last_unlink_trans > last_committed) {
5962 ret = btrfs_log_all_parents(trans, orig_inode, ctx); 6144 ret = btrfs_log_all_parents(trans, inode, ctx);
5963 if (ret) 6145 if (ret)
5964 goto end_trans; 6146 goto end_trans;
5965 } 6147 }
5966 6148
5967 /* 6149 ret = log_all_new_ancestors(trans, inode, parent, ctx);
5968 * If a new hard link was added to the inode in the current transaction 6150 if (ret)
5969 * and its link count is now greater than 1, we need to fallback to a
5970 * transaction commit, otherwise we can end up not logging all its new
5971 * parents for all the hard links. Here just from the dentry used to
5972 * fsync, we can not visit the ancestor inodes for all the other hard
5973 * links to figure out if any is new, so we fallback to a transaction
5974 * commit (instead of adding a lot of complexity of scanning a btree,
5975 * since this scenario is not a common use case).
5976 */
5977 if (inode->vfs_inode.i_nlink > 1 &&
5978 inode->last_link_trans > last_committed) {
5979 ret = -EMLINK;
5980 goto end_trans; 6151 goto end_trans;
5981 }
5982
5983 while (1) {
5984 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5985 break;
5986
5987 inode = BTRFS_I(d_inode(parent));
5988 if (root != inode->root)
5989 break;
5990 6152
5991 if (inode->generation > last_committed) {
5992 ret = btrfs_log_inode(trans, root, inode,
5993 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
5994 if (ret)
5995 goto end_trans;
5996 }
5997 if (IS_ROOT(parent))
5998 break;
5999
6000 parent = dget_parent(parent);
6001 dput(old_parent);
6002 old_parent = parent;
6003 }
6004 if (log_dentries) 6153 if (log_dentries)
6005 ret = log_new_dir_dentries(trans, root, orig_inode, ctx); 6154 ret = log_new_dir_dentries(trans, root, inode, ctx);
6006 else 6155 else
6007 ret = 0; 6156 ret = 0;
6008end_trans: 6157end_trans:
6009 dput(old_parent);
6010 if (ret < 0) { 6158 if (ret < 0) {
6011 btrfs_set_log_full_commit(trans); 6159 btrfs_set_log_full_commit(trans);
6012 ret = 1; 6160 ret = 1;