aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorQu Wenruo <wqu@suse.com>2019-01-23 02:15:16 -0500
committerDavid Sterba <dsterba@suse.com>2019-02-25 08:13:26 -0500
commit370a11b8114bcca3738fe6a5d7ed8babcc212f39 (patch)
tree8b20eaa243737a85b16bff94284ac11673cdc80d
parent5aea1a4fcf1e4fe3daea6f18fb66cbe49439bd8e (diff)
btrfs: qgroup: Introduce per-root swapped blocks infrastructure
To allow delayed subtree swap rescan, btrfs needs to record per-root information about which tree blocks get swapped. This patch introduces the required infrastructure. The designed workflow will be: 1) Record the subtree root block that gets swapped. During subtree swap: O = Old tree blocks N = New tree blocks reloc tree subvolume tree X Root Root / \ / \ NA OB OA OB / | | \ / | | \ NC ND OE OF OC OD OE OF In this case, NA and OA are going to be swapped, record (NA, OA) into subvolume tree X. 2) After subtree swap. reloc tree subvolume tree X Root Root / \ / \ OA OB NA OB / | | \ / | | \ OC OD OE OF NC ND OE OF 3a) COW happens for OB If we are going to COW tree block OB, we check OB's bytenr against tree X's swapped_blocks structure. If it doesn't fit any, nothing will happen. 3b) COW happens for NA Check NA's bytenr against tree X's swapped_blocks, and get a hit. Then we do subtree scan on both subtrees OA and NA. Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND). Then no matter what we do to subvolume tree X, qgroup numbers will still be correct. Then NA's record gets removed from X's swapped_blocks. 4) Transaction commit Any record in X's swapped_blocks gets removed, since there is no modification to swapped subtrees, no need to trigger heavy qgroup subtree rescan for them. This will introduce 128 bytes overhead for each btrfs_root even qgroup is not enabled. This is to reduce memory allocations and potential failures. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/ctree.h14
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/qgroup.c150
-rw-r--r--fs/btrfs/qgroup.h92
-rw-r--r--fs/btrfs/relocation.c7
-rw-r--r--fs/btrfs/transaction.c1
6 files changed, 265 insertions, 0 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dd0ccc6403b0..007b0e81992a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1209,6 +1209,17 @@ enum {
1209}; 1209};
1210 1210
1211/* 1211/*
1212 * Record swapped tree blocks of a subvolume tree for delayed subtree trace
1213 * code. For detail check comment in fs/btrfs/qgroup.c.
1214 */
1215struct btrfs_qgroup_swapped_blocks {
1216 spinlock_t lock;
1217 /* RM_EMPTY_ROOT() of above blocks[] */
1218 bool swapped;
1219 struct rb_root blocks[BTRFS_MAX_LEVEL];
1220};
1221
1222/*
1212 * in ram representation of the tree. extent_root is used for all allocations 1223 * in ram representation of the tree. extent_root is used for all allocations
1213 * and for the extent tree extent_root root. 1224 * and for the extent tree extent_root root.
1214 */ 1225 */
@@ -1343,6 +1354,9 @@ struct btrfs_root {
1343 /* Number of active swapfiles */ 1354 /* Number of active swapfiles */
1344 atomic_t nr_swapfiles; 1355 atomic_t nr_swapfiles;
1345 1356
1357 /* Record pairs of swapped blocks for qgroup */
1358 struct btrfs_qgroup_swapped_blocks swapped_blocks;
1359
1346#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1360#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1347 u64 alloc_bytenr; 1361 u64 alloc_bytenr;
1348#endif 1362#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9e24fdb57453..3d233608fa0f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1220,6 +1220,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1220 root->anon_dev = 0; 1220 root->anon_dev = 0;
1221 1221
1222 spin_lock_init(&root->root_item_lock); 1222 spin_lock_init(&root->root_item_lock);
1223 btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1223} 1224}
1224 1225
1225static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, 1226static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 7a6948989655..7166d202b26a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3818,3 +3818,153 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
3818 } 3818 }
3819 extent_changeset_release(&changeset); 3819 extent_changeset_release(&changeset);
3820} 3820}
3821
3822void btrfs_qgroup_init_swapped_blocks(
3823 struct btrfs_qgroup_swapped_blocks *swapped_blocks)
3824{
3825 int i;
3826
3827 spin_lock_init(&swapped_blocks->lock);
3828 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
3829 swapped_blocks->blocks[i] = RB_ROOT;
3830 swapped_blocks->swapped = false;
3831}
3832
3833/*
3834 * Delete all swapped blocks record of @root.
3835 * Every record here means we skipped a full subtree scan for qgroup.
3836 *
3837 * Gets called when committing one transaction.
3838 */
3839void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
3840{
3841 struct btrfs_qgroup_swapped_blocks *swapped_blocks;
3842 int i;
3843
3844 swapped_blocks = &root->swapped_blocks;
3845
3846 spin_lock(&swapped_blocks->lock);
3847 if (!swapped_blocks->swapped)
3848 goto out;
3849 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3850 struct rb_root *cur_root = &swapped_blocks->blocks[i];
3851 struct btrfs_qgroup_swapped_block *entry;
3852 struct btrfs_qgroup_swapped_block *next;
3853
3854 rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
3855 node)
3856 kfree(entry);
3857 swapped_blocks->blocks[i] = RB_ROOT;
3858 }
3859 swapped_blocks->swapped = false;
3860out:
3861 spin_unlock(&swapped_blocks->lock);
3862}
3863
3864/*
3865 * Add subtree roots record into @subvol_root.
3866 *
3867 * @subvol_root: tree root of the subvolume tree get swapped
3868 * @bg: block group under balance
3869 * @subvol_parent/slot: pointer to the subtree root in subvolume tree
3870 * @reloc_parent/slot: pointer to the subtree root in reloc tree
3871 * BOTH POINTERS ARE BEFORE TREE SWAP
3872 * @last_snapshot: last snapshot generation of the subvolume tree
3873 */
3874int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
3875 struct btrfs_root *subvol_root,
3876 struct btrfs_block_group_cache *bg,
3877 struct extent_buffer *subvol_parent, int subvol_slot,
3878 struct extent_buffer *reloc_parent, int reloc_slot,
3879 u64 last_snapshot)
3880{
3881 struct btrfs_fs_info *fs_info = subvol_root->fs_info;
3882 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
3883 struct btrfs_qgroup_swapped_block *block;
3884 struct rb_node **cur;
3885 struct rb_node *parent = NULL;
3886 int level = btrfs_header_level(subvol_parent) - 1;
3887 int ret = 0;
3888
3889 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
3890 return 0;
3891
3892 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
3893 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
3894 btrfs_err_rl(fs_info,
3895 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
3896 __func__,
3897 btrfs_node_ptr_generation(subvol_parent, subvol_slot),
3898 btrfs_node_ptr_generation(reloc_parent, reloc_slot));
3899 return -EUCLEAN;
3900 }
3901
3902 block = kmalloc(sizeof(*block), GFP_NOFS);
3903 if (!block) {
3904 ret = -ENOMEM;
3905 goto out;
3906 }
3907
3908 /*
3909 * @reloc_parent/slot is still before swap, while @block is going to
3910 * record the bytenr after swap, so we do the swap here.
3911 */
3912 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
3913 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
3914 reloc_slot);
3915 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
3916 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
3917 subvol_slot);
3918 block->last_snapshot = last_snapshot;
3919 block->level = level;
3920 if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
3921 block->trace_leaf = true;
3922 else
3923 block->trace_leaf = false;
3924 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
3925
3926 /* Insert @block into @blocks */
3927 spin_lock(&blocks->lock);
3928 cur = &blocks->blocks[level].rb_node;
3929 while (*cur) {
3930 struct btrfs_qgroup_swapped_block *entry;
3931
3932 parent = *cur;
3933 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
3934 node);
3935
3936 if (entry->subvol_bytenr < block->subvol_bytenr) {
3937 cur = &(*cur)->rb_left;
3938 } else if (entry->subvol_bytenr > block->subvol_bytenr) {
3939 cur = &(*cur)->rb_right;
3940 } else {
3941 if (entry->subvol_generation !=
3942 block->subvol_generation ||
3943 entry->reloc_bytenr != block->reloc_bytenr ||
3944 entry->reloc_generation !=
3945 block->reloc_generation) {
3946 /*
3947 * Duplicated but mismatch entry found.
3948 * Shouldn't happen.
3949 *
3950 * Marking qgroup inconsistent should be enough
3951 * for end users.
3952 */
3953 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
3954 ret = -EEXIST;
3955 }
3956 kfree(block);
3957 goto out_unlock;
3958 }
3959 }
3960 rb_link_node(&block->node, parent, cur);
3961 rb_insert_color(&block->node, &blocks->blocks[level]);
3962 blocks->swapped = true;
3963out_unlock:
3964 spin_unlock(&blocks->lock);
3965out:
3966 if (ret < 0)
3967 fs_info->qgroup_flags |=
3968 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3969 return ret;
3970}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 20c6bd5fa701..8dc17020e5be 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,6 +6,8 @@
6#ifndef BTRFS_QGROUP_H 6#ifndef BTRFS_QGROUP_H
7#define BTRFS_QGROUP_H 7#define BTRFS_QGROUP_H
8 8
9#include <linux/spinlock.h>
10#include <linux/rbtree.h>
9#include "ulist.h" 11#include "ulist.h"
10#include "delayed-ref.h" 12#include "delayed-ref.h"
11 13
@@ -38,6 +40,66 @@
38 */ 40 */
39 41
40/* 42/*
43 * Special performance optimization for balance.
44 *
45 * For balance, we need to swap subtree of subvolume and reloc trees.
46 * In theory, we need to trace all subtree blocks of both subvolume and reloc
47 * trees, since their owner has changed during such swap.
48 *
49 * However since balance has ensured that both subtrees are containing the
50 * same contents and have the same tree structures, such swap won't cause
51 * qgroup number change.
52 *
53 * But there is a race window between subtree swap and transaction commit,
54 * during that window, if we increase/decrease tree level or merge/split tree
55 * blocks, we still need to trace the original subtrees.
56 *
57 * So for balance, we use a delayed subtree tracing, whose workflow is:
58 *
59 * 1) Record the subtree root block get swapped.
60 *
61 * During subtree swap:
62 * O = Old tree blocks
63 * N = New tree blocks
64 * reloc tree subvolume tree X
65 * Root Root
66 * / \ / \
67 * NA OB OA OB
68 * / | | \ / | | \
69 * NC ND OE OF OC OD OE OF
70 *
71 * In this case, NA and OA are going to be swapped, record (NA, OA) into
72 * subvolume tree X.
73 *
74 * 2) After subtree swap.
75 * reloc tree subvolume tree X
76 * Root Root
77 * / \ / \
78 * OA OB NA OB
79 * / | | \ / | | \
80 * OC OD OE OF NC ND OE OF
81 *
82 * 3a) COW happens for OB
83 * If we are going to COW tree block OB, we check OB's bytenr against
84 * tree X's swapped_blocks structure.
85 * If it doesn't fit any, nothing will happen.
86 *
87 * 3b) COW happens for NA
88 * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
89 * Then we do subtree scan on both subtrees OA and NA.
90 * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
91 *
92 * Then no matter what we do to subvolume tree X, qgroup numbers will
93 * still be correct.
94 * Then NA's record gets removed from X's swapped_blocks.
95 *
96 * 4) Transaction commit
97 * Any record in X's swapped_blocks gets removed, since there is no
98 * modification to the swapped subtrees, no need to trigger heavy qgroup
99 * subtree rescan for them.
100 */
101
102/*
41 * Record a dirty extent, and info qgroup to update quota on it 103 * Record a dirty extent, and info qgroup to update quota on it
42 * TODO: Use kmem cache to alloc it. 104 * TODO: Use kmem cache to alloc it.
43 */ 105 */
@@ -48,6 +110,24 @@ struct btrfs_qgroup_extent_record {
48 struct ulist *old_roots; 110 struct ulist *old_roots;
49}; 111};
50 112
113struct btrfs_qgroup_swapped_block {
114 struct rb_node node;
115
116 int level;
117 bool trace_leaf;
118
119 /* bytenr/generation of the tree block in subvolume tree after swap */
120 u64 subvol_bytenr;
121 u64 subvol_generation;
122
123 /* bytenr/generation of the tree block in reloc tree after swap */
124 u64 reloc_bytenr;
125 u64 reloc_generation;
126
127 u64 last_snapshot;
128 struct btrfs_key first_key;
129};
130
51/* 131/*
52 * Qgroup reservation types: 132 * Qgroup reservation types:
53 * 133 *
@@ -325,4 +405,16 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
325 405
326void btrfs_qgroup_check_reserved_leak(struct inode *inode); 406void btrfs_qgroup_check_reserved_leak(struct inode *inode);
327 407
408/* btrfs_qgroup_swapped_blocks related functions */
409void btrfs_qgroup_init_swapped_blocks(
410 struct btrfs_qgroup_swapped_blocks *swapped_blocks);
411
412void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
413int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
414 struct btrfs_root *subvol_root,
415 struct btrfs_block_group_cache *bg,
416 struct extent_buffer *subvol_parent, int subvol_slot,
417 struct extent_buffer *reloc_parent, int reloc_slot,
418 u64 last_snapshot);
419
328#endif 420#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b915f3e157bd..0c528918c844 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1898,6 +1898,13 @@ again:
1898 if (ret < 0) 1898 if (ret < 0)
1899 break; 1899 break;
1900 1900
1901 btrfs_node_key_to_cpu(parent, &first_key, slot);
1902 ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1903 rc->block_group, parent, slot,
1904 path->nodes[level], path->slots[level],
1905 last_snapshot);
1906 if (ret < 0)
1907 break;
1901 /* 1908 /*
1902 * swap blocks in fs tree and reloc tree. 1909 * swap blocks in fs tree and reloc tree.
1903 */ 1910 */
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fdffe5d61739..0cc6d8b58191 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
122 if (is_fstree(root->root_key.objectid)) 122 if (is_fstree(root->root_key.objectid))
123 btrfs_unpin_free_ino(root); 123 btrfs_unpin_free_ino(root);
124 clear_btree_io_tree(&root->dirty_log_pages); 124 clear_btree_io_tree(&root->dirty_log_pages);
125 btrfs_qgroup_clean_swapped_blocks(root);
125 } 126 }
126 127
127 /* We can free old roots now. */ 128 /* We can free old roots now. */