aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Fasheh <mfasheh@suse.de>2014-07-17 15:39:01 -0400
committerChris Mason <clm@fb.com>2014-08-15 10:43:14 -0400
commit1152651a081720ef6a8c76bb7da676e8c900ac30 (patch)
treea7d5b02f161a7c4f9cdab947a68569475a93146c
parent6f7ff6d7832c6be13e8c95598884dbc40ad69fb7 (diff)
btrfs: qgroup: account shared subtrees during snapshot delete
During its tree walk, btrfs_drop_snapshot() will skip any shared subtrees it encounters. This is incorrect when we have qgroups turned on as those subtrees need to have their contents accounted. In particular, the case we're concerned with is when removing our snapshot root leaves the subtree with only one root reference. In those cases we need to find the last remaining root and add each extent in the subtree to the corresponding qgroup exclusive counts. This patch implements the shared subtree walk and a new qgroup operation, BTRFS_QGROUP_OPER_SUB_SUBTREE. When an operation of this type is encountered during qgroup accounting, we search for any root references to that extent and in the case that we find only one reference left, we go ahead and do the math on it's exclusive counts. Signed-off-by: Mark Fasheh <mfasheh@suse.de> Reviewed-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/extent-tree.c261
-rw-r--r--fs/btrfs/qgroup.c164
-rw-r--r--fs/btrfs/qgroup.h1
3 files changed, 426 insertions, 0 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 591893fe58cb..102ed3143976 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7478,6 +7478,220 @@ reada:
7478 wc->reada_slot = slot; 7478 wc->reada_slot = slot;
7479} 7479}
7480 7480
7481static int account_leaf_items(struct btrfs_trans_handle *trans,
7482 struct btrfs_root *root,
7483 struct extent_buffer *eb)
7484{
7485 int nr = btrfs_header_nritems(eb);
7486 int i, extent_type, ret;
7487 struct btrfs_key key;
7488 struct btrfs_file_extent_item *fi;
7489 u64 bytenr, num_bytes;
7490
7491 for (i = 0; i < nr; i++) {
7492 btrfs_item_key_to_cpu(eb, &key, i);
7493
7494 if (key.type != BTRFS_EXTENT_DATA_KEY)
7495 continue;
7496
7497 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
7498 /* filter out non qgroup-accountable extents */
7499 extent_type = btrfs_file_extent_type(eb, fi);
7500
7501 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
7502 continue;
7503
7504 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
7505 if (!bytenr)
7506 continue;
7507
7508 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
7509
7510 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7511 root->objectid,
7512 bytenr, num_bytes,
7513 BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
7514 if (ret)
7515 return ret;
7516 }
7517 return 0;
7518}
7519
7520/*
7521 * Walk up the tree from the bottom, freeing leaves and any interior
7522 * nodes which have had all slots visited. If a node (leaf or
7523 * interior) is freed, the node above it will have it's slot
7524 * incremented. The root node will never be freed.
7525 *
7526 * At the end of this function, we should have a path which has all
7527 * slots incremented to the next position for a search. If we need to
7528 * read a new node it will be NULL and the node above it will have the
7529 * correct slot selected for a later read.
7530 *
7531 * If we increment the root nodes slot counter past the number of
7532 * elements, 1 is returned to signal completion of the search.
7533 */
7534static int adjust_slots_upwards(struct btrfs_root *root,
7535 struct btrfs_path *path, int root_level)
7536{
7537 int level = 0;
7538 int nr, slot;
7539 struct extent_buffer *eb;
7540
7541 if (root_level == 0)
7542 return 1;
7543
7544 while (level <= root_level) {
7545 eb = path->nodes[level];
7546 nr = btrfs_header_nritems(eb);
7547 path->slots[level]++;
7548 slot = path->slots[level];
7549 if (slot >= nr || level == 0) {
7550 /*
7551 * Don't free the root - we will detect this
7552 * condition after our loop and return a
7553 * positive value for caller to stop walking the tree.
7554 */
7555 if (level != root_level) {
7556 btrfs_tree_unlock_rw(eb, path->locks[level]);
7557 path->locks[level] = 0;
7558
7559 free_extent_buffer(eb);
7560 path->nodes[level] = NULL;
7561 path->slots[level] = 0;
7562 }
7563 } else {
7564 /*
7565 * We have a valid slot to walk back down
7566 * from. Stop here so caller can process these
7567 * new nodes.
7568 */
7569 break;
7570 }
7571
7572 level++;
7573 }
7574
7575 eb = path->nodes[root_level];
7576 if (path->slots[root_level] >= btrfs_header_nritems(eb))
7577 return 1;
7578
7579 return 0;
7580}
7581
7582/*
7583 * root_eb is the subtree root and is locked before this function is called.
7584 */
7585static int account_shared_subtree(struct btrfs_trans_handle *trans,
7586 struct btrfs_root *root,
7587 struct extent_buffer *root_eb,
7588 u64 root_gen,
7589 int root_level)
7590{
7591 int ret = 0;
7592 int level;
7593 struct extent_buffer *eb = root_eb;
7594 struct btrfs_path *path = NULL;
7595
7596 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
7597 BUG_ON(root_eb == NULL);
7598
7599 if (!root->fs_info->quota_enabled)
7600 return 0;
7601
7602 if (!extent_buffer_uptodate(root_eb)) {
7603 ret = btrfs_read_buffer(root_eb, root_gen);
7604 if (ret)
7605 goto out;
7606 }
7607
7608 if (root_level == 0) {
7609 ret = account_leaf_items(trans, root, root_eb);
7610 goto out;
7611 }
7612
7613 path = btrfs_alloc_path();
7614 if (!path)
7615 return -ENOMEM;
7616
7617 /*
7618 * Walk down the tree. Missing extent blocks are filled in as
7619 * we go. Metadata is accounted every time we read a new
7620 * extent block.
7621 *
7622 * When we reach a leaf, we account for file extent items in it,
7623 * walk back up the tree (adjusting slot pointers as we go)
7624 * and restart the search process.
7625 */
7626 extent_buffer_get(root_eb); /* For path */
7627 path->nodes[root_level] = root_eb;
7628 path->slots[root_level] = 0;
7629 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
7630walk_down:
7631 level = root_level;
7632 while (level >= 0) {
7633 if (path->nodes[level] == NULL) {
7634 int child_bsize = root->nodesize;
7635 int parent_slot;
7636 u64 child_gen;
7637 u64 child_bytenr;
7638
7639 /* We need to get child blockptr/gen from
7640 * parent before we can read it. */
7641 eb = path->nodes[level + 1];
7642 parent_slot = path->slots[level + 1];
7643 child_bytenr = btrfs_node_blockptr(eb, parent_slot);
7644 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
7645
7646 eb = read_tree_block(root, child_bytenr, child_bsize,
7647 child_gen);
7648 if (!eb || !extent_buffer_uptodate(eb)) {
7649 ret = -EIO;
7650 goto out;
7651 }
7652
7653 path->nodes[level] = eb;
7654 path->slots[level] = 0;
7655
7656 btrfs_tree_read_lock(eb);
7657 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
7658 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
7659
7660 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7661 root->objectid,
7662 child_bytenr,
7663 child_bsize,
7664 BTRFS_QGROUP_OPER_SUB_SUBTREE,
7665 0);
7666 if (ret)
7667 goto out;
7668
7669 }
7670
7671 if (level == 0) {
7672 ret = account_leaf_items(trans, root, path->nodes[level]);
7673 if (ret)
7674 goto out;
7675
7676 /* Nonzero return here means we completed our search */
7677 ret = adjust_slots_upwards(root, path, root_level);
7678 if (ret)
7679 break;
7680
7681 /* Restart search with new slots */
7682 goto walk_down;
7683 }
7684
7685 level--;
7686 }
7687
7688 ret = 0;
7689out:
7690 btrfs_free_path(path);
7691
7692 return ret;
7693}
7694
7481/* 7695/*
7482 * helper to process tree block while walking down the tree. 7696 * helper to process tree block while walking down the tree.
7483 * 7697 *
@@ -7581,6 +7795,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7581 int level = wc->level; 7795 int level = wc->level;
7582 int reada = 0; 7796 int reada = 0;
7583 int ret = 0; 7797 int ret = 0;
7798 bool need_account = false;
7584 7799
7585 generation = btrfs_node_ptr_generation(path->nodes[level], 7800 generation = btrfs_node_ptr_generation(path->nodes[level],
7586 path->slots[level]); 7801 path->slots[level]);
@@ -7626,6 +7841,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7626 7841
7627 if (wc->stage == DROP_REFERENCE) { 7842 if (wc->stage == DROP_REFERENCE) {
7628 if (wc->refs[level - 1] > 1) { 7843 if (wc->refs[level - 1] > 1) {
7844 need_account = true;
7629 if (level == 1 && 7845 if (level == 1 &&
7630 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7846 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7631 goto skip; 7847 goto skip;
@@ -7689,6 +7905,16 @@ skip:
7689 parent = 0; 7905 parent = 0;
7690 } 7906 }
7691 7907
7908 if (need_account) {
7909 ret = account_shared_subtree(trans, root, next,
7910 generation, level - 1);
7911 if (ret) {
7912 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
7913 "%d accounting shared subtree. Quota "
7914 "is out of sync, rescan required.\n",
7915 root->fs_info->sb->s_id, ret);
7916 }
7917 }
7692 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7918 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7693 root->root_key.objectid, level - 1, 0, 0); 7919 root->root_key.objectid, level - 1, 0, 0);
7694 BUG_ON(ret); /* -ENOMEM */ 7920 BUG_ON(ret); /* -ENOMEM */
@@ -7773,6 +7999,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7773 else 7999 else
7774 ret = btrfs_dec_ref(trans, root, eb, 0); 8000 ret = btrfs_dec_ref(trans, root, eb, 0);
7775 BUG_ON(ret); /* -ENOMEM */ 8001 BUG_ON(ret); /* -ENOMEM */
8002 ret = account_leaf_items(trans, root, eb);
8003 if (ret) {
8004 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8005 "%d accounting leaf items. Quota "
8006 "is out of sync, rescan required.\n",
8007 root->fs_info->sb->s_id, ret);
8008 }
7776 } 8009 }
7777 /* make block locked assertion in clean_tree_block happy */ 8010 /* make block locked assertion in clean_tree_block happy */
7778 if (!path->locks[level] && 8011 if (!path->locks[level] &&
@@ -7898,6 +8131,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7898 int level; 8131 int level;
7899 bool root_dropped = false; 8132 bool root_dropped = false;
7900 8133
8134 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8135
7901 path = btrfs_alloc_path(); 8136 path = btrfs_alloc_path();
7902 if (!path) { 8137 if (!path) {
7903 err = -ENOMEM; 8138 err = -ENOMEM;
@@ -8023,6 +8258,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
8023 goto out_end_trans; 8258 goto out_end_trans;
8024 } 8259 }
8025 8260
8261 /*
8262 * Qgroup update accounting is run from
8263 * delayed ref handling. This usually works
8264 * out because delayed refs are normally the
8265 * only way qgroup updates are added. However,
8266 * we may have added updates during our tree
8267 * walk so run qgroups here to make sure we
8268 * don't lose any updates.
8269 */
8270 ret = btrfs_delayed_qgroup_accounting(trans,
8271 root->fs_info);
8272 if (ret)
8273 printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8274 "running qgroup updates "
8275 "during snapshot delete. "
8276 "Quota is out of sync, "
8277 "rescan required.\n", ret);
8278
8026 btrfs_end_transaction_throttle(trans, tree_root); 8279 btrfs_end_transaction_throttle(trans, tree_root);
8027 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8280 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8028 pr_debug("BTRFS: drop snapshot early exit\n"); 8281 pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8076,6 +8329,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
8076 } 8329 }
8077 root_dropped = true; 8330 root_dropped = true;
8078out_end_trans: 8331out_end_trans:
8332 ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
8333 if (ret)
8334 printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8335 "running qgroup updates "
8336 "during snapshot delete. "
8337 "Quota is out of sync, "
8338 "rescan required.\n", ret);
8339
8079 btrfs_end_transaction_throttle(trans, tree_root); 8340 btrfs_end_transaction_throttle(trans, tree_root);
8080out_free: 8341out_free:
8081 kfree(wc); 8342 kfree(wc);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98cb6b2630f9..65b62c467e28 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1201,6 +1201,50 @@ out:
1201 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1201 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1202 return ret; 1202 return ret;
1203} 1203}
1204
1205static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
1206 struct btrfs_qgroup_operation *oper2)
1207{
1208 /*
1209 * Ignore seq and type here, we're looking for any operation
1210 * at all related to this extent on that root.
1211 */
1212 if (oper1->bytenr < oper2->bytenr)
1213 return -1;
1214 if (oper1->bytenr > oper2->bytenr)
1215 return 1;
1216 if (oper1->ref_root < oper2->ref_root)
1217 return -1;
1218 if (oper1->ref_root > oper2->ref_root)
1219 return 1;
1220 return 0;
1221}
1222
1223static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
1224 struct btrfs_qgroup_operation *oper)
1225{
1226 struct rb_node *n;
1227 struct btrfs_qgroup_operation *cur;
1228 int cmp;
1229
1230 spin_lock(&fs_info->qgroup_op_lock);
1231 n = fs_info->qgroup_op_tree.rb_node;
1232 while (n) {
1233 cur = rb_entry(n, struct btrfs_qgroup_operation, n);
1234 cmp = comp_oper_exist(cur, oper);
1235 if (cmp < 0) {
1236 n = n->rb_right;
1237 } else if (cmp) {
1238 n = n->rb_left;
1239 } else {
1240 spin_unlock(&fs_info->qgroup_op_lock);
1241 return -EEXIST;
1242 }
1243 }
1244 spin_unlock(&fs_info->qgroup_op_lock);
1245 return 0;
1246}
1247
1204static int comp_oper(struct btrfs_qgroup_operation *oper1, 1248static int comp_oper(struct btrfs_qgroup_operation *oper1,
1205 struct btrfs_qgroup_operation *oper2) 1249 struct btrfs_qgroup_operation *oper2)
1206{ 1250{
@@ -1290,6 +1334,23 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1290 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); 1334 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
1291 INIT_LIST_HEAD(&oper->elem.list); 1335 INIT_LIST_HEAD(&oper->elem.list);
1292 oper->elem.seq = 0; 1336 oper->elem.seq = 0;
1337
1338 if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
1339 /*
1340 * If any operation for this bytenr/ref_root combo
1341 * exists, then we know it's not exclusively owned and
1342 * shouldn't be queued up.
1343 *
1344 * This also catches the case where we have a cloned
1345 * extent that gets queued up multiple times during
1346 * drop snapshot.
1347 */
1348 if (qgroup_oper_exists(fs_info, oper)) {
1349 kfree(oper);
1350 return 0;
1351 }
1352 }
1353
1293 ret = insert_qgroup_oper(fs_info, oper); 1354 ret = insert_qgroup_oper(fs_info, oper);
1294 if (ret) { 1355 if (ret) {
1295 /* Shouldn't happen so have an assert for developers */ 1356 /* Shouldn't happen so have an assert for developers */
@@ -1884,6 +1945,106 @@ out:
1884} 1945}
1885 1946
1886/* 1947/*
1948 * Process a reference to a shared subtree. This type of operation is
1949 * queued during snapshot removal when we encounter extents which are
1950 * shared between more than one root.
1951 */
1952static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
1953 struct btrfs_fs_info *fs_info,
1954 struct btrfs_qgroup_operation *oper)
1955{
1956 struct ulist *roots = NULL;
1957 struct ulist_node *unode;
1958 struct ulist_iterator uiter;
1959 struct btrfs_qgroup_list *glist;
1960 struct ulist *parents;
1961 int ret = 0;
1962 struct btrfs_qgroup *qg;
1963 u64 root_obj = 0;
1964 struct seq_list elem = {};
1965
1966 parents = ulist_alloc(GFP_NOFS);
1967 if (!parents)
1968 return -ENOMEM;
1969
1970 btrfs_get_tree_mod_seq(fs_info, &elem);
1971 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
1972 elem.seq, &roots);
1973 btrfs_put_tree_mod_seq(fs_info, &elem);
1974 if (ret < 0)
1975 return ret;
1976
1977 if (roots->nnodes != 1)
1978 goto out;
1979
1980 ULIST_ITER_INIT(&uiter);
1981 unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
1982 /*
1983 * If we find our ref root then that means all refs
1984 * this extent has to the root have not yet been
1985 * deleted. In that case, we do nothing and let the
1986 * last ref for this bytenr drive our update.
1987 *
1988 * This can happen for example if an extent is
1989 * referenced multiple times in a snapshot (clone,
1990 * etc). If we are in the middle of snapshot removal,
1991 * queued updates for such an extent will find the
1992 * root if we have not yet finished removing the
1993 * snapshot.
1994 */
1995 if (unode->val == oper->ref_root)
1996 goto out;
1997
1998 root_obj = unode->val;
1999 BUG_ON(!root_obj);
2000
2001 spin_lock(&fs_info->qgroup_lock);
2002 qg = find_qgroup_rb(fs_info, root_obj);
2003 if (!qg)
2004 goto out_unlock;
2005
2006 qg->excl += oper->num_bytes;
2007 qg->excl_cmpr += oper->num_bytes;
2008 qgroup_dirty(fs_info, qg);
2009
2010 /*
2011 * Adjust counts for parent groups. First we find all
2012 * parents, then in the 2nd loop we do the adjustment
2013 * while adding parents of the parents to our ulist.
2014 */
2015 list_for_each_entry(glist, &qg->groups, next_group) {
2016 ret = ulist_add(parents, glist->group->qgroupid,
2017 ptr_to_u64(glist->group), GFP_ATOMIC);
2018 if (ret < 0)
2019 goto out_unlock;
2020 }
2021
2022 ULIST_ITER_INIT(&uiter);
2023 while ((unode = ulist_next(parents, &uiter))) {
2024 qg = u64_to_ptr(unode->aux);
2025 qg->excl += oper->num_bytes;
2026 qg->excl_cmpr += oper->num_bytes;
2027 qgroup_dirty(fs_info, qg);
2028
2029 /* Add any parents of the parents */
2030 list_for_each_entry(glist, &qg->groups, next_group) {
2031 ret = ulist_add(parents, glist->group->qgroupid,
2032 ptr_to_u64(glist->group), GFP_ATOMIC);
2033 if (ret < 0)
2034 goto out_unlock;
2035 }
2036 }
2037
2038out_unlock:
2039 spin_unlock(&fs_info->qgroup_lock);
2040
2041out:
2042 ulist_free(roots);
2043 ulist_free(parents);
2044 return ret;
2045}
2046
2047/*
1887 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted 2048 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
1888 * from the fs. First, all roots referencing the extent are searched, and 2049 * from the fs. First, all roots referencing the extent are searched, and
1889 * then the space is accounted accordingly to the different roots. The 2050 * then the space is accounted accordingly to the different roots. The
@@ -1920,6 +2081,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
1920 case BTRFS_QGROUP_OPER_SUB_SHARED: 2081 case BTRFS_QGROUP_OPER_SUB_SHARED:
1921 ret = qgroup_shared_accounting(trans, fs_info, oper); 2082 ret = qgroup_shared_accounting(trans, fs_info, oper);
1922 break; 2083 break;
2084 case BTRFS_QGROUP_OPER_SUB_SUBTREE:
2085 ret = qgroup_subtree_accounting(trans, fs_info, oper);
2086 break;
1923 default: 2087 default:
1924 ASSERT(0); 2088 ASSERT(0);
1925 } 2089 }
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 5952ff1fbd7a..18cc68ca3090 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type {
44 BTRFS_QGROUP_OPER_ADD_SHARED, 44 BTRFS_QGROUP_OPER_ADD_SHARED,
45 BTRFS_QGROUP_OPER_SUB_EXCL, 45 BTRFS_QGROUP_OPER_SUB_EXCL,
46 BTRFS_QGROUP_OPER_SUB_SHARED, 46 BTRFS_QGROUP_OPER_SUB_SHARED,
47 BTRFS_QGROUP_OPER_SUB_SUBTREE,
47}; 48};
48 49
49struct btrfs_qgroup_operation { 50struct btrfs_qgroup_operation {