aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYan Zheng <zheng.yan@oracle.com>2008-10-29 14:49:05 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:05 -0400
commitf82d02d9d8222183b7945e893111a6d1bf67ae4a (patch)
tree70be1bb231f4cc2e673920774e759359f3dcf1a5
parentc8b978188c9a0fd3d535c13debd19d522b726f1f (diff)
Btrfs: Improve space balancing code
This patch improves the space balancing code to keep more sharing of tree blocks. The only case that breaks sharing of tree blocks is data extents get fragmented during balancing. The main changes in this patch are: Add a 'drop sub-tree' function. This solves the problem in old code that BTRFS_HEADER_FLAG_WRITTEN check breaks sharing of tree block. Remove relocation mapping tree. Relocation mappings are stored in struct btrfs_ref_path and updated dynamically during walking up/down the reference path. This reduces CPU usage and simplifies code. This patch also fixes a bug. Root items for reloc trees should be updated in btrfs_free_reloc_root. Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
-rw-r--r--fs/btrfs/ctree.c93
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c364
-rw-r--r--fs/btrfs/transaction.c4
5 files changed, 277 insertions, 199 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9caeb377de6..73899d0f9d8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -287,7 +287,7 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
287 /* 287 /*
288 * There are only two places that can drop reference to 288 * There are only two places that can drop reference to
289 * tree blocks owned by living reloc trees, one is here, 289 * tree blocks owned by living reloc trees, one is here,
290 * the other place is btrfs_merge_path. In both places, 290 * the other place is btrfs_drop_subtree. In both places,
291 * we check reference count while tree block is locked. 291 * we check reference count while tree block is locked.
292 * Furthermore, if reference count is one, it won't get 292 * Furthermore, if reference count is one, it won't get
293 * increased by someone else. 293 * increased by someone else.
@@ -312,9 +312,6 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
312 } 312 }
313 313
314 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 314 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
315 ret = btrfs_add_reloc_mapping(root, buf->start,
316 buf->len, cow->start);
317 BUG_ON(ret);
318 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); 315 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
319 WARN_ON(ret); 316 WARN_ON(ret);
320 } 317 }
@@ -1627,61 +1624,57 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1627 btrfs_node_key_to_cpu(eb, &key, slot); 1624 btrfs_node_key_to_cpu(eb, &key, slot);
1628 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); 1625 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1629 1626
1627 if (generation == trans->transid) {
1628 eb = read_tree_block(root, bytenr, blocksize,
1629 generation);
1630 btrfs_tree_lock(eb);
1631 }
1632
1630 /* 1633 /*
1631 * if node keys match and node pointer hasn't been modified 1634 * if node keys match and node pointer hasn't been modified
1632 * in the running transaction, we can merge the path. for 1635 * in the running transaction, we can merge the path. for
1633 * reloc trees, the node pointer check is skipped, this is 1636 * blocks owened by reloc trees, the node pointer check is
1634 * because the reloc trees are fully controlled by the space 1637 * skipped, this is because these blocks are fully controlled
1635 * balance code, no one else can modify them. 1638 * by the space balance code, no one else can modify them.
1636 */ 1639 */
1637 if (!nodes[level - 1] || !key_match || 1640 if (!nodes[level - 1] || !key_match ||
1638 (generation == trans->transid && 1641 (generation == trans->transid &&
1639 root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)) { 1642 btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1640next_level: 1643 if (level == 1 || level == lowest_level + 1) {
1641 if (level == 1 || level == lowest_level + 1) 1644 if (generation == trans->transid) {
1645 btrfs_tree_unlock(eb);
1646 free_extent_buffer(eb);
1647 }
1642 break; 1648 break;
1649 }
1643 1650
1644 eb = read_tree_block(root, bytenr, blocksize, 1651 if (generation != trans->transid) {
1645 generation); 1652 eb = read_tree_block(root, bytenr, blocksize,
1646 btrfs_tree_lock(eb); 1653 generation);
1654 btrfs_tree_lock(eb);
1655 }
1647 1656
1648 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1657 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1649 &eb, 0); 1658 &eb, 0);
1650 BUG_ON(ret); 1659 BUG_ON(ret);
1651 1660
1661 if (root->root_key.objectid ==
1662 BTRFS_TREE_RELOC_OBJECTID) {
1663 if (!nodes[level - 1]) {
1664 nodes[level - 1] = eb->start;
1665 memcpy(&node_keys[level - 1], &key,
1666 sizeof(node_keys[0]));
1667 } else {
1668 WARN_ON(1);
1669 }
1670 }
1671
1652 btrfs_tree_unlock(parent); 1672 btrfs_tree_unlock(parent);
1653 free_extent_buffer(parent); 1673 free_extent_buffer(parent);
1654 parent = eb; 1674 parent = eb;
1655 continue; 1675 continue;
1656 } 1676 }
1657 1677
1658 if (generation == trans->transid) {
1659 u32 refs;
1660 BUG_ON(btrfs_header_owner(eb) !=
1661 BTRFS_TREE_RELOC_OBJECTID);
1662 /*
1663 * lock the block to keep __btrfs_cow_block from
1664 * changing the reference count.
1665 */
1666 eb = read_tree_block(root, bytenr, blocksize,
1667 generation);
1668 btrfs_tree_lock(eb);
1669
1670 ret = btrfs_lookup_extent_ref(trans, root, bytenr,
1671 blocksize, &refs);
1672 BUG_ON(ret);
1673 /*
1674 * if replace block whose reference count is one,
1675 * we have to "drop the subtree". so skip it for
1676 * simplicity
1677 */
1678 if (refs == 1) {
1679 btrfs_tree_unlock(eb);
1680 free_extent_buffer(eb);
1681 goto next_level;
1682 }
1683 }
1684
1685 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); 1678 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1686 btrfs_set_node_ptr_generation(parent, slot, trans->transid); 1679 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1687 btrfs_mark_buffer_dirty(parent); 1680 btrfs_mark_buffer_dirty(parent);
@@ -1693,16 +1686,24 @@ next_level:
1693 btrfs_header_generation(parent), 1686 btrfs_header_generation(parent),
1694 level - 1); 1687 level - 1);
1695 BUG_ON(ret); 1688 BUG_ON(ret);
1696 ret = btrfs_free_extent(trans, root, bytenr,
1697 blocksize, parent->start,
1698 btrfs_header_owner(parent),
1699 btrfs_header_generation(parent),
1700 level - 1, 1);
1701 BUG_ON(ret);
1702 1689
1690 /*
1691 * If the block was created in the running transaction,
1692 * it's possible this is the last reference to it, so we
1693 * should drop the subtree.
1694 */
1703 if (generation == trans->transid) { 1695 if (generation == trans->transid) {
1696 ret = btrfs_drop_subtree(trans, root, eb, parent);
1697 BUG_ON(ret);
1704 btrfs_tree_unlock(eb); 1698 btrfs_tree_unlock(eb);
1705 free_extent_buffer(eb); 1699 free_extent_buffer(eb);
1700 } else {
1701 ret = btrfs_free_extent(trans, root, bytenr,
1702 blocksize, parent->start,
1703 btrfs_header_owner(parent),
1704 btrfs_header_generation(parent),
1705 level - 1, 1);
1706 BUG_ON(ret);
1706 } 1707 }
1707 break; 1708 break;
1708 } 1709 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 793d8fdda24..117090995e7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -684,7 +684,6 @@ struct btrfs_fs_info {
684 int thread_pool_size; 684 int thread_pool_size;
685 685
686 /* tree relocation relocated fields */ 686 /* tree relocation relocated fields */
687 struct extent_io_tree reloc_mapping_tree;
688 struct list_head dead_reloc_roots; 687 struct list_head dead_reloc_roots;
689 struct btrfs_leaf_ref_tree reloc_ref_tree; 688 struct btrfs_leaf_ref_tree reloc_ref_tree;
690 struct btrfs_leaf_ref_tree shared_ref_tree; 689 struct btrfs_leaf_ref_tree shared_ref_tree;
@@ -1636,13 +1635,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1636int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 1635int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1637 struct btrfs_root *root, u64 group_start); 1636 struct btrfs_root *root, u64 group_start);
1638int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); 1637int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
1639int btrfs_free_reloc_root(struct btrfs_root *root); 1638int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
1639 struct btrfs_root *root);
1640int btrfs_drop_dead_reloc_roots(struct btrfs_root *root); 1640int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1641int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
1642 u64 num_bytes, u64 new_bytenr);
1643int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
1644 u64 num_bytes, u64 *new_bytenr);
1645void btrfs_free_reloc_mappings(struct btrfs_root *root);
1646int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, 1641int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1647 struct btrfs_root *root, 1642 struct btrfs_root *root,
1648 struct extent_buffer *buf, u64 orig_start); 1643 struct extent_buffer *buf, u64 orig_start);
@@ -1726,6 +1721,10 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1726int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 1721int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1727int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root 1722int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
1728 *root); 1723 *root);
1724int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
1725 struct btrfs_root *root,
1726 struct extent_buffer *node,
1727 struct extent_buffer *parent);
1729/* root-item.c */ 1728/* root-item.c */
1730int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1729int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1731 struct btrfs_key *key); 1730 struct btrfs_key *key);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dc95f636a11..796256440df 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1448,8 +1448,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1448 fs_info->btree_inode->i_mapping, GFP_NOFS); 1448 fs_info->btree_inode->i_mapping, GFP_NOFS);
1449 fs_info->do_barriers = 1; 1449 fs_info->do_barriers = 1;
1450 1450
1451 extent_io_tree_init(&fs_info->reloc_mapping_tree,
1452 fs_info->btree_inode->i_mapping, GFP_NOFS);
1453 INIT_LIST_HEAD(&fs_info->dead_reloc_roots); 1451 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1454 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree); 1452 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1455 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree); 1453 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bbf04e80a1a..56e41369d71 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3032,13 +3032,103 @@ out:
3032} 3032}
3033 3033
3034/* 3034/*
3035 * helper function for drop_subtree, this function is similar to
3036 * walk_down_tree. The main difference is that it checks reference
3037 * counts while tree blocks are locked.
3038 */
3039static int noinline walk_down_subtree(struct btrfs_trans_handle *trans,
3040 struct btrfs_root *root,
3041 struct btrfs_path *path, int *level)
3042{
3043 struct extent_buffer *next;
3044 struct extent_buffer *cur;
3045 struct extent_buffer *parent;
3046 u64 bytenr;
3047 u64 ptr_gen;
3048 u32 blocksize;
3049 u32 refs;
3050 int ret;
3051
3052 cur = path->nodes[*level];
3053 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3054 &refs);
3055 BUG_ON(ret);
3056 if (refs > 1)
3057 goto out;
3058
3059 while (*level >= 0) {
3060 cur = path->nodes[*level];
3061 if (*level == 0) {
3062 ret = btrfs_drop_leaf_ref(trans, root, cur);
3063 BUG_ON(ret);
3064 clean_tree_block(trans, root, cur);
3065 break;
3066 }
3067 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3068 clean_tree_block(trans, root, cur);
3069 break;
3070 }
3071
3072 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3073 blocksize = btrfs_level_size(root, *level - 1);
3074 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3075
3076 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3077 btrfs_tree_lock(next);
3078
3079 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3080 &refs);
3081 BUG_ON(ret);
3082 if (refs > 1) {
3083 parent = path->nodes[*level];
3084 ret = btrfs_free_extent(trans, root, bytenr,
3085 blocksize, parent->start,
3086 btrfs_header_owner(parent),
3087 btrfs_header_generation(parent),
3088 *level - 1, 1);
3089 BUG_ON(ret);
3090 path->slots[*level]++;
3091 btrfs_tree_unlock(next);
3092 free_extent_buffer(next);
3093 continue;
3094 }
3095
3096 *level = btrfs_header_level(next);
3097 path->nodes[*level] = next;
3098 path->slots[*level] = 0;
3099 path->locks[*level] = 1;
3100 cond_resched();
3101 }
3102out:
3103 parent = path->nodes[*level + 1];
3104 bytenr = path->nodes[*level]->start;
3105 blocksize = path->nodes[*level]->len;
3106
3107 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3108 parent->start, btrfs_header_owner(parent),
3109 btrfs_header_generation(parent), *level, 1);
3110 BUG_ON(ret);
3111
3112 if (path->locks[*level]) {
3113 btrfs_tree_unlock(path->nodes[*level]);
3114 path->locks[*level] = 0;
3115 }
3116 free_extent_buffer(path->nodes[*level]);
3117 path->nodes[*level] = NULL;
3118 *level += 1;
3119 cond_resched();
3120 return 0;
3121}
3122
3123/*
3035 * helper for dropping snapshots. This walks back up the tree in the path 3124 * helper for dropping snapshots. This walks back up the tree in the path
3036 * to find the first node higher up where we haven't yet gone through 3125 * to find the first node higher up where we haven't yet gone through
3037 * all the slots 3126 * all the slots
3038 */ 3127 */
3039static int noinline walk_up_tree(struct btrfs_trans_handle *trans, 3128static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
3040 struct btrfs_root *root, 3129 struct btrfs_root *root,
3041 struct btrfs_path *path, int *level) 3130 struct btrfs_path *path,
3131 int *level, int max_level)
3042{ 3132{
3043 u64 root_owner; 3133 u64 root_owner;
3044 u64 root_gen; 3134 u64 root_gen;
@@ -3047,7 +3137,7 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
3047 int slot; 3137 int slot;
3048 int ret; 3138 int ret;
3049 3139
3050 for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 3140 for (i = *level; i < max_level && path->nodes[i]; i++) {
3051 slot = path->slots[i]; 3141 slot = path->slots[i];
3052 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 3142 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3053 struct extent_buffer *node; 3143 struct extent_buffer *node;
@@ -3070,12 +3160,18 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
3070 3160
3071 root_owner = btrfs_header_owner(parent); 3161 root_owner = btrfs_header_owner(parent);
3072 root_gen = btrfs_header_generation(parent); 3162 root_gen = btrfs_header_generation(parent);
3163
3164 clean_tree_block(trans, root, path->nodes[*level]);
3073 ret = btrfs_free_extent(trans, root, 3165 ret = btrfs_free_extent(trans, root,
3074 path->nodes[*level]->start, 3166 path->nodes[*level]->start,
3075 path->nodes[*level]->len, 3167 path->nodes[*level]->len,
3076 parent->start, root_owner, 3168 parent->start, root_owner,
3077 root_gen, *level, 1); 3169 root_gen, *level, 1);
3078 BUG_ON(ret); 3170 BUG_ON(ret);
3171 if (path->locks[*level]) {
3172 btrfs_tree_unlock(path->nodes[*level]);
3173 path->locks[*level] = 0;
3174 }
3079 free_extent_buffer(path->nodes[*level]); 3175 free_extent_buffer(path->nodes[*level]);
3080 path->nodes[*level] = NULL; 3176 path->nodes[*level] = NULL;
3081 *level = i + 1; 3177 *level = i + 1;
@@ -3145,7 +3241,8 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3145 if (wret < 0) 3241 if (wret < 0)
3146 ret = wret; 3242 ret = wret;
3147 3243
3148 wret = walk_up_tree(trans, root, path, &level); 3244 wret = walk_up_tree(trans, root, path, &level,
3245 BTRFS_MAX_LEVEL);
3149 if (wret > 0) 3246 if (wret > 0)
3150 break; 3247 break;
3151 if (wret < 0) 3248 if (wret < 0)
@@ -3168,6 +3265,50 @@ out:
3168 return ret; 3265 return ret;
3169} 3266}
3170 3267
3268int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3269 struct btrfs_root *root,
3270 struct extent_buffer *node,
3271 struct extent_buffer *parent)
3272{
3273 struct btrfs_path *path;
3274 int level;
3275 int parent_level;
3276 int ret = 0;
3277 int wret;
3278
3279 path = btrfs_alloc_path();
3280 BUG_ON(!path);
3281
3282 BUG_ON(!btrfs_tree_locked(parent));
3283 parent_level = btrfs_header_level(parent);
3284 extent_buffer_get(parent);
3285 path->nodes[parent_level] = parent;
3286 path->slots[parent_level] = btrfs_header_nritems(parent);
3287
3288 BUG_ON(!btrfs_tree_locked(node));
3289 level = btrfs_header_level(node);
3290 extent_buffer_get(node);
3291 path->nodes[level] = node;
3292 path->slots[level] = 0;
3293
3294 while (1) {
3295 wret = walk_down_subtree(trans, root, path, &level);
3296 if (wret < 0)
3297 ret = wret;
3298 if (wret != 0)
3299 break;
3300
3301 wret = walk_up_tree(trans, root, path, &level, parent_level);
3302 if (wret < 0)
3303 ret = wret;
3304 if (wret != 0)
3305 break;
3306 }
3307
3308 btrfs_free_path(path);
3309 return ret;
3310}
3311
3171static unsigned long calc_ra(unsigned long start, unsigned long last, 3312static unsigned long calc_ra(unsigned long start, unsigned long last,
3172 unsigned long nr) 3313 unsigned long nr)
3173{ 3314{
@@ -3312,6 +3453,10 @@ struct btrfs_ref_path {
3312 u32 num_refs; 3453 u32 num_refs;
3313 int lowest_level; 3454 int lowest_level;
3314 int current_level; 3455 int current_level;
3456 int shared_level;
3457
3458 struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
3459 u64 new_nodes[BTRFS_MAX_LEVEL];
3315}; 3460};
3316 3461
3317struct disk_extent { 3462struct disk_extent {
@@ -3360,6 +3505,7 @@ static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
3360 if (first_time) { 3505 if (first_time) {
3361 ref_path->lowest_level = -1; 3506 ref_path->lowest_level = -1;
3362 ref_path->current_level = -1; 3507 ref_path->current_level = -1;
3508 ref_path->shared_level = -1;
3363 goto walk_up; 3509 goto walk_up;
3364 } 3510 }
3365walk_down: 3511walk_down:
@@ -3403,8 +3549,11 @@ walk_down:
3403 3549
3404 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3550 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3405 if (found_key.objectid == bytenr && 3551 if (found_key.objectid == bytenr &&
3406 found_key.type == BTRFS_EXTENT_REF_KEY) 3552 found_key.type == BTRFS_EXTENT_REF_KEY) {
3553 if (level < ref_path->shared_level)
3554 ref_path->shared_level = level;
3407 goto found; 3555 goto found;
3556 }
3408next: 3557next:
3409 level--; 3558 level--;
3410 btrfs_release_path(extent_root, path); 3559 btrfs_release_path(extent_root, path);
@@ -3992,51 +4141,6 @@ out:
3992 return ret; 4141 return ret;
3993} 4142}
3994 4143
3995int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
3996 u64 num_bytes, u64 new_bytenr)
3997{
3998 set_extent_bits(&root->fs_info->reloc_mapping_tree,
3999 orig_bytenr, orig_bytenr + num_bytes - 1,
4000 EXTENT_LOCKED, GFP_NOFS);
4001 set_state_private(&root->fs_info->reloc_mapping_tree,
4002 orig_bytenr, new_bytenr);
4003 return 0;
4004}
4005
4006int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
4007 u64 num_bytes, u64 *new_bytenr)
4008{
4009 u64 bytenr;
4010 u64 cur_bytenr = orig_bytenr;
4011 u64 prev_bytenr = orig_bytenr;
4012 int ret;
4013
4014 while (1) {
4015 ret = get_state_private(&root->fs_info->reloc_mapping_tree,
4016 cur_bytenr, &bytenr);
4017 if (ret)
4018 break;
4019 prev_bytenr = cur_bytenr;
4020 cur_bytenr = bytenr;
4021 }
4022
4023 if (orig_bytenr == cur_bytenr)
4024 return -ENOENT;
4025
4026 if (prev_bytenr != orig_bytenr) {
4027 set_state_private(&root->fs_info->reloc_mapping_tree,
4028 orig_bytenr, cur_bytenr);
4029 }
4030 *new_bytenr = cur_bytenr;
4031 return 0;
4032}
4033
4034void btrfs_free_reloc_mappings(struct btrfs_root *root)
4035{
4036 clear_extent_bits(&root->fs_info->reloc_mapping_tree,
4037 0, (u64)-1, -1, GFP_NOFS);
4038}
4039
4040int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, 4144int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4041 struct btrfs_root *root, 4145 struct btrfs_root *root,
4042 struct extent_buffer *buf, u64 orig_start) 4146 struct extent_buffer *buf, u64 orig_start)
@@ -4222,15 +4326,30 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4222 return 0; 4326 return 0;
4223} 4327}
4224 4328
4225int btrfs_free_reloc_root(struct btrfs_root *root) 4329int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4330 struct btrfs_root *root)
4226{ 4331{
4227 struct btrfs_root *reloc_root; 4332 struct btrfs_root *reloc_root;
4333 int ret;
4228 4334
4229 if (root->reloc_root) { 4335 if (root->reloc_root) {
4230 reloc_root = root->reloc_root; 4336 reloc_root = root->reloc_root;
4231 root->reloc_root = NULL; 4337 root->reloc_root = NULL;
4232 list_add(&reloc_root->dead_list, 4338 list_add(&reloc_root->dead_list,
4233 &root->fs_info->dead_reloc_roots); 4339 &root->fs_info->dead_reloc_roots);
4340
4341 btrfs_set_root_bytenr(&reloc_root->root_item,
4342 reloc_root->node->start);
4343 btrfs_set_root_level(&root->root_item,
4344 btrfs_header_level(reloc_root->node));
4345 memset(&reloc_root->root_item.drop_progress, 0,
4346 sizeof(struct btrfs_disk_key));
4347 reloc_root->root_item.drop_level = 0;
4348
4349 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4350 &reloc_root->root_key,
4351 &reloc_root->root_item);
4352 BUG_ON(ret);
4234 } 4353 }
4235 return 0; 4354 return 0;
4236} 4355}
@@ -4356,8 +4475,6 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
4356 btrfs_set_root_refs(root_item, 0); 4475 btrfs_set_root_refs(root_item, 0);
4357 btrfs_set_root_bytenr(root_item, eb->start); 4476 btrfs_set_root_bytenr(root_item, eb->start);
4358 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 4477 btrfs_set_root_level(root_item, btrfs_header_level(eb));
4359 memset(&root_item->drop_progress, 0, sizeof(root_item->drop_progress));
4360 root_item->drop_level = 0;
4361 4478
4362 btrfs_tree_unlock(eb); 4479 btrfs_tree_unlock(eb);
4363 free_extent_buffer(eb); 4480 free_extent_buffer(eb);
@@ -4382,15 +4499,19 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
4382 * Core function of space balance. 4499 * Core function of space balance.
4383 * 4500 *
4384 * The idea is using reloc trees to relocate tree blocks in reference 4501 * The idea is using reloc trees to relocate tree blocks in reference
4385 * counted roots. There is one reloc tree for each subvol, all reloc 4502 * counted roots. There is one reloc tree for each subvol, and all
4386 * trees share same key objectid. Reloc trees are snapshots of the 4503 * reloc trees share same root key objectid. Reloc trees are snapshots
4387 * latest committed roots (subvol root->commit_root). To relocate a tree 4504 * of the latest committed roots of subvols (root->commit_root).
4388 * block referenced by a subvol, the code COW the block through the reloc 4505 *
4389 * tree, then update pointer in the subvol to point to the new block. 4506 * To relocate a tree block referenced by a subvol, there are two steps.
4390 * Since all reloc trees share same key objectid, we can easily do special 4507 * COW the block through subvol's reloc tree, then update block pointer
4391 * handing to share tree blocks between reloc trees. Once a tree block has 4508 * in the subvol to point to the new block. Since all reloc trees share
4392 * been COWed in one reloc tree, we can use the result when the same block 4509 * same root key objectid, doing special handing for tree blocks owned
4393 * is COWed again through other reloc trees. 4510 * by them is easy. Once a tree block has been COWed in one reloc tree,
4511 * we can use the resulting new block directly when the same block is
4512 * required to COW again through other reloc trees. By this way, relocated
4513 * tree blocks are shared between reloc trees, so they are also shared
4514 * between subvols.
4394 */ 4515 */
4395static int noinline relocate_one_path(struct btrfs_trans_handle *trans, 4516static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
4396 struct btrfs_root *root, 4517 struct btrfs_root *root,
@@ -4405,15 +4526,14 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
4405 struct btrfs_key *keys; 4526 struct btrfs_key *keys;
4406 u64 *nodes; 4527 u64 *nodes;
4407 int level; 4528 int level;
4408 int lowest_merge; 4529 int shared_level;
4409 int lowest_level = 0; 4530 int lowest_level = 0;
4410 int update_refs;
4411 int ret; 4531 int ret;
4412 4532
4413 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 4533 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
4414 lowest_level = ref_path->owner_objectid; 4534 lowest_level = ref_path->owner_objectid;
4415 4535
4416 if (is_cowonly_root(ref_path->root_objectid)) { 4536 if (!root->ref_cows) {
4417 path->lowest_level = lowest_level; 4537 path->lowest_level = lowest_level;
4418 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); 4538 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
4419 BUG_ON(ret < 0); 4539 BUG_ON(ret < 0);
@@ -4422,91 +4542,49 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
4422 return 0; 4542 return 0;
4423 } 4543 }
4424 4544
4425 keys = kzalloc(sizeof(*keys) * BTRFS_MAX_LEVEL, GFP_NOFS);
4426 BUG_ON(!keys);
4427 nodes = kzalloc(sizeof(*nodes) * BTRFS_MAX_LEVEL, GFP_NOFS);
4428 BUG_ON(!nodes);
4429
4430 mutex_lock(&root->fs_info->tree_reloc_mutex); 4545 mutex_lock(&root->fs_info->tree_reloc_mutex);
4431 ret = init_reloc_tree(trans, root); 4546 ret = init_reloc_tree(trans, root);
4432 BUG_ON(ret); 4547 BUG_ON(ret);
4433 reloc_root = root->reloc_root; 4548 reloc_root = root->reloc_root;
4434 4549
4435 path->lowest_level = lowest_level; 4550 shared_level = ref_path->shared_level;
4436 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 0); 4551 ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
4437 BUG_ON(ret);
4438 /*
4439 * get relocation mapping for tree blocks in the path
4440 */
4441 lowest_merge = BTRFS_MAX_LEVEL;
4442 for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
4443 u64 new_bytenr;
4444 eb = path->nodes[level];
4445 if (!eb || eb == reloc_root->node)
4446 continue;
4447 ret = btrfs_get_reloc_mapping(reloc_root, eb->start, eb->len,
4448 &new_bytenr);
4449 if (ret)
4450 continue;
4451 if (level == 0)
4452 btrfs_item_key_to_cpu(eb, &keys[level], 0);
4453 else
4454 btrfs_node_key_to_cpu(eb, &keys[level], 0);
4455 nodes[level] = new_bytenr;
4456 lowest_merge = level;
4457 }
4458 4552
4459 update_refs = 0; 4553 keys = ref_path->node_keys;
4460 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 4554 nodes = ref_path->new_nodes;
4461 eb = path->nodes[0]; 4555 memset(&keys[shared_level + 1], 0,
4462 if (btrfs_header_generation(eb) < trans->transid) 4556 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
4463 update_refs = 1; 4557 memset(&nodes[shared_level + 1], 0,
4464 } 4558 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
4465 4559
4466 btrfs_release_path(reloc_root, path); 4560 if (nodes[lowest_level] == 0) {
4467 /* 4561 path->lowest_level = lowest_level;
4468 * merge tree blocks that already relocated in other reloc trees 4562 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
4469 */ 4563 0, 1);
4470 if (lowest_merge != BTRFS_MAX_LEVEL) { 4564 BUG_ON(ret);
4565 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
4566 eb = path->nodes[level];
4567 if (!eb || eb == reloc_root->node)
4568 break;
4569 nodes[level] = eb->start;
4570 if (level == 0)
4571 btrfs_item_key_to_cpu(eb, &keys[level], 0);
4572 else
4573 btrfs_node_key_to_cpu(eb, &keys[level], 0);
4574 }
4575 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
4576 eb = path->nodes[0];
4577 ret = replace_extents_in_leaf(trans, reloc_root, eb,
4578 group, reloc_inode);
4579 BUG_ON(ret);
4580 }
4581 btrfs_release_path(reloc_root, path);
4582 } else {
4471 ret = btrfs_merge_path(trans, reloc_root, keys, nodes, 4583 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
4472 lowest_merge); 4584 lowest_level);
4473 BUG_ON(ret < 0);
4474 }
4475 /*
4476 * cow any tree blocks that still haven't been relocated
4477 */
4478 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 1);
4479 BUG_ON(ret);
4480 /*
4481 * if we are relocating data block group, update extent pointers
4482 * in the newly created tree leaf.
4483 */
4484 eb = path->nodes[0];
4485 if (update_refs && nodes[0] != eb->start) {
4486 ret = replace_extents_in_leaf(trans, reloc_root, eb, group,
4487 reloc_inode);
4488 BUG_ON(ret); 4585 BUG_ON(ret);
4489 } 4586 }
4490 4587
4491 memset(keys, 0, sizeof(*keys) * BTRFS_MAX_LEVEL);
4492 memset(nodes, 0, sizeof(*nodes) * BTRFS_MAX_LEVEL);
4493 for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
4494 eb = path->nodes[level];
4495 if (!eb || eb == reloc_root->node)
4496 continue;
4497 BUG_ON(btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID);
4498 nodes[level] = eb->start;
4499 if (level == 0)
4500 btrfs_item_key_to_cpu(eb, &keys[level], 0);
4501 else
4502 btrfs_node_key_to_cpu(eb, &keys[level], 0);
4503 }
4504
4505 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
4506 eb = path->nodes[0];
4507 extent_buffer_get(eb);
4508 }
4509 btrfs_release_path(reloc_root, path);
4510 /* 4588 /*
4511 * replace tree blocks in the fs tree with tree blocks in 4589 * replace tree blocks in the fs tree with tree blocks in
4512 * the reloc tree. 4590 * the reloc tree.
@@ -4515,15 +4593,19 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
4515 BUG_ON(ret < 0); 4593 BUG_ON(ret < 0);
4516 4594
4517 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 4595 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
4596 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
4597 0, 0);
4598 BUG_ON(ret);
4599 extent_buffer_get(path->nodes[0]);
4600 eb = path->nodes[0];
4601 btrfs_release_path(reloc_root, path);
4518 ret = invalidate_extent_cache(reloc_root, eb, group, root); 4602 ret = invalidate_extent_cache(reloc_root, eb, group, root);
4519 BUG_ON(ret); 4603 BUG_ON(ret);
4520 free_extent_buffer(eb); 4604 free_extent_buffer(eb);
4521 } 4605 }
4522 mutex_unlock(&root->fs_info->tree_reloc_mutex);
4523 4606
4607 mutex_unlock(&root->fs_info->tree_reloc_mutex);
4524 path->lowest_level = 0; 4608 path->lowest_level = 0;
4525 kfree(nodes);
4526 kfree(keys);
4527 return 0; 4609 return 0;
4528} 4610}
4529 4611
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5ecc24d634a..1df67129cc3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -521,7 +521,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
521 dirty = root->dirty_root; 521 dirty = root->dirty_root;
522 522
523 btrfs_free_log(trans, root); 523 btrfs_free_log(trans, root);
524 btrfs_free_reloc_root(root); 524 btrfs_free_reloc_root(trans, root);
525 525
526 if (root->commit_root == root->node) { 526 if (root->commit_root == root->node) {
527 WARN_ON(root->node->start != 527 WARN_ON(root->node->start !=
@@ -930,8 +930,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
930 */ 930 */
931 btrfs_free_log_root_tree(trans, root->fs_info); 931 btrfs_free_log_root_tree(trans, root->fs_info);
932 932
933 btrfs_free_reloc_mappings(root);
934
935 ret = btrfs_commit_tree_roots(trans, root); 933 ret = btrfs_commit_tree_roots(trans, root);
936 BUG_ON(ret); 934 BUG_ON(ret);
937 935