aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorYan Zheng <zheng.yan@oracle.com>2009-06-10 10:45:14 -0400
committerChris Mason <chris.mason@oracle.com>2009-06-10 11:29:46 -0400
commit5d4f98a28c7d334091c1b7744f48a1acdd2a4ae0 (patch)
treec611d7d824cbcdb777dd2d8e33e2ed1c5df8a9c6 /fs/btrfs/inode.c
parent5c939df56c3ea018b58e5aa76181284c2053d699 (diff)
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata. Once a filesystem has been mounted with this commit, IT WILL NO LONGER BE MOUNTABLE BY OLDER KERNELS. When a tree block in subvolume tree is cow'd, the reference counts of all extents it points to are increased by one. At transaction commit time, the old root of the subvolume is recorded in a "dead root" data structure, and the btree it points to is later walked, dropping reference counts and freeing any blocks where the reference count goes to 0. The increments done during cow and decrements done after commit cancel out, and the walk is a very expensive way to go about freeing the blocks that are no longer referenced by the new btree root. This commit reduces the transaction overhead by avoiding the need for dead root records. When a non-shared tree block is cow'd, we free the old block at once, and the new block inherits old block's references. When a tree block with reference count > 1 is cow'd, we increase the reference counts of all extents the new block points to by one, and decrease the old block's reference count by one. This dead tree avoidance code removes the need to modify the reference counts of lower level extents when a non-shared tree block is cow'd. But we still need to update back ref for all pointers in the block. This is because the location of the block is recorded in the back ref item. We can solve this by introducing a new type of back ref. The new back ref provides information about pointer's key, level and in which tree the pointer lives. This information allow us to find the pointer by searching the tree. The shortcoming of the new back ref is that it only works for pointers in tree blocks referenced by their owner trees. This is mostly a problem for snapshots, where resolving one of these fuzzy back references would be O(number_of_snapshots) and quite slow. The solution used here is to use the fuzzy back references in the common case where a given tree block is only referenced by one root, and use the full back references when multiple roots have a reference on a given block. This commit adds per subvolume red-black tree to keep trace of cached inodes. The red-black tree helps the balancing code to find cached inodes whose inode numbers within a given range. This commit improves the balancing code by introducing several data structures to keep the state of balancing. The most important one is the back ref cache. It caches how the upper level tree blocks are referenced. This greatly reduce the overhead of checking back ref. The improved balancing code scales significantly better with a large number of snapshots. This is a very large commit and was written in a number of pieces. But, they depend heavily on the disk format change and were squashed together to make sure git bisect didn't end up in a bad state wrt space balancing or the format change. Signed-off-by: Yan Zheng <zheng.yan@oracle.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c132
1 files changed, 73 insertions, 59 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c8b0190d031..917bf10597c6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -48,7 +48,6 @@
48#include "ordered-data.h" 48#include "ordered-data.h"
49#include "xattr.h" 49#include "xattr.h"
50#include "tree-log.h" 50#include "tree-log.h"
51#include "ref-cache.h"
52#include "compression.h" 51#include "compression.h"
53#include "locking.h" 52#include "locking.h"
54 53
@@ -944,6 +943,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
944 u64 cow_start; 943 u64 cow_start;
945 u64 cur_offset; 944 u64 cur_offset;
946 u64 extent_end; 945 u64 extent_end;
946 u64 extent_offset;
947 u64 disk_bytenr; 947 u64 disk_bytenr;
948 u64 num_bytes; 948 u64 num_bytes;
949 int extent_type; 949 int extent_type;
@@ -1005,6 +1005,7 @@ next_slot:
1005 if (extent_type == BTRFS_FILE_EXTENT_REG || 1005 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1006 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1006 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1007 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1007 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1008 extent_offset = btrfs_file_extent_offset(leaf, fi);
1008 extent_end = found_key.offset + 1009 extent_end = found_key.offset +
1009 btrfs_file_extent_num_bytes(leaf, fi); 1010 btrfs_file_extent_num_bytes(leaf, fi);
1010 if (extent_end <= start) { 1011 if (extent_end <= start) {
@@ -1022,9 +1023,10 @@ next_slot:
1022 if (btrfs_extent_readonly(root, disk_bytenr)) 1023 if (btrfs_extent_readonly(root, disk_bytenr))
1023 goto out_check; 1024 goto out_check;
1024 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 1025 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1025 disk_bytenr)) 1026 found_key.offset -
1027 extent_offset, disk_bytenr))
1026 goto out_check; 1028 goto out_check;
1027 disk_bytenr += btrfs_file_extent_offset(leaf, fi); 1029 disk_bytenr += extent_offset;
1028 disk_bytenr += cur_offset - found_key.offset; 1030 disk_bytenr += cur_offset - found_key.offset;
1029 num_bytes = min(end + 1, extent_end) - cur_offset; 1031 num_bytes = min(end + 1, extent_end) - cur_offset;
1030 /* 1032 /*
@@ -1489,9 +1491,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1489 ins.objectid = disk_bytenr; 1491 ins.objectid = disk_bytenr;
1490 ins.offset = disk_num_bytes; 1492 ins.offset = disk_num_bytes;
1491 ins.type = BTRFS_EXTENT_ITEM_KEY; 1493 ins.type = BTRFS_EXTENT_ITEM_KEY;
1492 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, 1494 ret = btrfs_alloc_reserved_file_extent(trans, root,
1493 root->root_key.objectid, 1495 root->root_key.objectid,
1494 trans->transid, inode->i_ino, &ins); 1496 inode->i_ino, file_pos, &ins);
1495 BUG_ON(ret); 1497 BUG_ON(ret);
1496 btrfs_free_path(path); 1498 btrfs_free_path(path);
1497 1499
@@ -1956,23 +1958,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
1956 * crossing root thing. we store the inode number in the 1958 * crossing root thing. we store the inode number in the
1957 * offset of the orphan item. 1959 * offset of the orphan item.
1958 */ 1960 */
1959 inode = btrfs_iget_locked(root->fs_info->sb, 1961 found_key.objectid = found_key.offset;
1960 found_key.offset, root); 1962 found_key.type = BTRFS_INODE_ITEM_KEY;
1961 if (!inode) 1963 found_key.offset = 0;
1964 inode = btrfs_iget(root->fs_info->sb, &found_key, root);
1965 if (IS_ERR(inode))
1962 break; 1966 break;
1963 1967
1964 if (inode->i_state & I_NEW) {
1965 BTRFS_I(inode)->root = root;
1966
1967 /* have to set the location manually */
1968 BTRFS_I(inode)->location.objectid = inode->i_ino;
1969 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1970 BTRFS_I(inode)->location.offset = 0;
1971
1972 btrfs_read_locked_inode(inode);
1973 unlock_new_inode(inode);
1974 }
1975
1976 /* 1968 /*
1977 * add this inode to the orphan list so btrfs_orphan_del does 1969 * add this inode to the orphan list so btrfs_orphan_del does
1978 * the proper thing when we hit it 1970 * the proper thing when we hit it
@@ -2069,7 +2061,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
2069/* 2061/*
2070 * read an inode from the btree into the in-memory inode 2062 * read an inode from the btree into the in-memory inode
2071 */ 2063 */
2072void btrfs_read_locked_inode(struct inode *inode) 2064static void btrfs_read_locked_inode(struct inode *inode)
2073{ 2065{
2074 struct btrfs_path *path; 2066 struct btrfs_path *path;
2075 struct extent_buffer *leaf; 2067 struct extent_buffer *leaf;
@@ -2599,9 +2591,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2599 struct btrfs_file_extent_item *fi; 2591 struct btrfs_file_extent_item *fi;
2600 u64 extent_start = 0; 2592 u64 extent_start = 0;
2601 u64 extent_num_bytes = 0; 2593 u64 extent_num_bytes = 0;
2594 u64 extent_offset = 0;
2602 u64 item_end = 0; 2595 u64 item_end = 0;
2603 u64 root_gen = 0;
2604 u64 root_owner = 0;
2605 int found_extent; 2596 int found_extent;
2606 int del_item; 2597 int del_item;
2607 int pending_del_nr = 0; 2598 int pending_del_nr = 0;
@@ -2716,6 +2707,9 @@ search_again:
2716 extent_num_bytes = 2707 extent_num_bytes =
2717 btrfs_file_extent_disk_num_bytes(leaf, 2708 btrfs_file_extent_disk_num_bytes(leaf,
2718 fi); 2709 fi);
2710 extent_offset = found_key.offset -
2711 btrfs_file_extent_offset(leaf, fi);
2712
2719 /* FIXME blocksize != 4096 */ 2713 /* FIXME blocksize != 4096 */
2720 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 2714 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2721 if (extent_start != 0) { 2715 if (extent_start != 0) {
@@ -2723,8 +2717,6 @@ search_again:
2723 if (root->ref_cows) 2717 if (root->ref_cows)
2724 inode_sub_bytes(inode, num_dec); 2718 inode_sub_bytes(inode, num_dec);
2725 } 2719 }
2726 root_gen = btrfs_header_generation(leaf);
2727 root_owner = btrfs_header_owner(leaf);
2728 } 2720 }
2729 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2721 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2730 /* 2722 /*
@@ -2768,12 +2760,12 @@ delete:
2768 } else { 2760 } else {
2769 break; 2761 break;
2770 } 2762 }
2771 if (found_extent) { 2763 if (found_extent && root->ref_cows) {
2772 btrfs_set_path_blocking(path); 2764 btrfs_set_path_blocking(path);
2773 ret = btrfs_free_extent(trans, root, extent_start, 2765 ret = btrfs_free_extent(trans, root, extent_start,
2774 extent_num_bytes, 2766 extent_num_bytes, 0,
2775 leaf->start, root_owner, 2767 btrfs_header_owner(leaf),
2776 root_gen, inode->i_ino, 0); 2768 inode->i_ino, extent_offset);
2777 BUG_ON(ret); 2769 BUG_ON(ret);
2778 } 2770 }
2779next: 2771next:
@@ -3105,6 +3097,45 @@ static int fixup_tree_root_location(struct btrfs_root *root,
3105 return 0; 3097 return 0;
3106} 3098}
3107 3099
3100static void inode_tree_add(struct inode *inode)
3101{
3102 struct btrfs_root *root = BTRFS_I(inode)->root;
3103 struct btrfs_inode *entry;
3104 struct rb_node **p = &root->inode_tree.rb_node;
3105 struct rb_node *parent = NULL;
3106
3107 spin_lock(&root->inode_lock);
3108 while (*p) {
3109 parent = *p;
3110 entry = rb_entry(parent, struct btrfs_inode, rb_node);
3111
3112 if (inode->i_ino < entry->vfs_inode.i_ino)
3113 p = &(*p)->rb_left;
3114 else if (inode->i_ino > entry->vfs_inode.i_ino)
3115 p = &(*p)->rb_right;
3116 else {
3117 WARN_ON(!(entry->vfs_inode.i_state &
3118 (I_WILL_FREE | I_FREEING | I_CLEAR)));
3119 break;
3120 }
3121 }
3122 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
3123 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3124 spin_unlock(&root->inode_lock);
3125}
3126
3127static void inode_tree_del(struct inode *inode)
3128{
3129 struct btrfs_root *root = BTRFS_I(inode)->root;
3130
3131 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
3132 spin_lock(&root->inode_lock);
3133 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3134 spin_unlock(&root->inode_lock);
3135 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3136 }
3137}
3138
3108static noinline void init_btrfs_i(struct inode *inode) 3139static noinline void init_btrfs_i(struct inode *inode)
3109{ 3140{
3110 struct btrfs_inode *bi = BTRFS_I(inode); 3141 struct btrfs_inode *bi = BTRFS_I(inode);
@@ -3130,6 +3161,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3130 inode->i_mapping, GFP_NOFS); 3161 inode->i_mapping, GFP_NOFS);
3131 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3162 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3132 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); 3163 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3164 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3133 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3165 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3134 mutex_init(&BTRFS_I(inode)->extent_mutex); 3166 mutex_init(&BTRFS_I(inode)->extent_mutex);
3135 mutex_init(&BTRFS_I(inode)->log_mutex); 3167 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3152,26 +3184,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
3152 args->root == BTRFS_I(inode)->root; 3184 args->root == BTRFS_I(inode)->root;
3153} 3185}
3154 3186
3155struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, 3187static struct inode *btrfs_iget_locked(struct super_block *s,
3156 struct btrfs_root *root, int wait) 3188 u64 objectid,
3157{ 3189 struct btrfs_root *root)
3158 struct inode *inode;
3159 struct btrfs_iget_args args;
3160 args.ino = objectid;
3161 args.root = root;
3162
3163 if (wait) {
3164 inode = ilookup5(s, objectid, btrfs_find_actor,
3165 (void *)&args);
3166 } else {
3167 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3168 (void *)&args);
3169 }
3170 return inode;
3171}
3172
3173struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3174 struct btrfs_root *root)
3175{ 3190{
3176 struct inode *inode; 3191 struct inode *inode;
3177 struct btrfs_iget_args args; 3192 struct btrfs_iget_args args;
@@ -3188,24 +3203,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3188 * Returns in *is_new if the inode was read from disk 3203 * Returns in *is_new if the inode was read from disk
3189 */ 3204 */
3190struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3205struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3191 struct btrfs_root *root, int *is_new) 3206 struct btrfs_root *root)
3192{ 3207{
3193 struct inode *inode; 3208 struct inode *inode;
3194 3209
3195 inode = btrfs_iget_locked(s, location->objectid, root); 3210 inode = btrfs_iget_locked(s, location->objectid, root);
3196 if (!inode) 3211 if (!inode)
3197 return ERR_PTR(-EACCES); 3212 return ERR_PTR(-ENOMEM);
3198 3213
3199 if (inode->i_state & I_NEW) { 3214 if (inode->i_state & I_NEW) {
3200 BTRFS_I(inode)->root = root; 3215 BTRFS_I(inode)->root = root;
3201 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3216 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3202 btrfs_read_locked_inode(inode); 3217 btrfs_read_locked_inode(inode);
3218
3219 inode_tree_add(inode);
3203 unlock_new_inode(inode); 3220 unlock_new_inode(inode);
3204 if (is_new)
3205 *is_new = 1;
3206 } else {
3207 if (is_new)
3208 *is_new = 0;
3209 } 3221 }
3210 3222
3211 return inode; 3223 return inode;
@@ -3218,7 +3230,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3218 struct btrfs_root *root = bi->root; 3230 struct btrfs_root *root = bi->root;
3219 struct btrfs_root *sub_root = root; 3231 struct btrfs_root *sub_root = root;
3220 struct btrfs_key location; 3232 struct btrfs_key location;
3221 int ret, new; 3233 int ret;
3222 3234
3223 if (dentry->d_name.len > BTRFS_NAME_LEN) 3235 if (dentry->d_name.len > BTRFS_NAME_LEN)
3224 return ERR_PTR(-ENAMETOOLONG); 3236 return ERR_PTR(-ENAMETOOLONG);
@@ -3236,7 +3248,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3236 return ERR_PTR(ret); 3248 return ERR_PTR(ret);
3237 if (ret > 0) 3249 if (ret > 0)
3238 return ERR_PTR(-ENOENT); 3250 return ERR_PTR(-ENOENT);
3239 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); 3251 inode = btrfs_iget(dir->i_sb, &location, sub_root);
3240 if (IS_ERR(inode)) 3252 if (IS_ERR(inode))
3241 return ERR_CAST(inode); 3253 return ERR_CAST(inode);
3242 } 3254 }
@@ -3631,6 +3643,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3631 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 3643 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3632 3644
3633 insert_inode_hash(inode); 3645 insert_inode_hash(inode);
3646 inode_tree_add(inode);
3634 return inode; 3647 return inode;
3635fail: 3648fail:
3636 if (dir) 3649 if (dir)
@@ -4683,6 +4696,7 @@ void btrfs_destroy_inode(struct inode *inode)
4683 btrfs_put_ordered_extent(ordered); 4696 btrfs_put_ordered_extent(ordered);
4684 } 4697 }
4685 } 4698 }
4699 inode_tree_del(inode);
4686 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 4700 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4687 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 4701 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4688} 4702}