aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
authorYan Zheng <zheng.yan@oracle.com>2009-06-10 10:45:14 -0400
committerChris Mason <chris.mason@oracle.com>2009-06-10 11:29:46 -0400
commit5d4f98a28c7d334091c1b7744f48a1acdd2a4ae0 (patch)
treec611d7d824cbcdb777dd2d8e33e2ed1c5df8a9c6 /fs/btrfs/disk-io.c
parent5c939df56c3ea018b58e5aa76181284c2053d699 (diff)
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata. Once a filesystem has been mounted with this commit, IT WILL NO LONGER BE MOUNTABLE BY OLDER KERNELS. When a tree block in subvolume tree is cow'd, the reference counts of all extents it points to are increased by one. At transaction commit time, the old root of the subvolume is recorded in a "dead root" data structure, and the btree it points to is later walked, dropping reference counts and freeing any blocks where the reference count goes to 0. The increments done during cow and decrements done after commit cancel out, and the walk is a very expensive way to go about freeing the blocks that are no longer referenced by the new btree root. This commit reduces the transaction overhead by avoiding the need for dead root records. When a non-shared tree block is cow'd, we free the old block at once, and the new block inherits old block's references. When a tree block with reference count > 1 is cow'd, we increase the reference counts of all extents the new block points to by one, and decrease the old block's reference count by one. This dead tree avoidance code removes the need to modify the reference counts of lower level extents when a non-shared tree block is cow'd. But we still need to update back ref for all pointers in the block. This is because the location of the block is recorded in the back ref item. We can solve this by introducing a new type of back ref. The new back ref provides information about pointer's key, level and in which tree the pointer lives. This information allow us to find the pointer by searching the tree. The shortcoming of the new back ref is that it only works for pointers in tree blocks referenced by their owner trees. This is mostly a problem for snapshots, where resolving one of these fuzzy back references would be O(number_of_snapshots) and quite slow. The solution used here is to use the fuzzy back references in the common case where a given tree block is only referenced by one root, and use the full back references when multiple roots have a reference on a given block. This commit adds per subvolume red-black tree to keep trace of cached inodes. The red-black tree helps the balancing code to find cached inodes whose inode numbers within a given range. This commit improves the balancing code by introducing several data structures to keep the state of balancing. The most important one is the back ref cache. It caches how the upper level tree blocks are referenced. This greatly reduce the overhead of checking back ref. The improved balancing code scales significantly better with a large number of snapshots. This is a very large commit and was written in a number of pieces. But, they depend heavily on the disk format change and were squashed together to make sure git bisect didn't end up in a bad state wrt space balancing or the format change. Signed-off-by: Yan Zheng <zheng.yan@oracle.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c95
1 files changed, 50 insertions, 45 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b0ea0b80c23..7f5c6e3e9992 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -36,7 +36,6 @@
36#include "print-tree.h" 36#include "print-tree.h"
37#include "async-thread.h" 37#include "async-thread.h"
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h"
40#include "tree-log.h" 39#include "tree-log.h"
41#include "free-space-cache.h" 40#include "free-space-cache.h"
42 41
@@ -884,7 +883,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
884{ 883{
885 root->node = NULL; 884 root->node = NULL;
886 root->commit_root = NULL; 885 root->commit_root = NULL;
887 root->ref_tree = NULL;
888 root->sectorsize = sectorsize; 886 root->sectorsize = sectorsize;
889 root->nodesize = nodesize; 887 root->nodesize = nodesize;
890 root->leafsize = leafsize; 888 root->leafsize = leafsize;
@@ -899,12 +897,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
899 root->last_inode_alloc = 0; 897 root->last_inode_alloc = 0;
900 root->name = NULL; 898 root->name = NULL;
901 root->in_sysfs = 0; 899 root->in_sysfs = 0;
900 root->inode_tree.rb_node = NULL;
902 901
903 INIT_LIST_HEAD(&root->dirty_list); 902 INIT_LIST_HEAD(&root->dirty_list);
904 INIT_LIST_HEAD(&root->orphan_list); 903 INIT_LIST_HEAD(&root->orphan_list);
905 INIT_LIST_HEAD(&root->dead_list); 904 INIT_LIST_HEAD(&root->root_list);
906 spin_lock_init(&root->node_lock); 905 spin_lock_init(&root->node_lock);
907 spin_lock_init(&root->list_lock); 906 spin_lock_init(&root->list_lock);
907 spin_lock_init(&root->inode_lock);
908 mutex_init(&root->objectid_mutex); 908 mutex_init(&root->objectid_mutex);
909 mutex_init(&root->log_mutex); 909 mutex_init(&root->log_mutex);
910 init_waitqueue_head(&root->log_writer_wait); 910 init_waitqueue_head(&root->log_writer_wait);
@@ -918,9 +918,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
918 extent_io_tree_init(&root->dirty_log_pages, 918 extent_io_tree_init(&root->dirty_log_pages,
919 fs_info->btree_inode->i_mapping, GFP_NOFS); 919 fs_info->btree_inode->i_mapping, GFP_NOFS);
920 920
921 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
922 root->ref_tree = &root->ref_tree_struct;
923
924 memset(&root->root_key, 0, sizeof(root->root_key)); 921 memset(&root->root_key, 0, sizeof(root->root_key));
925 memset(&root->root_item, 0, sizeof(root->root_item)); 922 memset(&root->root_item, 0, sizeof(root->root_item));
926 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); 923 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -959,6 +956,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
959 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 956 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
960 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 957 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
961 blocksize, generation); 958 blocksize, generation);
959 root->commit_root = btrfs_root_node(root);
962 BUG_ON(!root->node); 960 BUG_ON(!root->node);
963 return 0; 961 return 0;
964} 962}
@@ -1025,20 +1023,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1025 */ 1023 */
1026 root->ref_cows = 0; 1024 root->ref_cows = 0;
1027 1025
1028 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 1026 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1029 0, BTRFS_TREE_LOG_OBJECTID, 1027 BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
1030 trans->transid, 0, 0, 0);
1031 if (IS_ERR(leaf)) { 1028 if (IS_ERR(leaf)) {
1032 kfree(root); 1029 kfree(root);
1033 return ERR_CAST(leaf); 1030 return ERR_CAST(leaf);
1034 } 1031 }
1035 1032
1033 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1034 btrfs_set_header_bytenr(leaf, leaf->start);
1035 btrfs_set_header_generation(leaf, trans->transid);
1036 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1037 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
1036 root->node = leaf; 1038 root->node = leaf;
1037 btrfs_set_header_nritems(root->node, 0);
1038 btrfs_set_header_level(root->node, 0);
1039 btrfs_set_header_bytenr(root->node, root->node->start);
1040 btrfs_set_header_generation(root->node, trans->transid);
1041 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
1042 1039
1043 write_extent_buffer(root->node, root->fs_info->fsid, 1040 write_extent_buffer(root->node, root->fs_info->fsid,
1044 (unsigned long)btrfs_header_fsid(root->node), 1041 (unsigned long)btrfs_header_fsid(root->node),
@@ -1081,8 +1078,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1081 inode_item->nbytes = cpu_to_le64(root->leafsize); 1078 inode_item->nbytes = cpu_to_le64(root->leafsize);
1082 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 1079 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1083 1080
1084 btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); 1081 btrfs_set_root_node(&log_root->root_item, log_root->node);
1085 btrfs_set_root_generation(&log_root->root_item, trans->transid);
1086 1082
1087 WARN_ON(root->log_root); 1083 WARN_ON(root->log_root);
1088 root->log_root = log_root; 1084 root->log_root = log_root;
@@ -1144,6 +1140,7 @@ out:
1144 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1140 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1145 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1141 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1146 blocksize, generation); 1142 blocksize, generation);
1143 root->commit_root = btrfs_root_node(root);
1147 BUG_ON(!root->node); 1144 BUG_ON(!root->node);
1148insert: 1145insert:
1149 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1146 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1210,7 +1207,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1210 } 1207 }
1211 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 1208 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1212 ret = btrfs_find_dead_roots(fs_info->tree_root, 1209 ret = btrfs_find_dead_roots(fs_info->tree_root,
1213 root->root_key.objectid, root); 1210 root->root_key.objectid);
1214 BUG_ON(ret); 1211 BUG_ON(ret);
1215 btrfs_orphan_cleanup(root); 1212 btrfs_orphan_cleanup(root);
1216 } 1213 }
@@ -1569,8 +1566,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1569 atomic_set(&fs_info->async_delalloc_pages, 0); 1566 atomic_set(&fs_info->async_delalloc_pages, 0);
1570 atomic_set(&fs_info->async_submit_draining, 0); 1567 atomic_set(&fs_info->async_submit_draining, 0);
1571 atomic_set(&fs_info->nr_async_bios, 0); 1568 atomic_set(&fs_info->nr_async_bios, 0);
1572 atomic_set(&fs_info->throttles, 0);
1573 atomic_set(&fs_info->throttle_gen, 0);
1574 fs_info->sb = sb; 1569 fs_info->sb = sb;
1575 fs_info->max_extent = (u64)-1; 1570 fs_info->max_extent = (u64)-1;
1576 fs_info->max_inline = 8192 * 1024; 1571 fs_info->max_inline = 8192 * 1024;
@@ -1598,6 +1593,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1598 fs_info->btree_inode->i_mapping->a_ops = &btree_aops; 1593 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1599 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; 1594 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1600 1595
1596 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
1601 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, 1597 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1602 fs_info->btree_inode->i_mapping, 1598 fs_info->btree_inode->i_mapping,
1603 GFP_NOFS); 1599 GFP_NOFS);
@@ -1613,10 +1609,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1613 fs_info->btree_inode->i_mapping, GFP_NOFS); 1609 fs_info->btree_inode->i_mapping, GFP_NOFS);
1614 fs_info->do_barriers = 1; 1610 fs_info->do_barriers = 1;
1615 1611
1616 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1617 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1618 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1619
1620 BTRFS_I(fs_info->btree_inode)->root = tree_root; 1612 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1621 memset(&BTRFS_I(fs_info->btree_inode)->location, 0, 1613 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1622 sizeof(struct btrfs_key)); 1614 sizeof(struct btrfs_key));
@@ -1674,6 +1666,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1674 goto fail_iput; 1666 goto fail_iput;
1675 } 1667 }
1676 1668
1669 features = btrfs_super_incompat_flags(disk_super);
1670 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
1671 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1672 btrfs_set_super_incompat_flags(disk_super, features);
1673 }
1674
1677 features = btrfs_super_compat_ro_flags(disk_super) & 1675 features = btrfs_super_compat_ro_flags(disk_super) &
1678 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1676 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
1679 if (!(sb->s_flags & MS_RDONLY) && features) { 1677 if (!(sb->s_flags & MS_RDONLY) && features) {
@@ -1771,7 +1769,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1771 if (ret) { 1769 if (ret) {
1772 printk(KERN_WARNING "btrfs: failed to read the system " 1770 printk(KERN_WARNING "btrfs: failed to read the system "
1773 "array on %s\n", sb->s_id); 1771 "array on %s\n", sb->s_id);
1774 goto fail_sys_array; 1772 goto fail_sb_buffer;
1775 } 1773 }
1776 1774
1777 blocksize = btrfs_level_size(tree_root, 1775 blocksize = btrfs_level_size(tree_root,
@@ -1785,6 +1783,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1785 btrfs_super_chunk_root(disk_super), 1783 btrfs_super_chunk_root(disk_super),
1786 blocksize, generation); 1784 blocksize, generation);
1787 BUG_ON(!chunk_root->node); 1785 BUG_ON(!chunk_root->node);
1786 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1787 chunk_root->commit_root = btrfs_root_node(chunk_root);
1788 1788
1789 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, 1789 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1790 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 1790 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1810,7 +1810,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1810 blocksize, generation); 1810 blocksize, generation);
1811 if (!tree_root->node) 1811 if (!tree_root->node)
1812 goto fail_chunk_root; 1812 goto fail_chunk_root;
1813 1813 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1814 tree_root->commit_root = btrfs_root_node(tree_root);
1814 1815
1815 ret = find_and_setup_root(tree_root, fs_info, 1816 ret = find_and_setup_root(tree_root, fs_info,
1816 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 1817 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1820,14 +1821,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1820 1821
1821 ret = find_and_setup_root(tree_root, fs_info, 1822 ret = find_and_setup_root(tree_root, fs_info,
1822 BTRFS_DEV_TREE_OBJECTID, dev_root); 1823 BTRFS_DEV_TREE_OBJECTID, dev_root);
1823 dev_root->track_dirty = 1;
1824 if (ret) 1824 if (ret)
1825 goto fail_extent_root; 1825 goto fail_extent_root;
1826 dev_root->track_dirty = 1;
1826 1827
1827 ret = find_and_setup_root(tree_root, fs_info, 1828 ret = find_and_setup_root(tree_root, fs_info,
1828 BTRFS_CSUM_TREE_OBJECTID, csum_root); 1829 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1829 if (ret) 1830 if (ret)
1830 goto fail_extent_root; 1831 goto fail_dev_root;
1831 1832
1832 csum_root->track_dirty = 1; 1833 csum_root->track_dirty = 1;
1833 1834
@@ -1881,7 +1882,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1881 } 1882 }
1882 1883
1883 if (!(sb->s_flags & MS_RDONLY)) { 1884 if (!(sb->s_flags & MS_RDONLY)) {
1884 ret = btrfs_cleanup_reloc_trees(tree_root); 1885 ret = btrfs_recover_relocation(tree_root);
1885 BUG_ON(ret); 1886 BUG_ON(ret);
1886 } 1887 }
1887 1888
@@ -1908,14 +1909,19 @@ fail_cleaner:
1908 1909
1909fail_csum_root: 1910fail_csum_root:
1910 free_extent_buffer(csum_root->node); 1911 free_extent_buffer(csum_root->node);
1912 free_extent_buffer(csum_root->commit_root);
1913fail_dev_root:
1914 free_extent_buffer(dev_root->node);
1915 free_extent_buffer(dev_root->commit_root);
1911fail_extent_root: 1916fail_extent_root:
1912 free_extent_buffer(extent_root->node); 1917 free_extent_buffer(extent_root->node);
1918 free_extent_buffer(extent_root->commit_root);
1913fail_tree_root: 1919fail_tree_root:
1914 free_extent_buffer(tree_root->node); 1920 free_extent_buffer(tree_root->node);
1921 free_extent_buffer(tree_root->commit_root);
1915fail_chunk_root: 1922fail_chunk_root:
1916 free_extent_buffer(chunk_root->node); 1923 free_extent_buffer(chunk_root->node);
1917fail_sys_array: 1924 free_extent_buffer(chunk_root->commit_root);
1918 free_extent_buffer(dev_root->node);
1919fail_sb_buffer: 1925fail_sb_buffer:
1920 btrfs_stop_workers(&fs_info->fixup_workers); 1926 btrfs_stop_workers(&fs_info->fixup_workers);
1921 btrfs_stop_workers(&fs_info->delalloc_workers); 1927 btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -2173,6 +2179,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
2173 2179
2174int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2180int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2175{ 2181{
2182 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2176 radix_tree_delete(&fs_info->fs_roots_radix, 2183 radix_tree_delete(&fs_info->fs_roots_radix,
2177 (unsigned long)root->root_key.objectid); 2184 (unsigned long)root->root_key.objectid);
2178 if (root->anon_super.s_dev) { 2185 if (root->anon_super.s_dev) {
@@ -2219,10 +2226,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2219 ARRAY_SIZE(gang)); 2226 ARRAY_SIZE(gang));
2220 if (!ret) 2227 if (!ret)
2221 break; 2228 break;
2229
2230 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2222 for (i = 0; i < ret; i++) { 2231 for (i = 0; i < ret; i++) {
2223 root_objectid = gang[i]->root_key.objectid; 2232 root_objectid = gang[i]->root_key.objectid;
2224 ret = btrfs_find_dead_roots(fs_info->tree_root, 2233 ret = btrfs_find_dead_roots(fs_info->tree_root,
2225 root_objectid, gang[i]); 2234 root_objectid);
2226 BUG_ON(ret); 2235 BUG_ON(ret);
2227 btrfs_orphan_cleanup(gang[i]); 2236 btrfs_orphan_cleanup(gang[i]);
2228 } 2237 }
@@ -2278,20 +2287,16 @@ int close_ctree(struct btrfs_root *root)
2278 (unsigned long long)fs_info->total_ref_cache_size); 2287 (unsigned long long)fs_info->total_ref_cache_size);
2279 } 2288 }
2280 2289
2281 if (fs_info->extent_root->node) 2290 free_extent_buffer(fs_info->extent_root->node);
2282 free_extent_buffer(fs_info->extent_root->node); 2291 free_extent_buffer(fs_info->extent_root->commit_root);
2283 2292 free_extent_buffer(fs_info->tree_root->node);
2284 if (fs_info->tree_root->node) 2293 free_extent_buffer(fs_info->tree_root->commit_root);
2285 free_extent_buffer(fs_info->tree_root->node); 2294 free_extent_buffer(root->fs_info->chunk_root->node);
2286 2295 free_extent_buffer(root->fs_info->chunk_root->commit_root);
2287 if (root->fs_info->chunk_root->node) 2296 free_extent_buffer(root->fs_info->dev_root->node);
2288 free_extent_buffer(root->fs_info->chunk_root->node); 2297 free_extent_buffer(root->fs_info->dev_root->commit_root);
2289 2298 free_extent_buffer(root->fs_info->csum_root->node);
2290 if (root->fs_info->dev_root->node) 2299 free_extent_buffer(root->fs_info->csum_root->commit_root);
2291 free_extent_buffer(root->fs_info->dev_root->node);
2292
2293 if (root->fs_info->csum_root->node)
2294 free_extent_buffer(root->fs_info->csum_root->node);
2295 2300
2296 btrfs_free_block_groups(root->fs_info); 2301 btrfs_free_block_groups(root->fs_info);
2297 2302