aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorYan Zheng <zheng.yan@oracle.com>2009-06-10 10:45:14 -0400
committerChris Mason <chris.mason@oracle.com>2009-06-10 11:29:46 -0400
commit5d4f98a28c7d334091c1b7744f48a1acdd2a4ae0 (patch)
treec611d7d824cbcdb777dd2d8e33e2ed1c5df8a9c6 /fs/btrfs/tree-log.c
parent5c939df56c3ea018b58e5aa76181284c2053d699 (diff)
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata. Once a filesystem has been mounted with this commit, IT WILL NO LONGER BE MOUNTABLE BY OLDER KERNELS. When a tree block in subvolume tree is cow'd, the reference counts of all extents it points to are increased by one. At transaction commit time, the old root of the subvolume is recorded in a "dead root" data structure, and the btree it points to is later walked, dropping reference counts and freeing any blocks where the reference count goes to 0. The increments done during cow and decrements done after commit cancel out, and the walk is a very expensive way to go about freeing the blocks that are no longer referenced by the new btree root. This commit reduces the transaction overhead by avoiding the need for dead root records. When a non-shared tree block is cow'd, we free the old block at once, and the new block inherits old block's references. When a tree block with reference count > 1 is cow'd, we increase the reference counts of all extents the new block points to by one, and decrease the old block's reference count by one. This dead tree avoidance code removes the need to modify the reference counts of lower level extents when a non-shared tree block is cow'd. But we still need to update back ref for all pointers in the block. This is because the location of the block is recorded in the back ref item. We can solve this by introducing a new type of back ref. The new back ref provides information about pointer's key, level and in which tree the pointer lives. This information allow us to find the pointer by searching the tree. The shortcoming of the new back ref is that it only works for pointers in tree blocks referenced by their owner trees. This is mostly a problem for snapshots, where resolving one of these fuzzy back references would be O(number_of_snapshots) and quite slow. The solution used here is to use the fuzzy back references in the common case where a given tree block is only referenced by one root, and use the full back references when multiple roots have a reference on a given block. This commit adds per subvolume red-black tree to keep trace of cached inodes. The red-black tree helps the balancing code to find cached inodes whose inode numbers within a given range. This commit improves the balancing code by introducing several data structures to keep the state of balancing. The most important one is the back ref cache. It caches how the upper level tree blocks are referenced. This greatly reduce the overhead of checking back ref. The improved balancing code scales significantly better with a large number of snapshots. This is a very large commit and was written in a number of pieces. But, they depend heavily on the disk format change and were squashed together to make sure git bisect didn't end up in a bad state wrt space balancing or the format change. Signed-off-by: Yan Zheng <zheng.yan@oracle.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c102
1 files changed, 36 insertions, 66 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index db5e212e8445..2b41fc08c34a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -430,18 +430,16 @@ no_copy:
430static noinline struct inode *read_one_inode(struct btrfs_root *root, 430static noinline struct inode *read_one_inode(struct btrfs_root *root,
431 u64 objectid) 431 u64 objectid)
432{ 432{
433 struct btrfs_key key;
433 struct inode *inode; 434 struct inode *inode;
434 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
435 if (inode->i_state & I_NEW) {
436 BTRFS_I(inode)->root = root;
437 BTRFS_I(inode)->location.objectid = objectid;
438 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
439 BTRFS_I(inode)->location.offset = 0;
440 btrfs_read_locked_inode(inode);
441 unlock_new_inode(inode);
442 435
443 } 436 key.objectid = objectid;
444 if (is_bad_inode(inode)) { 437 key.type = BTRFS_INODE_ITEM_KEY;
438 key.offset = 0;
439 inode = btrfs_iget(root->fs_info->sb, &key, root);
440 if (IS_ERR(inode)) {
441 inode = NULL;
442 } else if (is_bad_inode(inode)) {
445 iput(inode); 443 iput(inode);
446 inode = NULL; 444 inode = NULL;
447 } 445 }
@@ -541,6 +539,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
541 539
542 if (found_type == BTRFS_FILE_EXTENT_REG || 540 if (found_type == BTRFS_FILE_EXTENT_REG ||
543 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 541 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
542 u64 offset;
544 unsigned long dest_offset; 543 unsigned long dest_offset;
545 struct btrfs_key ins; 544 struct btrfs_key ins;
546 545
@@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
555 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 554 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
556 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 555 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
557 ins.type = BTRFS_EXTENT_ITEM_KEY; 556 ins.type = BTRFS_EXTENT_ITEM_KEY;
557 offset = key->offset - btrfs_file_extent_offset(eb, item);
558 558
559 if (ins.objectid > 0) { 559 if (ins.objectid > 0) {
560 u64 csum_start; 560 u64 csum_start;
@@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
569 if (ret == 0) { 569 if (ret == 0) {
570 ret = btrfs_inc_extent_ref(trans, root, 570 ret = btrfs_inc_extent_ref(trans, root,
571 ins.objectid, ins.offset, 571 ins.objectid, ins.offset,
572 path->nodes[0]->start, 572 0, root->root_key.objectid,
573 root->root_key.objectid, 573 key->objectid, offset);
574 trans->transid, key->objectid);
575 } else { 574 } else {
576 /* 575 /*
577 * insert the extent pointer in the extent 576 * insert the extent pointer in the extent
578 * allocation tree 577 * allocation tree
579 */ 578 */
580 ret = btrfs_alloc_logged_extent(trans, root, 579 ret = btrfs_alloc_logged_file_extent(trans,
581 path->nodes[0]->start, 580 root, root->root_key.objectid,
582 root->root_key.objectid, 581 key->objectid, offset, &ins);
583 trans->transid, key->objectid,
584 &ins);
585 BUG_ON(ret); 582 BUG_ON(ret);
586 } 583 }
587 btrfs_release_path(root, path); 584 btrfs_release_path(root, path);
@@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1706 btrfs_wait_tree_block_writeback(next); 1703 btrfs_wait_tree_block_writeback(next);
1707 btrfs_tree_unlock(next); 1704 btrfs_tree_unlock(next);
1708 1705
1709 ret = btrfs_drop_leaf_ref(trans, root, next);
1710 BUG_ON(ret);
1711
1712 WARN_ON(root_owner != 1706 WARN_ON(root_owner !=
1713 BTRFS_TREE_LOG_OBJECTID); 1707 BTRFS_TREE_LOG_OBJECTID);
1714 ret = btrfs_free_reserved_extent(root, 1708 ret = btrfs_free_reserved_extent(root,
@@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1753 btrfs_wait_tree_block_writeback(next); 1747 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next); 1748 btrfs_tree_unlock(next);
1755 1749
1756 if (*level == 0) {
1757 ret = btrfs_drop_leaf_ref(trans, root, next);
1758 BUG_ON(ret);
1759 }
1760 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1750 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1761 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1751 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1762 BUG_ON(ret); 1752 BUG_ON(ret);
@@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1811 btrfs_wait_tree_block_writeback(next); 1801 btrfs_wait_tree_block_writeback(next);
1812 btrfs_tree_unlock(next); 1802 btrfs_tree_unlock(next);
1813 1803
1814 if (*level == 0) {
1815 ret = btrfs_drop_leaf_ref(trans, root,
1816 next);
1817 BUG_ON(ret);
1818 }
1819
1820 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1804 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1821 ret = btrfs_free_reserved_extent(root, 1805 ret = btrfs_free_reserved_extent(root,
1822 path->nodes[*level]->start, 1806 path->nodes[*level]->start,
@@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1884 btrfs_wait_tree_block_writeback(next); 1868 btrfs_wait_tree_block_writeback(next);
1885 btrfs_tree_unlock(next); 1869 btrfs_tree_unlock(next);
1886 1870
1887 if (orig_level == 0) {
1888 ret = btrfs_drop_leaf_ref(trans, log,
1889 next);
1890 BUG_ON(ret);
1891 }
1892 WARN_ON(log->root_key.objectid != 1871 WARN_ON(log->root_key.objectid !=
1893 BTRFS_TREE_LOG_OBJECTID); 1872 BTRFS_TREE_LOG_OBJECTID);
1894 ret = btrfs_free_reserved_extent(log, next->start, 1873 ret = btrfs_free_reserved_extent(log, next->start,
@@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2027 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2006 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
2028 BUG_ON(ret); 2007 BUG_ON(ret);
2029 2008
2030 btrfs_set_root_bytenr(&log->root_item, log->node->start); 2009 btrfs_set_root_node(&log->root_item, log->node);
2031 btrfs_set_root_generation(&log->root_item, trans->transid);
2032 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2033 2010
2034 root->log_batch = 0; 2011 root->log_batch = 0;
2035 root->log_transid++; 2012 root->log_transid++;
@@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2581 ins_keys, ins_sizes, nr); 2558 ins_keys, ins_sizes, nr);
2582 BUG_ON(ret); 2559 BUG_ON(ret);
2583 2560
2584 for (i = 0; i < nr; i++) { 2561 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2585 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2562 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2586 dst_path->slots[0]); 2563 dst_path->slots[0]);
2587 2564
@@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2617 found_type = btrfs_file_extent_type(src, extent); 2594 found_type = btrfs_file_extent_type(src, extent);
2618 if (found_type == BTRFS_FILE_EXTENT_REG || 2595 if (found_type == BTRFS_FILE_EXTENT_REG ||
2619 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2596 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2620 u64 ds = btrfs_file_extent_disk_bytenr(src, 2597 u64 ds, dl, cs, cl;
2621 extent); 2598 ds = btrfs_file_extent_disk_bytenr(src,
2622 u64 dl = btrfs_file_extent_disk_num_bytes(src, 2599 extent);
2623 extent); 2600 /* ds == 0 is a hole */
2624 u64 cs = btrfs_file_extent_offset(src, extent); 2601 if (ds == 0)
2625 u64 cl = btrfs_file_extent_num_bytes(src, 2602 continue;
2626 extent);; 2603
2604 dl = btrfs_file_extent_disk_num_bytes(src,
2605 extent);
2606 cs = btrfs_file_extent_offset(src, extent);
2607 cl = btrfs_file_extent_num_bytes(src,
2608 extent);;
2627 if (btrfs_file_extent_compression(src, 2609 if (btrfs_file_extent_compression(src,
2628 extent)) { 2610 extent)) {
2629 cs = 0; 2611 cs = 0;
2630 cl = dl; 2612 cl = dl;
2631 } 2613 }
2632 /* ds == 0 is a hole */ 2614
2633 if (ds != 0) { 2615 ret = btrfs_lookup_csums_range(
2634 ret = btrfs_inc_extent_ref(trans, log, 2616 log->fs_info->csum_root,
2635 ds, dl, 2617 ds + cs, ds + cs + cl - 1,
2636 dst_path->nodes[0]->start, 2618 &ordered_sums);
2637 BTRFS_TREE_LOG_OBJECTID, 2619 BUG_ON(ret);
2638 trans->transid,
2639 ins_keys[i].objectid);
2640 BUG_ON(ret);
2641 ret = btrfs_lookup_csums_range(
2642 log->fs_info->csum_root,
2643 ds + cs, ds + cs + cl - 1,
2644 &ordered_sums);
2645 BUG_ON(ret);
2646 }
2647 } 2620 }
2648 } 2621 }
2649 dst_path->slots[0]++;
2650 } 2622 }
2651 2623
2652 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2624 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
@@ -3029,9 +3001,7 @@ again:
3029 BUG_ON(!wc.replay_dest); 3001 BUG_ON(!wc.replay_dest);
3030 3002
3031 wc.replay_dest->log_root = log; 3003 wc.replay_dest->log_root = log;
3032 mutex_lock(&fs_info->trans_mutex); 3004 btrfs_record_root_in_trans(trans, wc.replay_dest);
3033 btrfs_record_root_in_trans(wc.replay_dest);
3034 mutex_unlock(&fs_info->trans_mutex);
3035 ret = walk_log_tree(trans, log, &wc); 3005 ret = walk_log_tree(trans, log, &wc);
3036 BUG_ON(ret); 3006 BUG_ON(ret);
3037 3007