aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-02-04 09:27:02 -0500
committerChris Mason <chris.mason@oracle.com>2009-02-04 09:27:02 -0500
commitbd56b30205bc09da0beb80d4ba3d4c7309792da5 (patch)
treea5cb3104687b27e923b73b2840f053abc1229a92 /fs/btrfs/extent-tree.c
parentb4ce94de9b4d64e8ab3cf155d13653c666e22b9b (diff)
Btrfs: Make btrfs_drop_snapshot work in larger and more efficient chunks
Every transaction in btrfs creates a new snapshot, and then schedules the snapshot from the last transaction for deletion. Snapshot deletion works by walking down the btree and dropping the reference counts on each btree block during the walk. If if a given leaf or node has a reference count greater than one, the reference count is decremented and the subtree pointed to by that node is ignored. If the reference count is one, walking continues down into that node or leaf, and the references of everything it points to are decremented. The old code would try to work in small pieces, walking down the tree until it found the lowest leaf or node to free and then returning. This was very friendly to the rest of the FS because it didn't have a huge impact on other operations. But it wouldn't always keep up with the rate that new commits added new snapshots for deletion, and it wasn't very optimal for the extent allocation tree because it wasn't finding leaves that were close together on disk and processing them at the same time. This changes things to walk down to a level 1 node and then process it in bulk. All the leaf pointers are sorted and the leaves are dropped in order based on their extent number. The extent allocation tree and commit code are now fast enough for this kind of bulk processing to work without slowing the rest of the FS down. Overall it does less IO and is better able to keep up with snapshot deletions under high load. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c306
1 files changed, 262 insertions, 44 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ed1e25d72483..1d3e9262a9da 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1533,6 +1533,11 @@ out:
1533 * struct refsort is used to match byte number to slot in the btree block. 1533 * struct refsort is used to match byte number to slot in the btree block.
1534 * we sort based on the byte number and then use the slot to actually 1534 * we sort based on the byte number and then use the slot to actually
1535 * find the item. 1535 * find the item.
1536 *
1537 * struct refsort is smaller than strcut btrfs_item and smaller than
1538 * struct btrfs_key_ptr. Since we're currently limited to the page size
1539 * for a btree block, there's no way for a kmalloc of refsorts for a
1540 * single node to be bigger than a page.
1536 */ 1541 */
1537struct refsort { 1542struct refsort {
1538 u64 bytenr; 1543 u64 bytenr;
@@ -3457,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3457{ 3462{
3458 u64 leaf_owner; 3463 u64 leaf_owner;
3459 u64 leaf_generation; 3464 u64 leaf_generation;
3465 struct refsort *sorted;
3460 struct btrfs_key key; 3466 struct btrfs_key key;
3461 struct btrfs_file_extent_item *fi; 3467 struct btrfs_file_extent_item *fi;
3462 int i; 3468 int i;
3463 int nritems; 3469 int nritems;
3464 int ret; 3470 int ret;
3471 int refi = 0;
3472 int slot;
3465 3473
3466 BUG_ON(!btrfs_is_leaf(leaf)); 3474 BUG_ON(!btrfs_is_leaf(leaf));
3467 nritems = btrfs_header_nritems(leaf); 3475 nritems = btrfs_header_nritems(leaf);
3468 leaf_owner = btrfs_header_owner(leaf); 3476 leaf_owner = btrfs_header_owner(leaf);
3469 leaf_generation = btrfs_header_generation(leaf); 3477 leaf_generation = btrfs_header_generation(leaf);
3470 3478
3479 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3480 /* we do this loop twice. The first time we build a list
3481 * of the extents we have a reference on, then we sort the list
3482 * by bytenr. The second time around we actually do the
3483 * extent freeing.
3484 */
3471 for (i = 0; i < nritems; i++) { 3485 for (i = 0; i < nritems; i++) {
3472 u64 disk_bytenr; 3486 u64 disk_bytenr;
3473 cond_resched(); 3487 cond_resched();
3474 3488
3475 btrfs_item_key_to_cpu(leaf, &key, i); 3489 btrfs_item_key_to_cpu(leaf, &key, i);
3490
3491 /* only extents have references, skip everything else */
3476 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3492 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3477 continue; 3493 continue;
3494
3478 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3495 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3496
3497 /* inline extents live in the btree, they don't have refs */
3479 if (btrfs_file_extent_type(leaf, fi) == 3498 if (btrfs_file_extent_type(leaf, fi) ==
3480 BTRFS_FILE_EXTENT_INLINE) 3499 BTRFS_FILE_EXTENT_INLINE)
3481 continue; 3500 continue;
3482 /* 3501
3483 * FIXME make sure to insert a trans record that
3484 * repeats the snapshot del on crash
3485 */
3486 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3502 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3503
3504 /* holes don't have refs */
3487 if (disk_bytenr == 0) 3505 if (disk_bytenr == 0)
3488 continue; 3506 continue;
3489 3507
3508 sorted[refi].bytenr = disk_bytenr;
3509 sorted[refi].slot = i;
3510 refi++;
3511 }
3512
3513 if (refi == 0)
3514 goto out;
3515
3516 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3517
3518 for (i = 0; i < refi; i++) {
3519 u64 disk_bytenr;
3520
3521 disk_bytenr = sorted[i].bytenr;
3522 slot = sorted[i].slot;
3523
3524 cond_resched();
3525
3526 btrfs_item_key_to_cpu(leaf, &key, slot);
3527 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3528 continue;
3529
3530 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3531
3490 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3532 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3491 btrfs_file_extent_disk_num_bytes(leaf, fi), 3533 btrfs_file_extent_disk_num_bytes(leaf, fi),
3492 leaf->start, leaf_owner, leaf_generation, 3534 leaf->start, leaf_owner, leaf_generation,
@@ -3497,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3497 wake_up(&root->fs_info->transaction_throttle); 3539 wake_up(&root->fs_info->transaction_throttle);
3498 cond_resched(); 3540 cond_resched();
3499 } 3541 }
3542out:
3543 kfree(sorted);
3500 return 0; 3544 return 0;
3501} 3545}
3502 3546
@@ -3506,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3506{ 3550{
3507 int i; 3551 int i;
3508 int ret; 3552 int ret;
3509 struct btrfs_extent_info *info = ref->extents; 3553 struct btrfs_extent_info *info;
3554 struct refsort *sorted;
3555
3556 if (ref->nritems == 0)
3557 return 0;
3510 3558
3559 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3560 for (i = 0; i < ref->nritems; i++) {
3561 sorted[i].bytenr = ref->extents[i].bytenr;
3562 sorted[i].slot = i;
3563 }
3564 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3565
3566 /*
3567 * the items in the ref were sorted when the ref was inserted
3568 * into the ref cache, so this is already in order
3569 */
3511 for (i = 0; i < ref->nritems; i++) { 3570 for (i = 0; i < ref->nritems; i++) {
3571 info = ref->extents + sorted[i].slot;
3512 ret = __btrfs_free_extent(trans, root, info->bytenr, 3572 ret = __btrfs_free_extent(trans, root, info->bytenr,
3513 info->num_bytes, ref->bytenr, 3573 info->num_bytes, ref->bytenr,
3514 ref->owner, ref->generation, 3574 ref->owner, ref->generation,
@@ -3566,6 +3626,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3566} 3626}
3567 3627
3568/* 3628/*
3629 * this is used while deleting old snapshots, and it drops the refs
3630 * on a whole subtree starting from a level 1 node.
3631 *
3632 * The idea is to sort all the leaf pointers, and then drop the
3633 * ref on all the leaves in order. Most of the time the leaves
3634 * will have ref cache entries, so no leaf IOs will be required to
3635 * find the extents they have references on.
3636 *
3637 * For each leaf, any references it has are also dropped in order
3638 *
3639 * This ends up dropping the references in something close to optimal
3640 * order for reading and modifying the extent allocation tree.
3641 */
3642static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3643 struct btrfs_root *root,
3644 struct btrfs_path *path)
3645{
3646 u64 bytenr;
3647 u64 root_owner;
3648 u64 root_gen;
3649 struct extent_buffer *eb = path->nodes[1];
3650 struct extent_buffer *leaf;
3651 struct btrfs_leaf_ref *ref;
3652 struct refsort *sorted = NULL;
3653 int nritems = btrfs_header_nritems(eb);
3654 int ret;
3655 int i;
3656 int refi = 0;
3657 int slot = path->slots[1];
3658 u32 blocksize = btrfs_level_size(root, 0);
3659 u32 refs;
3660
3661 if (nritems == 0)
3662 goto out;
3663
3664 root_owner = btrfs_header_owner(eb);
3665 root_gen = btrfs_header_generation(eb);
3666 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3667
3668 /*
3669 * step one, sort all the leaf pointers so we don't scribble
3670 * randomly into the extent allocation tree
3671 */
3672 for (i = slot; i < nritems; i++) {
3673 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3674 sorted[refi].slot = i;
3675 refi++;
3676 }
3677
3678 /*
3679 * nritems won't be zero, but if we're picking up drop_snapshot
3680 * after a crash, slot might be > 0, so double check things
3681 * just in case.
3682 */
3683 if (refi == 0)
3684 goto out;
3685
3686 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3687
3688 /*
3689 * the first loop frees everything the leaves point to
3690 */
3691 for (i = 0; i < refi; i++) {
3692 u64 ptr_gen;
3693
3694 bytenr = sorted[i].bytenr;
3695
3696 /*
3697 * check the reference count on this leaf. If it is > 1
3698 * we just decrement it below and don't update any
3699 * of the refs the leaf points to.
3700 */
3701 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3702 BUG_ON(ret);
3703 if (refs != 1)
3704 continue;
3705
3706 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3707
3708 /*
3709 * the leaf only had one reference, which means the
3710 * only thing pointing to this leaf is the snapshot
3711 * we're deleting. It isn't possible for the reference
3712 * count to increase again later
3713 *
3714 * The reference cache is checked for the leaf,
3715 * and if found we'll be able to drop any refs held by
3716 * the leaf without needing to read it in.
3717 */
3718 ref = btrfs_lookup_leaf_ref(root, bytenr);
3719 if (ref && ref->generation != ptr_gen) {
3720 btrfs_free_leaf_ref(root, ref);
3721 ref = NULL;
3722 }
3723 if (ref) {
3724 ret = cache_drop_leaf_ref(trans, root, ref);
3725 BUG_ON(ret);
3726 btrfs_remove_leaf_ref(root, ref);
3727 btrfs_free_leaf_ref(root, ref);
3728 } else {
3729 /*
3730 * the leaf wasn't in the reference cache, so
3731 * we have to read it.
3732 */
3733 leaf = read_tree_block(root, bytenr, blocksize,
3734 ptr_gen);
3735 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3736 BUG_ON(ret);
3737 free_extent_buffer(leaf);
3738 }
3739 atomic_inc(&root->fs_info->throttle_gen);
3740 wake_up(&root->fs_info->transaction_throttle);
3741 cond_resched();
3742 }
3743
3744 /*
3745 * run through the loop again to free the refs on the leaves.
3746 * This is faster than doing it in the loop above because
3747 * the leaves are likely to be clustered together. We end up
3748 * working in nice chunks on the extent allocation tree.
3749 */
3750 for (i = 0; i < refi; i++) {
3751 bytenr = sorted[i].bytenr;
3752 ret = __btrfs_free_extent(trans, root, bytenr,
3753 blocksize, eb->start,
3754 root_owner, root_gen, 0, 1);
3755 BUG_ON(ret);
3756
3757 atomic_inc(&root->fs_info->throttle_gen);
3758 wake_up(&root->fs_info->transaction_throttle);
3759 cond_resched();
3760 }
3761out:
3762 kfree(sorted);
3763
3764 /*
3765 * update the path to show we've processed the entire level 1
3766 * node. This will get saved into the root's drop_snapshot_progress
3767 * field so these drops are not repeated again if this transaction
3768 * commits.
3769 */
3770 path->slots[1] = nritems;
3771 return 0;
3772}
3773
3774/*
3569 * helper function for drop_snapshot, this walks down the tree dropping ref 3775 * helper function for drop_snapshot, this walks down the tree dropping ref
3570 * counts as it goes. 3776 * counts as it goes.
3571 */ 3777 */
@@ -3580,7 +3786,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3580 struct extent_buffer *next; 3786 struct extent_buffer *next;
3581 struct extent_buffer *cur; 3787 struct extent_buffer *cur;
3582 struct extent_buffer *parent; 3788 struct extent_buffer *parent;
3583 struct btrfs_leaf_ref *ref;
3584 u32 blocksize; 3789 u32 blocksize;
3585 int ret; 3790 int ret;
3586 u32 refs; 3791 u32 refs;
@@ -3607,17 +3812,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3607 if (path->slots[*level] >= 3812 if (path->slots[*level] >=
3608 btrfs_header_nritems(cur)) 3813 btrfs_header_nritems(cur))
3609 break; 3814 break;
3815
3816 /* the new code goes down to level 1 and does all the
3817 * leaves pointed to that node in bulk. So, this check
3818 * for level 0 will always be false.
3819 *
3820 * But, the disk format allows the drop_snapshot_progress
3821 * field in the root to leave things in a state where
3822 * a leaf will need cleaning up here. If someone crashes
3823 * with the old code and then boots with the new code,
3824 * we might find a leaf here.
3825 */
3610 if (*level == 0) { 3826 if (*level == 0) {
3611 ret = btrfs_drop_leaf_ref(trans, root, cur); 3827 ret = btrfs_drop_leaf_ref(trans, root, cur);
3612 BUG_ON(ret); 3828 BUG_ON(ret);
3613 break; 3829 break;
3614 } 3830 }
3831
3832 /*
3833 * once we get to level one, process the whole node
3834 * at once, including everything below it.
3835 */
3836 if (*level == 1) {
3837 ret = drop_level_one_refs(trans, root, path);
3838 BUG_ON(ret);
3839 break;
3840 }
3841
3615 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3842 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3616 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3843 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3617 blocksize = btrfs_level_size(root, *level - 1); 3844 blocksize = btrfs_level_size(root, *level - 1);
3618 3845
3619 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3846 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3620 BUG_ON(ret); 3847 BUG_ON(ret);
3848
3849 /*
3850 * if there is more than one reference, we don't need
3851 * to read that node to drop any references it has. We
3852 * just drop the ref we hold on that node and move on to the
3853 * next slot in this level.
3854 */
3621 if (refs != 1) { 3855 if (refs != 1) {
3622 parent = path->nodes[*level]; 3856 parent = path->nodes[*level];
3623 root_owner = btrfs_header_owner(parent); 3857 root_owner = btrfs_header_owner(parent);
@@ -3636,46 +3870,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3636 3870
3637 continue; 3871 continue;
3638 } 3872 }
3873
3639 /* 3874 /*
3640 * at this point, we have a single ref, and since the 3875 * we need to keep freeing things in the next level down.
3641 * only place referencing this extent is a dead root 3876 * read the block and loop around to process it
3642 * the reference count should never go higher.
3643 * So, we don't need to check it again
3644 */ 3877 */
3645 if (*level == 1) { 3878 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3646 ref = btrfs_lookup_leaf_ref(root, bytenr);
3647 if (ref && ref->generation != ptr_gen) {
3648 btrfs_free_leaf_ref(root, ref);
3649 ref = NULL;
3650 }
3651 if (ref) {
3652 ret = cache_drop_leaf_ref(trans, root, ref);
3653 BUG_ON(ret);
3654 btrfs_remove_leaf_ref(root, ref);
3655 btrfs_free_leaf_ref(root, ref);
3656 *level = 0;
3657 break;
3658 }
3659 }
3660 next = btrfs_find_tree_block(root, bytenr, blocksize);
3661 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3662 free_extent_buffer(next);
3663
3664 next = read_tree_block(root, bytenr, blocksize,
3665 ptr_gen);
3666 cond_resched();
3667#if 0
3668 /*
3669 * this is a debugging check and can go away
3670 * the ref should never go all the way down to 1
3671 * at this point
3672 */
3673 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3674 &refs);
3675 BUG_ON(ret);
3676 WARN_ON(refs != 1);
3677#endif
3678 }
3679 WARN_ON(*level <= 0); 3879 WARN_ON(*level <= 0);
3680 if (path->nodes[*level-1]) 3880 if (path->nodes[*level-1])
3681 free_extent_buffer(path->nodes[*level-1]); 3881 free_extent_buffer(path->nodes[*level-1]);
@@ -3700,11 +3900,16 @@ out:
3700 root_owner = btrfs_header_owner(parent); 3900 root_owner = btrfs_header_owner(parent);
3701 root_gen = btrfs_header_generation(parent); 3901 root_gen = btrfs_header_generation(parent);
3702 3902
3903 /*
3904 * cleanup and free the reference on the last node
3905 * we processed
3906 */
3703 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3907 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3704 parent->start, root_owner, root_gen, 3908 parent->start, root_owner, root_gen,
3705 *level, 1); 3909 *level, 1);
3706 free_extent_buffer(path->nodes[*level]); 3910 free_extent_buffer(path->nodes[*level]);
3707 path->nodes[*level] = NULL; 3911 path->nodes[*level] = NULL;
3912
3708 *level += 1; 3913 *level += 1;
3709 BUG_ON(ret); 3914 BUG_ON(ret);
3710 3915
@@ -3824,6 +4029,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3824 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4029 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3825 struct extent_buffer *node; 4030 struct extent_buffer *node;
3826 struct btrfs_disk_key disk_key; 4031 struct btrfs_disk_key disk_key;
4032
4033 /*
4034 * there is more work to do in this level.
4035 * Update the drop_progress marker to reflect
4036 * the work we've done so far, and then bump
4037 * the slot number
4038 */
3827 node = path->nodes[i]; 4039 node = path->nodes[i];
3828 path->slots[i]++; 4040 path->slots[i]++;
3829 *level = i; 4041 *level = i;
@@ -3835,6 +4047,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3835 return 0; 4047 return 0;
3836 } else { 4048 } else {
3837 struct extent_buffer *parent; 4049 struct extent_buffer *parent;
4050
4051 /*
4052 * this whole node is done, free our reference
4053 * on it and go up one level
4054 */
3838 if (path->nodes[*level] == root->node) 4055 if (path->nodes[*level] == root->node)
3839 parent = path->nodes[*level]; 4056 parent = path->nodes[*level];
3840 else 4057 else
@@ -4849,6 +5066,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4849 ref->bytenr = buf->start; 5066 ref->bytenr = buf->start;
4850 ref->owner = btrfs_header_owner(buf); 5067 ref->owner = btrfs_header_owner(buf);
4851 ref->generation = btrfs_header_generation(buf); 5068 ref->generation = btrfs_header_generation(buf);
5069
4852 ret = btrfs_add_leaf_ref(root, ref, 0); 5070 ret = btrfs_add_leaf_ref(root, ref, 0);
4853 WARN_ON(ret); 5071 WARN_ON(ret);
4854 btrfs_free_leaf_ref(root, ref); 5072 btrfs_free_leaf_ref(root, ref);