aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c306
1 files changed, 262 insertions, 44 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ed1e25d72483..1d3e9262a9da 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1533,6 +1533,11 @@ out:
1533 * struct refsort is used to match byte number to slot in the btree block. 1533 * struct refsort is used to match byte number to slot in the btree block.
1534 * we sort based on the byte number and then use the slot to actually 1534 * we sort based on the byte number and then use the slot to actually
1535 * find the item. 1535 * find the item.
1536 *
1537 * struct refsort is smaller than strcut btrfs_item and smaller than
1538 * struct btrfs_key_ptr. Since we're currently limited to the page size
1539 * for a btree block, there's no way for a kmalloc of refsorts for a
1540 * single node to be bigger than a page.
1536 */ 1541 */
1537struct refsort { 1542struct refsort {
1538 u64 bytenr; 1543 u64 bytenr;
@@ -3457,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3457{ 3462{
3458 u64 leaf_owner; 3463 u64 leaf_owner;
3459 u64 leaf_generation; 3464 u64 leaf_generation;
3465 struct refsort *sorted;
3460 struct btrfs_key key; 3466 struct btrfs_key key;
3461 struct btrfs_file_extent_item *fi; 3467 struct btrfs_file_extent_item *fi;
3462 int i; 3468 int i;
3463 int nritems; 3469 int nritems;
3464 int ret; 3470 int ret;
3471 int refi = 0;
3472 int slot;
3465 3473
3466 BUG_ON(!btrfs_is_leaf(leaf)); 3474 BUG_ON(!btrfs_is_leaf(leaf));
3467 nritems = btrfs_header_nritems(leaf); 3475 nritems = btrfs_header_nritems(leaf);
3468 leaf_owner = btrfs_header_owner(leaf); 3476 leaf_owner = btrfs_header_owner(leaf);
3469 leaf_generation = btrfs_header_generation(leaf); 3477 leaf_generation = btrfs_header_generation(leaf);
3470 3478
3479 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3480 /* we do this loop twice. The first time we build a list
3481 * of the extents we have a reference on, then we sort the list
3482 * by bytenr. The second time around we actually do the
3483 * extent freeing.
3484 */
3471 for (i = 0; i < nritems; i++) { 3485 for (i = 0; i < nritems; i++) {
3472 u64 disk_bytenr; 3486 u64 disk_bytenr;
3473 cond_resched(); 3487 cond_resched();
3474 3488
3475 btrfs_item_key_to_cpu(leaf, &key, i); 3489 btrfs_item_key_to_cpu(leaf, &key, i);
3490
3491 /* only extents have references, skip everything else */
3476 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3492 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3477 continue; 3493 continue;
3494
3478 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3495 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3496
3497 /* inline extents live in the btree, they don't have refs */
3479 if (btrfs_file_extent_type(leaf, fi) == 3498 if (btrfs_file_extent_type(leaf, fi) ==
3480 BTRFS_FILE_EXTENT_INLINE) 3499 BTRFS_FILE_EXTENT_INLINE)
3481 continue; 3500 continue;
3482 /* 3501
3483 * FIXME make sure to insert a trans record that
3484 * repeats the snapshot del on crash
3485 */
3486 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3502 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3503
3504 /* holes don't have refs */
3487 if (disk_bytenr == 0) 3505 if (disk_bytenr == 0)
3488 continue; 3506 continue;
3489 3507
3508 sorted[refi].bytenr = disk_bytenr;
3509 sorted[refi].slot = i;
3510 refi++;
3511 }
3512
3513 if (refi == 0)
3514 goto out;
3515
3516 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3517
3518 for (i = 0; i < refi; i++) {
3519 u64 disk_bytenr;
3520
3521 disk_bytenr = sorted[i].bytenr;
3522 slot = sorted[i].slot;
3523
3524 cond_resched();
3525
3526 btrfs_item_key_to_cpu(leaf, &key, slot);
3527 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3528 continue;
3529
3530 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3531
3490 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3532 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3491 btrfs_file_extent_disk_num_bytes(leaf, fi), 3533 btrfs_file_extent_disk_num_bytes(leaf, fi),
3492 leaf->start, leaf_owner, leaf_generation, 3534 leaf->start, leaf_owner, leaf_generation,
@@ -3497,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3497 wake_up(&root->fs_info->transaction_throttle); 3539 wake_up(&root->fs_info->transaction_throttle);
3498 cond_resched(); 3540 cond_resched();
3499 } 3541 }
3542out:
3543 kfree(sorted);
3500 return 0; 3544 return 0;
3501} 3545}
3502 3546
@@ -3506,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3506{ 3550{
3507 int i; 3551 int i;
3508 int ret; 3552 int ret;
3509 struct btrfs_extent_info *info = ref->extents; 3553 struct btrfs_extent_info *info;
3554 struct refsort *sorted;
3555
3556 if (ref->nritems == 0)
3557 return 0;
3510 3558
3559 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3560 for (i = 0; i < ref->nritems; i++) {
3561 sorted[i].bytenr = ref->extents[i].bytenr;
3562 sorted[i].slot = i;
3563 }
3564 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3565
3566 /*
3567 * the items in the ref were sorted when the ref was inserted
3568 * into the ref cache, so this is already in order
3569 */
3511 for (i = 0; i < ref->nritems; i++) { 3570 for (i = 0; i < ref->nritems; i++) {
3571 info = ref->extents + sorted[i].slot;
3512 ret = __btrfs_free_extent(trans, root, info->bytenr, 3572 ret = __btrfs_free_extent(trans, root, info->bytenr,
3513 info->num_bytes, ref->bytenr, 3573 info->num_bytes, ref->bytenr,
3514 ref->owner, ref->generation, 3574 ref->owner, ref->generation,
@@ -3566,6 +3626,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3566} 3626}
3567 3627
3568/* 3628/*
3629 * this is used while deleting old snapshots, and it drops the refs
3630 * on a whole subtree starting from a level 1 node.
3631 *
3632 * The idea is to sort all the leaf pointers, and then drop the
3633 * ref on all the leaves in order. Most of the time the leaves
3634 * will have ref cache entries, so no leaf IOs will be required to
3635 * find the extents they have references on.
3636 *
3637 * For each leaf, any references it has are also dropped in order
3638 *
3639 * This ends up dropping the references in something close to optimal
3640 * order for reading and modifying the extent allocation tree.
3641 */
3642static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3643 struct btrfs_root *root,
3644 struct btrfs_path *path)
3645{
3646 u64 bytenr;
3647 u64 root_owner;
3648 u64 root_gen;
3649 struct extent_buffer *eb = path->nodes[1];
3650 struct extent_buffer *leaf;
3651 struct btrfs_leaf_ref *ref;
3652 struct refsort *sorted = NULL;
3653 int nritems = btrfs_header_nritems(eb);
3654 int ret;
3655 int i;
3656 int refi = 0;
3657 int slot = path->slots[1];
3658 u32 blocksize = btrfs_level_size(root, 0);
3659 u32 refs;
3660
3661 if (nritems == 0)
3662 goto out;
3663
3664 root_owner = btrfs_header_owner(eb);
3665 root_gen = btrfs_header_generation(eb);
3666 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3667
3668 /*
3669 * step one, sort all the leaf pointers so we don't scribble
3670 * randomly into the extent allocation tree
3671 */
3672 for (i = slot; i < nritems; i++) {
3673 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3674 sorted[refi].slot = i;
3675 refi++;
3676 }
3677
3678 /*
3679 * nritems won't be zero, but if we're picking up drop_snapshot
3680 * after a crash, slot might be > 0, so double check things
3681 * just in case.
3682 */
3683 if (refi == 0)
3684 goto out;
3685
3686 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3687
3688 /*
3689 * the first loop frees everything the leaves point to
3690 */
3691 for (i = 0; i < refi; i++) {
3692 u64 ptr_gen;
3693
3694 bytenr = sorted[i].bytenr;
3695
3696 /*
3697 * check the reference count on this leaf. If it is > 1
3698 * we just decrement it below and don't update any
3699 * of the refs the leaf points to.
3700 */
3701 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3702 BUG_ON(ret);
3703 if (refs != 1)
3704 continue;
3705
3706 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3707
3708 /*
3709 * the leaf only had one reference, which means the
3710 * only thing pointing to this leaf is the snapshot
3711 * we're deleting. It isn't possible for the reference
3712 * count to increase again later
3713 *
3714 * The reference cache is checked for the leaf,
3715 * and if found we'll be able to drop any refs held by
3716 * the leaf without needing to read it in.
3717 */
3718 ref = btrfs_lookup_leaf_ref(root, bytenr);
3719 if (ref && ref->generation != ptr_gen) {
3720 btrfs_free_leaf_ref(root, ref);
3721 ref = NULL;
3722 }
3723 if (ref) {
3724 ret = cache_drop_leaf_ref(trans, root, ref);
3725 BUG_ON(ret);
3726 btrfs_remove_leaf_ref(root, ref);
3727 btrfs_free_leaf_ref(root, ref);
3728 } else {
3729 /*
3730 * the leaf wasn't in the reference cache, so
3731 * we have to read it.
3732 */
3733 leaf = read_tree_block(root, bytenr, blocksize,
3734 ptr_gen);
3735 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3736 BUG_ON(ret);
3737 free_extent_buffer(leaf);
3738 }
3739 atomic_inc(&root->fs_info->throttle_gen);
3740 wake_up(&root->fs_info->transaction_throttle);
3741 cond_resched();
3742 }
3743
3744 /*
3745 * run through the loop again to free the refs on the leaves.
3746 * This is faster than doing it in the loop above because
3747 * the leaves are likely to be clustered together. We end up
3748 * working in nice chunks on the extent allocation tree.
3749 */
3750 for (i = 0; i < refi; i++) {
3751 bytenr = sorted[i].bytenr;
3752 ret = __btrfs_free_extent(trans, root, bytenr,
3753 blocksize, eb->start,
3754 root_owner, root_gen, 0, 1);
3755 BUG_ON(ret);
3756
3757 atomic_inc(&root->fs_info->throttle_gen);
3758 wake_up(&root->fs_info->transaction_throttle);
3759 cond_resched();
3760 }
3761out:
3762 kfree(sorted);
3763
3764 /*
3765 * update the path to show we've processed the entire level 1
3766 * node. This will get saved into the root's drop_snapshot_progress
3767 * field so these drops are not repeated again if this transaction
3768 * commits.
3769 */
3770 path->slots[1] = nritems;
3771 return 0;
3772}
3773
3774/*
3569 * helper function for drop_snapshot, this walks down the tree dropping ref 3775 * helper function for drop_snapshot, this walks down the tree dropping ref
3570 * counts as it goes. 3776 * counts as it goes.
3571 */ 3777 */
@@ -3580,7 +3786,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3580 struct extent_buffer *next; 3786 struct extent_buffer *next;
3581 struct extent_buffer *cur; 3787 struct extent_buffer *cur;
3582 struct extent_buffer *parent; 3788 struct extent_buffer *parent;
3583 struct btrfs_leaf_ref *ref;
3584 u32 blocksize; 3789 u32 blocksize;
3585 int ret; 3790 int ret;
3586 u32 refs; 3791 u32 refs;
@@ -3607,17 +3812,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3607 if (path->slots[*level] >= 3812 if (path->slots[*level] >=
3608 btrfs_header_nritems(cur)) 3813 btrfs_header_nritems(cur))
3609 break; 3814 break;
3815
3816 /* the new code goes down to level 1 and does all the
3817 * leaves pointed to that node in bulk. So, this check
3818 * for level 0 will always be false.
3819 *
3820 * But, the disk format allows the drop_snapshot_progress
3821 * field in the root to leave things in a state where
3822 * a leaf will need cleaning up here. If someone crashes
3823 * with the old code and then boots with the new code,
3824 * we might find a leaf here.
3825 */
3610 if (*level == 0) { 3826 if (*level == 0) {
3611 ret = btrfs_drop_leaf_ref(trans, root, cur); 3827 ret = btrfs_drop_leaf_ref(trans, root, cur);
3612 BUG_ON(ret); 3828 BUG_ON(ret);
3613 break; 3829 break;
3614 } 3830 }
3831
3832 /*
3833 * once we get to level one, process the whole node
3834 * at once, including everything below it.
3835 */
3836 if (*level == 1) {
3837 ret = drop_level_one_refs(trans, root, path);
3838 BUG_ON(ret);
3839 break;
3840 }
3841
3615 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3842 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3616 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3843 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3617 blocksize = btrfs_level_size(root, *level - 1); 3844 blocksize = btrfs_level_size(root, *level - 1);
3618 3845
3619 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3846 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3620 BUG_ON(ret); 3847 BUG_ON(ret);
3848
3849 /*
3850 * if there is more than one reference, we don't need
3851 * to read that node to drop any references it has. We
3852 * just drop the ref we hold on that node and move on to the
3853 * next slot in this level.
3854 */
3621 if (refs != 1) { 3855 if (refs != 1) {
3622 parent = path->nodes[*level]; 3856 parent = path->nodes[*level];
3623 root_owner = btrfs_header_owner(parent); 3857 root_owner = btrfs_header_owner(parent);
@@ -3636,46 +3870,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3636 3870
3637 continue; 3871 continue;
3638 } 3872 }
3873
3639 /* 3874 /*
3640 * at this point, we have a single ref, and since the 3875 * we need to keep freeing things in the next level down.
3641 * only place referencing this extent is a dead root 3876 * read the block and loop around to process it
3642 * the reference count should never go higher.
3643 * So, we don't need to check it again
3644 */ 3877 */
3645 if (*level == 1) { 3878 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3646 ref = btrfs_lookup_leaf_ref(root, bytenr);
3647 if (ref && ref->generation != ptr_gen) {
3648 btrfs_free_leaf_ref(root, ref);
3649 ref = NULL;
3650 }
3651 if (ref) {
3652 ret = cache_drop_leaf_ref(trans, root, ref);
3653 BUG_ON(ret);
3654 btrfs_remove_leaf_ref(root, ref);
3655 btrfs_free_leaf_ref(root, ref);
3656 *level = 0;
3657 break;
3658 }
3659 }
3660 next = btrfs_find_tree_block(root, bytenr, blocksize);
3661 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3662 free_extent_buffer(next);
3663
3664 next = read_tree_block(root, bytenr, blocksize,
3665 ptr_gen);
3666 cond_resched();
3667#if 0
3668 /*
3669 * this is a debugging check and can go away
3670 * the ref should never go all the way down to 1
3671 * at this point
3672 */
3673 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3674 &refs);
3675 BUG_ON(ret);
3676 WARN_ON(refs != 1);
3677#endif
3678 }
3679 WARN_ON(*level <= 0); 3879 WARN_ON(*level <= 0);
3680 if (path->nodes[*level-1]) 3880 if (path->nodes[*level-1])
3681 free_extent_buffer(path->nodes[*level-1]); 3881 free_extent_buffer(path->nodes[*level-1]);
@@ -3700,11 +3900,16 @@ out:
3700 root_owner = btrfs_header_owner(parent); 3900 root_owner = btrfs_header_owner(parent);
3701 root_gen = btrfs_header_generation(parent); 3901 root_gen = btrfs_header_generation(parent);
3702 3902
3903 /*
3904 * cleanup and free the reference on the last node
3905 * we processed
3906 */
3703 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3907 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3704 parent->start, root_owner, root_gen, 3908 parent->start, root_owner, root_gen,
3705 *level, 1); 3909 *level, 1);
3706 free_extent_buffer(path->nodes[*level]); 3910 free_extent_buffer(path->nodes[*level]);
3707 path->nodes[*level] = NULL; 3911 path->nodes[*level] = NULL;
3912
3708 *level += 1; 3913 *level += 1;
3709 BUG_ON(ret); 3914 BUG_ON(ret);
3710 3915
@@ -3824,6 +4029,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3824 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4029 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3825 struct extent_buffer *node; 4030 struct extent_buffer *node;
3826 struct btrfs_disk_key disk_key; 4031 struct btrfs_disk_key disk_key;
4032
4033 /*
4034 * there is more work to do in this level.
4035 * Update the drop_progress marker to reflect
4036 * the work we've done so far, and then bump
4037 * the slot number
4038 */
3827 node = path->nodes[i]; 4039 node = path->nodes[i];
3828 path->slots[i]++; 4040 path->slots[i]++;
3829 *level = i; 4041 *level = i;
@@ -3835,6 +4047,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3835 return 0; 4047 return 0;
3836 } else { 4048 } else {
3837 struct extent_buffer *parent; 4049 struct extent_buffer *parent;
4050
4051 /*
4052 * this whole node is done, free our reference
4053 * on it and go up one level
4054 */
3838 if (path->nodes[*level] == root->node) 4055 if (path->nodes[*level] == root->node)
3839 parent = path->nodes[*level]; 4056 parent = path->nodes[*level];
3840 else 4057 else
@@ -4849,6 +5066,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4849 ref->bytenr = buf->start; 5066 ref->bytenr = buf->start;
4850 ref->owner = btrfs_header_owner(buf); 5067 ref->owner = btrfs_header_owner(buf);
4851 ref->generation = btrfs_header_generation(buf); 5068 ref->generation = btrfs_header_generation(buf);
5069
4852 ret = btrfs_add_leaf_ref(root, ref, 0); 5070 ret = btrfs_add_leaf_ref(root, ref, 0);
4853 WARN_ON(ret); 5071 WARN_ON(ret);
4854 btrfs_free_leaf_ref(root, ref); 5072 btrfs_free_leaf_ref(root, ref);