diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 438 |
1 files changed, 365 insertions, 73 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 293da650873f..7527523c2d2d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -19,7 +19,7 @@ | |||
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
21 | #include <linux/blkdev.h> | 21 | #include <linux/blkdev.h> |
22 | #include <linux/version.h> | 22 | #include <linux/sort.h> |
23 | #include "compat.h" | 23 | #include "compat.h" |
24 | #include "hash.h" | 24 | #include "hash.h" |
25 | #include "crc32c.h" | 25 | #include "crc32c.h" |
@@ -30,7 +30,6 @@ | |||
30 | #include "volumes.h" | 30 | #include "volumes.h" |
31 | #include "locking.h" | 31 | #include "locking.h" |
32 | #include "ref-cache.h" | 32 | #include "ref-cache.h" |
33 | #include "compat.h" | ||
34 | 33 | ||
35 | #define PENDING_EXTENT_INSERT 0 | 34 | #define PENDING_EXTENT_INSERT 0 |
36 | #define PENDING_EXTENT_DELETE 1 | 35 | #define PENDING_EXTENT_DELETE 1 |
@@ -326,10 +325,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, | |||
326 | u64 flags) | 325 | u64 flags) |
327 | { | 326 | { |
328 | struct list_head *head = &info->space_info; | 327 | struct list_head *head = &info->space_info; |
329 | struct list_head *cur; | ||
330 | struct btrfs_space_info *found; | 328 | struct btrfs_space_info *found; |
331 | list_for_each(cur, head) { | 329 | list_for_each_entry(found, head, list) { |
332 | found = list_entry(cur, struct btrfs_space_info, list); | ||
333 | if (found->flags == flags) | 330 | if (found->flags == flags) |
334 | return found; | 331 | return found; |
335 | } | 332 | } |
@@ -1525,15 +1522,55 @@ out: | |||
1525 | return ret; | 1522 | return ret; |
1526 | } | 1523 | } |
1527 | 1524 | ||
1528 | int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 1525 | /* when a block goes through cow, we update the reference counts of |
1529 | struct extent_buffer *orig_buf, struct extent_buffer *buf, | 1526 | * everything that block points to. The internal pointers of the block |
1530 | u32 *nr_extents) | 1527 | * can be in just about any order, and it is likely to have clusters of |
1528 | * things that are close together and clusters of things that are not. | ||
1529 | * | ||
1530 | * To help reduce the seeks that come with updating all of these reference | ||
1531 | * counts, sort them by byte number before actual updates are done. | ||
1532 | * | ||
1533 | * struct refsort is used to match byte number to slot in the btree block. | ||
1534 | * we sort based on the byte number and then use the slot to actually | ||
1535 | * find the item. | ||
1536 | * | ||
1537 | * struct refsort is smaller than strcut btrfs_item and smaller than | ||
1538 | * struct btrfs_key_ptr. Since we're currently limited to the page size | ||
1539 | * for a btree block, there's no way for a kmalloc of refsorts for a | ||
1540 | * single node to be bigger than a page. | ||
1541 | */ | ||
1542 | struct refsort { | ||
1543 | u64 bytenr; | ||
1544 | u32 slot; | ||
1545 | }; | ||
1546 | |||
1547 | /* | ||
1548 | * for passing into sort() | ||
1549 | */ | ||
1550 | static int refsort_cmp(const void *a_void, const void *b_void) | ||
1551 | { | ||
1552 | const struct refsort *a = a_void; | ||
1553 | const struct refsort *b = b_void; | ||
1554 | |||
1555 | if (a->bytenr < b->bytenr) | ||
1556 | return -1; | ||
1557 | if (a->bytenr > b->bytenr) | ||
1558 | return 1; | ||
1559 | return 0; | ||
1560 | } | ||
1561 | |||
1562 | |||
1563 | noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, | ||
1564 | struct btrfs_root *root, | ||
1565 | struct extent_buffer *orig_buf, | ||
1566 | struct extent_buffer *buf, u32 *nr_extents) | ||
1531 | { | 1567 | { |
1532 | u64 bytenr; | 1568 | u64 bytenr; |
1533 | u64 ref_root; | 1569 | u64 ref_root; |
1534 | u64 orig_root; | 1570 | u64 orig_root; |
1535 | u64 ref_generation; | 1571 | u64 ref_generation; |
1536 | u64 orig_generation; | 1572 | u64 orig_generation; |
1573 | struct refsort *sorted; | ||
1537 | u32 nritems; | 1574 | u32 nritems; |
1538 | u32 nr_file_extents = 0; | 1575 | u32 nr_file_extents = 0; |
1539 | struct btrfs_key key; | 1576 | struct btrfs_key key; |
@@ -1542,6 +1579,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
1542 | int level; | 1579 | int level; |
1543 | int ret = 0; | 1580 | int ret = 0; |
1544 | int faili = 0; | 1581 | int faili = 0; |
1582 | int refi = 0; | ||
1583 | int slot; | ||
1545 | int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, | 1584 | int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, |
1546 | u64, u64, u64, u64, u64, u64, u64, u64); | 1585 | u64, u64, u64, u64, u64, u64, u64, u64); |
1547 | 1586 | ||
@@ -1553,6 +1592,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
1553 | nritems = btrfs_header_nritems(buf); | 1592 | nritems = btrfs_header_nritems(buf); |
1554 | level = btrfs_header_level(buf); | 1593 | level = btrfs_header_level(buf); |
1555 | 1594 | ||
1595 | sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); | ||
1596 | BUG_ON(!sorted); | ||
1597 | |||
1556 | if (root->ref_cows) { | 1598 | if (root->ref_cows) { |
1557 | process_func = __btrfs_inc_extent_ref; | 1599 | process_func = __btrfs_inc_extent_ref; |
1558 | } else { | 1600 | } else { |
@@ -1565,6 +1607,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
1565 | process_func = __btrfs_update_extent_ref; | 1607 | process_func = __btrfs_update_extent_ref; |
1566 | } | 1608 | } |
1567 | 1609 | ||
1610 | /* | ||
1611 | * we make two passes through the items. In the first pass we | ||
1612 | * only record the byte number and slot. Then we sort based on | ||
1613 | * byte number and do the actual work based on the sorted results | ||
1614 | */ | ||
1568 | for (i = 0; i < nritems; i++) { | 1615 | for (i = 0; i < nritems; i++) { |
1569 | cond_resched(); | 1616 | cond_resched(); |
1570 | if (level == 0) { | 1617 | if (level == 0) { |
@@ -1581,6 +1628,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
1581 | continue; | 1628 | continue; |
1582 | 1629 | ||
1583 | nr_file_extents++; | 1630 | nr_file_extents++; |
1631 | sorted[refi].bytenr = bytenr; | ||
1632 | sorted[refi].slot = i; | ||
1633 | refi++; | ||
1634 | } else { | ||
1635 | bytenr = btrfs_node_blockptr(buf, i); | ||
1636 | sorted[refi].bytenr = bytenr; | ||
1637 | sorted[refi].slot = i; | ||
1638 | refi++; | ||
1639 | } | ||
1640 | } | ||
1641 | /* | ||
1642 | * if refi == 0, we didn't actually put anything into the sorted | ||
1643 | * array and we're done | ||
1644 | */ | ||
1645 | if (refi == 0) | ||
1646 | goto out; | ||
1647 | |||
1648 | sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); | ||
1649 | |||
1650 | for (i = 0; i < refi; i++) { | ||
1651 | cond_resched(); | ||
1652 | slot = sorted[i].slot; | ||
1653 | bytenr = sorted[i].bytenr; | ||
1654 | |||
1655 | if (level == 0) { | ||
1656 | btrfs_item_key_to_cpu(buf, &key, slot); | ||
1584 | 1657 | ||
1585 | ret = process_func(trans, root, bytenr, | 1658 | ret = process_func(trans, root, bytenr, |
1586 | orig_buf->start, buf->start, | 1659 | orig_buf->start, buf->start, |
@@ -1589,25 +1662,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
1589 | key.objectid); | 1662 | key.objectid); |
1590 | 1663 | ||
1591 | if (ret) { | 1664 | if (ret) { |
1592 | faili = i; | 1665 | faili = slot; |
1593 | WARN_ON(1); | 1666 | WARN_ON(1); |
1594 | goto fail; | 1667 | goto fail; |
1595 | } | 1668 | } |
1596 | } else { | 1669 | } else { |
1597 | bytenr = btrfs_node_blockptr(buf, i); | ||
1598 | ret = process_func(trans, root, bytenr, | 1670 | ret = process_func(trans, root, bytenr, |
1599 | orig_buf->start, buf->start, | 1671 | orig_buf->start, buf->start, |
1600 | orig_root, ref_root, | 1672 | orig_root, ref_root, |
1601 | orig_generation, ref_generation, | 1673 | orig_generation, ref_generation, |
1602 | level - 1); | 1674 | level - 1); |
1603 | if (ret) { | 1675 | if (ret) { |
1604 | faili = i; | 1676 | faili = slot; |
1605 | WARN_ON(1); | 1677 | WARN_ON(1); |
1606 | goto fail; | 1678 | goto fail; |
1607 | } | 1679 | } |
1608 | } | 1680 | } |
1609 | } | 1681 | } |
1610 | out: | 1682 | out: |
1683 | kfree(sorted); | ||
1611 | if (nr_extents) { | 1684 | if (nr_extents) { |
1612 | if (level == 0) | 1685 | if (level == 0) |
1613 | *nr_extents = nr_file_extents; | 1686 | *nr_extents = nr_file_extents; |
@@ -1616,6 +1689,7 @@ out: | |||
1616 | } | 1689 | } |
1617 | return 0; | 1690 | return 0; |
1618 | fail: | 1691 | fail: |
1692 | kfree(sorted); | ||
1619 | WARN_ON(1); | 1693 | WARN_ON(1); |
1620 | return ret; | 1694 | return ret; |
1621 | } | 1695 | } |
@@ -2159,7 +2233,8 @@ again: | |||
2159 | ret = find_first_extent_bit(&info->extent_ins, search, &start, | 2233 | ret = find_first_extent_bit(&info->extent_ins, search, &start, |
2160 | &end, EXTENT_WRITEBACK); | 2234 | &end, EXTENT_WRITEBACK); |
2161 | if (ret) { | 2235 | if (ret) { |
2162 | if (skipped && all && !num_inserts) { | 2236 | if (skipped && all && !num_inserts && |
2237 | list_empty(&update_list)) { | ||
2163 | skipped = 0; | 2238 | skipped = 0; |
2164 | search = 0; | 2239 | search = 0; |
2165 | continue; | 2240 | continue; |
@@ -2547,6 +2622,7 @@ again: | |||
2547 | if (ret) { | 2622 | if (ret) { |
2548 | if (all && skipped && !nr) { | 2623 | if (all && skipped && !nr) { |
2549 | search = 0; | 2624 | search = 0; |
2625 | skipped = 0; | ||
2550 | continue; | 2626 | continue; |
2551 | } | 2627 | } |
2552 | mutex_unlock(&info->extent_ins_mutex); | 2628 | mutex_unlock(&info->extent_ins_mutex); |
@@ -2700,13 +2776,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, | |||
2700 | /* if metadata always pin */ | 2776 | /* if metadata always pin */ |
2701 | if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { | 2777 | if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { |
2702 | if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { | 2778 | if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { |
2703 | struct btrfs_block_group_cache *cache; | 2779 | mutex_lock(&root->fs_info->pinned_mutex); |
2704 | 2780 | btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); | |
2705 | /* btrfs_free_reserved_extent */ | 2781 | mutex_unlock(&root->fs_info->pinned_mutex); |
2706 | cache = btrfs_lookup_block_group(root->fs_info, bytenr); | ||
2707 | BUG_ON(!cache); | ||
2708 | btrfs_add_free_space(cache, bytenr, num_bytes); | ||
2709 | put_block_group(cache); | ||
2710 | update_reserved_extents(root, bytenr, num_bytes, 0); | 2782 | update_reserved_extents(root, bytenr, num_bytes, 0); |
2711 | return 0; | 2783 | return 0; |
2712 | } | 2784 | } |
@@ -3014,7 +3086,6 @@ loop_check: | |||
3014 | static void dump_space_info(struct btrfs_space_info *info, u64 bytes) | 3086 | static void dump_space_info(struct btrfs_space_info *info, u64 bytes) |
3015 | { | 3087 | { |
3016 | struct btrfs_block_group_cache *cache; | 3088 | struct btrfs_block_group_cache *cache; |
3017 | struct list_head *l; | ||
3018 | 3089 | ||
3019 | printk(KERN_INFO "space_info has %llu free, is %sfull\n", | 3090 | printk(KERN_INFO "space_info has %llu free, is %sfull\n", |
3020 | (unsigned long long)(info->total_bytes - info->bytes_used - | 3091 | (unsigned long long)(info->total_bytes - info->bytes_used - |
@@ -3022,8 +3093,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) | |||
3022 | (info->full) ? "" : "not "); | 3093 | (info->full) ? "" : "not "); |
3023 | 3094 | ||
3024 | down_read(&info->groups_sem); | 3095 | down_read(&info->groups_sem); |
3025 | list_for_each(l, &info->block_groups) { | 3096 | list_for_each_entry(cache, &info->block_groups, list) { |
3026 | cache = list_entry(l, struct btrfs_block_group_cache, list); | ||
3027 | spin_lock(&cache->lock); | 3097 | spin_lock(&cache->lock); |
3028 | printk(KERN_INFO "block group %llu has %llu bytes, %llu used " | 3098 | printk(KERN_INFO "block group %llu has %llu bytes, %llu used " |
3029 | "%llu pinned %llu reserved\n", | 3099 | "%llu pinned %llu reserved\n", |
@@ -3342,7 +3412,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, | |||
3342 | btrfs_set_header_generation(buf, trans->transid); | 3412 | btrfs_set_header_generation(buf, trans->transid); |
3343 | btrfs_tree_lock(buf); | 3413 | btrfs_tree_lock(buf); |
3344 | clean_tree_block(trans, root, buf); | 3414 | clean_tree_block(trans, root, buf); |
3415 | |||
3416 | btrfs_set_lock_blocking(buf); | ||
3345 | btrfs_set_buffer_uptodate(buf); | 3417 | btrfs_set_buffer_uptodate(buf); |
3418 | |||
3346 | if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { | 3419 | if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { |
3347 | set_extent_dirty(&root->dirty_log_pages, buf->start, | 3420 | set_extent_dirty(&root->dirty_log_pages, buf->start, |
3348 | buf->start + buf->len - 1, GFP_NOFS); | 3421 | buf->start + buf->len - 1, GFP_NOFS); |
@@ -3351,6 +3424,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, | |||
3351 | buf->start + buf->len - 1, GFP_NOFS); | 3424 | buf->start + buf->len - 1, GFP_NOFS); |
3352 | } | 3425 | } |
3353 | trans->blocks_used++; | 3426 | trans->blocks_used++; |
3427 | /* this returns a buffer locked for blocking */ | ||
3354 | return buf; | 3428 | return buf; |
3355 | } | 3429 | } |
3356 | 3430 | ||
@@ -3388,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, | |||
3388 | { | 3462 | { |
3389 | u64 leaf_owner; | 3463 | u64 leaf_owner; |
3390 | u64 leaf_generation; | 3464 | u64 leaf_generation; |
3465 | struct refsort *sorted; | ||
3391 | struct btrfs_key key; | 3466 | struct btrfs_key key; |
3392 | struct btrfs_file_extent_item *fi; | 3467 | struct btrfs_file_extent_item *fi; |
3393 | int i; | 3468 | int i; |
3394 | int nritems; | 3469 | int nritems; |
3395 | int ret; | 3470 | int ret; |
3471 | int refi = 0; | ||
3472 | int slot; | ||
3396 | 3473 | ||
3397 | BUG_ON(!btrfs_is_leaf(leaf)); | 3474 | BUG_ON(!btrfs_is_leaf(leaf)); |
3398 | nritems = btrfs_header_nritems(leaf); | 3475 | nritems = btrfs_header_nritems(leaf); |
3399 | leaf_owner = btrfs_header_owner(leaf); | 3476 | leaf_owner = btrfs_header_owner(leaf); |
3400 | leaf_generation = btrfs_header_generation(leaf); | 3477 | leaf_generation = btrfs_header_generation(leaf); |
3401 | 3478 | ||
3479 | sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); | ||
3480 | /* we do this loop twice. The first time we build a list | ||
3481 | * of the extents we have a reference on, then we sort the list | ||
3482 | * by bytenr. The second time around we actually do the | ||
3483 | * extent freeing. | ||
3484 | */ | ||
3402 | for (i = 0; i < nritems; i++) { | 3485 | for (i = 0; i < nritems; i++) { |
3403 | u64 disk_bytenr; | 3486 | u64 disk_bytenr; |
3404 | cond_resched(); | 3487 | cond_resched(); |
3405 | 3488 | ||
3406 | btrfs_item_key_to_cpu(leaf, &key, i); | 3489 | btrfs_item_key_to_cpu(leaf, &key, i); |
3490 | |||
3491 | /* only extents have references, skip everything else */ | ||
3407 | if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) | 3492 | if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) |
3408 | continue; | 3493 | continue; |
3494 | |||
3409 | fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); | 3495 | fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); |
3496 | |||
3497 | /* inline extents live in the btree, they don't have refs */ | ||
3410 | if (btrfs_file_extent_type(leaf, fi) == | 3498 | if (btrfs_file_extent_type(leaf, fi) == |
3411 | BTRFS_FILE_EXTENT_INLINE) | 3499 | BTRFS_FILE_EXTENT_INLINE) |
3412 | continue; | 3500 | continue; |
3413 | /* | 3501 | |
3414 | * FIXME make sure to insert a trans record that | ||
3415 | * repeats the snapshot del on crash | ||
3416 | */ | ||
3417 | disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); | 3502 | disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); |
3503 | |||
3504 | /* holes don't have refs */ | ||
3418 | if (disk_bytenr == 0) | 3505 | if (disk_bytenr == 0) |
3419 | continue; | 3506 | continue; |
3420 | 3507 | ||
3508 | sorted[refi].bytenr = disk_bytenr; | ||
3509 | sorted[refi].slot = i; | ||
3510 | refi++; | ||
3511 | } | ||
3512 | |||
3513 | if (refi == 0) | ||
3514 | goto out; | ||
3515 | |||
3516 | sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); | ||
3517 | |||
3518 | for (i = 0; i < refi; i++) { | ||
3519 | u64 disk_bytenr; | ||
3520 | |||
3521 | disk_bytenr = sorted[i].bytenr; | ||
3522 | slot = sorted[i].slot; | ||
3523 | |||
3524 | cond_resched(); | ||
3525 | |||
3526 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
3527 | if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) | ||
3528 | continue; | ||
3529 | |||
3530 | fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); | ||
3531 | |||
3421 | ret = __btrfs_free_extent(trans, root, disk_bytenr, | 3532 | ret = __btrfs_free_extent(trans, root, disk_bytenr, |
3422 | btrfs_file_extent_disk_num_bytes(leaf, fi), | 3533 | btrfs_file_extent_disk_num_bytes(leaf, fi), |
3423 | leaf->start, leaf_owner, leaf_generation, | 3534 | leaf->start, leaf_owner, leaf_generation, |
@@ -3428,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, | |||
3428 | wake_up(&root->fs_info->transaction_throttle); | 3539 | wake_up(&root->fs_info->transaction_throttle); |
3429 | cond_resched(); | 3540 | cond_resched(); |
3430 | } | 3541 | } |
3542 | out: | ||
3543 | kfree(sorted); | ||
3431 | return 0; | 3544 | return 0; |
3432 | } | 3545 | } |
3433 | 3546 | ||
@@ -3437,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, | |||
3437 | { | 3550 | { |
3438 | int i; | 3551 | int i; |
3439 | int ret; | 3552 | int ret; |
3440 | struct btrfs_extent_info *info = ref->extents; | 3553 | struct btrfs_extent_info *info; |
3554 | struct refsort *sorted; | ||
3555 | |||
3556 | if (ref->nritems == 0) | ||
3557 | return 0; | ||
3441 | 3558 | ||
3559 | sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); | ||
3442 | for (i = 0; i < ref->nritems; i++) { | 3560 | for (i = 0; i < ref->nritems; i++) { |
3561 | sorted[i].bytenr = ref->extents[i].bytenr; | ||
3562 | sorted[i].slot = i; | ||
3563 | } | ||
3564 | sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); | ||
3565 | |||
3566 | /* | ||
3567 | * the items in the ref were sorted when the ref was inserted | ||
3568 | * into the ref cache, so this is already in order | ||
3569 | */ | ||
3570 | for (i = 0; i < ref->nritems; i++) { | ||
3571 | info = ref->extents + sorted[i].slot; | ||
3443 | ret = __btrfs_free_extent(trans, root, info->bytenr, | 3572 | ret = __btrfs_free_extent(trans, root, info->bytenr, |
3444 | info->num_bytes, ref->bytenr, | 3573 | info->num_bytes, ref->bytenr, |
3445 | ref->owner, ref->generation, | 3574 | ref->owner, ref->generation, |
@@ -3453,6 +3582,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, | |||
3453 | info++; | 3582 | info++; |
3454 | } | 3583 | } |
3455 | 3584 | ||
3585 | kfree(sorted); | ||
3456 | return 0; | 3586 | return 0; |
3457 | } | 3587 | } |
3458 | 3588 | ||
@@ -3497,6 +3627,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, | |||
3497 | } | 3627 | } |
3498 | 3628 | ||
3499 | /* | 3629 | /* |
3630 | * this is used while deleting old snapshots, and it drops the refs | ||
3631 | * on a whole subtree starting from a level 1 node. | ||
3632 | * | ||
3633 | * The idea is to sort all the leaf pointers, and then drop the | ||
3634 | * ref on all the leaves in order. Most of the time the leaves | ||
3635 | * will have ref cache entries, so no leaf IOs will be required to | ||
3636 | * find the extents they have references on. | ||
3637 | * | ||
3638 | * For each leaf, any references it has are also dropped in order | ||
3639 | * | ||
3640 | * This ends up dropping the references in something close to optimal | ||
3641 | * order for reading and modifying the extent allocation tree. | ||
3642 | */ | ||
3643 | static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, | ||
3644 | struct btrfs_root *root, | ||
3645 | struct btrfs_path *path) | ||
3646 | { | ||
3647 | u64 bytenr; | ||
3648 | u64 root_owner; | ||
3649 | u64 root_gen; | ||
3650 | struct extent_buffer *eb = path->nodes[1]; | ||
3651 | struct extent_buffer *leaf; | ||
3652 | struct btrfs_leaf_ref *ref; | ||
3653 | struct refsort *sorted = NULL; | ||
3654 | int nritems = btrfs_header_nritems(eb); | ||
3655 | int ret; | ||
3656 | int i; | ||
3657 | int refi = 0; | ||
3658 | int slot = path->slots[1]; | ||
3659 | u32 blocksize = btrfs_level_size(root, 0); | ||
3660 | u32 refs; | ||
3661 | |||
3662 | if (nritems == 0) | ||
3663 | goto out; | ||
3664 | |||
3665 | root_owner = btrfs_header_owner(eb); | ||
3666 | root_gen = btrfs_header_generation(eb); | ||
3667 | sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); | ||
3668 | |||
3669 | /* | ||
3670 | * step one, sort all the leaf pointers so we don't scribble | ||
3671 | * randomly into the extent allocation tree | ||
3672 | */ | ||
3673 | for (i = slot; i < nritems; i++) { | ||
3674 | sorted[refi].bytenr = btrfs_node_blockptr(eb, i); | ||
3675 | sorted[refi].slot = i; | ||
3676 | refi++; | ||
3677 | } | ||
3678 | |||
3679 | /* | ||
3680 | * nritems won't be zero, but if we're picking up drop_snapshot | ||
3681 | * after a crash, slot might be > 0, so double check things | ||
3682 | * just in case. | ||
3683 | */ | ||
3684 | if (refi == 0) | ||
3685 | goto out; | ||
3686 | |||
3687 | sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); | ||
3688 | |||
3689 | /* | ||
3690 | * the first loop frees everything the leaves point to | ||
3691 | */ | ||
3692 | for (i = 0; i < refi; i++) { | ||
3693 | u64 ptr_gen; | ||
3694 | |||
3695 | bytenr = sorted[i].bytenr; | ||
3696 | |||
3697 | /* | ||
3698 | * check the reference count on this leaf. If it is > 1 | ||
3699 | * we just decrement it below and don't update any | ||
3700 | * of the refs the leaf points to. | ||
3701 | */ | ||
3702 | ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); | ||
3703 | BUG_ON(ret); | ||
3704 | if (refs != 1) | ||
3705 | continue; | ||
3706 | |||
3707 | ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); | ||
3708 | |||
3709 | /* | ||
3710 | * the leaf only had one reference, which means the | ||
3711 | * only thing pointing to this leaf is the snapshot | ||
3712 | * we're deleting. It isn't possible for the reference | ||
3713 | * count to increase again later | ||
3714 | * | ||
3715 | * The reference cache is checked for the leaf, | ||
3716 | * and if found we'll be able to drop any refs held by | ||
3717 | * the leaf without needing to read it in. | ||
3718 | */ | ||
3719 | ref = btrfs_lookup_leaf_ref(root, bytenr); | ||
3720 | if (ref && ref->generation != ptr_gen) { | ||
3721 | btrfs_free_leaf_ref(root, ref); | ||
3722 | ref = NULL; | ||
3723 | } | ||
3724 | if (ref) { | ||
3725 | ret = cache_drop_leaf_ref(trans, root, ref); | ||
3726 | BUG_ON(ret); | ||
3727 | btrfs_remove_leaf_ref(root, ref); | ||
3728 | btrfs_free_leaf_ref(root, ref); | ||
3729 | } else { | ||
3730 | /* | ||
3731 | * the leaf wasn't in the reference cache, so | ||
3732 | * we have to read it. | ||
3733 | */ | ||
3734 | leaf = read_tree_block(root, bytenr, blocksize, | ||
3735 | ptr_gen); | ||
3736 | ret = btrfs_drop_leaf_ref(trans, root, leaf); | ||
3737 | BUG_ON(ret); | ||
3738 | free_extent_buffer(leaf); | ||
3739 | } | ||
3740 | atomic_inc(&root->fs_info->throttle_gen); | ||
3741 | wake_up(&root->fs_info->transaction_throttle); | ||
3742 | cond_resched(); | ||
3743 | } | ||
3744 | |||
3745 | /* | ||
3746 | * run through the loop again to free the refs on the leaves. | ||
3747 | * This is faster than doing it in the loop above because | ||
3748 | * the leaves are likely to be clustered together. We end up | ||
3749 | * working in nice chunks on the extent allocation tree. | ||
3750 | */ | ||
3751 | for (i = 0; i < refi; i++) { | ||
3752 | bytenr = sorted[i].bytenr; | ||
3753 | ret = __btrfs_free_extent(trans, root, bytenr, | ||
3754 | blocksize, eb->start, | ||
3755 | root_owner, root_gen, 0, 1); | ||
3756 | BUG_ON(ret); | ||
3757 | |||
3758 | atomic_inc(&root->fs_info->throttle_gen); | ||
3759 | wake_up(&root->fs_info->transaction_throttle); | ||
3760 | cond_resched(); | ||
3761 | } | ||
3762 | out: | ||
3763 | kfree(sorted); | ||
3764 | |||
3765 | /* | ||
3766 | * update the path to show we've processed the entire level 1 | ||
3767 | * node. This will get saved into the root's drop_snapshot_progress | ||
3768 | * field so these drops are not repeated again if this transaction | ||
3769 | * commits. | ||
3770 | */ | ||
3771 | path->slots[1] = nritems; | ||
3772 | return 0; | ||
3773 | } | ||
3774 | |||
3775 | /* | ||
3500 | * helper function for drop_snapshot, this walks down the tree dropping ref | 3776 | * helper function for drop_snapshot, this walks down the tree dropping ref |
3501 | * counts as it goes. | 3777 | * counts as it goes. |
3502 | */ | 3778 | */ |
@@ -3511,7 +3787,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, | |||
3511 | struct extent_buffer *next; | 3787 | struct extent_buffer *next; |
3512 | struct extent_buffer *cur; | 3788 | struct extent_buffer *cur; |
3513 | struct extent_buffer *parent; | 3789 | struct extent_buffer *parent; |
3514 | struct btrfs_leaf_ref *ref; | ||
3515 | u32 blocksize; | 3790 | u32 blocksize; |
3516 | int ret; | 3791 | int ret; |
3517 | u32 refs; | 3792 | u32 refs; |
@@ -3538,17 +3813,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, | |||
3538 | if (path->slots[*level] >= | 3813 | if (path->slots[*level] >= |
3539 | btrfs_header_nritems(cur)) | 3814 | btrfs_header_nritems(cur)) |
3540 | break; | 3815 | break; |
3816 | |||
3817 | /* the new code goes down to level 1 and does all the | ||
3818 | * leaves pointed to that node in bulk. So, this check | ||
3819 | * for level 0 will always be false. | ||
3820 | * | ||
3821 | * But, the disk format allows the drop_snapshot_progress | ||
3822 | * field in the root to leave things in a state where | ||
3823 | * a leaf will need cleaning up here. If someone crashes | ||
3824 | * with the old code and then boots with the new code, | ||
3825 | * we might find a leaf here. | ||
3826 | */ | ||
3541 | if (*level == 0) { | 3827 | if (*level == 0) { |
3542 | ret = btrfs_drop_leaf_ref(trans, root, cur); | 3828 | ret = btrfs_drop_leaf_ref(trans, root, cur); |
3543 | BUG_ON(ret); | 3829 | BUG_ON(ret); |
3544 | break; | 3830 | break; |
3545 | } | 3831 | } |
3832 | |||
3833 | /* | ||
3834 | * once we get to level one, process the whole node | ||
3835 | * at once, including everything below it. | ||
3836 | */ | ||
3837 | if (*level == 1) { | ||
3838 | ret = drop_level_one_refs(trans, root, path); | ||
3839 | BUG_ON(ret); | ||
3840 | break; | ||
3841 | } | ||
3842 | |||
3546 | bytenr = btrfs_node_blockptr(cur, path->slots[*level]); | 3843 | bytenr = btrfs_node_blockptr(cur, path->slots[*level]); |
3547 | ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); | 3844 | ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); |
3548 | blocksize = btrfs_level_size(root, *level - 1); | 3845 | blocksize = btrfs_level_size(root, *level - 1); |
3549 | 3846 | ||
3550 | ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); | 3847 | ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); |
3551 | BUG_ON(ret); | 3848 | BUG_ON(ret); |
3849 | |||
3850 | /* | ||
3851 | * if there is more than one reference, we don't need | ||
3852 | * to read that node to drop any references it has. We | ||
3853 | * just drop the ref we hold on that node and move on to the | ||
3854 | * next slot in this level. | ||
3855 | */ | ||
3552 | if (refs != 1) { | 3856 | if (refs != 1) { |
3553 | parent = path->nodes[*level]; | 3857 | parent = path->nodes[*level]; |
3554 | root_owner = btrfs_header_owner(parent); | 3858 | root_owner = btrfs_header_owner(parent); |
@@ -3567,46 +3871,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, | |||
3567 | 3871 | ||
3568 | continue; | 3872 | continue; |
3569 | } | 3873 | } |
3874 | |||
3570 | /* | 3875 | /* |
3571 | * at this point, we have a single ref, and since the | 3876 | * we need to keep freeing things in the next level down. |
3572 | * only place referencing this extent is a dead root | 3877 | * read the block and loop around to process it |
3573 | * the reference count should never go higher. | ||
3574 | * So, we don't need to check it again | ||
3575 | */ | 3878 | */ |
3576 | if (*level == 1) { | 3879 | next = read_tree_block(root, bytenr, blocksize, ptr_gen); |
3577 | ref = btrfs_lookup_leaf_ref(root, bytenr); | ||
3578 | if (ref && ref->generation != ptr_gen) { | ||
3579 | btrfs_free_leaf_ref(root, ref); | ||
3580 | ref = NULL; | ||
3581 | } | ||
3582 | if (ref) { | ||
3583 | ret = cache_drop_leaf_ref(trans, root, ref); | ||
3584 | BUG_ON(ret); | ||
3585 | btrfs_remove_leaf_ref(root, ref); | ||
3586 | btrfs_free_leaf_ref(root, ref); | ||
3587 | *level = 0; | ||
3588 | break; | ||
3589 | } | ||
3590 | } | ||
3591 | next = btrfs_find_tree_block(root, bytenr, blocksize); | ||
3592 | if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { | ||
3593 | free_extent_buffer(next); | ||
3594 | |||
3595 | next = read_tree_block(root, bytenr, blocksize, | ||
3596 | ptr_gen); | ||
3597 | cond_resched(); | ||
3598 | #if 0 | ||
3599 | /* | ||
3600 | * this is a debugging check and can go away | ||
3601 | * the ref should never go all the way down to 1 | ||
3602 | * at this point | ||
3603 | */ | ||
3604 | ret = lookup_extent_ref(NULL, root, bytenr, blocksize, | ||
3605 | &refs); | ||
3606 | BUG_ON(ret); | ||
3607 | WARN_ON(refs != 1); | ||
3608 | #endif | ||
3609 | } | ||
3610 | WARN_ON(*level <= 0); | 3880 | WARN_ON(*level <= 0); |
3611 | if (path->nodes[*level-1]) | 3881 | if (path->nodes[*level-1]) |
3612 | free_extent_buffer(path->nodes[*level-1]); | 3882 | free_extent_buffer(path->nodes[*level-1]); |
@@ -3631,11 +3901,16 @@ out: | |||
3631 | root_owner = btrfs_header_owner(parent); | 3901 | root_owner = btrfs_header_owner(parent); |
3632 | root_gen = btrfs_header_generation(parent); | 3902 | root_gen = btrfs_header_generation(parent); |
3633 | 3903 | ||
3904 | /* | ||
3905 | * cleanup and free the reference on the last node | ||
3906 | * we processed | ||
3907 | */ | ||
3634 | ret = __btrfs_free_extent(trans, root, bytenr, blocksize, | 3908 | ret = __btrfs_free_extent(trans, root, bytenr, blocksize, |
3635 | parent->start, root_owner, root_gen, | 3909 | parent->start, root_owner, root_gen, |
3636 | *level, 1); | 3910 | *level, 1); |
3637 | free_extent_buffer(path->nodes[*level]); | 3911 | free_extent_buffer(path->nodes[*level]); |
3638 | path->nodes[*level] = NULL; | 3912 | path->nodes[*level] = NULL; |
3913 | |||
3639 | *level += 1; | 3914 | *level += 1; |
3640 | BUG_ON(ret); | 3915 | BUG_ON(ret); |
3641 | 3916 | ||
@@ -3687,6 +3962,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, | |||
3687 | 3962 | ||
3688 | next = read_tree_block(root, bytenr, blocksize, ptr_gen); | 3963 | next = read_tree_block(root, bytenr, blocksize, ptr_gen); |
3689 | btrfs_tree_lock(next); | 3964 | btrfs_tree_lock(next); |
3965 | btrfs_set_lock_blocking(next); | ||
3690 | 3966 | ||
3691 | ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, | 3967 | ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, |
3692 | &refs); | 3968 | &refs); |
@@ -3754,6 +4030,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, | |||
3754 | if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { | 4030 | if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { |
3755 | struct extent_buffer *node; | 4031 | struct extent_buffer *node; |
3756 | struct btrfs_disk_key disk_key; | 4032 | struct btrfs_disk_key disk_key; |
4033 | |||
4034 | /* | ||
4035 | * there is more work to do in this level. | ||
4036 | * Update the drop_progress marker to reflect | ||
4037 | * the work we've done so far, and then bump | ||
4038 | * the slot number | ||
4039 | */ | ||
3757 | node = path->nodes[i]; | 4040 | node = path->nodes[i]; |
3758 | path->slots[i]++; | 4041 | path->slots[i]++; |
3759 | *level = i; | 4042 | *level = i; |
@@ -3765,6 +4048,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, | |||
3765 | return 0; | 4048 | return 0; |
3766 | } else { | 4049 | } else { |
3767 | struct extent_buffer *parent; | 4050 | struct extent_buffer *parent; |
4051 | |||
4052 | /* | ||
4053 | * this whole node is done, free our reference | ||
4054 | * on it and go up one level | ||
4055 | */ | ||
3768 | if (path->nodes[*level] == root->node) | 4056 | if (path->nodes[*level] == root->node) |
3769 | parent = path->nodes[*level]; | 4057 | parent = path->nodes[*level]; |
3770 | else | 4058 | else |
@@ -4444,7 +4732,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans, | |||
4444 | u64 lock_end = 0; | 4732 | u64 lock_end = 0; |
4445 | u64 num_bytes; | 4733 | u64 num_bytes; |
4446 | u64 ext_offset; | 4734 | u64 ext_offset; |
4447 | u64 first_pos; | 4735 | u64 search_end = (u64)-1; |
4448 | u32 nritems; | 4736 | u32 nritems; |
4449 | int nr_scaned = 0; | 4737 | int nr_scaned = 0; |
4450 | int extent_locked = 0; | 4738 | int extent_locked = 0; |
@@ -4452,7 +4740,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans, | |||
4452 | int ret; | 4740 | int ret; |
4453 | 4741 | ||
4454 | memcpy(&key, leaf_key, sizeof(key)); | 4742 | memcpy(&key, leaf_key, sizeof(key)); |
4455 | first_pos = INT_LIMIT(loff_t) - extent_key->offset; | ||
4456 | if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { | 4743 | if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { |
4457 | if (key.objectid < ref_path->owner_objectid || | 4744 | if (key.objectid < ref_path->owner_objectid || |
4458 | (key.objectid == ref_path->owner_objectid && | 4745 | (key.objectid == ref_path->owner_objectid && |
@@ -4501,7 +4788,7 @@ next: | |||
4501 | if ((key.objectid > ref_path->owner_objectid) || | 4788 | if ((key.objectid > ref_path->owner_objectid) || |
4502 | (key.objectid == ref_path->owner_objectid && | 4789 | (key.objectid == ref_path->owner_objectid && |
4503 | key.type > BTRFS_EXTENT_DATA_KEY) || | 4790 | key.type > BTRFS_EXTENT_DATA_KEY) || |
4504 | (key.offset >= first_pos + extent_key->offset)) | 4791 | key.offset >= search_end) |
4505 | break; | 4792 | break; |
4506 | } | 4793 | } |
4507 | 4794 | ||
@@ -4534,8 +4821,10 @@ next: | |||
4534 | num_bytes = btrfs_file_extent_num_bytes(leaf, fi); | 4821 | num_bytes = btrfs_file_extent_num_bytes(leaf, fi); |
4535 | ext_offset = btrfs_file_extent_offset(leaf, fi); | 4822 | ext_offset = btrfs_file_extent_offset(leaf, fi); |
4536 | 4823 | ||
4537 | if (first_pos > key.offset - ext_offset) | 4824 | if (search_end == (u64)-1) { |
4538 | first_pos = key.offset - ext_offset; | 4825 | search_end = key.offset - ext_offset + |
4826 | btrfs_file_extent_ram_bytes(leaf, fi); | ||
4827 | } | ||
4539 | 4828 | ||
4540 | if (!extent_locked) { | 4829 | if (!extent_locked) { |
4541 | lock_start = key.offset; | 4830 | lock_start = key.offset; |
@@ -4724,7 +5013,7 @@ next: | |||
4724 | } | 5013 | } |
4725 | skip: | 5014 | skip: |
4726 | if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && | 5015 | if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && |
4727 | key.offset >= first_pos + extent_key->offset) | 5016 | key.offset >= search_end) |
4728 | break; | 5017 | break; |
4729 | 5018 | ||
4730 | cond_resched(); | 5019 | cond_resched(); |
@@ -4778,6 +5067,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, | |||
4778 | ref->bytenr = buf->start; | 5067 | ref->bytenr = buf->start; |
4779 | ref->owner = btrfs_header_owner(buf); | 5068 | ref->owner = btrfs_header_owner(buf); |
4780 | ref->generation = btrfs_header_generation(buf); | 5069 | ref->generation = btrfs_header_generation(buf); |
5070 | |||
4781 | ret = btrfs_add_leaf_ref(root, ref, 0); | 5071 | ret = btrfs_add_leaf_ref(root, ref, 0); |
4782 | WARN_ON(ret); | 5072 | WARN_ON(ret); |
4783 | btrfs_free_leaf_ref(root, ref); | 5073 | btrfs_free_leaf_ref(root, ref); |
@@ -5957,9 +6247,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
5957 | path = btrfs_alloc_path(); | 6247 | path = btrfs_alloc_path(); |
5958 | BUG_ON(!path); | 6248 | BUG_ON(!path); |
5959 | 6249 | ||
5960 | btrfs_remove_free_space_cache(block_group); | 6250 | spin_lock(&root->fs_info->block_group_cache_lock); |
5961 | rb_erase(&block_group->cache_node, | 6251 | rb_erase(&block_group->cache_node, |
5962 | &root->fs_info->block_group_cache_tree); | 6252 | &root->fs_info->block_group_cache_tree); |
6253 | spin_unlock(&root->fs_info->block_group_cache_lock); | ||
6254 | btrfs_remove_free_space_cache(block_group); | ||
5963 | down_write(&block_group->space_info->groups_sem); | 6255 | down_write(&block_group->space_info->groups_sem); |
5964 | list_del(&block_group->list); | 6256 | list_del(&block_group->list); |
5965 | up_write(&block_group->space_info->groups_sem); | 6257 | up_write(&block_group->space_info->groups_sem); |