aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-04-03 09:47:43 -0400
committerChris Mason <chris.mason@oracle.com>2009-04-03 09:47:43 -0400
commitfa9c0d795f7b57c76560b7fac703f5d341210e28 (patch)
tree74d9d9846e21ce5b99738f3cc13b855fb63d1eba /fs/btrfs/extent-tree.c
parent8e73f275011b3264a87339fd9f1690e944e381c9 (diff)
Btrfs: rework allocation clustering
Because btrfs is copy-on-write, we end up picking new locations for blocks very often. This makes it fairly difficult to maintain perfect read patterns over time, but we can at least do some optimizations for writes. This is done today by remembering the last place we allocated and trying to find a free space hole big enough to hold more than just one allocation. The end result is that we tend to write sequentially to the drive. This happens all the time for metadata and it happens for data when mounted -o ssd. But, the way we record it is fairly racey and it tends to fragment the free space over time because we are trying to allocate fairly large areas at once. This commit gets rid of the races by adding a free space cluster object with dedicated locking to make sure that only one process at a time is out replacing the cluster. The free space fragmentation is somewhat solved by allowing a cluster to be comprised of smaller free space extents. This part definitely adds some CPU time to the cluster allocations, but it allows the allocator to consume the small holes left behind by cow. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c181
1 files changed, 128 insertions, 53 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 15d62a9214b..178df4c67de 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "volumes.h" 31#include "volumes.h"
32#include "locking.h" 32#include "locking.h"
33#include "ref-cache.h" 33#include "ref-cache.h"
34#include "free-space-cache.h"
34 35
35#define PENDING_EXTENT_INSERT 0 36#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 37#define PENDING_EXTENT_DELETE 1
@@ -324,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
324 return cache; 325 return cache;
325} 326}
326 327
327static inline void put_block_group(struct btrfs_block_group_cache *cache) 328void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
328{ 329{
329 if (atomic_dec_and_test(&cache->count)) 330 if (atomic_dec_and_test(&cache->count))
330 kfree(cache); 331 kfree(cache);
@@ -397,12 +398,12 @@ again:
397 div_factor(cache->key.offset, factor)) { 398 div_factor(cache->key.offset, factor)) {
398 group_start = cache->key.objectid; 399 group_start = cache->key.objectid;
399 spin_unlock(&cache->lock); 400 spin_unlock(&cache->lock);
400 put_block_group(cache); 401 btrfs_put_block_group(cache);
401 goto found; 402 goto found;
402 } 403 }
403 } 404 }
404 spin_unlock(&cache->lock); 405 spin_unlock(&cache->lock);
405 put_block_group(cache); 406 btrfs_put_block_group(cache);
406 cond_resched(); 407 cond_resched();
407 } 408 }
408 if (!wrapped) { 409 if (!wrapped) {
@@ -1592,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1592 if (!block_group || block_group->ro) 1593 if (!block_group || block_group->ro)
1593 readonly = 1; 1594 readonly = 1;
1594 if (block_group) 1595 if (block_group)
1595 put_block_group(block_group); 1596 btrfs_put_block_group(block_group);
1596 return readonly; 1597 return readonly;
1597} 1598}
1598 1599
@@ -2016,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2016 WARN_ON(ret); 2017 WARN_ON(ret);
2017 } 2018 }
2018 } 2019 }
2019 put_block_group(cache); 2020 btrfs_put_block_group(cache);
2020 total -= num_bytes; 2021 total -= num_bytes;
2021 bytenr += num_bytes; 2022 bytenr += num_bytes;
2022 } 2023 }
@@ -2033,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2033 return 0; 2034 return 0;
2034 2035
2035 bytenr = cache->key.objectid; 2036 bytenr = cache->key.objectid;
2036 put_block_group(cache); 2037 btrfs_put_block_group(cache);
2037 2038
2038 return bytenr; 2039 return bytenr;
2039} 2040}
@@ -2077,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2077 if (cache->cached) 2078 if (cache->cached)
2078 btrfs_add_free_space(cache, bytenr, len); 2079 btrfs_add_free_space(cache, bytenr, len);
2079 } 2080 }
2080 put_block_group(cache); 2081 btrfs_put_block_group(cache);
2081 bytenr += len; 2082 bytenr += len;
2082 num -= len; 2083 num -= len;
2083 } 2084 }
@@ -2108,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
2108 } 2109 }
2109 spin_unlock(&cache->lock); 2110 spin_unlock(&cache->lock);
2110 spin_unlock(&cache->space_info->lock); 2111 spin_unlock(&cache->space_info->lock);
2111 put_block_group(cache); 2112 btrfs_put_block_group(cache);
2112 bytenr += len; 2113 bytenr += len;
2113 num -= len; 2114 num -= len;
2114 } 2115 }
@@ -2543,12 +2544,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2543{ 2544{
2544 int ret = 0; 2545 int ret = 0;
2545 struct btrfs_root *root = orig_root->fs_info->extent_root; 2546 struct btrfs_root *root = orig_root->fs_info->extent_root;
2546 u64 *last_ptr = NULL; 2547 struct btrfs_free_cluster *last_ptr = NULL;
2547 struct btrfs_block_group_cache *block_group = NULL; 2548 struct btrfs_block_group_cache *block_group = NULL;
2548 int empty_cluster = 2 * 1024 * 1024; 2549 int empty_cluster = 2 * 1024 * 1024;
2549 int allowed_chunk_alloc = 0; 2550 int allowed_chunk_alloc = 0;
2550 int using_hint = 0;
2551 struct btrfs_space_info *space_info; 2551 struct btrfs_space_info *space_info;
2552 int last_ptr_loop = 0;
2553 int loop = 0;
2552 2554
2553 WARN_ON(num_bytes < root->sectorsize); 2555 WARN_ON(num_bytes < root->sectorsize);
2554 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2556 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -2561,38 +2563,39 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2561 allowed_chunk_alloc = 1; 2563 allowed_chunk_alloc = 1;
2562 2564
2563 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2565 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2564 last_ptr = &root->fs_info->last_alloc; 2566 last_ptr = &root->fs_info->meta_alloc_cluster;
2565 if (!btrfs_test_opt(root, SSD)) 2567 if (!btrfs_test_opt(root, SSD))
2566 empty_cluster = 64 * 1024; 2568 empty_cluster = 64 * 1024;
2567 } 2569 }
2568 2570
2569 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2571 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
2570 last_ptr = &root->fs_info->last_data_alloc; 2572 last_ptr = &root->fs_info->data_alloc_cluster;
2573 }
2571 2574
2572 if (last_ptr) { 2575 if (last_ptr) {
2573 if (*last_ptr) 2576 spin_lock(&last_ptr->lock);
2574 hint_byte = *last_ptr; 2577 if (last_ptr->block_group)
2575 else 2578 hint_byte = last_ptr->window_start;
2576 empty_size += empty_cluster; 2579 spin_unlock(&last_ptr->lock);
2577 } else {
2578 empty_cluster = 0;
2579 } 2580 }
2581
2580 search_start = max(search_start, first_logical_byte(root, 0)); 2582 search_start = max(search_start, first_logical_byte(root, 0));
2581 search_start = max(search_start, hint_byte); 2583 search_start = max(search_start, hint_byte);
2582 2584
2585 if (!last_ptr) {
2586 empty_cluster = 0;
2587 loop = 1;
2588 }
2589
2583 if (search_start == hint_byte) { 2590 if (search_start == hint_byte) {
2584 using_hint = 1;
2585 block_group = btrfs_lookup_block_group(root->fs_info, 2591 block_group = btrfs_lookup_block_group(root->fs_info,
2586 search_start); 2592 search_start);
2587 if (block_group && block_group_bits(block_group, data)) { 2593 if (block_group && block_group_bits(block_group, data)) {
2588 down_read(&space_info->groups_sem); 2594 down_read(&space_info->groups_sem);
2589 goto have_block_group; 2595 goto have_block_group;
2590 } else if (block_group) { 2596 } else if (block_group) {
2591 put_block_group(block_group); 2597 btrfs_put_block_group(block_group);
2592 } 2598 }
2593
2594 empty_size += empty_cluster;
2595 using_hint = 0;
2596 } 2599 }
2597 2600
2598search: 2601search:
@@ -2609,7 +2612,7 @@ have_block_group:
2609 ret = cache_block_group(root, block_group); 2612 ret = cache_block_group(root, block_group);
2610 mutex_unlock(&block_group->cache_mutex); 2613 mutex_unlock(&block_group->cache_mutex);
2611 if (ret) { 2614 if (ret) {
2612 put_block_group(block_group); 2615 btrfs_put_block_group(block_group);
2613 break; 2616 break;
2614 } 2617 }
2615 } 2618 }
@@ -2617,11 +2620,88 @@ have_block_group:
2617 if (unlikely(block_group->ro)) 2620 if (unlikely(block_group->ro))
2618 goto loop; 2621 goto loop;
2619 2622
2623 if (last_ptr) {
2624 /*
2625 * the refill lock keeps out other
2626 * people trying to start a new cluster
2627 */
2628 spin_lock(&last_ptr->refill_lock);
2629 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2630 num_bytes, search_start);
2631 if (offset) {
2632 /* we have a block, we're done */
2633 spin_unlock(&last_ptr->refill_lock);
2634 goto checks;
2635 }
2636
2637 spin_lock(&last_ptr->lock);
2638 /*
2639 * whoops, this cluster doesn't actually point to
2640 * this block group. Get a ref on the block
2641 * group is does point to and try again
2642 */
2643 if (!last_ptr_loop && last_ptr->block_group &&
2644 last_ptr->block_group != block_group) {
2645
2646 btrfs_put_block_group(block_group);
2647 block_group = last_ptr->block_group;
2648 atomic_inc(&block_group->count);
2649 spin_unlock(&last_ptr->lock);
2650 spin_unlock(&last_ptr->refill_lock);
2651
2652 last_ptr_loop = 1;
2653 search_start = block_group->key.objectid;
2654 goto have_block_group;
2655 }
2656 spin_unlock(&last_ptr->lock);
2657
2658 /*
2659 * this cluster didn't work out, free it and
2660 * start over
2661 */
2662 btrfs_return_cluster_to_free_space(NULL, last_ptr);
2663
2664 last_ptr_loop = 0;
2665
2666 /* allocate a cluster in this block group */
2667 ret = btrfs_find_space_cluster(trans,
2668 block_group, last_ptr,
2669 offset, num_bytes,
2670 empty_cluster + empty_size);
2671 if (ret == 0) {
2672 /*
2673 * now pull our allocation out of this
2674 * cluster
2675 */
2676 offset = btrfs_alloc_from_cluster(block_group,
2677 last_ptr, num_bytes,
2678 search_start);
2679 if (offset) {
2680 /* we found one, proceed */
2681 spin_unlock(&last_ptr->refill_lock);
2682 goto checks;
2683 }
2684 }
2685 /*
2686 * at this point we either didn't find a cluster
2687 * or we weren't able to allocate a block from our
2688 * cluster. Free the cluster we've been trying
2689 * to use, and go to the next block group
2690 */
2691 if (loop < 2) {
2692 btrfs_return_cluster_to_free_space(NULL,
2693 last_ptr);
2694 spin_unlock(&last_ptr->refill_lock);
2695 goto loop;
2696 }
2697 spin_unlock(&last_ptr->refill_lock);
2698 }
2699
2620 offset = btrfs_find_space_for_alloc(block_group, search_start, 2700 offset = btrfs_find_space_for_alloc(block_group, search_start,
2621 num_bytes, empty_size); 2701 num_bytes, empty_size);
2622 if (!offset) 2702 if (!offset)
2623 goto loop; 2703 goto loop;
2624 2704checks:
2625 search_start = stripe_align(root, offset); 2705 search_start = stripe_align(root, offset);
2626 2706
2627 /* move on to the next group */ 2707 /* move on to the next group */
@@ -2637,11 +2717,6 @@ have_block_group:
2637 goto loop; 2717 goto loop;
2638 } 2718 }
2639 2719
2640 if (using_hint && search_start > hint_byte) {
2641 btrfs_add_free_space(block_group, offset, num_bytes);
2642 goto loop;
2643 }
2644
2645 if (exclude_nr > 0 && 2720 if (exclude_nr > 0 &&
2646 (search_start + num_bytes > exclude_start && 2721 (search_start + num_bytes > exclude_start &&
2647 search_start < exclude_start + exclude_nr)) { 2722 search_start < exclude_start + exclude_nr)) {
@@ -2670,33 +2745,33 @@ have_block_group:
2670 /* we are all good, lets return */ 2745 /* we are all good, lets return */
2671 break; 2746 break;
2672loop: 2747loop:
2673 put_block_group(block_group); 2748 btrfs_put_block_group(block_group);
2674 if (using_hint) {
2675 empty_size += empty_cluster;
2676 using_hint = 0;
2677 up_read(&space_info->groups_sem);
2678 goto search;
2679 }
2680 } 2749 }
2681 up_read(&space_info->groups_sem); 2750 up_read(&space_info->groups_sem);
2682 2751
2683 if (!ins->objectid && (empty_size || allowed_chunk_alloc)) { 2752 /* loop == 0, try to find a clustered alloc in every block group
2684 int try_again = empty_size; 2753 * loop == 1, try again after forcing a chunk allocation
2685 2754 * loop == 2, set empty_size and empty_cluster to 0 and try again
2686 empty_size = 0; 2755 */
2756 if (!ins->objectid && loop < 3 &&
2757 (empty_size || empty_cluster || allowed_chunk_alloc)) {
2758 if (loop >= 2) {
2759 empty_size = 0;
2760 empty_cluster = 0;
2761 }
2687 2762
2688 if (allowed_chunk_alloc) { 2763 if (allowed_chunk_alloc) {
2689 ret = do_chunk_alloc(trans, root, num_bytes + 2764 ret = do_chunk_alloc(trans, root, num_bytes +
2690 2 * 1024 * 1024, data, 1); 2765 2 * 1024 * 1024, data, 1);
2691 if (!ret)
2692 try_again = 1;
2693 allowed_chunk_alloc = 0; 2766 allowed_chunk_alloc = 0;
2694 } else { 2767 } else {
2695 space_info->force_alloc = 1; 2768 space_info->force_alloc = 1;
2696 } 2769 }
2697 2770
2698 if (try_again) 2771 if (loop < 3) {
2772 loop++;
2699 goto search; 2773 goto search;
2774 }
2700 ret = -ENOSPC; 2775 ret = -ENOSPC;
2701 } else if (!ins->objectid) { 2776 } else if (!ins->objectid) {
2702 ret = -ENOSPC; 2777 ret = -ENOSPC;
@@ -2707,9 +2782,7 @@ loop:
2707 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2782 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2708 trans->block_group = block_group->key.objectid; 2783 trans->block_group = block_group->key.objectid;
2709 2784
2710 if (last_ptr) 2785 btrfs_put_block_group(block_group);
2711 *last_ptr = ins->objectid + ins->offset;
2712 put_block_group(block_group);
2713 ret = 0; 2786 ret = 0;
2714 } 2787 }
2715 2788
@@ -2817,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2817 ret = btrfs_discard_extent(root, start, len); 2890 ret = btrfs_discard_extent(root, start, len);
2818 2891
2819 btrfs_add_free_space(cache, start, len); 2892 btrfs_add_free_space(cache, start, len);
2820 put_block_group(cache); 2893 btrfs_put_block_group(cache);
2821 update_reserved_extents(root, start, len, 0); 2894 update_reserved_extents(root, start, len, 0);
2822 2895
2823 return ret; 2896 return ret;
@@ -2955,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
2955 ret = btrfs_remove_free_space(block_group, ins->objectid, 3028 ret = btrfs_remove_free_space(block_group, ins->objectid,
2956 ins->offset); 3029 ins->offset);
2957 BUG_ON(ret); 3030 BUG_ON(ret);
2958 put_block_group(block_group); 3031 btrfs_put_block_group(block_group);
2959 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3032 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
2960 ref_generation, owner, ins, 1); 3033 ref_generation, owner, ins, 1);
2961 return ret; 3034 return ret;
@@ -5644,7 +5717,7 @@ next:
5644 WARN_ON(block_group->reserved > 0); 5717 WARN_ON(block_group->reserved > 0);
5645 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5718 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5646 spin_unlock(&block_group->lock); 5719 spin_unlock(&block_group->lock);
5647 put_block_group(block_group); 5720 btrfs_put_block_group(block_group);
5648 ret = 0; 5721 ret = 0;
5649out: 5722out:
5650 btrfs_free_path(path); 5723 btrfs_free_path(path);
@@ -5774,6 +5847,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
5774 spin_lock_init(&cache->tree_lock); 5847 spin_lock_init(&cache->tree_lock);
5775 mutex_init(&cache->cache_mutex); 5848 mutex_init(&cache->cache_mutex);
5776 INIT_LIST_HEAD(&cache->list); 5849 INIT_LIST_HEAD(&cache->list);
5850 INIT_LIST_HEAD(&cache->cluster_list);
5777 read_extent_buffer(leaf, &cache->item, 5851 read_extent_buffer(leaf, &cache->item,
5778 btrfs_item_ptr_offset(leaf, path->slots[0]), 5852 btrfs_item_ptr_offset(leaf, path->slots[0]),
5779 sizeof(cache->item)); 5853 sizeof(cache->item));
@@ -5830,6 +5904,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5830 spin_lock_init(&cache->tree_lock); 5904 spin_lock_init(&cache->tree_lock);
5831 mutex_init(&cache->cache_mutex); 5905 mutex_init(&cache->cache_mutex);
5832 INIT_LIST_HEAD(&cache->list); 5906 INIT_LIST_HEAD(&cache->list);
5907 INIT_LIST_HEAD(&cache->cluster_list);
5833 5908
5834 btrfs_set_block_group_used(&cache->item, bytes_used); 5909 btrfs_set_block_group_used(&cache->item, bytes_used);
5835 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5910 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5889,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5889 spin_unlock(&block_group->space_info->lock); 5964 spin_unlock(&block_group->space_info->lock);
5890 block_group->space_info->full = 0; 5965 block_group->space_info->full = 0;
5891 5966
5892 put_block_group(block_group); 5967 btrfs_put_block_group(block_group);
5893 put_block_group(block_group); 5968 btrfs_put_block_group(block_group);
5894 5969
5895 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5970 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5896 if (ret > 0) 5971 if (ret > 0)