aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-04-03 09:47:43 -0400
committerChris Mason <chris.mason@oracle.com>2009-04-03 09:47:43 -0400
commitfa9c0d795f7b57c76560b7fac703f5d341210e28 (patch)
tree74d9d9846e21ce5b99738f3cc13b855fb63d1eba
parent8e73f275011b3264a87339fd9f1690e944e381c9 (diff)
Btrfs: rework allocation clustering
Because btrfs is copy-on-write, we end up picking new locations for blocks very often. This makes it fairly difficult to maintain perfect read patterns over time, but we can at least do some optimizations for writes. This is done today by remembering the last place we allocated and trying to find a free space hole big enough to hold more than just one allocation. The end result is that we tend to write sequentially to the drive. This happens all the time for metadata and it happens for data when mounted -o ssd. But, the way we record it is fairly racey and it tends to fragment the free space over time because we are trying to allocate fairly large areas at once. This commit gets rid of the races by adding a free space cluster object with dedicated locking to make sure that only one process at a time is out replacing the cluster. The free space fragmentation is somewhat solved by allowing a cluster to be comprised of smaller free space extents. This part definitely adds some CPU time to the cluster allocations, but it allows the allocator to consume the small holes left behind by cow. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/ctree.h54
-rw-r--r--fs/btrfs/disk-io.c5
-rw-r--r--fs/btrfs/extent-tree.c181
-rw-r--r--fs/btrfs/free-space-cache.c297
-rw-r--r--fs/btrfs/free-space-cache.h44
-rw-r--r--fs/btrfs/transaction.c2
6 files changed, 509 insertions, 74 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aaa049b8e134..b82931f97ef3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -633,11 +633,29 @@ struct btrfs_space_info {
633 struct rw_semaphore groups_sem; 633 struct rw_semaphore groups_sem;
634}; 634};
635 635
636struct btrfs_free_space { 636/*
637 struct rb_node bytes_index; 637 * free clusters are used to claim free space in relatively large chunks,
638 struct rb_node offset_index; 638 * allowing us to do less seeky writes. They are used for all metadata
639 u64 offset; 639 * allocations and data allocations in ssd mode.
640 u64 bytes; 640 */
641struct btrfs_free_cluster {
642 spinlock_t lock;
643 spinlock_t refill_lock;
644 struct rb_root root;
645
646 /* largest extent in this cluster */
647 u64 max_size;
648
649 /* first extent starting offset */
650 u64 window_start;
651
652 struct btrfs_block_group_cache *block_group;
653 /*
654 * when a cluster is allocated from a block group, we put the
655 * cluster onto a list in the block group so that it can
656 * be freed before the block group is freed.
657 */
658 struct list_head block_group_list;
641}; 659};
642 660
643struct btrfs_block_group_cache { 661struct btrfs_block_group_cache {
@@ -667,6 +685,11 @@ struct btrfs_block_group_cache {
667 685
668 /* usage count */ 686 /* usage count */
669 atomic_t count; 687 atomic_t count;
688
689 /* List of struct btrfs_free_clusters for this block group.
690 * Today it will only have one thing on it, but that may change
691 */
692 struct list_head cluster_list;
670}; 693};
671 694
672struct btrfs_leaf_ref_tree { 695struct btrfs_leaf_ref_tree {
@@ -838,8 +861,12 @@ struct btrfs_fs_info {
838 spinlock_t delalloc_lock; 861 spinlock_t delalloc_lock;
839 spinlock_t new_trans_lock; 862 spinlock_t new_trans_lock;
840 u64 delalloc_bytes; 863 u64 delalloc_bytes;
841 u64 last_alloc; 864
842 u64 last_data_alloc; 865 /* data_alloc_cluster is only used in ssd mode */
866 struct btrfs_free_cluster data_alloc_cluster;
867
868 /* all metadata allocations go through this cluster */
869 struct btrfs_free_cluster meta_alloc_cluster;
843 870
844 spinlock_t ref_cache_lock; 871 spinlock_t ref_cache_lock;
845 u64 total_ref_cache_size; 872 u64 total_ref_cache_size;
@@ -1747,6 +1774,7 @@ static inline struct dentry *fdentry(struct file *file)
1747} 1774}
1748 1775
1749/* extent-tree.c */ 1776/* extent-tree.c */
1777void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1750int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1778int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1751 struct btrfs_root *root, unsigned long count); 1779 struct btrfs_root *root, unsigned long count);
1752int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1780int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
@@ -2173,16 +2201,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
2173int btrfs_init_acl(struct inode *inode, struct inode *dir); 2201int btrfs_init_acl(struct inode *inode, struct inode *dir);
2174int btrfs_acl_chmod(struct inode *inode); 2202int btrfs_acl_chmod(struct inode *inode);
2175 2203
2176/* free-space-cache.c */
2177int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2178 u64 bytenr, u64 size);
2179int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2180 u64 bytenr, u64 size);
2181void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2182 *block_group);
2183u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2184 u64 offset, u64 bytes, u64 empty_size);
2185void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2186 u64 bytes);
2187u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2188#endif 2204#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ea59ebfa505d..e68ef7b456b0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h" 39#include "ref-cache.h"
40#include "tree-log.h" 40#include "tree-log.h"
41#include "free-space-cache.h"
41 42
42static struct extent_io_ops btree_extent_io_ops; 43static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
@@ -1652,6 +1653,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1652 mutex_init(&fs_info->cleaner_mutex); 1653 mutex_init(&fs_info->cleaner_mutex);
1653 mutex_init(&fs_info->volume_mutex); 1654 mutex_init(&fs_info->volume_mutex);
1654 mutex_init(&fs_info->tree_reloc_mutex); 1655 mutex_init(&fs_info->tree_reloc_mutex);
1656
1657 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1658 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1659
1655 init_waitqueue_head(&fs_info->transaction_throttle); 1660 init_waitqueue_head(&fs_info->transaction_throttle);
1656 init_waitqueue_head(&fs_info->transaction_wait); 1661 init_waitqueue_head(&fs_info->transaction_wait);
1657 init_waitqueue_head(&fs_info->async_submit_wait); 1662 init_waitqueue_head(&fs_info->async_submit_wait);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 15d62a9214b7..178df4c67de4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "volumes.h" 31#include "volumes.h"
32#include "locking.h" 32#include "locking.h"
33#include "ref-cache.h" 33#include "ref-cache.h"
34#include "free-space-cache.h"
34 35
35#define PENDING_EXTENT_INSERT 0 36#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 37#define PENDING_EXTENT_DELETE 1
@@ -324,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
324 return cache; 325 return cache;
325} 326}
326 327
327static inline void put_block_group(struct btrfs_block_group_cache *cache) 328void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
328{ 329{
329 if (atomic_dec_and_test(&cache->count)) 330 if (atomic_dec_and_test(&cache->count))
330 kfree(cache); 331 kfree(cache);
@@ -397,12 +398,12 @@ again:
397 div_factor(cache->key.offset, factor)) { 398 div_factor(cache->key.offset, factor)) {
398 group_start = cache->key.objectid; 399 group_start = cache->key.objectid;
399 spin_unlock(&cache->lock); 400 spin_unlock(&cache->lock);
400 put_block_group(cache); 401 btrfs_put_block_group(cache);
401 goto found; 402 goto found;
402 } 403 }
403 } 404 }
404 spin_unlock(&cache->lock); 405 spin_unlock(&cache->lock);
405 put_block_group(cache); 406 btrfs_put_block_group(cache);
406 cond_resched(); 407 cond_resched();
407 } 408 }
408 if (!wrapped) { 409 if (!wrapped) {
@@ -1592,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1592 if (!block_group || block_group->ro) 1593 if (!block_group || block_group->ro)
1593 readonly = 1; 1594 readonly = 1;
1594 if (block_group) 1595 if (block_group)
1595 put_block_group(block_group); 1596 btrfs_put_block_group(block_group);
1596 return readonly; 1597 return readonly;
1597} 1598}
1598 1599
@@ -2016,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2016 WARN_ON(ret); 2017 WARN_ON(ret);
2017 } 2018 }
2018 } 2019 }
2019 put_block_group(cache); 2020 btrfs_put_block_group(cache);
2020 total -= num_bytes; 2021 total -= num_bytes;
2021 bytenr += num_bytes; 2022 bytenr += num_bytes;
2022 } 2023 }
@@ -2033,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2033 return 0; 2034 return 0;
2034 2035
2035 bytenr = cache->key.objectid; 2036 bytenr = cache->key.objectid;
2036 put_block_group(cache); 2037 btrfs_put_block_group(cache);
2037 2038
2038 return bytenr; 2039 return bytenr;
2039} 2040}
@@ -2077,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2077 if (cache->cached) 2078 if (cache->cached)
2078 btrfs_add_free_space(cache, bytenr, len); 2079 btrfs_add_free_space(cache, bytenr, len);
2079 } 2080 }
2080 put_block_group(cache); 2081 btrfs_put_block_group(cache);
2081 bytenr += len; 2082 bytenr += len;
2082 num -= len; 2083 num -= len;
2083 } 2084 }
@@ -2108,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
2108 } 2109 }
2109 spin_unlock(&cache->lock); 2110 spin_unlock(&cache->lock);
2110 spin_unlock(&cache->space_info->lock); 2111 spin_unlock(&cache->space_info->lock);
2111 put_block_group(cache); 2112 btrfs_put_block_group(cache);
2112 bytenr += len; 2113 bytenr += len;
2113 num -= len; 2114 num -= len;
2114 } 2115 }
@@ -2543,12 +2544,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2543{ 2544{
2544 int ret = 0; 2545 int ret = 0;
2545 struct btrfs_root *root = orig_root->fs_info->extent_root; 2546 struct btrfs_root *root = orig_root->fs_info->extent_root;
2546 u64 *last_ptr = NULL; 2547 struct btrfs_free_cluster *last_ptr = NULL;
2547 struct btrfs_block_group_cache *block_group = NULL; 2548 struct btrfs_block_group_cache *block_group = NULL;
2548 int empty_cluster = 2 * 1024 * 1024; 2549 int empty_cluster = 2 * 1024 * 1024;
2549 int allowed_chunk_alloc = 0; 2550 int allowed_chunk_alloc = 0;
2550 int using_hint = 0;
2551 struct btrfs_space_info *space_info; 2551 struct btrfs_space_info *space_info;
2552 int last_ptr_loop = 0;
2553 int loop = 0;
2552 2554
2553 WARN_ON(num_bytes < root->sectorsize); 2555 WARN_ON(num_bytes < root->sectorsize);
2554 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2556 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -2561,38 +2563,39 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2561 allowed_chunk_alloc = 1; 2563 allowed_chunk_alloc = 1;
2562 2564
2563 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2565 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2564 last_ptr = &root->fs_info->last_alloc; 2566 last_ptr = &root->fs_info->meta_alloc_cluster;
2565 if (!btrfs_test_opt(root, SSD)) 2567 if (!btrfs_test_opt(root, SSD))
2566 empty_cluster = 64 * 1024; 2568 empty_cluster = 64 * 1024;
2567 } 2569 }
2568 2570
2569 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2571 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
2570 last_ptr = &root->fs_info->last_data_alloc; 2572 last_ptr = &root->fs_info->data_alloc_cluster;
2573 }
2571 2574
2572 if (last_ptr) { 2575 if (last_ptr) {
2573 if (*last_ptr) 2576 spin_lock(&last_ptr->lock);
2574 hint_byte = *last_ptr; 2577 if (last_ptr->block_group)
2575 else 2578 hint_byte = last_ptr->window_start;
2576 empty_size += empty_cluster; 2579 spin_unlock(&last_ptr->lock);
2577 } else {
2578 empty_cluster = 0;
2579 } 2580 }
2581
2580 search_start = max(search_start, first_logical_byte(root, 0)); 2582 search_start = max(search_start, first_logical_byte(root, 0));
2581 search_start = max(search_start, hint_byte); 2583 search_start = max(search_start, hint_byte);
2582 2584
2585 if (!last_ptr) {
2586 empty_cluster = 0;
2587 loop = 1;
2588 }
2589
2583 if (search_start == hint_byte) { 2590 if (search_start == hint_byte) {
2584 using_hint = 1;
2585 block_group = btrfs_lookup_block_group(root->fs_info, 2591 block_group = btrfs_lookup_block_group(root->fs_info,
2586 search_start); 2592 search_start);
2587 if (block_group && block_group_bits(block_group, data)) { 2593 if (block_group && block_group_bits(block_group, data)) {
2588 down_read(&space_info->groups_sem); 2594 down_read(&space_info->groups_sem);
2589 goto have_block_group; 2595 goto have_block_group;
2590 } else if (block_group) { 2596 } else if (block_group) {
2591 put_block_group(block_group); 2597 btrfs_put_block_group(block_group);
2592 } 2598 }
2593
2594 empty_size += empty_cluster;
2595 using_hint = 0;
2596 } 2599 }
2597 2600
2598search: 2601search:
@@ -2609,7 +2612,7 @@ have_block_group:
2609 ret = cache_block_group(root, block_group); 2612 ret = cache_block_group(root, block_group);
2610 mutex_unlock(&block_group->cache_mutex); 2613 mutex_unlock(&block_group->cache_mutex);
2611 if (ret) { 2614 if (ret) {
2612 put_block_group(block_group); 2615 btrfs_put_block_group(block_group);
2613 break; 2616 break;
2614 } 2617 }
2615 } 2618 }
@@ -2617,11 +2620,88 @@ have_block_group:
2617 if (unlikely(block_group->ro)) 2620 if (unlikely(block_group->ro))
2618 goto loop; 2621 goto loop;
2619 2622
2623 if (last_ptr) {
2624 /*
2625 * the refill lock keeps out other
2626 * people trying to start a new cluster
2627 */
2628 spin_lock(&last_ptr->refill_lock);
2629 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2630 num_bytes, search_start);
2631 if (offset) {
2632 /* we have a block, we're done */
2633 spin_unlock(&last_ptr->refill_lock);
2634 goto checks;
2635 }
2636
2637 spin_lock(&last_ptr->lock);
2638 /*
2639 * whoops, this cluster doesn't actually point to
2640 * this block group. Get a ref on the block
2641 * group is does point to and try again
2642 */
2643 if (!last_ptr_loop && last_ptr->block_group &&
2644 last_ptr->block_group != block_group) {
2645
2646 btrfs_put_block_group(block_group);
2647 block_group = last_ptr->block_group;
2648 atomic_inc(&block_group->count);
2649 spin_unlock(&last_ptr->lock);
2650 spin_unlock(&last_ptr->refill_lock);
2651
2652 last_ptr_loop = 1;
2653 search_start = block_group->key.objectid;
2654 goto have_block_group;
2655 }
2656 spin_unlock(&last_ptr->lock);
2657
2658 /*
2659 * this cluster didn't work out, free it and
2660 * start over
2661 */
2662 btrfs_return_cluster_to_free_space(NULL, last_ptr);
2663
2664 last_ptr_loop = 0;
2665
2666 /* allocate a cluster in this block group */
2667 ret = btrfs_find_space_cluster(trans,
2668 block_group, last_ptr,
2669 offset, num_bytes,
2670 empty_cluster + empty_size);
2671 if (ret == 0) {
2672 /*
2673 * now pull our allocation out of this
2674 * cluster
2675 */
2676 offset = btrfs_alloc_from_cluster(block_group,
2677 last_ptr, num_bytes,
2678 search_start);
2679 if (offset) {
2680 /* we found one, proceed */
2681 spin_unlock(&last_ptr->refill_lock);
2682 goto checks;
2683 }
2684 }
2685 /*
2686 * at this point we either didn't find a cluster
2687 * or we weren't able to allocate a block from our
2688 * cluster. Free the cluster we've been trying
2689 * to use, and go to the next block group
2690 */
2691 if (loop < 2) {
2692 btrfs_return_cluster_to_free_space(NULL,
2693 last_ptr);
2694 spin_unlock(&last_ptr->refill_lock);
2695 goto loop;
2696 }
2697 spin_unlock(&last_ptr->refill_lock);
2698 }
2699
2620 offset = btrfs_find_space_for_alloc(block_group, search_start, 2700 offset = btrfs_find_space_for_alloc(block_group, search_start,
2621 num_bytes, empty_size); 2701 num_bytes, empty_size);
2622 if (!offset) 2702 if (!offset)
2623 goto loop; 2703 goto loop;
2624 2704checks:
2625 search_start = stripe_align(root, offset); 2705 search_start = stripe_align(root, offset);
2626 2706
2627 /* move on to the next group */ 2707 /* move on to the next group */
@@ -2637,11 +2717,6 @@ have_block_group:
2637 goto loop; 2717 goto loop;
2638 } 2718 }
2639 2719
2640 if (using_hint && search_start > hint_byte) {
2641 btrfs_add_free_space(block_group, offset, num_bytes);
2642 goto loop;
2643 }
2644
2645 if (exclude_nr > 0 && 2720 if (exclude_nr > 0 &&
2646 (search_start + num_bytes > exclude_start && 2721 (search_start + num_bytes > exclude_start &&
2647 search_start < exclude_start + exclude_nr)) { 2722 search_start < exclude_start + exclude_nr)) {
@@ -2670,33 +2745,33 @@ have_block_group:
2670 /* we are all good, lets return */ 2745 /* we are all good, lets return */
2671 break; 2746 break;
2672loop: 2747loop:
2673 put_block_group(block_group); 2748 btrfs_put_block_group(block_group);
2674 if (using_hint) {
2675 empty_size += empty_cluster;
2676 using_hint = 0;
2677 up_read(&space_info->groups_sem);
2678 goto search;
2679 }
2680 } 2749 }
2681 up_read(&space_info->groups_sem); 2750 up_read(&space_info->groups_sem);
2682 2751
2683 if (!ins->objectid && (empty_size || allowed_chunk_alloc)) { 2752 /* loop == 0, try to find a clustered alloc in every block group
2684 int try_again = empty_size; 2753 * loop == 1, try again after forcing a chunk allocation
2685 2754 * loop == 2, set empty_size and empty_cluster to 0 and try again
2686 empty_size = 0; 2755 */
2756 if (!ins->objectid && loop < 3 &&
2757 (empty_size || empty_cluster || allowed_chunk_alloc)) {
2758 if (loop >= 2) {
2759 empty_size = 0;
2760 empty_cluster = 0;
2761 }
2687 2762
2688 if (allowed_chunk_alloc) { 2763 if (allowed_chunk_alloc) {
2689 ret = do_chunk_alloc(trans, root, num_bytes + 2764 ret = do_chunk_alloc(trans, root, num_bytes +
2690 2 * 1024 * 1024, data, 1); 2765 2 * 1024 * 1024, data, 1);
2691 if (!ret)
2692 try_again = 1;
2693 allowed_chunk_alloc = 0; 2766 allowed_chunk_alloc = 0;
2694 } else { 2767 } else {
2695 space_info->force_alloc = 1; 2768 space_info->force_alloc = 1;
2696 } 2769 }
2697 2770
2698 if (try_again) 2771 if (loop < 3) {
2772 loop++;
2699 goto search; 2773 goto search;
2774 }
2700 ret = -ENOSPC; 2775 ret = -ENOSPC;
2701 } else if (!ins->objectid) { 2776 } else if (!ins->objectid) {
2702 ret = -ENOSPC; 2777 ret = -ENOSPC;
@@ -2707,9 +2782,7 @@ loop:
2707 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2782 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2708 trans->block_group = block_group->key.objectid; 2783 trans->block_group = block_group->key.objectid;
2709 2784
2710 if (last_ptr) 2785 btrfs_put_block_group(block_group);
2711 *last_ptr = ins->objectid + ins->offset;
2712 put_block_group(block_group);
2713 ret = 0; 2786 ret = 0;
2714 } 2787 }
2715 2788
@@ -2817,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2817 ret = btrfs_discard_extent(root, start, len); 2890 ret = btrfs_discard_extent(root, start, len);
2818 2891
2819 btrfs_add_free_space(cache, start, len); 2892 btrfs_add_free_space(cache, start, len);
2820 put_block_group(cache); 2893 btrfs_put_block_group(cache);
2821 update_reserved_extents(root, start, len, 0); 2894 update_reserved_extents(root, start, len, 0);
2822 2895
2823 return ret; 2896 return ret;
@@ -2955,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
2955 ret = btrfs_remove_free_space(block_group, ins->objectid, 3028 ret = btrfs_remove_free_space(block_group, ins->objectid,
2956 ins->offset); 3029 ins->offset);
2957 BUG_ON(ret); 3030 BUG_ON(ret);
2958 put_block_group(block_group); 3031 btrfs_put_block_group(block_group);
2959 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3032 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
2960 ref_generation, owner, ins, 1); 3033 ref_generation, owner, ins, 1);
2961 return ret; 3034 return ret;
@@ -5644,7 +5717,7 @@ next:
5644 WARN_ON(block_group->reserved > 0); 5717 WARN_ON(block_group->reserved > 0);
5645 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5718 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5646 spin_unlock(&block_group->lock); 5719 spin_unlock(&block_group->lock);
5647 put_block_group(block_group); 5720 btrfs_put_block_group(block_group);
5648 ret = 0; 5721 ret = 0;
5649out: 5722out:
5650 btrfs_free_path(path); 5723 btrfs_free_path(path);
@@ -5774,6 +5847,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
5774 spin_lock_init(&cache->tree_lock); 5847 spin_lock_init(&cache->tree_lock);
5775 mutex_init(&cache->cache_mutex); 5848 mutex_init(&cache->cache_mutex);
5776 INIT_LIST_HEAD(&cache->list); 5849 INIT_LIST_HEAD(&cache->list);
5850 INIT_LIST_HEAD(&cache->cluster_list);
5777 read_extent_buffer(leaf, &cache->item, 5851 read_extent_buffer(leaf, &cache->item,
5778 btrfs_item_ptr_offset(leaf, path->slots[0]), 5852 btrfs_item_ptr_offset(leaf, path->slots[0]),
5779 sizeof(cache->item)); 5853 sizeof(cache->item));
@@ -5830,6 +5904,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5830 spin_lock_init(&cache->tree_lock); 5904 spin_lock_init(&cache->tree_lock);
5831 mutex_init(&cache->cache_mutex); 5905 mutex_init(&cache->cache_mutex);
5832 INIT_LIST_HEAD(&cache->list); 5906 INIT_LIST_HEAD(&cache->list);
5907 INIT_LIST_HEAD(&cache->cluster_list);
5833 5908
5834 btrfs_set_block_group_used(&cache->item, bytes_used); 5909 btrfs_set_block_group_used(&cache->item, bytes_used);
5835 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5910 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5889,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5889 spin_unlock(&block_group->space_info->lock); 5964 spin_unlock(&block_group->space_info->lock);
5890 block_group->space_info->full = 0; 5965 block_group->space_info->full = 0;
5891 5966
5892 put_block_group(block_group); 5967 btrfs_put_block_group(block_group);
5893 put_block_group(block_group); 5968 btrfs_put_block_group(block_group);
5894 5969
5895 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5970 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5896 if (ret > 0) 5971 if (ret > 0)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index df19b60eef61..3fdadd28e935 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include "ctree.h" 20#include "ctree.h"
21#include "free-space-cache.h"
22#include "transaction.h"
23
24struct btrfs_free_space {
25 struct rb_node bytes_index;
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
21 30
22static int tree_insert_offset(struct rb_root *root, u64 offset, 31static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node) 32 struct rb_node *node)
@@ -371,12 +380,58 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
371 return ret; 380 return ret;
372} 381}
373 382
383/*
384 * for a given cluster, put all of its extents back into the free
385 * space cache. If the block group passed doesn't match the block group
386 * pointed to by the cluster, someone else raced in and freed the
387 * cluster already. In that case, we just return without changing anything
388 */
389static int
390__btrfs_return_cluster_to_free_space(
391 struct btrfs_block_group_cache *block_group,
392 struct btrfs_free_cluster *cluster)
393{
394 struct btrfs_free_space *entry;
395 struct rb_node *node;
396
397 spin_lock(&cluster->lock);
398 if (cluster->block_group != block_group)
399 goto out;
400
401 cluster->window_start = 0;
402 node = rb_first(&cluster->root);
403 while(node) {
404 entry = rb_entry(node, struct btrfs_free_space, offset_index);
405 node = rb_next(&entry->offset_index);
406 rb_erase(&entry->offset_index, &cluster->root);
407 link_free_space(block_group, entry);
408 }
409 list_del_init(&cluster->block_group_list);
410
411 btrfs_put_block_group(cluster->block_group);
412 cluster->block_group = NULL;
413 cluster->root.rb_node = NULL;
414out:
415 spin_unlock(&cluster->lock);
416 return 0;
417}
418
374void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 419void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
375{ 420{
376 struct btrfs_free_space *info; 421 struct btrfs_free_space *info;
377 struct rb_node *node; 422 struct rb_node *node;
423 struct btrfs_free_cluster *cluster;
424 struct btrfs_free_cluster *safe;
378 425
379 spin_lock(&block_group->tree_lock); 426 spin_lock(&block_group->tree_lock);
427
428 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
429 block_group_list) {
430
431 WARN_ON(cluster->block_group != block_group);
432 __btrfs_return_cluster_to_free_space(block_group, cluster);
433 }
434
380 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 435 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
381 info = rb_entry(node, struct btrfs_free_space, bytes_index); 436 info = rb_entry(node, struct btrfs_free_space, bytes_index);
382 unlink_free_space(block_group, info); 437 unlink_free_space(block_group, info);
@@ -417,3 +472,245 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
417 472
418 return ret; 473 return ret;
419} 474}
475
476/*
477 * given a cluster, put all of its extents back into the free space
478 * cache. If a block group is passed, this function will only free
479 * a cluster that belongs to the passed block group.
480 *
481 * Otherwise, it'll get a reference on the block group pointed to by the
482 * cluster and remove the cluster from it.
483 */
484int btrfs_return_cluster_to_free_space(
485 struct btrfs_block_group_cache *block_group,
486 struct btrfs_free_cluster *cluster)
487{
488 int ret;
489
490 /* first, get a safe pointer to the block group */
491 spin_lock(&cluster->lock);
492 if (!block_group) {
493 block_group = cluster->block_group;
494 if (!block_group) {
495 spin_unlock(&cluster->lock);
496 return 0;
497 }
498 } else if (cluster->block_group != block_group) {
499 /* someone else has already freed it don't redo their work */
500 spin_unlock(&cluster->lock);
501 return 0;
502 }
503 atomic_inc(&block_group->count);
504 spin_unlock(&cluster->lock);
505
506 /* now return any extents the cluster had on it */
507 spin_lock(&block_group->tree_lock);
508 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
509 spin_unlock(&block_group->tree_lock);
510
511 /* finally drop our ref */
512 btrfs_put_block_group(block_group);
513 return ret;
514}
515
516/*
517 * given a cluster, try to allocate 'bytes' from it, returns 0
518 * if it couldn't find anything suitably large, or a logical disk offset
519 * if things worked out
520 */
521u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
522 struct btrfs_free_cluster *cluster, u64 bytes,
523 u64 min_start)
524{
525 struct btrfs_free_space *entry = NULL;
526 struct rb_node *node;
527 u64 ret = 0;
528
529 spin_lock(&cluster->lock);
530 if (bytes > cluster->max_size)
531 goto out;
532
533 if (cluster->block_group != block_group)
534 goto out;
535
536 node = rb_first(&cluster->root);
537 if (!node)
538 goto out;
539
540 entry = rb_entry(node, struct btrfs_free_space, offset_index);
541
542 while(1) {
543 if (entry->bytes < bytes || entry->offset < min_start) {
544 struct rb_node *node;
545
546 node = rb_next(&entry->offset_index);
547 if (!node)
548 break;
549 entry = rb_entry(node, struct btrfs_free_space,
550 offset_index);
551 continue;
552 }
553 ret = entry->offset;
554
555 entry->offset += bytes;
556 entry->bytes -= bytes;
557
558 if (entry->bytes == 0) {
559 rb_erase(&entry->offset_index, &cluster->root);
560 kfree(entry);
561 }
562 break;
563 }
564out:
565 spin_unlock(&cluster->lock);
566 return ret;
567}
568
569/*
570 * here we try to find a cluster of blocks in a block group. The goal
571 * is to find at least bytes free and up to empty_size + bytes free.
572 * We might not find them all in one contiguous area.
573 *
574 * returns zero and sets up cluster if things worked out, otherwise
575 * it returns -enospc
576 */
577int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
578 struct btrfs_block_group_cache *block_group,
579 struct btrfs_free_cluster *cluster,
580 u64 offset, u64 bytes, u64 empty_size)
581{
582 struct btrfs_free_space *entry = NULL;
583 struct rb_node *node;
584 struct btrfs_free_space *next;
585 struct btrfs_free_space *last;
586 u64 min_bytes;
587 u64 window_start;
588 u64 window_free;
589 u64 max_extent = 0;
590 int total_retries = 0;
591 int ret;
592
593 /* for metadata, allow allocates with more holes */
594 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
595 /*
596 * we want to do larger allocations when we are
597 * flushing out the delayed refs, it helps prevent
598 * making more work as we go along.
599 */
600 if (trans->transaction->delayed_refs.flushing)
601 min_bytes = max(bytes, (bytes + empty_size) >> 1);
602 else
603 min_bytes = max(bytes, (bytes + empty_size) >> 4);
604 } else
605 min_bytes = max(bytes, (bytes + empty_size) >> 2);
606
607 spin_lock(&block_group->tree_lock);
608 spin_lock(&cluster->lock);
609
610 /* someone already found a cluster, hooray */
611 if (cluster->block_group) {
612 ret = 0;
613 goto out;
614 }
615again:
616 min_bytes = min(min_bytes, bytes + empty_size);
617 entry = tree_search_bytes(&block_group->free_space_bytes,
618 offset, min_bytes);
619 if (!entry) {
620 ret = -ENOSPC;
621 goto out;
622 }
623 window_start = entry->offset;
624 window_free = entry->bytes;
625 last = entry;
626 max_extent = entry->bytes;
627
628 while(1) {
629 /* out window is just right, lets fill it */
630 if (window_free >= bytes + empty_size)
631 break;
632
633 node = rb_next(&last->offset_index);
634 if (!node) {
635 ret = -ENOSPC;
636 goto out;
637 }
638 next = rb_entry(node, struct btrfs_free_space, offset_index);
639
640 /*
641 * we haven't filled the empty size and the window is
642 * very large. reset and try again
643 */
644 if (next->offset - window_start > (bytes + empty_size) * 2) {
645 entry = next;
646 window_start = entry->offset;
647 window_free = entry->bytes;
648 last = entry;
649 max_extent = 0;
650 total_retries++;
651 if (total_retries % 256 == 0) {
652 if (min_bytes >= (bytes + empty_size)) {
653 ret = -ENOSPC;
654 goto out;
655 }
656 /*
657 * grow our allocation a bit, we're not having
658 * much luck
659 */
660 min_bytes *= 2;
661 goto again;
662 }
663 } else {
664 last = next;
665 window_free += next->bytes;
666 if (entry->bytes > max_extent)
667 max_extent = entry->bytes;
668 }
669 }
670
671 cluster->window_start = entry->offset;
672
673 /*
674 * now we've found our entries, pull them out of the free space
675 * cache and put them into the cluster rbtree
676 *
677 * The cluster includes an rbtree, but only uses the offset index
678 * of each free space cache entry.
679 */
680 while(1) {
681 node = rb_next(&entry->offset_index);
682 unlink_free_space(block_group, entry);
683 ret = tree_insert_offset(&cluster->root, entry->offset,
684 &entry->offset_index);
685 BUG_ON(ret);
686
687 if (!node || entry == last)
688 break;
689
690 entry = rb_entry(node, struct btrfs_free_space, offset_index);
691 }
692 ret = 0;
693 cluster->max_size = max_extent;
694 atomic_inc(&block_group->count);
695 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
696 cluster->block_group = block_group;
697out:
698 spin_unlock(&cluster->lock);
699 spin_unlock(&block_group->tree_lock);
700
701 return ret;
702}
703
704/*
705 * simple code to zero out a cluster
706 */
707void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
708{
709 spin_lock_init(&cluster->lock);
710 spin_lock_init(&cluster->refill_lock);
711 cluster->root.rb_node = NULL;
712 cluster->max_size = 0;
713 INIT_LIST_HEAD(&cluster->block_group_list);
714 cluster->block_group = NULL;
715}
716
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE
21
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
25 u64 bytenr, u64 size);
26void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
27 *block_group);
28u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
29 u64 offset, u64 bytes, u64 empty_size);
30void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
34 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size);
37void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
38u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
39 struct btrfs_free_cluster *cluster, u64 bytes,
40 u64 min_start);
41int btrfs_return_cluster_to_free_space(
42 struct btrfs_block_group_cache *block_group,
43 struct btrfs_free_cluster *cluster);
44#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 664782c6a2df..3e8225de4e9d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
53 GFP_NOFS); 53 GFP_NOFS);
54 BUG_ON(!cur_trans); 54 BUG_ON(!cur_trans);
55 root->fs_info->generation++; 55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1; 56 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0; 57 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation; 58 cur_trans->transid = root->fs_info->generation;