Btrfs: rework allocation clustering

Because btrfs is copy-on-write, we end up picking new locations for blocks very often. This makes it fairly difficult to maintain perfect read patterns over time, but we can at least do some optimizations for writes. This is done today by remembering the last place we allocated and trying to find a free space hole big enough to hold more than just one allocation. The end result is that we tend to write sequentially to the drive. This happens all the time for metadata and it happens for data when mounted -o ssd. But, the way we record it is fairly racey and it tends to fragment the free space over time because we are trying to allocate fairly large areas at once. This commit gets rid of the races by adding a free space cluster object with dedicated locking to make sure that only one process at a time is out replacing the cluster. The free space fragmentation is somewhat solved by allowing a cluster to be comprised of smaller free space extents. This part definitely adds some CPU time to the cluster allocations, but it allows the allocator to consume the small holes left behind by cow. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2009-04-03 09:47:43 -0400
committer: Chris Mason <chris.mason@oracle.com> 2009-04-03 09:47:43 -0400
commit: fa9c0d795f7b57c76560b7fac703f5d341210e28 (patch)
tree: 74d9d9846e21ce5b99738f3cc13b855fb63d1eba /fs/btrfs/free-space-cache.c
parent: 8e73f275011b3264a87339fd9f1690e944e381c9 (diff)
1 files changed, 297 insertions, 0 deletions
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index df19b60eef61..3fdadd28e935 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
 #include <linux/sched.h>
 #include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+struct btrfs_free_space {
+        struct rb_node bytes_index;
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+};
 static int tree_insert_offset(struct rb_root *root, u64 offset,
                              struct rb_node *node)
@@ -371,12 +380,58 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
        return ret;
 }
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache.  If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already.  In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster)
+{
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        spin_lock(&cluster->lock);
+        if (cluster->block_group != block_group)
+                goto out;
+        cluster->window_start = 0;
+        node = rb_first(&cluster->root);
+        while(node) {
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                node = rb_next(&entry->offset_index);
+                rb_erase(&entry->offset_index, &cluster->root);
+                link_free_space(block_group, entry);
+        }
+        list_del_init(&cluster->block_group_list);
+        btrfs_put_block_group(cluster->block_group);
+        cluster->block_group = NULL;
+        cluster->root.rb_node = NULL;
+out:
+        spin_unlock(&cluster->lock);
+        return 0;
+}
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 {
        struct btrfs_free_space *info;
        struct rb_node *node;
+        struct btrfs_free_cluster *cluster;
+        struct btrfs_free_cluster *safe;
        spin_lock(&block_group->tree_lock);
+        list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+                                 block_group_list) {
+                WARN_ON(cluster->block_group != block_group);
+                __btrfs_return_cluster_to_free_space(block_group, cluster);
+        }
        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
                info = rb_entry(node, struct btrfs_free_space, bytes_index);
                unlink_free_space(block_group, info);
@@ -417,3 +472,245 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
        return ret;
 }
+/*
+ * given a cluster, put all of its extents back into the free space
+ * cache.  If a block group is passed, this function will only free
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+                               struct btrfs_block_group_cache *block_group,
+                               struct btrfs_free_cluster *cluster)
+{
+        int ret;
+        /* first, get a safe pointer to the block group */
+        spin_lock(&cluster->lock);
+        if (!block_group) {
+                block_group = cluster->block_group;
+                if (!block_group) {
+                        spin_unlock(&cluster->lock);
+                        return 0;
+                }
+        } else if (cluster->block_group != block_group) {
+                /* someone else has already freed it don't redo their work */
+                spin_unlock(&cluster->lock);
+                return 0;
+        }
+        atomic_inc(&block_group->count);
+        spin_unlock(&cluster->lock);
+        /* now return any extents the cluster had on it */
+        spin_lock(&block_group->tree_lock);
+        ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+        spin_unlock(&block_group->tree_lock);
+        /* finally drop our ref */
+        btrfs_put_block_group(block_group);
+        return ret;
+}
+/*
+ * given a cluster, try to allocate 'bytes' from it, returns 0
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster, u64 bytes,
+                             u64 min_start)
+{
+        struct btrfs_free_space *entry = NULL;
+        struct rb_node *node;
+        u64 ret = 0;
+        spin_lock(&cluster->lock);
+        if (bytes > cluster->max_size)
+                goto out;
+        if (cluster->block_group != block_group)
+                goto out;
+        node = rb_first(&cluster->root);
+        if (!node)
+                goto out;
+        entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        while(1) {
+                if (entry->bytes < bytes || entry->offset < min_start) {
+                        struct rb_node *node;
+                        node = rb_next(&entry->offset_index);
+                        if (!node)
+                                break;
+                        entry = rb_entry(node, struct btrfs_free_space,
+                                         offset_index);
+                        continue;
+                }
+                ret = entry->offset;
+                entry->offset += bytes;
+                entry->bytes -= bytes;
+                if (entry->bytes == 0) {
+                        rb_erase(&entry->offset_index, &cluster->root);
+                        kfree(entry);
+                }
+                break;
+        }
+out:
+        spin_unlock(&cluster->lock);
+        return ret;
+}
+/*
+ * here we try to find a cluster of blocks in a block group.  The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster,
+                             u64 offset, u64 bytes, u64 empty_size)
+{
+        struct btrfs_free_space *entry = NULL;
+        struct rb_node *node;
+        struct btrfs_free_space *next;
+        struct btrfs_free_space *last;
+        u64 min_bytes;
+        u64 window_start;
+        u64 window_free;
+        u64 max_extent = 0;
+        int total_retries = 0;
+        int ret;
+        /* for metadata, allow allocates with more holes */
+        if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+                /*
+                 * we want to do larger allocations when we are
+                 * flushing out the delayed refs, it helps prevent
+                 * making more work as we go along.
+                 */
+                if (trans->transaction->delayed_refs.flushing)
+                        min_bytes = max(bytes, (bytes + empty_size) >> 1);
+                else
+                        min_bytes = max(bytes, (bytes + empty_size) >> 4);
+        } else
+                min_bytes = max(bytes, (bytes + empty_size) >> 2);
+        spin_lock(&block_group->tree_lock);
+        spin_lock(&cluster->lock);
+        /* someone already found a cluster, hooray */
+        if (cluster->block_group) {
+                ret = 0;
+                goto out;
+        }
+again:
+        min_bytes = min(min_bytes, bytes + empty_size);
+        entry = tree_search_bytes(&block_group->free_space_bytes,
+                                  offset, min_bytes);
+        if (!entry) {
+                ret = -ENOSPC;
+                goto out;
+        }
+        window_start = entry->offset;
+        window_free = entry->bytes;
+        last = entry;
+        max_extent = entry->bytes;
+        while(1) {
+                /* out window is just right, lets fill it */
+                if (window_free >= bytes + empty_size)
+                        break;
+                node = rb_next(&last->offset_index);
+                if (!node) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                next = rb_entry(node, struct btrfs_free_space, offset_index);
+                /*
+                 * we haven't filled the empty size and the window is
+                 * very large.  reset and try again
+                 */
+                if (next->offset - window_start > (bytes + empty_size) * 2) {
+                        entry = next;
+                        window_start = entry->offset;
+                        window_free = entry->bytes;
+                        last = entry;
+                        max_extent = 0;
+                        total_retries++;
+                        if (total_retries % 256 == 0) {
+                                if (min_bytes >= (bytes + empty_size)) {
+                                        ret = -ENOSPC;
+                                        goto out;
+                                }
+                                /*
+                                 * grow our allocation a bit, we're not having
+                                 * much luck
+                                 */
+                                min_bytes *= 2;
+                                goto again;
+                        }
+                } else {
+                        last = next;
+                        window_free += next->bytes;
+                        if (entry->bytes > max_extent)
+                                max_extent = entry->bytes;
+                }
+        }
+        cluster->window_start = entry->offset;
+        /*
+         * now we've found our entries, pull them out of the free space
+         * cache and put them into the cluster rbtree
+         *
+         * The cluster includes an rbtree, but only uses the offset index
+         * of each free space cache entry.
+         */
+        while(1) {
+                node = rb_next(&entry->offset_index);
+                unlink_free_space(block_group, entry);
+                ret = tree_insert_offset(&cluster->root, entry->offset,
+                                         &entry->offset_index);
+                BUG_ON(ret);
+                if (!node || entry == last)
+                        break;
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        }
+        ret = 0;
+        cluster->max_size = max_extent;
+        atomic_inc(&block_group->count);
+        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
+        cluster->block_group = block_group;
+out:
+        spin_unlock(&cluster->lock);
+        spin_unlock(&block_group->tree_lock);
+        return ret;
+}
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+        spin_lock_init(&cluster->lock);
+        spin_lock_init(&cluster->refill_lock);
+        cluster->root.rb_node = NULL;
+        cluster->max_size = 0;
+        INIT_LIST_HEAD(&cluster->block_group_list);
+        cluster->block_group = NULL;
+}
author	Chris Mason <chris.mason@oracle.com>	2009-04-03 09:47:43 -0400
committer	Chris Mason <chris.mason@oracle.com>	2009-04-03 09:47:43 -0400
commit	fa9c0d795f7b57c76560b7fac703f5d341210e28 (patch)
tree	74d9d9846e21ce5b99738f3cc13b855fb63d1eba /fs/btrfs/free-space-cache.c
parent	8e73f275011b3264a87339fd9f1690e944e381c9 (diff)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index df19b60eef61..3fdadd28e935 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18		18
19	#include <linux/sched.h>	19	#include <linux/sched.h>
20	#include "ctree.h"	20	#include "ctree.h"
		21	#include "free-space-cache.h"
		22	#include "transaction.h"
		23
		24	struct btrfs_free_space {
		25	struct rb_node bytes_index;
		26	struct rb_node offset_index;
		27	u64 offset;
		28	u64 bytes;
		29	};
21		30
22	static int tree_insert_offset(struct rb_root *root, u64 offset,	31	static int tree_insert_offset(struct rb_root *root, u64 offset,
23	struct rb_node *node)	32	struct rb_node *node)
@@ -371,12 +380,58 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
371	return ret;	380	return ret;
372	}	381	}
373		382
		383	/*
		384	* for a given cluster, put all of its extents back into the free
		385	* space cache. If the block group passed doesn't match the block group
		386	* pointed to by the cluster, someone else raced in and freed the
		387	* cluster already. In that case, we just return without changing anything
		388	*/
		389	static int
		390	__btrfs_return_cluster_to_free_space(
		391	struct btrfs_block_group_cache *block_group,
		392	struct btrfs_free_cluster *cluster)
		393	{
		394	struct btrfs_free_space *entry;
		395	struct rb_node *node;
		396
		397	spin_lock(&cluster->lock);
		398	if (cluster->block_group != block_group)
		399	goto out;
		400
		401	cluster->window_start = 0;
		402	node = rb_first(&cluster->root);
		403	while(node) {
		404	entry = rb_entry(node, struct btrfs_free_space, offset_index);
		405	node = rb_next(&entry->offset_index);
		406	rb_erase(&entry->offset_index, &cluster->root);
		407	link_free_space(block_group, entry);
		408	}
		409	list_del_init(&cluster->block_group_list);
		410
		411	btrfs_put_block_group(cluster->block_group);
		412	cluster->block_group = NULL;
		413	cluster->root.rb_node = NULL;
		414	out:
		415	spin_unlock(&cluster->lock);
		416	return 0;
		417	}
		418
374	void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)	419	void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
375	{	420	{
376	struct btrfs_free_space *info;	421	struct btrfs_free_space *info;
377	struct rb_node *node;	422	struct rb_node *node;
		423	struct btrfs_free_cluster *cluster;
		424	struct btrfs_free_cluster *safe;
378		425
379	spin_lock(&block_group->tree_lock);	426	spin_lock(&block_group->tree_lock);
		427
		428	list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
		429	block_group_list) {
		430
		431	WARN_ON(cluster->block_group != block_group);
		432	__btrfs_return_cluster_to_free_space(block_group, cluster);
		433	}
		434
380	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {	435	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
381	info = rb_entry(node, struct btrfs_free_space, bytes_index);	436	info = rb_entry(node, struct btrfs_free_space, bytes_index);
382	unlink_free_space(block_group, info);	437	unlink_free_space(block_group, info);
@@ -417,3 +472,245 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
417		472
418	return ret;	473	return ret;
419	}	474	}
		475
		476	/*
		477	* given a cluster, put all of its extents back into the free space
		478	* cache. If a block group is passed, this function will only free
		479	* a cluster that belongs to the passed block group.
		480	*
		481	* Otherwise, it'll get a reference on the block group pointed to by the
		482	* cluster and remove the cluster from it.
		483	*/
		484	int btrfs_return_cluster_to_free_space(
		485	struct btrfs_block_group_cache *block_group,
		486	struct btrfs_free_cluster *cluster)
		487	{
		488	int ret;
		489
		490	/* first, get a safe pointer to the block group */
		491	spin_lock(&cluster->lock);
		492	if (!block_group) {
		493	block_group = cluster->block_group;
		494	if (!block_group) {
		495	spin_unlock(&cluster->lock);
		496	return 0;
		497	}
		498	} else if (cluster->block_group != block_group) {
		499	/* someone else has already freed it don't redo their work */
		500	spin_unlock(&cluster->lock);
		501	return 0;
		502	}
		503	atomic_inc(&block_group->count);
		504	spin_unlock(&cluster->lock);
		505
		506	/* now return any extents the cluster had on it */
		507	spin_lock(&block_group->tree_lock);
		508	ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
		509	spin_unlock(&block_group->tree_lock);
		510
		511	/* finally drop our ref */
		512	btrfs_put_block_group(block_group);
		513	return ret;
		514	}
		515
		516	/*
		517	* given a cluster, try to allocate 'bytes' from it, returns 0
		518	* if it couldn't find anything suitably large, or a logical disk offset
		519	* if things worked out
		520	*/
		521	u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
		522	struct btrfs_free_cluster *cluster, u64 bytes,
		523	u64 min_start)
		524	{
		525	struct btrfs_free_space *entry = NULL;
		526	struct rb_node *node;
		527	u64 ret = 0;
		528
		529	spin_lock(&cluster->lock);
		530	if (bytes > cluster->max_size)
		531	goto out;
		532
		533	if (cluster->block_group != block_group)
		534	goto out;
		535
		536	node = rb_first(&cluster->root);
		537	if (!node)
		538	goto out;
		539
		540	entry = rb_entry(node, struct btrfs_free_space, offset_index);
		541
		542	while(1) {
		543	if (entry->bytes < bytes \|\| entry->offset < min_start) {
		544	struct rb_node *node;
		545
		546	node = rb_next(&entry->offset_index);
		547	if (!node)
		548	break;
		549	entry = rb_entry(node, struct btrfs_free_space,
		550	offset_index);
		551	continue;
		552	}
		553	ret = entry->offset;
		554
		555	entry->offset += bytes;
		556	entry->bytes -= bytes;
		557
		558	if (entry->bytes == 0) {
		559	rb_erase(&entry->offset_index, &cluster->root);
		560	kfree(entry);
		561	}
		562	break;
		563	}
		564	out:
		565	spin_unlock(&cluster->lock);
		566	return ret;
		567	}
		568
		569	/*
		570	* here we try to find a cluster of blocks in a block group. The goal
		571	* is to find at least bytes free and up to empty_size + bytes free.
		572	* We might not find them all in one contiguous area.
		573	*
		574	* returns zero and sets up cluster if things worked out, otherwise
		575	* it returns -enospc
		576	*/
		577	int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
		578	struct btrfs_block_group_cache *block_group,
		579	struct btrfs_free_cluster *cluster,
		580	u64 offset, u64 bytes, u64 empty_size)
		581	{
		582	struct btrfs_free_space *entry = NULL;
		583	struct rb_node *node;
		584	struct btrfs_free_space *next;
		585	struct btrfs_free_space *last;
		586	u64 min_bytes;
		587	u64 window_start;
		588	u64 window_free;
		589	u64 max_extent = 0;
		590	int total_retries = 0;
		591	int ret;
		592
		593	/* for metadata, allow allocates with more holes */
		594	if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
		595	/*
		596	* we want to do larger allocations when we are
		597	* flushing out the delayed refs, it helps prevent
		598	* making more work as we go along.
		599	*/
		600	if (trans->transaction->delayed_refs.flushing)
		601	min_bytes = max(bytes, (bytes + empty_size) >> 1);
		602	else
		603	min_bytes = max(bytes, (bytes + empty_size) >> 4);
		604	} else
		605	min_bytes = max(bytes, (bytes + empty_size) >> 2);
		606
		607	spin_lock(&block_group->tree_lock);
		608	spin_lock(&cluster->lock);
		609
		610	/* someone already found a cluster, hooray */
		611	if (cluster->block_group) {
		612	ret = 0;
		613	goto out;
		614	}
		615	again:
		616	min_bytes = min(min_bytes, bytes + empty_size);
		617	entry = tree_search_bytes(&block_group->free_space_bytes,
		618	offset, min_bytes);
		619	if (!entry) {
		620	ret = -ENOSPC;
		621	goto out;
		622	}
		623	window_start = entry->offset;
		624	window_free = entry->bytes;
		625	last = entry;
		626	max_extent = entry->bytes;
		627
		628	while(1) {
		629	/* out window is just right, lets fill it */
		630	if (window_free >= bytes + empty_size)
		631	break;
		632
		633	node = rb_next(&last->offset_index);
		634	if (!node) {
		635	ret = -ENOSPC;
		636	goto out;
		637	}
		638	next = rb_entry(node, struct btrfs_free_space, offset_index);
		639
		640	/*
		641	* we haven't filled the empty size and the window is
		642	* very large. reset and try again
		643	*/
		644	if (next->offset - window_start > (bytes + empty_size) * 2) {
		645	entry = next;
		646	window_start = entry->offset;
		647	window_free = entry->bytes;
		648	last = entry;
		649	max_extent = 0;
		650	total_retries++;
		651	if (total_retries % 256 == 0) {
		652	if (min_bytes >= (bytes + empty_size)) {
		653	ret = -ENOSPC;
		654	goto out;
		655	}
		656	/*
		657	* grow our allocation a bit, we're not having
		658	* much luck
		659	*/
		660	min_bytes *= 2;
		661	goto again;
		662	}
		663	} else {
		664	last = next;
		665	window_free += next->bytes;
		666	if (entry->bytes > max_extent)
		667	max_extent = entry->bytes;
		668	}
		669	}
		670
		671	cluster->window_start = entry->offset;
		672
		673	/*
		674	* now we've found our entries, pull them out of the free space
		675	* cache and put them into the cluster rbtree
		676	*
		677	* The cluster includes an rbtree, but only uses the offset index
		678	* of each free space cache entry.
		679	*/
		680	while(1) {
		681	node = rb_next(&entry->offset_index);
		682	unlink_free_space(block_group, entry);
		683	ret = tree_insert_offset(&cluster->root, entry->offset,
		684	&entry->offset_index);
		685	BUG_ON(ret);
		686
		687	if (!node \|\| entry == last)
		688	break;
		689
		690	entry = rb_entry(node, struct btrfs_free_space, offset_index);
		691	}
		692	ret = 0;
		693	cluster->max_size = max_extent;
		694	atomic_inc(&block_group->count);
		695	list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
		696	cluster->block_group = block_group;
		697	out:
		698	spin_unlock(&cluster->lock);
		699	spin_unlock(&block_group->tree_lock);
		700
		701	return ret;
		702	}
		703
		704	/*
		705	* simple code to zero out a cluster
		706	*/
		707	void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
		708	{
		709	spin_lock_init(&cluster->lock);
		710	spin_lock_init(&cluster->refill_lock);
		711	cluster->root.rb_node = NULL;
		712	cluster->max_size = 0;
		713	INIT_LIST_HEAD(&cluster->block_group_list);
		714	cluster->block_group = NULL;
		715	}
		716