Btrfs: turbo charge fsync

At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik <jbacik@fusionio.com>
author: Josef Bacik <jbacik@fusionio.com> 2012-08-17 13:14:17 -0400
committer: Chris Mason <chris.mason@fusionio.com> 2012-10-01 15:19:03 -0400
commit: 5dc562c541e1026df9d43913c2f6b91156e22d32 (patch)
tree: a7768100e81b756f2a3edbfcaf99ad77ca7ed605 /fs/btrfs/tree-log.c
parent: 224ecce517af3a952321202cdf304c12e138caca (diff)
1 files changed, 214 insertions, 6 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f4f285..f2ff02c55130 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/list_sort.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
@@ -550,7 +551,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        saved_nbytes = inode_get_bytes(inode);
        /* drop any overlapping extents */
-        ret = btrfs_drop_extents(trans, inode, start, extent_end,
+        ret = btrfs_drop_extents(trans, root, inode, start, extent_end,
                                 &alloc_hint, 1);
        BUG_ON(ret);
@@ -2803,6 +2804,194 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        return ret;
 }
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        struct extent_map *em1, *em2;
+        em1 = list_entry(a, struct extent_map, list);
+        em2 = list_entry(b, struct extent_map, list);
+        if (em1->start < em2->start)
+                return -1;
+        else if (em1->start > em2->start)
+                return 1;
+        return 0;
+}
+struct log_args {
+        struct extent_buffer *src;
+        u64 next_offset;
+        int start_slot;
+        int nr;
+};
+static int log_one_extent(struct btrfs_trans_handle *trans,
+                          struct inode *inode, struct btrfs_root *root,
+                          struct extent_map *em, struct btrfs_path *path,
+                          struct btrfs_path *dst_path, struct log_args *args)
+{
+        struct btrfs_root *log = root->log_root;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 start = em->start;
+        u64 len = em->len;
+        u64 num_bytes;
+        int nritems;
+        int ret;
+        if (BTRFS_I(inode)->logged_trans == trans->transid) {
+                u64 tmp;
+                ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
+                                           start + len, &tmp, 0);
+                if (ret)
+                        return ret;
+        }
+        while (len) {
+                if (args->nr)
+                        goto next_slot;
+                key.objectid = btrfs_ino(inode);
+                key.type = BTRFS_EXTENT_DATA_KEY;
+                key.offset = start;
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        return ret;
+                if (ret) {
+                        /*
+                         * This shouldn't happen, but it might so warn and
+                         * return an error.
+                         */
+                        WARN_ON(1);
+                        return -ENOENT;
+                }
+                args->src = path->nodes[0];
+next_slot:
+                fi = btrfs_item_ptr(args->src, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (args->nr &&
+                    args->start_slot + args->nr == path->slots[0]) {
+                        args->nr++;
+                } else if (args->nr) {
+                        ret = copy_items(trans, log, dst_path, args->src,
+                                         args->start_slot, args->nr,
+                                         LOG_INODE_ALL);
+                        if (ret)
+                                return ret;
+                        args->nr = 1;
+                        args->start_slot = path->slots[0];
+                } else if (!args->nr) {
+                        args->nr = 1;
+                        args->start_slot = path->slots[0];
+                }
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                path->slots[0]++;
+                num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
+                if (len < num_bytes) {
+                        /* I _think_ this is ok, envision we write to a
+                         * preallocated space that is adjacent to a previously
+                         * written preallocated space that gets merged when we
+                         * mark this preallocated space written.  If we do not
+                         * have the adjacent extent in cache then when we copy
+                         * this extent it could end up being larger than our EM
+                         * thinks it is, which is a-ok, so just set len to 0.
+                         */
+                        len = 0;
+                } else {
+                        len -= num_bytes;
+                }
+                start += btrfs_file_extent_num_bytes(args->src, fi);
+                args->next_offset = start;
+                if (path->slots[0] < nritems) {
+                        if (len)
+                                goto next_slot;
+                        break;
+                }
+                if (args->nr) {
+                        ret = copy_items(trans, log, dst_path, args->src,
+                                         args->start_slot, args->nr,
+                                         LOG_INODE_ALL);
+                        if (ret)
+                                return ret;
+                        args->nr = 0;
+                        btrfs_release_path(path);
+                }
+        }
+        return 0;
+}
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     struct inode *inode,
+                                     struct btrfs_path *path,
+                                     struct btrfs_path *dst_path)
+{
+        struct log_args args;
+        struct btrfs_root *log = root->log_root;
+        struct extent_map *em, *n;
+        struct list_head extents;
+        struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+        u64 test_gen;
+        int ret = 0;
+        INIT_LIST_HEAD(&extents);
+        memset(&args, 0, sizeof(args));
+        write_lock(&tree->lock);
+        test_gen = root->fs_info->last_trans_committed;
+        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+                list_del_init(&em->list);
+                if (em->generation <= test_gen)
+                        continue;
+                list_add_tail(&em->list, &extents);
+        }
+        list_sort(NULL, &extents, extent_cmp);
+        while (!list_empty(&extents)) {
+                em = list_entry(extents.next, struct extent_map, list);
+                list_del_init(&em->list);
+                /*
+                 * If we had an error we just need to delete everybody from our
+                 * private list.
+                 */
+                if (ret)
+                        continue;
+                /*
+                 * If the previous EM and the last extent we left off on aren't
+                 * sequential then we need to copy the items we have and redo
+                 * our search
+                 */
+                if (args.nr && em->start != args.next_offset) {
+                        ret = copy_items(trans, log, dst_path, args.src,
+                                         args.start_slot, args.nr,
+                                         LOG_INODE_ALL);
+                        if (ret)
+                                continue;
+                        btrfs_release_path(path);
+                        args.nr = 0;
+                }
+                ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+        }
+        if (!ret && args.nr)
+                ret = copy_items(trans, log, dst_path, args.src,
+                                 args.start_slot, args.nr, LOG_INODE_ALL);
+        btrfs_release_path(path);
+        WARN_ON(!list_empty(&extents));
+        write_unlock(&tree->lock);
+        return ret;
+}
 /* log a single inode in the tree log.
 * At least one parent directory for this inode must exist in the tree
 * or be logged already.
@@ -2832,6 +3021,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        int nritems;
        int ins_start_slot = 0;
        int ins_nr;
+        bool fast_search = false;
        u64 ino = btrfs_ino(inode);
        log = root->log_root;
@@ -2851,10 +3041,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        max_key.objectid = ino;
-        /* today the code can only do partial logging of directories */
-        if (!S_ISDIR(inode->i_mode))
-            inode_only = LOG_INODE_ALL;
+        /* today the code can only do partial logging of directories */
        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
                max_key.type = BTRFS_XATTR_ITEM_KEY;
        else
@@ -2881,7 +3069,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                        max_key_type = BTRFS_XATTR_ITEM_KEY;
                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
        } else {
-                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+                if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                       &BTRFS_I(inode)->runtime_flags)) {
+                        ret = btrfs_truncate_inode_items(trans, log,
+                                                         inode, 0, 0);
+                } else {
+                        fast_search = true;
+                        max_key.type = BTRFS_XATTR_ITEM_KEY;
+                        ret = drop_objectid_items(trans, log, path, ino,
+                                                  BTRFS_XATTR_ITEM_KEY);
+                }
        }
        if (ret) {
                err = ret;
@@ -2960,7 +3157,18 @@ next_slot:
                }
                ins_nr = 0;
        }
-        WARN_ON(ins_nr);
+        if (fast_search) {
+                btrfs_release_path(path);
+                btrfs_release_path(dst_path);
+                ret = btrfs_log_changed_extents(trans, root, inode, path,
+                                                dst_path);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
+        }
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
                btrfs_release_path(path);
                btrfs_release_path(dst_path);
author	Josef Bacik <jbacik@fusionio.com>	2012-08-17 13:14:17 -0400
committer	Chris Mason <chris.mason@fusionio.com>	2012-10-01 15:19:03 -0400
commit	5dc562c541e1026df9d43913c2f6b91156e22d32 (patch)
tree	a7768100e81b756f2a3edbfcaf99ad77ca7ed605 /fs/btrfs/tree-log.c
parent	224ecce517af3a952321202cdf304c12e138caca (diff)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c86670f4f285..f2ff02c55130 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
18		18
19	#include <linux/sched.h>	19	#include <linux/sched.h>
20	#include <linux/slab.h>	20	#include <linux/slab.h>
		21	#include <linux/list_sort.h>
21	#include "ctree.h"	22	#include "ctree.h"
22	#include "transaction.h"	23	#include "transaction.h"
23	#include "disk-io.h"	24	#include "disk-io.h"
@@ -550,7 +551,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
550		551
551	saved_nbytes = inode_get_bytes(inode);	552	saved_nbytes = inode_get_bytes(inode);
552	/* drop any overlapping extents */	553	/* drop any overlapping extents */
553	ret = btrfs_drop_extents(trans, inode, start, extent_end,	554	ret = btrfs_drop_extents(trans, root, inode, start, extent_end,
554	&alloc_hint, 1);	555	&alloc_hint, 1);
555	BUG_ON(ret);	556	BUG_ON(ret);
556		557
@@ -2803,6 +2804,194 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2803	return ret;	2804	return ret;
2804	}	2805	}
2805		2806
		2807	static int extent_cmp(void priv, struct list_head a, struct list_head *b)
		2808	{
		2809	struct extent_map em1, em2;
		2810
		2811	em1 = list_entry(a, struct extent_map, list);
		2812	em2 = list_entry(b, struct extent_map, list);
		2813
		2814	if (em1->start < em2->start)
		2815	return -1;
		2816	else if (em1->start > em2->start)
		2817	return 1;
		2818	return 0;
		2819	}
		2820
		2821	struct log_args {
		2822	struct extent_buffer *src;
		2823	u64 next_offset;
		2824	int start_slot;
		2825	int nr;
		2826	};
		2827
		2828	static int log_one_extent(struct btrfs_trans_handle *trans,
		2829	struct inode inode, struct btrfs_root root,
		2830	struct extent_map em, struct btrfs_path path,
		2831	struct btrfs_path dst_path, struct log_args args)
		2832	{
		2833	struct btrfs_root *log = root->log_root;
		2834	struct btrfs_file_extent_item *fi;
		2835	struct btrfs_key key;
		2836	u64 start = em->start;
		2837	u64 len = em->len;
		2838	u64 num_bytes;
		2839	int nritems;
		2840	int ret;
		2841
		2842	if (BTRFS_I(inode)->logged_trans == trans->transid) {
		2843	u64 tmp;
		2844	ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
		2845	start + len, &tmp, 0);
		2846	if (ret)
		2847	return ret;
		2848	}
		2849
		2850	while (len) {
		2851	if (args->nr)
		2852	goto next_slot;
		2853	key.objectid = btrfs_ino(inode);
		2854	key.type = BTRFS_EXTENT_DATA_KEY;
		2855	key.offset = start;
		2856
		2857	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		2858	if (ret < 0)
		2859	return ret;
		2860	if (ret) {
		2861	/*
		2862	* This shouldn't happen, but it might so warn and
		2863	* return an error.
		2864	*/
		2865	WARN_ON(1);
		2866	return -ENOENT;
		2867	}
		2868	args->src = path->nodes[0];
		2869	next_slot:
		2870	fi = btrfs_item_ptr(args->src, path->slots[0],
		2871	struct btrfs_file_extent_item);
		2872	if (args->nr &&
		2873	args->start_slot + args->nr == path->slots[0]) {
		2874	args->nr++;
		2875	} else if (args->nr) {
		2876	ret = copy_items(trans, log, dst_path, args->src,
		2877	args->start_slot, args->nr,
		2878	LOG_INODE_ALL);
		2879	if (ret)
		2880	return ret;
		2881	args->nr = 1;
		2882	args->start_slot = path->slots[0];
		2883	} else if (!args->nr) {
		2884	args->nr = 1;
		2885	args->start_slot = path->slots[0];
		2886	}
		2887	nritems = btrfs_header_nritems(path->nodes[0]);
		2888	path->slots[0]++;
		2889	num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
		2890	if (len < num_bytes) {
		2891	/* I _think_ this is ok, envision we write to a
		2892	* preallocated space that is adjacent to a previously
		2893	* written preallocated space that gets merged when we
		2894	* mark this preallocated space written. If we do not
		2895	* have the adjacent extent in cache then when we copy
		2896	* this extent it could end up being larger than our EM
		2897	* thinks it is, which is a-ok, so just set len to 0.
		2898	*/
		2899	len = 0;
		2900	} else {
		2901	len -= num_bytes;
		2902	}
		2903	start += btrfs_file_extent_num_bytes(args->src, fi);
		2904	args->next_offset = start;
		2905
		2906	if (path->slots[0] < nritems) {
		2907	if (len)
		2908	goto next_slot;
		2909	break;
		2910	}
		2911
		2912	if (args->nr) {
		2913	ret = copy_items(trans, log, dst_path, args->src,
		2914	args->start_slot, args->nr,
		2915	LOG_INODE_ALL);
		2916	if (ret)
		2917	return ret;
		2918	args->nr = 0;
		2919	btrfs_release_path(path);
		2920	}
		2921	}
		2922
		2923	return 0;
		2924	}
		2925
		2926	static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
		2927	struct btrfs_root *root,
		2928	struct inode *inode,
		2929	struct btrfs_path *path,
		2930	struct btrfs_path *dst_path)
		2931	{
		2932	struct log_args args;
		2933	struct btrfs_root *log = root->log_root;
		2934	struct extent_map em, n;
		2935	struct list_head extents;
		2936	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
		2937	u64 test_gen;
		2938	int ret = 0;
		2939
		2940	INIT_LIST_HEAD(&extents);
		2941
		2942	memset(&args, 0, sizeof(args));
		2943
		2944	write_lock(&tree->lock);
		2945	test_gen = root->fs_info->last_trans_committed;
		2946
		2947	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
		2948	list_del_init(&em->list);
		2949	if (em->generation <= test_gen)
		2950	continue;
		2951	list_add_tail(&em->list, &extents);
		2952	}
		2953
		2954	list_sort(NULL, &extents, extent_cmp);
		2955
		2956	while (!list_empty(&extents)) {
		2957	em = list_entry(extents.next, struct extent_map, list);
		2958
		2959	list_del_init(&em->list);
		2960
		2961	/*
		2962	* If we had an error we just need to delete everybody from our
		2963	* private list.
		2964	*/
		2965	if (ret)
		2966	continue;
		2967
		2968	/*
		2969	* If the previous EM and the last extent we left off on aren't
		2970	* sequential then we need to copy the items we have and redo
		2971	* our search
		2972	*/
		2973	if (args.nr && em->start != args.next_offset) {
		2974	ret = copy_items(trans, log, dst_path, args.src,
		2975	args.start_slot, args.nr,
		2976	LOG_INODE_ALL);
		2977	if (ret)
		2978	continue;
		2979	btrfs_release_path(path);
		2980	args.nr = 0;
		2981	}
		2982
		2983	ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
		2984	}
		2985
		2986	if (!ret && args.nr)
		2987	ret = copy_items(trans, log, dst_path, args.src,
		2988	args.start_slot, args.nr, LOG_INODE_ALL);
		2989	btrfs_release_path(path);
		2990	WARN_ON(!list_empty(&extents));
		2991	write_unlock(&tree->lock);
		2992	return ret;
		2993	}
		2994
2806	/* log a single inode in the tree log.	2995	/* log a single inode in the tree log.
2807	* At least one parent directory for this inode must exist in the tree	2996	* At least one parent directory for this inode must exist in the tree
2808	* or be logged already.	2997	* or be logged already.
@@ -2832,6 +3021,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2832	int nritems;	3021	int nritems;
2833	int ins_start_slot = 0;	3022	int ins_start_slot = 0;
2834	int ins_nr;	3023	int ins_nr;
		3024	bool fast_search = false;
2835	u64 ino = btrfs_ino(inode);	3025	u64 ino = btrfs_ino(inode);
2836		3026
2837	log = root->log_root;	3027	log = root->log_root;
@@ -2851,10 +3041,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2851		3041
2852	max_key.objectid = ino;	3042	max_key.objectid = ino;
2853		3043
2854	/* today the code can only do partial logging of directories */
2855	if (!S_ISDIR(inode->i_mode))
2856	inode_only = LOG_INODE_ALL;
2857		3044
		3045	/* today the code can only do partial logging of directories */
2858	if (inode_only == LOG_INODE_EXISTS \|\| S_ISDIR(inode->i_mode))	3046	if (inode_only == LOG_INODE_EXISTS \|\| S_ISDIR(inode->i_mode))
2859	max_key.type = BTRFS_XATTR_ITEM_KEY;	3047	max_key.type = BTRFS_XATTR_ITEM_KEY;
2860	else	3048	else
@@ -2881,7 +3069,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2881	max_key_type = BTRFS_XATTR_ITEM_KEY;	3069	max_key_type = BTRFS_XATTR_ITEM_KEY;
2882	ret = drop_objectid_items(trans, log, path, ino, max_key_type);	3070	ret = drop_objectid_items(trans, log, path, ino, max_key_type);
2883	} else {	3071	} else {
2884	ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);	3072	if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
		3073	&BTRFS_I(inode)->runtime_flags)) {
		3074	ret = btrfs_truncate_inode_items(trans, log,
		3075	inode, 0, 0);
		3076	} else {
		3077	fast_search = true;
		3078	max_key.type = BTRFS_XATTR_ITEM_KEY;
		3079	ret = drop_objectid_items(trans, log, path, ino,
		3080	BTRFS_XATTR_ITEM_KEY);
		3081	}
2885	}	3082	}
2886	if (ret) {	3083	if (ret) {
2887	err = ret;	3084	err = ret;
@@ -2960,7 +3157,18 @@ next_slot:
2960	}	3157	}
2961	ins_nr = 0;	3158	ins_nr = 0;
2962	}	3159	}
2963	WARN_ON(ins_nr);	3160
		3161	if (fast_search) {
		3162	btrfs_release_path(path);
		3163	btrfs_release_path(dst_path);
		3164	ret = btrfs_log_changed_extents(trans, root, inode, path,
		3165	dst_path);
		3166	if (ret) {
		3167	err = ret;
		3168	goto out_unlock;
		3169	}
		3170	}
		3171
2964	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {	3172	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2965	btrfs_release_path(path);	3173	btrfs_release_path(path);
2966	btrfs_release_path(dst_path);	3174	btrfs_release_path(dst_path);