aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fusionio.com>2012-08-17 13:14:17 -0400
committerChris Mason <chris.mason@fusionio.com>2012-10-01 15:19:03 -0400
commit5dc562c541e1026df9d43913c2f6b91156e22d32 (patch)
treea7768100e81b756f2a3edbfcaf99ad77ca7ed605 /fs/btrfs/tree-log.c
parent224ecce517af3a952321202cdf304c12e138caca (diff)
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c220
1 files changed, 214 insertions, 6 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f4f285..f2ff02c55130 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/list_sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "transaction.h" 23#include "transaction.h"
23#include "disk-io.h" 24#include "disk-io.h"
@@ -550,7 +551,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
550 551
551 saved_nbytes = inode_get_bytes(inode); 552 saved_nbytes = inode_get_bytes(inode);
552 /* drop any overlapping extents */ 553 /* drop any overlapping extents */
553 ret = btrfs_drop_extents(trans, inode, start, extent_end, 554 ret = btrfs_drop_extents(trans, root, inode, start, extent_end,
554 &alloc_hint, 1); 555 &alloc_hint, 1);
555 BUG_ON(ret); 556 BUG_ON(ret);
556 557
@@ -2803,6 +2804,194 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2803 return ret; 2804 return ret;
2804} 2805}
2805 2806
2807static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
2808{
2809 struct extent_map *em1, *em2;
2810
2811 em1 = list_entry(a, struct extent_map, list);
2812 em2 = list_entry(b, struct extent_map, list);
2813
2814 if (em1->start < em2->start)
2815 return -1;
2816 else if (em1->start > em2->start)
2817 return 1;
2818 return 0;
2819}
2820
2821struct log_args {
2822 struct extent_buffer *src;
2823 u64 next_offset;
2824 int start_slot;
2825 int nr;
2826};
2827
2828static int log_one_extent(struct btrfs_trans_handle *trans,
2829 struct inode *inode, struct btrfs_root *root,
2830 struct extent_map *em, struct btrfs_path *path,
2831 struct btrfs_path *dst_path, struct log_args *args)
2832{
2833 struct btrfs_root *log = root->log_root;
2834 struct btrfs_file_extent_item *fi;
2835 struct btrfs_key key;
2836 u64 start = em->start;
2837 u64 len = em->len;
2838 u64 num_bytes;
2839 int nritems;
2840 int ret;
2841
2842 if (BTRFS_I(inode)->logged_trans == trans->transid) {
2843 u64 tmp;
2844 ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
2845 start + len, &tmp, 0);
2846 if (ret)
2847 return ret;
2848 }
2849
2850 while (len) {
2851 if (args->nr)
2852 goto next_slot;
2853 key.objectid = btrfs_ino(inode);
2854 key.type = BTRFS_EXTENT_DATA_KEY;
2855 key.offset = start;
2856
2857 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2858 if (ret < 0)
2859 return ret;
2860 if (ret) {
2861 /*
2862 * This shouldn't happen, but it might so warn and
2863 * return an error.
2864 */
2865 WARN_ON(1);
2866 return -ENOENT;
2867 }
2868 args->src = path->nodes[0];
2869next_slot:
2870 fi = btrfs_item_ptr(args->src, path->slots[0],
2871 struct btrfs_file_extent_item);
2872 if (args->nr &&
2873 args->start_slot + args->nr == path->slots[0]) {
2874 args->nr++;
2875 } else if (args->nr) {
2876 ret = copy_items(trans, log, dst_path, args->src,
2877 args->start_slot, args->nr,
2878 LOG_INODE_ALL);
2879 if (ret)
2880 return ret;
2881 args->nr = 1;
2882 args->start_slot = path->slots[0];
2883 } else if (!args->nr) {
2884 args->nr = 1;
2885 args->start_slot = path->slots[0];
2886 }
2887 nritems = btrfs_header_nritems(path->nodes[0]);
2888 path->slots[0]++;
2889 num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
2890 if (len < num_bytes) {
2891 /* I _think_ this is ok, envision we write to a
2892 * preallocated space that is adjacent to a previously
2893 * written preallocated space that gets merged when we
2894 * mark this preallocated space written. If we do not
2895 * have the adjacent extent in cache then when we copy
2896 * this extent it could end up being larger than our EM
2897 * thinks it is, which is a-ok, so just set len to 0.
2898 */
2899 len = 0;
2900 } else {
2901 len -= num_bytes;
2902 }
2903 start += btrfs_file_extent_num_bytes(args->src, fi);
2904 args->next_offset = start;
2905
2906 if (path->slots[0] < nritems) {
2907 if (len)
2908 goto next_slot;
2909 break;
2910 }
2911
2912 if (args->nr) {
2913 ret = copy_items(trans, log, dst_path, args->src,
2914 args->start_slot, args->nr,
2915 LOG_INODE_ALL);
2916 if (ret)
2917 return ret;
2918 args->nr = 0;
2919 btrfs_release_path(path);
2920 }
2921 }
2922
2923 return 0;
2924}
2925
2926static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
2927 struct btrfs_root *root,
2928 struct inode *inode,
2929 struct btrfs_path *path,
2930 struct btrfs_path *dst_path)
2931{
2932 struct log_args args;
2933 struct btrfs_root *log = root->log_root;
2934 struct extent_map *em, *n;
2935 struct list_head extents;
2936 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
2937 u64 test_gen;
2938 int ret = 0;
2939
2940 INIT_LIST_HEAD(&extents);
2941
2942 memset(&args, 0, sizeof(args));
2943
2944 write_lock(&tree->lock);
2945 test_gen = root->fs_info->last_trans_committed;
2946
2947 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
2948 list_del_init(&em->list);
2949 if (em->generation <= test_gen)
2950 continue;
2951 list_add_tail(&em->list, &extents);
2952 }
2953
2954 list_sort(NULL, &extents, extent_cmp);
2955
2956 while (!list_empty(&extents)) {
2957 em = list_entry(extents.next, struct extent_map, list);
2958
2959 list_del_init(&em->list);
2960
2961 /*
2962 * If we had an error we just need to delete everybody from our
2963 * private list.
2964 */
2965 if (ret)
2966 continue;
2967
2968 /*
2969 * If the previous EM and the last extent we left off on aren't
2970 * sequential then we need to copy the items we have and redo
2971 * our search
2972 */
2973 if (args.nr && em->start != args.next_offset) {
2974 ret = copy_items(trans, log, dst_path, args.src,
2975 args.start_slot, args.nr,
2976 LOG_INODE_ALL);
2977 if (ret)
2978 continue;
2979 btrfs_release_path(path);
2980 args.nr = 0;
2981 }
2982
2983 ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
2984 }
2985
2986 if (!ret && args.nr)
2987 ret = copy_items(trans, log, dst_path, args.src,
2988 args.start_slot, args.nr, LOG_INODE_ALL);
2989 btrfs_release_path(path);
2990 WARN_ON(!list_empty(&extents));
2991 write_unlock(&tree->lock);
2992 return ret;
2993}
2994
2806/* log a single inode in the tree log. 2995/* log a single inode in the tree log.
2807 * At least one parent directory for this inode must exist in the tree 2996 * At least one parent directory for this inode must exist in the tree
2808 * or be logged already. 2997 * or be logged already.
@@ -2832,6 +3021,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2832 int nritems; 3021 int nritems;
2833 int ins_start_slot = 0; 3022 int ins_start_slot = 0;
2834 int ins_nr; 3023 int ins_nr;
3024 bool fast_search = false;
2835 u64 ino = btrfs_ino(inode); 3025 u64 ino = btrfs_ino(inode);
2836 3026
2837 log = root->log_root; 3027 log = root->log_root;
@@ -2851,10 +3041,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2851 3041
2852 max_key.objectid = ino; 3042 max_key.objectid = ino;
2853 3043
2854 /* today the code can only do partial logging of directories */
2855 if (!S_ISDIR(inode->i_mode))
2856 inode_only = LOG_INODE_ALL;
2857 3044
3045 /* today the code can only do partial logging of directories */
2858 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3046 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2859 max_key.type = BTRFS_XATTR_ITEM_KEY; 3047 max_key.type = BTRFS_XATTR_ITEM_KEY;
2860 else 3048 else
@@ -2881,7 +3069,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2881 max_key_type = BTRFS_XATTR_ITEM_KEY; 3069 max_key_type = BTRFS_XATTR_ITEM_KEY;
2882 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3070 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
2883 } else { 3071 } else {
2884 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 3072 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3073 &BTRFS_I(inode)->runtime_flags)) {
3074 ret = btrfs_truncate_inode_items(trans, log,
3075 inode, 0, 0);
3076 } else {
3077 fast_search = true;
3078 max_key.type = BTRFS_XATTR_ITEM_KEY;
3079 ret = drop_objectid_items(trans, log, path, ino,
3080 BTRFS_XATTR_ITEM_KEY);
3081 }
2885 } 3082 }
2886 if (ret) { 3083 if (ret) {
2887 err = ret; 3084 err = ret;
@@ -2960,7 +3157,18 @@ next_slot:
2960 } 3157 }
2961 ins_nr = 0; 3158 ins_nr = 0;
2962 } 3159 }
2963 WARN_ON(ins_nr); 3160
3161 if (fast_search) {
3162 btrfs_release_path(path);
3163 btrfs_release_path(dst_path);
3164 ret = btrfs_log_changed_extents(trans, root, inode, path,
3165 dst_path);
3166 if (ret) {
3167 err = ret;
3168 goto out_unlock;
3169 }
3170 }
3171
2964 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3172 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2965 btrfs_release_path(path); 3173 btrfs_release_path(path);
2966 btrfs_release_path(dst_path); 3174 btrfs_release_path(dst_path);