aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_map.c
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fusionio.com>2012-08-17 13:14:17 -0400
committerChris Mason <chris.mason@fusionio.com>2012-10-01 15:19:03 -0400
commit5dc562c541e1026df9d43913c2f6b91156e22d32 (patch)
treea7768100e81b756f2a3edbfcaf99ad77ca7ed605 /fs/btrfs/extent_map.c
parent224ecce517af3a952321202cdf304c12e138caca (diff)
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Diffstat (limited to 'fs/btrfs/extent_map.c')
-rw-r--r--fs/btrfs/extent_map.c34
1 files changed, 32 insertions, 2 deletions
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b3301459..1fe82cfc1d93 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -35,6 +35,7 @@ void extent_map_exit(void)
35void extent_map_tree_init(struct extent_map_tree *tree) 35void extent_map_tree_init(struct extent_map_tree *tree)
36{ 36{
37 tree->map = RB_ROOT; 37 tree->map = RB_ROOT;
38 INIT_LIST_HEAD(&tree->modified_extents);
38 rwlock_init(&tree->lock); 39 rwlock_init(&tree->lock);
39} 40}
40 41
@@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)
54 em->in_tree = 0; 55 em->in_tree = 0;
55 em->flags = 0; 56 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 57 em->compress_type = BTRFS_COMPRESS_NONE;
58 em->generation = 0;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
60 INIT_LIST_HEAD(&em->list);
58 return em; 61 return em;
59} 62}
60 63
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
72 WARN_ON(atomic_read(&em->refs) == 0); 75 WARN_ON(atomic_read(&em->refs) == 0);
73 if (atomic_dec_and_test(&em->refs)) { 76 if (atomic_dec_and_test(&em->refs)) {
74 WARN_ON(em->in_tree); 77 WARN_ON(em->in_tree);
78 WARN_ON(!list_empty(&em->list));
75 kmem_cache_free(extent_map_cache, em); 79 kmem_cache_free(extent_map_cache, em);
76 } 80 }
77} 81}
@@ -198,6 +202,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
198 em->block_len += merge->block_len; 202 em->block_len += merge->block_len;
199 em->block_start = merge->block_start; 203 em->block_start = merge->block_start;
200 merge->in_tree = 0; 204 merge->in_tree = 0;
205 if (merge->generation > em->generation) {
206 em->generation = merge->generation;
207 list_move(&em->list, &tree->modified_extents);
208 }
209
210 list_del_init(&merge->list);
201 rb_erase(&merge->rb_node, &tree->map); 211 rb_erase(&merge->rb_node, &tree->map);
202 free_extent_map(merge); 212 free_extent_map(merge);
203 } 213 }
@@ -211,11 +221,29 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
211 em->block_len += merge->len; 221 em->block_len += merge->len;
212 rb_erase(&merge->rb_node, &tree->map); 222 rb_erase(&merge->rb_node, &tree->map);
213 merge->in_tree = 0; 223 merge->in_tree = 0;
224 if (merge->generation > em->generation) {
225 em->generation = merge->generation;
226 list_move(&em->list, &tree->modified_extents);
227 }
228 list_del_init(&merge->list);
214 free_extent_map(merge); 229 free_extent_map(merge);
215 } 230 }
216} 231}
217 232
218int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) 233/**
234 * unpint_extent_cache - unpin an extent from the cache
235 * @tree: tree to unpin the extent in
236 * @start: logical offset in the file
237 * @len: length of the extent
238 * @gen: generation that this extent has been modified in
239 * @prealloc: if this is set we need to clear the prealloc flag
240 *
241 * Called after an extent has been written to disk properly. Set the generation
242 * to the generation that actually added the file item to the inode so we know
243 * we need to sync this extent when we call fsync().
244 */
245int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
246 u64 gen)
219{ 247{
220 int ret = 0; 248 int ret = 0;
221 struct extent_map *em; 249 struct extent_map *em;
@@ -228,10 +256,11 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
228 if (!em) 256 if (!em)
229 goto out; 257 goto out;
230 258
259 list_move(&em->list, &tree->modified_extents);
260 em->generation = gen;
231 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 261 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
232 262
233 try_merge_map(tree, em); 263 try_merge_map(tree, em);
234
235 free_extent_map(em); 264 free_extent_map(em);
236out: 265out:
237 write_unlock(&tree->lock); 266 write_unlock(&tree->lock);
@@ -358,6 +387,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
358 387
359 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 388 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
360 rb_erase(&em->rb_node, &tree->map); 389 rb_erase(&em->rb_node, &tree->map);
390 list_del_init(&em->list);
361 em->in_tree = 0; 391 em->in_tree = 0;
362 return ret; 392 return ret;
363} 393}