aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fusionio.com>2012-08-17 13:14:17 -0400
committerChris Mason <chris.mason@fusionio.com>2012-10-01 15:19:03 -0400
commit5dc562c541e1026df9d43913c2f6b91156e22d32 (patch)
treea7768100e81b756f2a3edbfcaf99ad77ca7ed605 /fs/btrfs/inode.c
parent224ecce517af3a952321202cdf304c12e138caca (diff)
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c120
1 files changed, 110 insertions, 10 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6971bac66d9d..1b99fe8a129d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -247,7 +247,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
247 return 1; 247 return 1;
248 } 248 }
249 249
250 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 250 ret = btrfs_drop_extents(trans, root, inode, start, aligned_end,
251 &hint_byte, 1); 251 &hint_byte, 1);
252 if (ret) 252 if (ret)
253 return ret; 253 return ret;
@@ -1803,7 +1803,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1803 * the caller is expected to unpin it and allow it to be merged 1803 * the caller is expected to unpin it and allow it to be merged
1804 * with the others. 1804 * with the others.
1805 */ 1805 */
1806 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1806 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1807 file_pos + num_bytes,
1807 &hint, 0); 1808 &hint, 0);
1808 if (ret) 1809 if (ret)
1809 goto out; 1810 goto out;
@@ -1929,11 +1930,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1929 ordered_extent->len, 1930 ordered_extent->len,
1930 compress_type, 0, 0, 1931 compress_type, 0, 0,
1931 BTRFS_FILE_EXTENT_REG); 1932 BTRFS_FILE_EXTENT_REG);
1932 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1933 ordered_extent->file_offset,
1934 ordered_extent->len);
1935 } 1933 }
1936 1934 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1935 ordered_extent->file_offset, ordered_extent->len,
1936 trans->transid);
1937 if (ret < 0) { 1937 if (ret < 0) {
1938 btrfs_abort_transaction(trans, root, ret); 1938 btrfs_abort_transaction(trans, root, ret);
1939 goto out_unlock; 1939 goto out_unlock;
@@ -2592,6 +2592,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
2592 2592
2593 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2593 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2594 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2594 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2595 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
2596
2597 /*
2598 * If we were modified in the current generation and evicted from memory
2599 * and then re-read we need to do a full sync since we don't have any
2600 * idea about which extents were modified before we were evicted from
2601 * cache.
2602 */
2603 if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
2604 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2605 &BTRFS_I(inode)->runtime_flags);
2606
2595 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2607 inode->i_version = btrfs_inode_sequence(leaf, inode_item);
2596 inode->i_generation = BTRFS_I(inode)->generation; 2608 inode->i_generation = BTRFS_I(inode)->generation;
2597 inode->i_rdev = 0; 2609 inode->i_rdev = 0;
@@ -3269,8 +3281,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3269 return -ENOMEM; 3281 return -ENOMEM;
3270 path->reada = -1; 3282 path->reada = -1;
3271 3283
3284 /*
3285 * We want to drop from the next block forward in case this new size is
3286 * not block aligned since we will be keeping the last block of the
3287 * extent just the way it is.
3288 */
3272 if (root->ref_cows || root == root->fs_info->tree_root) 3289 if (root->ref_cows || root == root->fs_info->tree_root)
3273 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3290 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
3274 3291
3275 /* 3292 /*
3276 * This function is also used to drop the items in the log tree before 3293 * This function is also used to drop the items in the log tree before
@@ -3579,6 +3596,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3579 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3596 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3580 struct extent_map *em = NULL; 3597 struct extent_map *em = NULL;
3581 struct extent_state *cached_state = NULL; 3598 struct extent_state *cached_state = NULL;
3599 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3582 u64 mask = root->sectorsize - 1; 3600 u64 mask = root->sectorsize - 1;
3583 u64 hole_start = (oldsize + mask) & ~mask; 3601 u64 hole_start = (oldsize + mask) & ~mask;
3584 u64 block_end = (size + mask) & ~mask; 3602 u64 block_end = (size + mask) & ~mask;
@@ -3615,6 +3633,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3615 last_byte = min(extent_map_end(em), block_end); 3633 last_byte = min(extent_map_end(em), block_end);
3616 last_byte = (last_byte + mask) & ~mask; 3634 last_byte = (last_byte + mask) & ~mask;
3617 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3635 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3636 struct extent_map *hole_em;
3618 u64 hint_byte = 0; 3637 u64 hint_byte = 0;
3619 hole_size = last_byte - cur_offset; 3638 hole_size = last_byte - cur_offset;
3620 3639
@@ -3624,7 +3643,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3624 break; 3643 break;
3625 } 3644 }
3626 3645
3627 err = btrfs_drop_extents(trans, inode, cur_offset, 3646 err = btrfs_drop_extents(trans, root, inode,
3647 cur_offset,
3628 cur_offset + hole_size, 3648 cur_offset + hole_size,
3629 &hint_byte, 1); 3649 &hint_byte, 1);
3630 if (err) { 3650 if (err) {
@@ -3643,9 +3663,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3643 break; 3663 break;
3644 } 3664 }
3645 3665
3646 btrfs_drop_extent_cache(inode, hole_start, 3666 btrfs_drop_extent_cache(inode, cur_offset,
3647 last_byte - 1, 0); 3667 cur_offset + hole_size - 1, 0);
3668 hole_em = alloc_extent_map();
3669 if (!hole_em) {
3670 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3671 &BTRFS_I(inode)->runtime_flags);
3672 goto next;
3673 }
3674 hole_em->start = cur_offset;
3675 hole_em->len = hole_size;
3676 hole_em->orig_start = cur_offset;
3677
3678 hole_em->block_start = EXTENT_MAP_HOLE;
3679 hole_em->block_len = 0;
3680 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3681 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3682 hole_em->generation = trans->transid;
3648 3683
3684 while (1) {
3685 write_lock(&em_tree->lock);
3686 err = add_extent_mapping(em_tree, hole_em);
3687 if (!err)
3688 list_move(&hole_em->list,
3689 &em_tree->modified_extents);
3690 write_unlock(&em_tree->lock);
3691 if (err != -EEXIST)
3692 break;
3693 btrfs_drop_extent_cache(inode, cur_offset,
3694 cur_offset +
3695 hole_size - 1, 0);
3696 }
3697 free_extent_map(hole_em);
3698next:
3649 btrfs_update_inode(trans, root, inode); 3699 btrfs_update_inode(trans, root, inode);
3650 btrfs_end_transaction(trans, root); 3700 btrfs_end_transaction(trans, root);
3651 } 3701 }
@@ -4673,6 +4723,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4673 BTRFS_I(inode)->generation = trans->transid; 4723 BTRFS_I(inode)->generation = trans->transid;
4674 inode->i_generation = BTRFS_I(inode)->generation; 4724 inode->i_generation = BTRFS_I(inode)->generation;
4675 4725
4726 /*
4727 * We could have gotten an inode number from somebody who was fsynced
4728 * and then removed in this same transaction, so let's just set full
4729 * sync since it will be a full sync anyway and this will blow away the
4730 * old info in the log.
4731 */
4732 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
4733
4676 if (S_ISDIR(mode)) 4734 if (S_ISDIR(mode))
4677 owner = 0; 4735 owner = 0;
4678 else 4736 else
@@ -6839,6 +6897,15 @@ static int btrfs_truncate(struct inode *inode)
6839 &BTRFS_I(inode)->runtime_flags)) 6897 &BTRFS_I(inode)->runtime_flags))
6840 btrfs_add_ordered_operation(trans, root, inode); 6898 btrfs_add_ordered_operation(trans, root, inode);
6841 6899
6900 /*
6901 * So if we truncate and then write and fsync we normally would just
6902 * write the extents that changed, which is a problem if we need to
6903 * first truncate that entire inode. So set this flag so we write out
6904 * all of the extents in the inode to the sync log so we're completely
6905 * safe.
6906 */
6907 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6908
6842 while (1) { 6909 while (1) {
6843 ret = btrfs_block_rsv_refill(root, rsv, min_size); 6910 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6844 if (ret) { 6911 if (ret) {
@@ -7510,6 +7577,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7510 loff_t actual_len, u64 *alloc_hint, 7577 loff_t actual_len, u64 *alloc_hint,
7511 struct btrfs_trans_handle *trans) 7578 struct btrfs_trans_handle *trans)
7512{ 7579{
7580 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
7581 struct extent_map *em;
7513 struct btrfs_root *root = BTRFS_I(inode)->root; 7582 struct btrfs_root *root = BTRFS_I(inode)->root;
7514 struct btrfs_key ins; 7583 struct btrfs_key ins;
7515 u64 cur_offset = start; 7584 u64 cur_offset = start;
@@ -7550,6 +7619,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7550 btrfs_drop_extent_cache(inode, cur_offset, 7619 btrfs_drop_extent_cache(inode, cur_offset,
7551 cur_offset + ins.offset -1, 0); 7620 cur_offset + ins.offset -1, 0);
7552 7621
7622 em = alloc_extent_map();
7623 if (!em) {
7624 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
7625 &BTRFS_I(inode)->runtime_flags);
7626 goto next;
7627 }
7628
7629 em->start = cur_offset;
7630 em->orig_start = cur_offset;
7631 em->len = ins.offset;
7632 em->block_start = ins.objectid;
7633 em->block_len = ins.offset;
7634 em->bdev = root->fs_info->fs_devices->latest_bdev;
7635 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7636 em->generation = trans->transid;
7637
7638 while (1) {
7639 write_lock(&em_tree->lock);
7640 ret = add_extent_mapping(em_tree, em);
7641 if (!ret)
7642 list_move(&em->list,
7643 &em_tree->modified_extents);
7644 write_unlock(&em_tree->lock);
7645 if (ret != -EEXIST)
7646 break;
7647 btrfs_drop_extent_cache(inode, cur_offset,
7648 cur_offset + ins.offset - 1,
7649 0);
7650 }
7651 free_extent_map(em);
7652next:
7553 num_bytes -= ins.offset; 7653 num_bytes -= ins.offset;
7554 cur_offset += ins.offset; 7654 cur_offset += ins.offset;
7555 *alloc_hint = ins.objectid + ins.offset; 7655 *alloc_hint = ins.objectid + ins.offset;