diff options
author | Josef Bacik <jbacik@fusionio.com> | 2012-08-17 13:14:17 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@fusionio.com> | 2012-10-01 15:19:03 -0400 |
commit | 5dc562c541e1026df9d43913c2f6b91156e22d32 (patch) | |
tree | a7768100e81b756f2a3edbfcaf99ad77ca7ed605 /fs/btrfs/inode.c | |
parent | 224ecce517af3a952321202cdf304c12e138caca (diff) |
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 120 |
1 files changed, 110 insertions, 10 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6971bac66d9d..1b99fe8a129d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -247,7 +247,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, | |||
247 | return 1; | 247 | return 1; |
248 | } | 248 | } |
249 | 249 | ||
250 | ret = btrfs_drop_extents(trans, inode, start, aligned_end, | 250 | ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, |
251 | &hint_byte, 1); | 251 | &hint_byte, 1); |
252 | if (ret) | 252 | if (ret) |
253 | return ret; | 253 | return ret; |
@@ -1803,7 +1803,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, | |||
1803 | * the caller is expected to unpin it and allow it to be merged | 1803 | * the caller is expected to unpin it and allow it to be merged |
1804 | * with the others. | 1804 | * with the others. |
1805 | */ | 1805 | */ |
1806 | ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, | 1806 | ret = btrfs_drop_extents(trans, root, inode, file_pos, |
1807 | file_pos + num_bytes, | ||
1807 | &hint, 0); | 1808 | &hint, 0); |
1808 | if (ret) | 1809 | if (ret) |
1809 | goto out; | 1810 | goto out; |
@@ -1929,11 +1930,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) | |||
1929 | ordered_extent->len, | 1930 | ordered_extent->len, |
1930 | compress_type, 0, 0, | 1931 | compress_type, 0, 0, |
1931 | BTRFS_FILE_EXTENT_REG); | 1932 | BTRFS_FILE_EXTENT_REG); |
1932 | unpin_extent_cache(&BTRFS_I(inode)->extent_tree, | ||
1933 | ordered_extent->file_offset, | ||
1934 | ordered_extent->len); | ||
1935 | } | 1933 | } |
1936 | 1934 | unpin_extent_cache(&BTRFS_I(inode)->extent_tree, | |
1935 | ordered_extent->file_offset, ordered_extent->len, | ||
1936 | trans->transid); | ||
1937 | if (ret < 0) { | 1937 | if (ret < 0) { |
1938 | btrfs_abort_transaction(trans, root, ret); | 1938 | btrfs_abort_transaction(trans, root, ret); |
1939 | goto out_unlock; | 1939 | goto out_unlock; |
@@ -2592,6 +2592,18 @@ static void btrfs_read_locked_inode(struct inode *inode) | |||
2592 | 2592 | ||
2593 | inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); | 2593 | inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); |
2594 | BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); | 2594 | BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); |
2595 | BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); | ||
2596 | |||
2597 | /* | ||
2598 | * If we were modified in the current generation and evicted from memory | ||
2599 | * and then re-read we need to do a full sync since we don't have any | ||
2600 | * idea about which extents were modified before we were evicted from | ||
2601 | * cache. | ||
2602 | */ | ||
2603 | if (BTRFS_I(inode)->last_trans == root->fs_info->generation) | ||
2604 | set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, | ||
2605 | &BTRFS_I(inode)->runtime_flags); | ||
2606 | |||
2595 | inode->i_version = btrfs_inode_sequence(leaf, inode_item); | 2607 | inode->i_version = btrfs_inode_sequence(leaf, inode_item); |
2596 | inode->i_generation = BTRFS_I(inode)->generation; | 2608 | inode->i_generation = BTRFS_I(inode)->generation; |
2597 | inode->i_rdev = 0; | 2609 | inode->i_rdev = 0; |
@@ -3269,8 +3281,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
3269 | return -ENOMEM; | 3281 | return -ENOMEM; |
3270 | path->reada = -1; | 3282 | path->reada = -1; |
3271 | 3283 | ||
3284 | /* | ||
3285 | * We want to drop from the next block forward in case this new size is | ||
3286 | * not block aligned since we will be keeping the last block of the | ||
3287 | * extent just the way it is. | ||
3288 | */ | ||
3272 | if (root->ref_cows || root == root->fs_info->tree_root) | 3289 | if (root->ref_cows || root == root->fs_info->tree_root) |
3273 | btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); | 3290 | btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); |
3274 | 3291 | ||
3275 | /* | 3292 | /* |
3276 | * This function is also used to drop the items in the log tree before | 3293 | * This function is also used to drop the items in the log tree before |
@@ -3579,6 +3596,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3579 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | 3596 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; |
3580 | struct extent_map *em = NULL; | 3597 | struct extent_map *em = NULL; |
3581 | struct extent_state *cached_state = NULL; | 3598 | struct extent_state *cached_state = NULL; |
3599 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
3582 | u64 mask = root->sectorsize - 1; | 3600 | u64 mask = root->sectorsize - 1; |
3583 | u64 hole_start = (oldsize + mask) & ~mask; | 3601 | u64 hole_start = (oldsize + mask) & ~mask; |
3584 | u64 block_end = (size + mask) & ~mask; | 3602 | u64 block_end = (size + mask) & ~mask; |
@@ -3615,6 +3633,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3615 | last_byte = min(extent_map_end(em), block_end); | 3633 | last_byte = min(extent_map_end(em), block_end); |
3616 | last_byte = (last_byte + mask) & ~mask; | 3634 | last_byte = (last_byte + mask) & ~mask; |
3617 | if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { | 3635 | if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { |
3636 | struct extent_map *hole_em; | ||
3618 | u64 hint_byte = 0; | 3637 | u64 hint_byte = 0; |
3619 | hole_size = last_byte - cur_offset; | 3638 | hole_size = last_byte - cur_offset; |
3620 | 3639 | ||
@@ -3624,7 +3643,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3624 | break; | 3643 | break; |
3625 | } | 3644 | } |
3626 | 3645 | ||
3627 | err = btrfs_drop_extents(trans, inode, cur_offset, | 3646 | err = btrfs_drop_extents(trans, root, inode, |
3647 | cur_offset, | ||
3628 | cur_offset + hole_size, | 3648 | cur_offset + hole_size, |
3629 | &hint_byte, 1); | 3649 | &hint_byte, 1); |
3630 | if (err) { | 3650 | if (err) { |
@@ -3643,9 +3663,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3643 | break; | 3663 | break; |
3644 | } | 3664 | } |
3645 | 3665 | ||
3646 | btrfs_drop_extent_cache(inode, hole_start, | 3666 | btrfs_drop_extent_cache(inode, cur_offset, |
3647 | last_byte - 1, 0); | 3667 | cur_offset + hole_size - 1, 0); |
3668 | hole_em = alloc_extent_map(); | ||
3669 | if (!hole_em) { | ||
3670 | set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, | ||
3671 | &BTRFS_I(inode)->runtime_flags); | ||
3672 | goto next; | ||
3673 | } | ||
3674 | hole_em->start = cur_offset; | ||
3675 | hole_em->len = hole_size; | ||
3676 | hole_em->orig_start = cur_offset; | ||
3677 | |||
3678 | hole_em->block_start = EXTENT_MAP_HOLE; | ||
3679 | hole_em->block_len = 0; | ||
3680 | hole_em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
3681 | hole_em->compress_type = BTRFS_COMPRESS_NONE; | ||
3682 | hole_em->generation = trans->transid; | ||
3648 | 3683 | ||
3684 | while (1) { | ||
3685 | write_lock(&em_tree->lock); | ||
3686 | err = add_extent_mapping(em_tree, hole_em); | ||
3687 | if (!err) | ||
3688 | list_move(&hole_em->list, | ||
3689 | &em_tree->modified_extents); | ||
3690 | write_unlock(&em_tree->lock); | ||
3691 | if (err != -EEXIST) | ||
3692 | break; | ||
3693 | btrfs_drop_extent_cache(inode, cur_offset, | ||
3694 | cur_offset + | ||
3695 | hole_size - 1, 0); | ||
3696 | } | ||
3697 | free_extent_map(hole_em); | ||
3698 | next: | ||
3649 | btrfs_update_inode(trans, root, inode); | 3699 | btrfs_update_inode(trans, root, inode); |
3650 | btrfs_end_transaction(trans, root); | 3700 | btrfs_end_transaction(trans, root); |
3651 | } | 3701 | } |
@@ -4673,6 +4723,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, | |||
4673 | BTRFS_I(inode)->generation = trans->transid; | 4723 | BTRFS_I(inode)->generation = trans->transid; |
4674 | inode->i_generation = BTRFS_I(inode)->generation; | 4724 | inode->i_generation = BTRFS_I(inode)->generation; |
4675 | 4725 | ||
4726 | /* | ||
4727 | * We could have gotten an inode number from somebody who was fsynced | ||
4728 | * and then removed in this same transaction, so let's just set full | ||
4729 | * sync since it will be a full sync anyway and this will blow away the | ||
4730 | * old info in the log. | ||
4731 | */ | ||
4732 | set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); | ||
4733 | |||
4676 | if (S_ISDIR(mode)) | 4734 | if (S_ISDIR(mode)) |
4677 | owner = 0; | 4735 | owner = 0; |
4678 | else | 4736 | else |
@@ -6839,6 +6897,15 @@ static int btrfs_truncate(struct inode *inode) | |||
6839 | &BTRFS_I(inode)->runtime_flags)) | 6897 | &BTRFS_I(inode)->runtime_flags)) |
6840 | btrfs_add_ordered_operation(trans, root, inode); | 6898 | btrfs_add_ordered_operation(trans, root, inode); |
6841 | 6899 | ||
6900 | /* | ||
6901 | * So if we truncate and then write and fsync we normally would just | ||
6902 | * write the extents that changed, which is a problem if we need to | ||
6903 | * first truncate that entire inode. So set this flag so we write out | ||
6904 | * all of the extents in the inode to the sync log so we're completely | ||
6905 | * safe. | ||
6906 | */ | ||
6907 | set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); | ||
6908 | |||
6842 | while (1) { | 6909 | while (1) { |
6843 | ret = btrfs_block_rsv_refill(root, rsv, min_size); | 6910 | ret = btrfs_block_rsv_refill(root, rsv, min_size); |
6844 | if (ret) { | 6911 | if (ret) { |
@@ -7510,6 +7577,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, | |||
7510 | loff_t actual_len, u64 *alloc_hint, | 7577 | loff_t actual_len, u64 *alloc_hint, |
7511 | struct btrfs_trans_handle *trans) | 7578 | struct btrfs_trans_handle *trans) |
7512 | { | 7579 | { |
7580 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
7581 | struct extent_map *em; | ||
7513 | struct btrfs_root *root = BTRFS_I(inode)->root; | 7582 | struct btrfs_root *root = BTRFS_I(inode)->root; |
7514 | struct btrfs_key ins; | 7583 | struct btrfs_key ins; |
7515 | u64 cur_offset = start; | 7584 | u64 cur_offset = start; |
@@ -7550,6 +7619,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, | |||
7550 | btrfs_drop_extent_cache(inode, cur_offset, | 7619 | btrfs_drop_extent_cache(inode, cur_offset, |
7551 | cur_offset + ins.offset -1, 0); | 7620 | cur_offset + ins.offset -1, 0); |
7552 | 7621 | ||
7622 | em = alloc_extent_map(); | ||
7623 | if (!em) { | ||
7624 | set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, | ||
7625 | &BTRFS_I(inode)->runtime_flags); | ||
7626 | goto next; | ||
7627 | } | ||
7628 | |||
7629 | em->start = cur_offset; | ||
7630 | em->orig_start = cur_offset; | ||
7631 | em->len = ins.offset; | ||
7632 | em->block_start = ins.objectid; | ||
7633 | em->block_len = ins.offset; | ||
7634 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
7635 | set_bit(EXTENT_FLAG_PREALLOC, &em->flags); | ||
7636 | em->generation = trans->transid; | ||
7637 | |||
7638 | while (1) { | ||
7639 | write_lock(&em_tree->lock); | ||
7640 | ret = add_extent_mapping(em_tree, em); | ||
7641 | if (!ret) | ||
7642 | list_move(&em->list, | ||
7643 | &em_tree->modified_extents); | ||
7644 | write_unlock(&em_tree->lock); | ||
7645 | if (ret != -EEXIST) | ||
7646 | break; | ||
7647 | btrfs_drop_extent_cache(inode, cur_offset, | ||
7648 | cur_offset + ins.offset - 1, | ||
7649 | 0); | ||
7650 | } | ||
7651 | free_extent_map(em); | ||
7652 | next: | ||
7553 | num_bytes -= ins.offset; | 7653 | num_bytes -= ins.offset; |
7554 | cur_offset += ins.offset; | 7654 | cur_offset += ins.offset; |
7555 | *alloc_hint = ins.objectid + ins.offset; | 7655 | *alloc_hint = ins.objectid + ins.offset; |