aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fusionio.com>2012-08-17 13:14:17 -0400
committerChris Mason <chris.mason@fusionio.com>2012-10-01 15:19:03 -0400
commit5dc562c541e1026df9d43913c2f6b91156e22d32 (patch)
treea7768100e81b756f2a3edbfcaf99ad77ca7ed605 /fs/btrfs/file.c
parent224ecce517af3a952321202cdf304c12e138caca (diff)
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c62
1 files changed, 43 insertions, 19 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b7c885c8423f..399f9d71a926 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -459,13 +459,14 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
459 * [start, end]. Existing extents are split as required. 459 * [start, end]. Existing extents are split as required.
460 */ 460 */
461int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 461int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
462 int skip_pinned) 462 int skip_pinned)
463{ 463{
464 struct extent_map *em; 464 struct extent_map *em;
465 struct extent_map *split = NULL; 465 struct extent_map *split = NULL;
466 struct extent_map *split2 = NULL; 466 struct extent_map *split2 = NULL;
467 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 467 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
468 u64 len = end - start + 1; 468 u64 len = end - start + 1;
469 u64 gen;
469 int ret; 470 int ret;
470 int testend = 1; 471 int testend = 1;
471 unsigned long flags; 472 unsigned long flags;
@@ -490,6 +491,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
490 break; 491 break;
491 } 492 }
492 flags = em->flags; 493 flags = em->flags;
494 gen = em->generation;
493 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 495 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
494 if (testend && em->start + em->len >= start + len) { 496 if (testend && em->start + em->len >= start + len) {
495 free_extent_map(em); 497 free_extent_map(em);
@@ -518,12 +520,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
518 split->block_len = em->block_len; 520 split->block_len = em->block_len;
519 else 521 else
520 split->block_len = split->len; 522 split->block_len = split->len;
521 523 split->generation = gen;
522 split->bdev = em->bdev; 524 split->bdev = em->bdev;
523 split->flags = flags; 525 split->flags = flags;
524 split->compress_type = em->compress_type; 526 split->compress_type = em->compress_type;
525 ret = add_extent_mapping(em_tree, split); 527 ret = add_extent_mapping(em_tree, split);
526 BUG_ON(ret); /* Logic error */ 528 BUG_ON(ret); /* Logic error */
529 list_move(&split->list, &em_tree->modified_extents);
527 free_extent_map(split); 530 free_extent_map(split);
528 split = split2; 531 split = split2;
529 split2 = NULL; 532 split2 = NULL;
@@ -537,6 +540,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
537 split->bdev = em->bdev; 540 split->bdev = em->bdev;
538 split->flags = flags; 541 split->flags = flags;
539 split->compress_type = em->compress_type; 542 split->compress_type = em->compress_type;
543 split->generation = gen;
540 544
541 if (compressed) { 545 if (compressed) {
542 split->block_len = em->block_len; 546 split->block_len = em->block_len;
@@ -550,6 +554,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
550 554
551 ret = add_extent_mapping(em_tree, split); 555 ret = add_extent_mapping(em_tree, split);
552 BUG_ON(ret); /* Logic error */ 556 BUG_ON(ret); /* Logic error */
557 list_move(&split->list, &em_tree->modified_extents);
553 free_extent_map(split); 558 free_extent_map(split);
554 split = NULL; 559 split = NULL;
555 } 560 }
@@ -576,13 +581,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
576 * it is either truncated or split. Anything entirely inside the range 581 * it is either truncated or split. Anything entirely inside the range
577 * is deleted from the tree. 582 * is deleted from the tree.
578 */ 583 */
579int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 584int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
580 u64 start, u64 end, u64 *hint_byte, int drop_cache) 585 struct btrfs_root *root, struct inode *inode,
586 struct btrfs_path *path, u64 start, u64 end,
587 u64 *hint_byte, int drop_cache)
581{ 588{
582 struct btrfs_root *root = BTRFS_I(inode)->root;
583 struct extent_buffer *leaf; 589 struct extent_buffer *leaf;
584 struct btrfs_file_extent_item *fi; 590 struct btrfs_file_extent_item *fi;
585 struct btrfs_path *path;
586 struct btrfs_key key; 591 struct btrfs_key key;
587 struct btrfs_key new_key; 592 struct btrfs_key new_key;
588 u64 ino = btrfs_ino(inode); 593 u64 ino = btrfs_ino(inode);
@@ -597,14 +602,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
597 int recow; 602 int recow;
598 int ret; 603 int ret;
599 int modify_tree = -1; 604 int modify_tree = -1;
605 int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
600 606
601 if (drop_cache) 607 if (drop_cache)
602 btrfs_drop_extent_cache(inode, start, end - 1, 0); 608 btrfs_drop_extent_cache(inode, start, end - 1, 0);
603 609
604 path = btrfs_alloc_path();
605 if (!path)
606 return -ENOMEM;
607
608 if (start >= BTRFS_I(inode)->disk_i_size) 610 if (start >= BTRFS_I(inode)->disk_i_size)
609 modify_tree = 0; 611 modify_tree = 0;
610 612
@@ -707,7 +709,7 @@ next_slot:
707 extent_end - start); 709 extent_end - start);
708 btrfs_mark_buffer_dirty(leaf); 710 btrfs_mark_buffer_dirty(leaf);
709 711
710 if (disk_bytenr > 0) { 712 if (update_refs && disk_bytenr > 0) {
711 ret = btrfs_inc_extent_ref(trans, root, 713 ret = btrfs_inc_extent_ref(trans, root,
712 disk_bytenr, num_bytes, 0, 714 disk_bytenr, num_bytes, 0,
713 root->root_key.objectid, 715 root->root_key.objectid,
@@ -734,7 +736,7 @@ next_slot:
734 btrfs_set_file_extent_num_bytes(leaf, fi, 736 btrfs_set_file_extent_num_bytes(leaf, fi,
735 extent_end - end); 737 extent_end - end);
736 btrfs_mark_buffer_dirty(leaf); 738 btrfs_mark_buffer_dirty(leaf);
737 if (disk_bytenr > 0) { 739 if (update_refs && disk_bytenr > 0) {
738 inode_sub_bytes(inode, end - key.offset); 740 inode_sub_bytes(inode, end - key.offset);
739 *hint_byte = disk_bytenr; 741 *hint_byte = disk_bytenr;
740 } 742 }
@@ -753,7 +755,7 @@ next_slot:
753 btrfs_set_file_extent_num_bytes(leaf, fi, 755 btrfs_set_file_extent_num_bytes(leaf, fi,
754 start - key.offset); 756 start - key.offset);
755 btrfs_mark_buffer_dirty(leaf); 757 btrfs_mark_buffer_dirty(leaf);
756 if (disk_bytenr > 0) { 758 if (update_refs && disk_bytenr > 0) {
757 inode_sub_bytes(inode, extent_end - start); 759 inode_sub_bytes(inode, extent_end - start);
758 *hint_byte = disk_bytenr; 760 *hint_byte = disk_bytenr;
759 } 761 }
@@ -777,12 +779,13 @@ next_slot:
777 del_nr++; 779 del_nr++;
778 } 780 }
779 781
780 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 782 if (update_refs &&
783 extent_type == BTRFS_FILE_EXTENT_INLINE) {
781 inode_sub_bytes(inode, 784 inode_sub_bytes(inode,
782 extent_end - key.offset); 785 extent_end - key.offset);
783 extent_end = ALIGN(extent_end, 786 extent_end = ALIGN(extent_end,
784 root->sectorsize); 787 root->sectorsize);
785 } else if (disk_bytenr > 0) { 788 } else if (update_refs && disk_bytenr > 0) {
786 ret = btrfs_free_extent(trans, root, 789 ret = btrfs_free_extent(trans, root,
787 disk_bytenr, num_bytes, 0, 790 disk_bytenr, num_bytes, 0,
788 root->root_key.objectid, 791 root->root_key.objectid,
@@ -806,7 +809,7 @@ next_slot:
806 del_nr); 809 del_nr);
807 if (ret) { 810 if (ret) {
808 btrfs_abort_transaction(trans, root, ret); 811 btrfs_abort_transaction(trans, root, ret);
809 goto out; 812 break;
810 } 813 }
811 814
812 del_nr = 0; 815 del_nr = 0;
@@ -825,7 +828,22 @@ next_slot:
825 btrfs_abort_transaction(trans, root, ret); 828 btrfs_abort_transaction(trans, root, ret);
826 } 829 }
827 830
828out: 831 btrfs_release_path(path);
832 return ret;
833}
834
835int btrfs_drop_extents(struct btrfs_trans_handle *trans,
836 struct btrfs_root *root, struct inode *inode, u64 start,
837 u64 end, u64 *hint_byte, int drop_cache)
838{
839 struct btrfs_path *path;
840 int ret;
841
842 path = btrfs_alloc_path();
843 if (!path)
844 return -ENOMEM;
845 ret = __btrfs_drop_extents(trans, root, inode, path, start, end,
846 hint_byte, drop_cache);
829 btrfs_free_path(path); 847 btrfs_free_path(path);
830 return ret; 848 return ret;
831} 849}
@@ -892,8 +910,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
892 int ret; 910 int ret;
893 u64 ino = btrfs_ino(inode); 911 u64 ino = btrfs_ino(inode);
894 912
895 btrfs_drop_extent_cache(inode, start, end - 1, 0);
896
897 path = btrfs_alloc_path(); 913 path = btrfs_alloc_path();
898 if (!path) 914 if (!path)
899 return -ENOMEM; 915 return -ENOMEM;
@@ -1556,6 +1572,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1556 BTRFS_I(inode)->last_trans <= 1572 BTRFS_I(inode)->last_trans <=
1557 root->fs_info->last_trans_committed) { 1573 root->fs_info->last_trans_committed) {
1558 BTRFS_I(inode)->last_trans = 0; 1574 BTRFS_I(inode)->last_trans = 0;
1575
1576 /*
1577 * We'v had everything committed since the last time we were
1578 * modified so clear this flag in case it was set for whatever
1579 * reason, it's no longer relevant.
1580 */
1581 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1582 &BTRFS_I(inode)->runtime_flags);
1559 mutex_unlock(&inode->i_mutex); 1583 mutex_unlock(&inode->i_mutex);
1560 goto out; 1584 goto out;
1561 } 1585 }