aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
authorMiao Xie <miaox@cn.fujitsu.com>2011-04-22 06:12:22 -0400
committerChris Mason <chris.mason@oracle.com>2011-05-21 09:30:56 -0400
commit16cdcec736cd214350cdb591bf1091f8beedefa0 (patch)
tree5598d4561660c4d7a1d4de8b3703d6dd3cc7f9e7 /fs/btrfs/disk-io.c
parent61c4f2c81c61f73549928dfd9f3e8f26aa36a8cf (diff)
btrfs: implement delayed inode items operation
Changelog V5 -> V6: - Fix oom when the memory load is high, by storing the delayed nodes into the root's radix tree, and letting btrfs inodes go. Changelog V4 -> V5: - Fix the race on adding the delayed node to the inode, which is spotted by Chris Mason. - Merge Chris Mason's incremental patch into this patch. - Fix deadlock between readdir() and memory fault, which is reported by Itaru Kitayama. Changelog V3 -> V4: - Fix nested lock, which is reported by Itaru Kitayama, by updating space cache inode in time. Changelog V2 -> V3: - Fix the race between the delayed worker and the task which does delayed items balance, which is reported by Tsutomu Itoh. - Modify the patch address David Sterba's comment. - Fix the bug of the cpu recursion spinlock, reported by Chris Mason Changelog V1 -> V2: - break up the global rb-tree, use a list to manage the delayed nodes, which is created for every directory and file, and used to manage the delayed directory name index items and the delayed inode item. - introduce a worker to deal with the delayed nodes. Compare with Ext3/4, the performance of file creation and deletion on btrfs is very poor. the reason is that btrfs must do a lot of b+ tree insertions, such as inode item, directory name item, directory name index and so on. If we can do some delayed b+ tree insertion or deletion, we can improve the performance, so we made this patch which implemented delayed directory name index insertion/deletion and delayed inode update. Implementation: - introduce a delayed root object into the filesystem, that use two lists to manage the delayed nodes which are created for every file/directory. One is used to manage all the delayed nodes that have delayed items. And the other is used to manage the delayed nodes which is waiting to be dealt with by the work thread. - Every delayed node has two rb-tree, one is used to manage the directory name index which is going to be inserted into b+ tree, and the other is used to manage the directory name index which is going to be deleted from b+ tree. - introduce a worker to deal with the delayed operation. This worker is used to deal with the works of the delayed directory name index items insertion and deletion and the delayed inode update. When the delayed items is beyond the lower limit, we create works for some delayed nodes and insert them into the work queue of the worker, and then go back. When the delayed items is beyond the upper bound, we create works for all the delayed nodes that haven't been dealt with, and insert them into the work queue of the worker, and then wait for that the untreated items is below some threshold value. - When we want to insert a directory name index into b+ tree, we just add the information into the delayed inserting rb-tree. And then we check the number of the delayed items and do delayed items balance. (The balance policy is above.) - When we want to delete a directory name index from the b+ tree, we search it in the inserting rb-tree at first. If we look it up, just drop it. If not, add the key of it into the delayed deleting rb-tree. Similar to the delayed inserting rb-tree, we also check the number of the delayed items and do delayed items balance. (The same to inserting manipulation) - When we want to update the metadata of some inode, we cached the data of the inode into the delayed node. the worker will flush it into the b+ tree after dealing with the delayed insertion and deletion. - We will move the delayed node to the tail of the list after we access the delayed node, By this way, we can cache more delayed items and merge more inode updates. - If we want to commit transaction, we will deal with all the delayed node. - the delayed node will be freed when we free the btrfs inode. - Before we log the inode items, we commit all the directory name index items and the delayed inode update. I did a quick test by the benchmark tool[1] and found we can improve the performance of file creation by ~15%, and file deletion by ~20%. Before applying this patch: Create files: Total files: 50000 Total time: 1.096108 Average time: 0.000022 Delete files: Total files: 50000 Total time: 1.510403 Average time: 0.000030 After applying this patch: Create files: Total files: 50000 Total time: 0.932899 Average time: 0.000019 Delete files: Total files: 50000 Total time: 1.215732 Average time: 0.000024 [1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3 Many thanks for Kitayama-san's help! Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Reviewed-by: David Sterba <dave@jikos.cz> Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com> Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c50
1 files changed, 45 insertions, 5 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 228cf36ece83..22c3c9586049 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1058,6 +1058,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1058 root->name = NULL; 1058 root->name = NULL;
1059 root->in_sysfs = 0; 1059 root->in_sysfs = 0;
1060 root->inode_tree = RB_ROOT; 1060 root->inode_tree = RB_ROOT;
1061 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1061 root->block_rsv = NULL; 1062 root->block_rsv = NULL;
1062 root->orphan_block_rsv = NULL; 1063 root->orphan_block_rsv = NULL;
1063 1064
@@ -1693,6 +1694,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1693 1694
1694 INIT_LIST_HEAD(&fs_info->ordered_extents); 1695 INIT_LIST_HEAD(&fs_info->ordered_extents);
1695 spin_lock_init(&fs_info->ordered_extent_lock); 1696 spin_lock_init(&fs_info->ordered_extent_lock);
1697 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
1698 GFP_NOFS);
1699 if (!fs_info->delayed_root) {
1700 err = -ENOMEM;
1701 goto fail_iput;
1702 }
1703 btrfs_init_delayed_root(fs_info->delayed_root);
1696 1704
1697 sb->s_blocksize = 4096; 1705 sb->s_blocksize = 4096;
1698 sb->s_blocksize_bits = blksize_bits(4096); 1706 sb->s_blocksize_bits = blksize_bits(4096);
@@ -1760,7 +1768,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1760 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1768 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1761 if (!bh) { 1769 if (!bh) {
1762 err = -EINVAL; 1770 err = -EINVAL;
1763 goto fail_iput; 1771 goto fail_alloc;
1764 } 1772 }
1765 1773
1766 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1774 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
@@ -1772,7 +1780,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1772 1780
1773 disk_super = &fs_info->super_copy; 1781 disk_super = &fs_info->super_copy;
1774 if (!btrfs_super_root(disk_super)) 1782 if (!btrfs_super_root(disk_super))
1775 goto fail_iput; 1783 goto fail_alloc;
1776 1784
1777 /* check FS state, whether FS is broken. */ 1785 /* check FS state, whether FS is broken. */
1778 fs_info->fs_state |= btrfs_super_flags(disk_super); 1786 fs_info->fs_state |= btrfs_super_flags(disk_super);
@@ -1788,7 +1796,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1788 ret = btrfs_parse_options(tree_root, options); 1796 ret = btrfs_parse_options(tree_root, options);
1789 if (ret) { 1797 if (ret) {
1790 err = ret; 1798 err = ret;
1791 goto fail_iput; 1799 goto fail_alloc;
1792 } 1800 }
1793 1801
1794 features = btrfs_super_incompat_flags(disk_super) & 1802 features = btrfs_super_incompat_flags(disk_super) &
@@ -1798,7 +1806,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1798 "unsupported optional features (%Lx).\n", 1806 "unsupported optional features (%Lx).\n",
1799 (unsigned long long)features); 1807 (unsigned long long)features);
1800 err = -EINVAL; 1808 err = -EINVAL;
1801 goto fail_iput; 1809 goto fail_alloc;
1802 } 1810 }
1803 1811
1804 features = btrfs_super_incompat_flags(disk_super); 1812 features = btrfs_super_incompat_flags(disk_super);
@@ -1814,7 +1822,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1814 "unsupported option features (%Lx).\n", 1822 "unsupported option features (%Lx).\n",
1815 (unsigned long long)features); 1823 (unsigned long long)features);
1816 err = -EINVAL; 1824 err = -EINVAL;
1817 goto fail_iput; 1825 goto fail_alloc;
1818 } 1826 }
1819 1827
1820 btrfs_init_workers(&fs_info->generic_worker, 1828 btrfs_init_workers(&fs_info->generic_worker,
@@ -1861,6 +1869,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1861 &fs_info->generic_worker); 1869 &fs_info->generic_worker);
1862 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", 1870 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1863 1, &fs_info->generic_worker); 1871 1, &fs_info->generic_worker);
1872 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1873 fs_info->thread_pool_size,
1874 &fs_info->generic_worker);
1864 1875
1865 /* 1876 /*
1866 * endios are largely parallel and should have a very 1877 * endios are largely parallel and should have a very
@@ -1882,6 +1893,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1882 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1893 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1883 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1894 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1884 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 1895 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1896 btrfs_start_workers(&fs_info->delayed_workers, 1);
1885 1897
1886 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1898 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1887 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1899 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2138,6 +2150,9 @@ fail_sb_buffer:
2138 btrfs_stop_workers(&fs_info->endio_write_workers); 2150 btrfs_stop_workers(&fs_info->endio_write_workers);
2139 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2151 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2140 btrfs_stop_workers(&fs_info->submit_workers); 2152 btrfs_stop_workers(&fs_info->submit_workers);
2153 btrfs_stop_workers(&fs_info->delayed_workers);
2154fail_alloc:
2155 kfree(fs_info->delayed_root);
2141fail_iput: 2156fail_iput:
2142 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2143 iput(fs_info->btree_inode); 2158 iput(fs_info->btree_inode);
@@ -2578,6 +2593,7 @@ int close_ctree(struct btrfs_root *root)
2578 del_fs_roots(fs_info); 2593 del_fs_roots(fs_info);
2579 2594
2580 iput(fs_info->btree_inode); 2595 iput(fs_info->btree_inode);
2596 kfree(fs_info->delayed_root);
2581 2597
2582 btrfs_stop_workers(&fs_info->generic_worker); 2598 btrfs_stop_workers(&fs_info->generic_worker);
2583 btrfs_stop_workers(&fs_info->fixup_workers); 2599 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2589,6 +2605,7 @@ int close_ctree(struct btrfs_root *root)
2589 btrfs_stop_workers(&fs_info->endio_write_workers); 2605 btrfs_stop_workers(&fs_info->endio_write_workers);
2590 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2606 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2591 btrfs_stop_workers(&fs_info->submit_workers); 2607 btrfs_stop_workers(&fs_info->submit_workers);
2608 btrfs_stop_workers(&fs_info->delayed_workers);
2592 2609
2593 btrfs_close_devices(fs_info->fs_devices); 2610 btrfs_close_devices(fs_info->fs_devices);
2594 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2611 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2665,6 +2682,29 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2665 if (current->flags & PF_MEMALLOC) 2682 if (current->flags & PF_MEMALLOC)
2666 return; 2683 return;
2667 2684
2685 btrfs_balance_delayed_items(root);
2686
2687 num_dirty = root->fs_info->dirty_metadata_bytes;
2688
2689 if (num_dirty > thresh) {
2690 balance_dirty_pages_ratelimited_nr(
2691 root->fs_info->btree_inode->i_mapping, 1);
2692 }
2693 return;
2694}
2695
2696void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2697{
2698 /*
2699 * looks as though older kernels can get into trouble with
2700 * this code, they end up stuck in balance_dirty_pages forever
2701 */
2702 u64 num_dirty;
2703 unsigned long thresh = 32 * 1024 * 1024;
2704
2705 if (current->flags & PF_MEMALLOC)
2706 return;
2707
2668 num_dirty = root->fs_info->dirty_metadata_bytes; 2708 num_dirty = root->fs_info->dirty_metadata_bytes;
2669 2709
2670 if (num_dirty > thresh) { 2710 if (num_dirty > thresh) {