aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLiu Bo <bo.liu@linux.alibaba.com>2018-08-21 17:54:37 -0400
committerDavid Sterba <dsterba@suse.com>2018-10-15 11:23:38 -0400
commit523983401644ebeb331c923c28c9591c07430a7d (patch)
tree242e5f93f3ed599f93882c14b8a4c420ebb26840
parent9b142115ed3593480b813a9331593e9f199da340 (diff)
Btrfs: kill btrfs_clear_path_blocking
Btrfs's btree locking has two modes, spinning mode and blocking mode, while searching btree, locking is always acquired in spinning mode and then converted to blocking mode if necessary, and in some hot paths we may switch the locking back to spinning mode by btrfs_clear_path_blocking(). When acquiring locks, both of reader and writer need to wait for blocking readers and writers to complete before doing read_lock()/write_lock(). The problem is that btrfs_clear_path_blocking() needs to switch nodes in the path to blocking mode at first (by btrfs_set_path_blocking) to make lockdep happy before doing its actual clearing blocking job. When switching to blocking mode from spinning mode, it consists of step 1) bumping up blocking readers counter and step 2) read_unlock()/write_unlock(), this has caused serious ping-pong effect if there're a great amount of concurrent readers/writers, as waiters will be woken up and go to sleep immediately. 1) Killing this kind of ping-pong results in a big improvement in my 1600k files creation script, MNT=/mnt/btrfs mkfs.btrfs -f /dev/sdf mount /dev/def $MNT time fsmark -D 10000 -S0 -n 100000 -s 0 -L 1 -l /tmp/fs_log.txt \ -d $MNT/0 -d $MNT/1 \ -d $MNT/2 -d $MNT/3 \ -d $MNT/4 -d $MNT/5 \ -d $MNT/6 -d $MNT/7 \ -d $MNT/8 -d $MNT/9 \ -d $MNT/10 -d $MNT/11 \ -d $MNT/12 -d $MNT/13 \ -d $MNT/14 -d $MNT/15 w/o patch: real 2m27.307s user 0m12.839s sys 13m42.831s w/ patch: real 1m2.273s user 0m15.802s sys 8m16.495s 1.1) latency histogram from funclatency[1] Overall with the patch, there're ~50% less write lock acquisition and the 95% max latency that write lock takes also reduces to ~100ms from >500ms. -------------------------------------------- w/o patch: -------------------------------------------- Function = btrfs_tree_lock msecs : count distribution 0 -> 1 : 2385222 |****************************************| 2 -> 3 : 37147 | | 4 -> 7 : 20452 | | 8 -> 15 : 13131 | | 16 -> 31 : 3877 | | 32 -> 63 : 3900 | | 64 -> 127 : 2612 | | 128 -> 255 : 974 | | 256 -> 511 : 165 | | 512 -> 1023 : 13 | | Function = btrfs_tree_read_lock msecs : count distribution 0 -> 1 : 6743860 |****************************************| 2 -> 3 : 2146 | | 4 -> 7 : 190 | | 8 -> 15 : 38 | | 16 -> 31 : 4 | | -------------------------------------------- w/ patch: -------------------------------------------- Function = btrfs_tree_lock msecs : count distribution 0 -> 1 : 1318454 |****************************************| 2 -> 3 : 6800 | | 4 -> 7 : 3664 | | 8 -> 15 : 2145 | | 16 -> 31 : 809 | | 32 -> 63 : 219 | | 64 -> 127 : 10 | | Function = btrfs_tree_read_lock msecs : count distribution 0 -> 1 : 6854317 |****************************************| 2 -> 3 : 2383 | | 4 -> 7 : 601 | | 8 -> 15 : 92 | | 2) dbench also proves the improvement, dbench -t 120 -D /mnt/btrfs 16 w/o patch: Throughput 158.363 MB/sec w/ patch: Throughput 449.52 MB/sec 3) xfstests didn't show any additional failures. One thing to note is that callers may set path->leave_spinning to have all nodes in the path stay in spinning mode, which means callers are ready to not sleep before releasing the path, but it won't cause problems if they don't want to sleep in blocking mode. [1]: https://github.com/iovisor/bcc/blob/master/tools/funclatency.py Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/ctree.c57
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/delayed-inode.c3
3 files changed, 4 insertions, 58 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0a6c645fab0a..2ee43b6a4f09 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
52 } 52 }
53} 53}
54 54
55/*
56 * reset all the locked nodes in the patch to spinning locks.
57 *
58 * held is used to keep lockdep happy, when lockdep is enabled
59 * we set held to a blocking lock before we go around and
60 * retake all the spinlocks in the path. You can safely use NULL
61 * for held
62 */
63noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
64 struct extent_buffer *held, int held_rw)
65{
66 int i;
67
68 if (held) {
69 btrfs_set_lock_blocking_rw(held, held_rw);
70 if (held_rw == BTRFS_WRITE_LOCK)
71 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
72 else if (held_rw == BTRFS_READ_LOCK)
73 held_rw = BTRFS_READ_LOCK_BLOCKING;
74 }
75 btrfs_set_path_blocking(p);
76
77 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
78 if (p->nodes[i] && p->locks[i]) {
79 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
80 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
81 p->locks[i] = BTRFS_WRITE_LOCK;
82 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
83 p->locks[i] = BTRFS_READ_LOCK;
84 }
85 }
86
87 if (held)
88 btrfs_clear_lock_blocking_rw(held, held_rw);
89}
90
91/* this also releases the path */ 55/* this also releases the path */
92void btrfs_free_path(struct btrfs_path *p) 56void btrfs_free_path(struct btrfs_path *p)
93{ 57{
@@ -1306,7 +1270,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1306 } 1270 }
1307 } 1271 }
1308 1272
1309 btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1310 btrfs_tree_read_unlock_blocking(eb); 1273 btrfs_tree_read_unlock_blocking(eb);
1311 free_extent_buffer(eb); 1274 free_extent_buffer(eb);
1312 1275
@@ -2482,7 +2445,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2482 btrfs_set_path_blocking(p); 2445 btrfs_set_path_blocking(p);
2483 reada_for_balance(fs_info, p, level); 2446 reada_for_balance(fs_info, p, level);
2484 sret = split_node(trans, root, p, level); 2447 sret = split_node(trans, root, p, level);
2485 btrfs_clear_path_blocking(p, NULL, 0);
2486 2448
2487 BUG_ON(sret > 0); 2449 BUG_ON(sret > 0);
2488 if (sret) { 2450 if (sret) {
@@ -2503,7 +2465,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2503 btrfs_set_path_blocking(p); 2465 btrfs_set_path_blocking(p);
2504 reada_for_balance(fs_info, p, level); 2466 reada_for_balance(fs_info, p, level);
2505 sret = balance_level(trans, root, p, level); 2467 sret = balance_level(trans, root, p, level);
2506 btrfs_clear_path_blocking(p, NULL, 0);
2507 2468
2508 if (sret) { 2469 if (sret) {
2509 ret = sret; 2470 ret = sret;
@@ -2788,7 +2749,10 @@ again:
2788 } 2749 }
2789cow_done: 2750cow_done:
2790 p->nodes[level] = b; 2751 p->nodes[level] = b;
2791 btrfs_clear_path_blocking(p, NULL, 0); 2752 /*
2753 * Leave path with blocking locks to avoid massive
2754 * lock context switch, this is made on purpose.
2755 */
2792 2756
2793 /* 2757 /*
2794 * we have a lock on b and as long as we aren't changing 2758 * we have a lock on b and as long as we aren't changing
@@ -2870,8 +2834,6 @@ cow_done:
2870 if (!err) { 2834 if (!err) {
2871 btrfs_set_path_blocking(p); 2835 btrfs_set_path_blocking(p);
2872 btrfs_tree_lock(b); 2836 btrfs_tree_lock(b);
2873 btrfs_clear_path_blocking(p, b,
2874 BTRFS_WRITE_LOCK);
2875 } 2837 }
2876 p->locks[level] = BTRFS_WRITE_LOCK; 2838 p->locks[level] = BTRFS_WRITE_LOCK;
2877 } else { 2839 } else {
@@ -2879,8 +2841,6 @@ cow_done:
2879 if (!err) { 2841 if (!err) {
2880 btrfs_set_path_blocking(p); 2842 btrfs_set_path_blocking(p);
2881 btrfs_tree_read_lock(b); 2843 btrfs_tree_read_lock(b);
2882 btrfs_clear_path_blocking(p, b,
2883 BTRFS_READ_LOCK);
2884 } 2844 }
2885 p->locks[level] = BTRFS_READ_LOCK; 2845 p->locks[level] = BTRFS_READ_LOCK;
2886 } 2846 }
@@ -2899,7 +2859,6 @@ cow_done:
2899 btrfs_set_path_blocking(p); 2859 btrfs_set_path_blocking(p);
2900 err = split_leaf(trans, root, key, 2860 err = split_leaf(trans, root, key,
2901 p, ins_len, ret == 0); 2861 p, ins_len, ret == 0);
2902 btrfs_clear_path_blocking(p, NULL, 0);
2903 2862
2904 BUG_ON(err > 0); 2863 BUG_ON(err > 0);
2905 if (err) { 2864 if (err) {
@@ -2970,7 +2929,6 @@ again:
2970 while (b) { 2929 while (b) {
2971 level = btrfs_header_level(b); 2930 level = btrfs_header_level(b);
2972 p->nodes[level] = b; 2931 p->nodes[level] = b;
2973 btrfs_clear_path_blocking(p, NULL, 0);
2974 2932
2975 /* 2933 /*
2976 * we have a lock on b and as long as we aren't changing 2934 * we have a lock on b and as long as we aren't changing
@@ -3016,8 +2974,6 @@ again:
3016 if (!err) { 2974 if (!err) {
3017 btrfs_set_path_blocking(p); 2975 btrfs_set_path_blocking(p);
3018 btrfs_tree_read_lock(b); 2976 btrfs_tree_read_lock(b);
3019 btrfs_clear_path_blocking(p, b,
3020 BTRFS_READ_LOCK);
3021 } 2977 }
3022 b = tree_mod_log_rewind(fs_info, p, b, time_seq); 2978 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3023 if (!b) { 2979 if (!b) {
@@ -5201,7 +5157,6 @@ find_next_key:
5201 path->locks[level - 1] = BTRFS_READ_LOCK; 5157 path->locks[level - 1] = BTRFS_READ_LOCK;
5202 path->nodes[level - 1] = cur; 5158 path->nodes[level - 1] = cur;
5203 unlock_up(path, level, 1, 0, NULL); 5159 unlock_up(path, level, 1, 0, NULL);
5204 btrfs_clear_path_blocking(path, NULL, 0);
5205 } 5160 }
5206out: 5161out:
5207 path->keep_locks = keep_locks; 5162 path->keep_locks = keep_locks;
@@ -5786,8 +5741,6 @@ again:
5786 if (!ret) { 5741 if (!ret) {
5787 btrfs_set_path_blocking(path); 5742 btrfs_set_path_blocking(path);
5788 btrfs_tree_read_lock(next); 5743 btrfs_tree_read_lock(next);
5789 btrfs_clear_path_blocking(path, next,
5790 BTRFS_READ_LOCK);
5791 } 5744 }
5792 next_rw_lock = BTRFS_READ_LOCK; 5745 next_rw_lock = BTRFS_READ_LOCK;
5793 } 5746 }
@@ -5823,8 +5776,6 @@ again:
5823 if (!ret) { 5776 if (!ret) {
5824 btrfs_set_path_blocking(path); 5777 btrfs_set_path_blocking(path);
5825 btrfs_tree_read_lock(next); 5778 btrfs_tree_read_lock(next);
5826 btrfs_clear_path_blocking(path, next,
5827 BTRFS_READ_LOCK);
5828 } 5779 }
5829 next_rw_lock = BTRFS_READ_LOCK; 5780 next_rw_lock = BTRFS_READ_LOCK;
5830 } 5781 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e8244c0b0597..15c659f23411 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2868,8 +2868,6 @@ void btrfs_release_path(struct btrfs_path *p);
2868struct btrfs_path *btrfs_alloc_path(void); 2868struct btrfs_path *btrfs_alloc_path(void);
2869void btrfs_free_path(struct btrfs_path *p); 2869void btrfs_free_path(struct btrfs_path *p);
2870void btrfs_set_path_blocking(struct btrfs_path *p); 2870void btrfs_set_path_blocking(struct btrfs_path *p);
2871void btrfs_clear_path_blocking(struct btrfs_path *p,
2872 struct extent_buffer *held, int held_rw);
2873void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 2871void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
2874 2872
2875int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2873int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 15dcbd6ef531..c669f250d4a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -765,9 +765,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
765 i++; 765 i++;
766 } 766 }
767 767
768 /* reset all the locked nodes in the patch to spinning locks. */
769 btrfs_clear_path_blocking(path, NULL, 0);
770
771 /* insert the keys of the items */ 768 /* insert the keys of the items */
772 setup_items_for_insert(root, path, keys, data_size, 769 setup_items_for_insert(root, path, keys, data_size,
773 total_data_size, total_size, nitems); 770 total_data_size, total_size, nitems);