diff options
author | Chris Mason <chris.mason@oracle.com> | 2009-03-13 10:17:05 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2009-03-24 16:14:26 -0400 |
commit | c3e69d58e86c3917ae4e9e31b4acf490a7cafe60 (patch) | |
tree | bd4f1e62446a208bdae26f0c36d67e3afbc1cd1d /fs/btrfs/extent-tree.c | |
parent | 1887be66dcc3140a81d1299958a41fc0eedfa64f (diff) |
Btrfs: process the delayed reference queue in clusters
The delayed reference queue maintains pending operations that need to
be done to the extent allocation tree. These are processed by
finding records in the tree that are not currently being processed one at
a time.
This is slow because it uses lots of time searching through the rbtree
and because it creates lock contention on the extent allocation tree
when lots of different procs are running delayed refs at the same time.
This commit changes things to grab a cluster of refs for processing,
using a cursor into the rbtree as the starting point of the next search.
This way we walk smoothly through the rbtree.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 149 |
1 files changed, 98 insertions, 51 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8471c79b0877..3b8b6c212701 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -926,52 +926,41 @@ again: | |||
926 | return NULL; | 926 | return NULL; |
927 | } | 927 | } |
928 | 928 | ||
929 | /* | 929 | static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, |
930 | * this starts processing the delayed reference count updates and | 930 | struct btrfs_root *root, |
931 | * extent insertions we have queued up so far. count can be | 931 | struct list_head *cluster) |
932 | * 0, which means to process everything in the tree at the start | ||
933 | * of the run (but not newly added entries), or it can be some target | ||
934 | * number you'd like to process. | ||
935 | */ | ||
936 | int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, | ||
937 | struct btrfs_root *root, unsigned long count) | ||
938 | { | 932 | { |
939 | struct rb_node *node; | ||
940 | struct btrfs_delayed_ref_root *delayed_refs; | 933 | struct btrfs_delayed_ref_root *delayed_refs; |
941 | struct btrfs_delayed_ref_node *ref; | 934 | struct btrfs_delayed_ref_node *ref; |
942 | struct btrfs_delayed_ref_head *locked_ref = NULL; | 935 | struct btrfs_delayed_ref_head *locked_ref = NULL; |
943 | int ret; | 936 | int ret; |
937 | int count = 0; | ||
944 | int must_insert_reserved = 0; | 938 | int must_insert_reserved = 0; |
945 | int run_all = count == (unsigned long)-1; | ||
946 | |||
947 | if (root == root->fs_info->extent_root) | ||
948 | root = root->fs_info->tree_root; | ||
949 | 939 | ||
950 | delayed_refs = &trans->transaction->delayed_refs; | 940 | delayed_refs = &trans->transaction->delayed_refs; |
951 | again: | ||
952 | spin_lock(&delayed_refs->lock); | ||
953 | if (count == 0) | ||
954 | count = delayed_refs->num_entries; | ||
955 | while (1) { | 941 | while (1) { |
956 | if (!locked_ref) { | 942 | if (!locked_ref) { |
957 | /* | 943 | /* pick a new head ref from the cluster list */ |
958 | * no locked ref, go find something we can | 944 | if (list_empty(cluster)) |
959 | * process in the rbtree. We start at | ||
960 | * the beginning of the tree, there may be less | ||
961 | * lock contention if we do something smarter here. | ||
962 | */ | ||
963 | node = rb_first(&delayed_refs->root); | ||
964 | if (!node) { | ||
965 | spin_unlock(&delayed_refs->lock); | ||
966 | break; | 945 | break; |
967 | } | ||
968 | 946 | ||
969 | ref = rb_entry(node, struct btrfs_delayed_ref_node, | 947 | locked_ref = list_entry(cluster->next, |
970 | rb_node); | 948 | struct btrfs_delayed_ref_head, cluster); |
971 | ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref); | 949 | |
972 | if (ret) { | 950 | /* grab the lock that says we are going to process |
973 | spin_unlock(&delayed_refs->lock); | 951 | * all the refs for this head */ |
974 | break; | 952 | ret = btrfs_delayed_ref_lock(trans, locked_ref); |
953 | |||
954 | /* | ||
955 | * we may have dropped the spin lock to get the head | ||
956 | * mutex lock, and that might have given someone else | ||
957 | * time to free the head. If that's true, it has been | ||
958 | * removed from our list and we can move on. | ||
959 | */ | ||
960 | if (ret == -EAGAIN) { | ||
961 | locked_ref = NULL; | ||
962 | count++; | ||
963 | continue; | ||
975 | } | 964 | } |
976 | } | 965 | } |
977 | 966 | ||
@@ -986,7 +975,6 @@ again: | |||
986 | * locked_ref is the head node, so we have to go one | 975 | * locked_ref is the head node, so we have to go one |
987 | * node back for any delayed ref updates | 976 | * node back for any delayed ref updates |
988 | */ | 977 | */ |
989 | |||
990 | ref = select_delayed_ref(locked_ref); | 978 | ref = select_delayed_ref(locked_ref); |
991 | if (!ref) { | 979 | if (!ref) { |
992 | /* All delayed refs have been processed, Go ahead | 980 | /* All delayed refs have been processed, Go ahead |
@@ -994,6 +982,7 @@ again: | |||
994 | * so that any accounting fixes can happen | 982 | * so that any accounting fixes can happen |
995 | */ | 983 | */ |
996 | ref = &locked_ref->node; | 984 | ref = &locked_ref->node; |
985 | list_del_init(&locked_ref->cluster); | ||
997 | locked_ref = NULL; | 986 | locked_ref = NULL; |
998 | } | 987 | } |
999 | 988 | ||
@@ -1007,30 +996,72 @@ again: | |||
1007 | BUG_ON(ret); | 996 | BUG_ON(ret); |
1008 | btrfs_put_delayed_ref(ref); | 997 | btrfs_put_delayed_ref(ref); |
1009 | 998 | ||
1010 | /* once we lock the head ref, we have to process all the | 999 | count++; |
1011 | * entries for it. So, we might end up doing more entries | 1000 | cond_resched(); |
1012 | * that count was asking us to do. | 1001 | spin_lock(&delayed_refs->lock); |
1013 | */ | 1002 | } |
1014 | if (count > 0) | 1003 | return count; |
1015 | count--; | 1004 | } |
1005 | |||
1006 | /* | ||
1007 | * this starts processing the delayed reference count updates and | ||
1008 | * extent insertions we have queued up so far. count can be | ||
1009 | * 0, which means to process everything in the tree at the start | ||
1010 | * of the run (but not newly added entries), or it can be some target | ||
1011 | * number you'd like to process. | ||
1012 | */ | ||
1013 | int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, | ||
1014 | struct btrfs_root *root, unsigned long count) | ||
1015 | { | ||
1016 | struct rb_node *node; | ||
1017 | struct btrfs_delayed_ref_root *delayed_refs; | ||
1018 | struct btrfs_delayed_ref_node *ref; | ||
1019 | struct list_head cluster; | ||
1020 | int ret; | ||
1021 | int run_all = count == (unsigned long)-1; | ||
1022 | int run_most = 0; | ||
1023 | |||
1024 | if (root == root->fs_info->extent_root) | ||
1025 | root = root->fs_info->tree_root; | ||
1026 | |||
1027 | delayed_refs = &trans->transaction->delayed_refs; | ||
1028 | INIT_LIST_HEAD(&cluster); | ||
1029 | again: | ||
1030 | spin_lock(&delayed_refs->lock); | ||
1031 | if (count == 0) { | ||
1032 | count = delayed_refs->num_entries * 2; | ||
1033 | run_most = 1; | ||
1034 | } | ||
1035 | while (1) { | ||
1036 | if (!(run_all || run_most) && | ||
1037 | delayed_refs->num_heads_ready < 64) | ||
1038 | break; | ||
1016 | 1039 | ||
1017 | /* | 1040 | /* |
1018 | * we set locked_ref to null above if we're all done | 1041 | * go find something we can process in the rbtree. We start at |
1019 | * with this bytenr | 1042 | * the beginning of the tree, and then build a cluster |
1043 | * of refs to process starting at the first one we are able to | ||
1044 | * lock | ||
1020 | */ | 1045 | */ |
1021 | if (!locked_ref && count == 0) | 1046 | ret = btrfs_find_ref_cluster(trans, &cluster, |
1047 | delayed_refs->run_delayed_start); | ||
1048 | if (ret) | ||
1022 | break; | 1049 | break; |
1023 | 1050 | ||
1024 | cond_resched(); | 1051 | ret = run_clustered_refs(trans, root, &cluster); |
1025 | spin_lock(&delayed_refs->lock); | 1052 | BUG_ON(ret < 0); |
1053 | |||
1054 | count -= min_t(unsigned long, ret, count); | ||
1055 | |||
1056 | if (count == 0) | ||
1057 | break; | ||
1026 | } | 1058 | } |
1059 | |||
1027 | if (run_all) { | 1060 | if (run_all) { |
1028 | spin_lock(&delayed_refs->lock); | ||
1029 | node = rb_first(&delayed_refs->root); | 1061 | node = rb_first(&delayed_refs->root); |
1030 | if (!node) { | 1062 | if (!node) |
1031 | spin_unlock(&delayed_refs->lock); | ||
1032 | goto out; | 1063 | goto out; |
1033 | } | 1064 | count = (unsigned long)-1; |
1034 | 1065 | ||
1035 | while (node) { | 1066 | while (node) { |
1036 | ref = rb_entry(node, struct btrfs_delayed_ref_node, | 1067 | ref = rb_entry(node, struct btrfs_delayed_ref_node, |
@@ -1052,11 +1083,11 @@ again: | |||
1052 | node = rb_next(node); | 1083 | node = rb_next(node); |
1053 | } | 1084 | } |
1054 | spin_unlock(&delayed_refs->lock); | 1085 | spin_unlock(&delayed_refs->lock); |
1055 | count = (unsigned long)-1; | ||
1056 | schedule_timeout(1); | 1086 | schedule_timeout(1); |
1057 | goto again; | 1087 | goto again; |
1058 | } | 1088 | } |
1059 | out: | 1089 | out: |
1090 | spin_unlock(&delayed_refs->lock); | ||
1060 | return 0; | 1091 | return 0; |
1061 | } | 1092 | } |
1062 | 1093 | ||
@@ -2407,12 +2438,18 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, | |||
2407 | */ | 2438 | */ |
2408 | head->node.in_tree = 0; | 2439 | head->node.in_tree = 0; |
2409 | rb_erase(&head->node.rb_node, &delayed_refs->root); | 2440 | rb_erase(&head->node.rb_node, &delayed_refs->root); |
2441 | |||
2410 | delayed_refs->num_entries--; | 2442 | delayed_refs->num_entries--; |
2411 | 2443 | ||
2412 | /* | 2444 | /* |
2413 | * we don't take a ref on the node because we're removing it from the | 2445 | * we don't take a ref on the node because we're removing it from the |
2414 | * tree, so we just steal the ref the tree was holding. | 2446 | * tree, so we just steal the ref the tree was holding. |
2415 | */ | 2447 | */ |
2448 | delayed_refs->num_heads--; | ||
2449 | if (list_empty(&head->cluster)) | ||
2450 | delayed_refs->num_heads_ready--; | ||
2451 | |||
2452 | list_del_init(&head->cluster); | ||
2416 | spin_unlock(&delayed_refs->lock); | 2453 | spin_unlock(&delayed_refs->lock); |
2417 | 2454 | ||
2418 | ret = run_one_delayed_ref(trans, root->fs_info->tree_root, | 2455 | ret = run_one_delayed_ref(trans, root->fs_info->tree_root, |
@@ -3705,6 +3742,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root | |||
3705 | struct btrfs_path *path; | 3742 | struct btrfs_path *path; |
3706 | int i; | 3743 | int i; |
3707 | int orig_level; | 3744 | int orig_level; |
3745 | int update_count; | ||
3708 | struct btrfs_root_item *root_item = &root->root_item; | 3746 | struct btrfs_root_item *root_item = &root->root_item; |
3709 | 3747 | ||
3710 | WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); | 3748 | WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); |
@@ -3746,6 +3784,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root | |||
3746 | } | 3784 | } |
3747 | } | 3785 | } |
3748 | while (1) { | 3786 | while (1) { |
3787 | unsigned long update; | ||
3749 | wret = walk_down_tree(trans, root, path, &level); | 3788 | wret = walk_down_tree(trans, root, path, &level); |
3750 | if (wret > 0) | 3789 | if (wret > 0) |
3751 | break; | 3790 | break; |
@@ -3764,6 +3803,14 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root | |||
3764 | } | 3803 | } |
3765 | atomic_inc(&root->fs_info->throttle_gen); | 3804 | atomic_inc(&root->fs_info->throttle_gen); |
3766 | wake_up(&root->fs_info->transaction_throttle); | 3805 | wake_up(&root->fs_info->transaction_throttle); |
3806 | for (update_count = 0; update_count < 16; update_count++) { | ||
3807 | update = trans->delayed_ref_updates; | ||
3808 | trans->delayed_ref_updates = 0; | ||
3809 | if (update) | ||
3810 | btrfs_run_delayed_refs(trans, root, update); | ||
3811 | else | ||
3812 | break; | ||
3813 | } | ||
3767 | } | 3814 | } |
3768 | for (i = 0; i <= orig_level; i++) { | 3815 | for (i = 0; i <= orig_level; i++) { |
3769 | if (path->nodes[i]) { | 3816 | if (path->nodes[i]) { |