diff options
author | Kent Overstreet <kmo@daterainc.com> | 2014-03-17 20:15:53 -0400 |
---|---|---|
committer | Kent Overstreet <kmo@daterainc.com> | 2014-03-18 15:23:35 -0400 |
commit | 0a63b66db566cffdf90182eb6e66fdd4d0479e63 (patch) | |
tree | d1284e5008b668befb8179de30aeb50d4e789177 /drivers | |
parent | 56b30770b27d54d68ad51eccc6d888282b568cee (diff) |
bcache: Rework btree cache reserve handling
This changes the bucket allocation reserves to use _real_ reserves - separate
freelists - instead of watermarks, which if nothing else makes the current code
saner to reason about and is going to be important in the future when we add
support for multiple btrees.
It also adds btree_check_reserve(), which checks (and locks) the reserves for
both bucket allocation and memory allocation for btree nodes; the old code just
kinda sorta assumed that since (e.g. for btree node splits) it had the root
locked and that meant no other threads could try to make use of the same
reserve; this technically should have been ok for memory allocation (we should
always have a reserve for memory allocation (the btree node cache is used as a
reserve and we preallocate it)), but multiple btrees will mean that locking the
root won't be sufficient anymore, and for the bucket allocation reserve it was
technically possible for the old code to deadlock.
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/bcache/alloc.c | 23 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 15 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 225 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 5 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 13 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 3 |
6 files changed, 145 insertions, 139 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 5ba4eaea57f4..a59ef6147fc7 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
@@ -375,6 +375,7 @@ static int bch_allocator_thread(void *arg) | |||
375 | } | 375 | } |
376 | 376 | ||
377 | allocator_wait(ca, bch_allocator_push(ca, bucket)); | 377 | allocator_wait(ca, bch_allocator_push(ca, bucket)); |
378 | wake_up(&ca->set->btree_cache_wait); | ||
378 | wake_up(&ca->set->bucket_wait); | 379 | wake_up(&ca->set->bucket_wait); |
379 | } | 380 | } |
380 | 381 | ||
@@ -717,25 +718,3 @@ int bch_cache_allocator_start(struct cache *ca) | |||
717 | ca->alloc_thread = k; | 718 | ca->alloc_thread = k; |
718 | return 0; | 719 | return 0; |
719 | } | 720 | } |
720 | |||
721 | int bch_cache_allocator_init(struct cache *ca) | ||
722 | { | ||
723 | /* | ||
724 | * Reserve: | ||
725 | * Prio/gen writes first | ||
726 | * Then 8 for btree allocations | ||
727 | * Then half for the moving garbage collector | ||
728 | */ | ||
729 | #if 0 | ||
730 | ca->watermark[WATERMARK_PRIO] = 0; | ||
731 | |||
732 | ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); | ||
733 | |||
734 | ca->watermark[WATERMARK_MOVINGGC] = 8 + | ||
735 | ca->watermark[WATERMARK_METADATA]; | ||
736 | |||
737 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | ||
738 | ca->watermark[WATERMARK_MOVINGGC]; | ||
739 | #endif | ||
740 | return 0; | ||
741 | } | ||
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 15d26236caf9..171cda89cb6b 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -562,19 +562,16 @@ struct cache_set { | |||
562 | struct list_head btree_cache_freed; | 562 | struct list_head btree_cache_freed; |
563 | 563 | ||
564 | /* Number of elements in btree_cache + btree_cache_freeable lists */ | 564 | /* Number of elements in btree_cache + btree_cache_freeable lists */ |
565 | unsigned bucket_cache_used; | 565 | unsigned btree_cache_used; |
566 | 566 | ||
567 | /* | 567 | /* |
568 | * If we need to allocate memory for a new btree node and that | 568 | * If we need to allocate memory for a new btree node and that |
569 | * allocation fails, we can cannibalize another node in the btree cache | 569 | * allocation fails, we can cannibalize another node in the btree cache |
570 | * to satisfy the allocation. However, only one thread can be doing this | 570 | * to satisfy the allocation - lock to guarantee only one thread does |
571 | * at a time, for obvious reasons - try_harder and try_wait are | 571 | * this at a time: |
572 | * basically a lock for this that we can wait on asynchronously. The | ||
573 | * btree_root() macro releases the lock when it returns. | ||
574 | */ | 572 | */ |
575 | struct task_struct *try_harder; | 573 | wait_queue_head_t btree_cache_wait; |
576 | wait_queue_head_t try_wait; | 574 | struct task_struct *btree_cache_alloc_lock; |
577 | uint64_t try_harder_start; | ||
578 | 575 | ||
579 | /* | 576 | /* |
580 | * When we free a btree node, we increment the gen of the bucket the | 577 | * When we free a btree node, we increment the gen of the bucket the |
@@ -669,7 +666,6 @@ struct cache_set { | |||
669 | struct time_stats btree_gc_time; | 666 | struct time_stats btree_gc_time; |
670 | struct time_stats btree_split_time; | 667 | struct time_stats btree_split_time; |
671 | struct time_stats btree_read_time; | 668 | struct time_stats btree_read_time; |
672 | struct time_stats try_harder_time; | ||
673 | 669 | ||
674 | atomic_long_t cache_read_races; | 670 | atomic_long_t cache_read_races; |
675 | atomic_long_t writeback_keys_done; | 671 | atomic_long_t writeback_keys_done; |
@@ -956,7 +952,6 @@ int bch_open_buckets_alloc(struct cache_set *); | |||
956 | void bch_open_buckets_free(struct cache_set *); | 952 | void bch_open_buckets_free(struct cache_set *); |
957 | 953 | ||
958 | int bch_cache_allocator_start(struct cache *ca); | 954 | int bch_cache_allocator_start(struct cache *ca); |
959 | int bch_cache_allocator_init(struct cache *ca); | ||
960 | 955 | ||
961 | void bch_debug_exit(void); | 956 | void bch_debug_exit(void); |
962 | int bch_debug_init(struct kobject *); | 957 | int bch_debug_init(struct kobject *); |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index beb32551da78..be90596a9e2a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -117,7 +117,7 @@ | |||
117 | ({ \ | 117 | ({ \ |
118 | int _r, l = (b)->level - 1; \ | 118 | int _r, l = (b)->level - 1; \ |
119 | bool _w = l <= (op)->lock; \ | 119 | bool _w = l <= (op)->lock; \ |
120 | struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ | 120 | struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ |
121 | if (!IS_ERR(_child)) { \ | 121 | if (!IS_ERR(_child)) { \ |
122 | _child->parent = (b); \ | 122 | _child->parent = (b); \ |
123 | _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ | 123 | _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ |
@@ -146,17 +146,12 @@ | |||
146 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | 146 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ |
147 | } \ | 147 | } \ |
148 | rw_unlock(_w, _b); \ | 148 | rw_unlock(_w, _b); \ |
149 | bch_cannibalize_unlock(c); \ | ||
149 | if (_r == -EINTR) \ | 150 | if (_r == -EINTR) \ |
150 | schedule(); \ | 151 | schedule(); \ |
151 | bch_cannibalize_unlock(c); \ | ||
152 | if (_r == -ENOSPC) { \ | ||
153 | wait_event((c)->try_wait, \ | ||
154 | !(c)->try_harder); \ | ||
155 | _r = -EINTR; \ | ||
156 | } \ | ||
157 | } while (_r == -EINTR); \ | 152 | } while (_r == -EINTR); \ |
158 | \ | 153 | \ |
159 | finish_wait(&(c)->bucket_wait, &(op)->wait); \ | 154 | finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ |
160 | _r; \ | 155 | _r; \ |
161 | }) | 156 | }) |
162 | 157 | ||
@@ -563,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) | |||
563 | #define mca_reserve(c) (((c->root && c->root->level) \ | 558 | #define mca_reserve(c) (((c->root && c->root->level) \ |
564 | ? c->root->level : 1) * 8 + 16) | 559 | ? c->root->level : 1) * 8 + 16) |
565 | #define mca_can_free(c) \ | 560 | #define mca_can_free(c) \ |
566 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) | 561 | max_t(int, 0, c->btree_cache_used - mca_reserve(c)) |
567 | 562 | ||
568 | static void mca_data_free(struct btree *b) | 563 | static void mca_data_free(struct btree *b) |
569 | { | 564 | { |
@@ -571,7 +566,7 @@ static void mca_data_free(struct btree *b) | |||
571 | 566 | ||
572 | bch_btree_keys_free(&b->keys); | 567 | bch_btree_keys_free(&b->keys); |
573 | 568 | ||
574 | b->c->bucket_cache_used--; | 569 | b->c->btree_cache_used--; |
575 | list_move(&b->list, &b->c->btree_cache_freed); | 570 | list_move(&b->list, &b->c->btree_cache_freed); |
576 | } | 571 | } |
577 | 572 | ||
@@ -596,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | |||
596 | ilog2(b->c->btree_pages), | 591 | ilog2(b->c->btree_pages), |
597 | btree_order(k)), | 592 | btree_order(k)), |
598 | gfp)) { | 593 | gfp)) { |
599 | b->c->bucket_cache_used++; | 594 | b->c->btree_cache_used++; |
600 | list_move(&b->list, &b->c->btree_cache); | 595 | list_move(&b->list, &b->c->btree_cache); |
601 | } else { | 596 | } else { |
602 | list_move(&b->list, &b->c->btree_cache_freed); | 597 | list_move(&b->list, &b->c->btree_cache_freed); |
@@ -675,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, | |||
675 | if (c->shrinker_disabled) | 670 | if (c->shrinker_disabled) |
676 | return SHRINK_STOP; | 671 | return SHRINK_STOP; |
677 | 672 | ||
678 | if (c->try_harder) | 673 | if (c->btree_cache_alloc_lock) |
679 | return SHRINK_STOP; | 674 | return SHRINK_STOP; |
680 | 675 | ||
681 | /* Return -1 if we can't do anything right now */ | 676 | /* Return -1 if we can't do anything right now */ |
@@ -707,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, | |||
707 | } | 702 | } |
708 | } | 703 | } |
709 | 704 | ||
710 | for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { | 705 | for (i = 0; (nr--) && i < c->btree_cache_used; i++) { |
711 | if (list_empty(&c->btree_cache)) | 706 | if (list_empty(&c->btree_cache)) |
712 | goto out; | 707 | goto out; |
713 | 708 | ||
@@ -736,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink, | |||
736 | if (c->shrinker_disabled) | 731 | if (c->shrinker_disabled) |
737 | return 0; | 732 | return 0; |
738 | 733 | ||
739 | if (c->try_harder) | 734 | if (c->btree_cache_alloc_lock) |
740 | return 0; | 735 | return 0; |
741 | 736 | ||
742 | return mca_can_free(c) * c->btree_pages; | 737 | return mca_can_free(c) * c->btree_pages; |
@@ -840,17 +835,30 @@ out: | |||
840 | return b; | 835 | return b; |
841 | } | 836 | } |
842 | 837 | ||
843 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) | 838 | static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) |
839 | { | ||
840 | struct task_struct *old; | ||
841 | |||
842 | old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); | ||
843 | if (old && old != current) { | ||
844 | if (op) | ||
845 | prepare_to_wait(&c->btree_cache_wait, &op->wait, | ||
846 | TASK_UNINTERRUPTIBLE); | ||
847 | return -EINTR; | ||
848 | } | ||
849 | |||
850 | return 0; | ||
851 | } | ||
852 | |||
853 | static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, | ||
854 | struct bkey *k) | ||
844 | { | 855 | { |
845 | struct btree *b; | 856 | struct btree *b; |
846 | 857 | ||
847 | trace_bcache_btree_cache_cannibalize(c); | 858 | trace_bcache_btree_cache_cannibalize(c); |
848 | 859 | ||
849 | if (!c->try_harder) { | 860 | if (mca_cannibalize_lock(c, op)) |
850 | c->try_harder = current; | 861 | return ERR_PTR(-EINTR); |
851 | c->try_harder_start = local_clock(); | ||
852 | } else if (c->try_harder != current) | ||
853 | return ERR_PTR(-ENOSPC); | ||
854 | 862 | ||
855 | list_for_each_entry_reverse(b, &c->btree_cache, list) | 863 | list_for_each_entry_reverse(b, &c->btree_cache, list) |
856 | if (!mca_reap(b, btree_order(k), false)) | 864 | if (!mca_reap(b, btree_order(k), false)) |
@@ -860,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) | |||
860 | if (!mca_reap(b, btree_order(k), true)) | 868 | if (!mca_reap(b, btree_order(k), true)) |
861 | return b; | 869 | return b; |
862 | 870 | ||
871 | WARN(1, "btree cache cannibalize failed\n"); | ||
863 | return ERR_PTR(-ENOMEM); | 872 | return ERR_PTR(-ENOMEM); |
864 | } | 873 | } |
865 | 874 | ||
@@ -871,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) | |||
871 | */ | 880 | */ |
872 | static void bch_cannibalize_unlock(struct cache_set *c) | 881 | static void bch_cannibalize_unlock(struct cache_set *c) |
873 | { | 882 | { |
874 | if (c->try_harder == current) { | 883 | if (c->btree_cache_alloc_lock == current) { |
875 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); | 884 | c->btree_cache_alloc_lock = NULL; |
876 | c->try_harder = NULL; | 885 | wake_up(&c->btree_cache_wait); |
877 | wake_up(&c->try_wait); | ||
878 | } | 886 | } |
879 | } | 887 | } |
880 | 888 | ||
881 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) | 889 | static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, |
890 | struct bkey *k, int level) | ||
882 | { | 891 | { |
883 | struct btree *b; | 892 | struct btree *b; |
884 | 893 | ||
@@ -941,7 +950,7 @@ err: | |||
941 | if (b) | 950 | if (b) |
942 | rw_unlock(true, b); | 951 | rw_unlock(true, b); |
943 | 952 | ||
944 | b = mca_cannibalize(c, k); | 953 | b = mca_cannibalize(c, op, k); |
945 | if (!IS_ERR(b)) | 954 | if (!IS_ERR(b)) |
946 | goto out; | 955 | goto out; |
947 | 956 | ||
@@ -957,8 +966,8 @@ err: | |||
957 | * The btree node will have either a read or a write lock held, depending on | 966 | * The btree node will have either a read or a write lock held, depending on |
958 | * level and op->lock. | 967 | * level and op->lock. |
959 | */ | 968 | */ |
960 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, | 969 | struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, |
961 | int level, bool write) | 970 | struct bkey *k, int level, bool write) |
962 | { | 971 | { |
963 | int i = 0; | 972 | int i = 0; |
964 | struct btree *b; | 973 | struct btree *b; |
@@ -972,7 +981,7 @@ retry: | |||
972 | return ERR_PTR(-EAGAIN); | 981 | return ERR_PTR(-EAGAIN); |
973 | 982 | ||
974 | mutex_lock(&c->bucket_lock); | 983 | mutex_lock(&c->bucket_lock); |
975 | b = mca_alloc(c, k, level); | 984 | b = mca_alloc(c, op, k, level); |
976 | mutex_unlock(&c->bucket_lock); | 985 | mutex_unlock(&c->bucket_lock); |
977 | 986 | ||
978 | if (!b) | 987 | if (!b) |
@@ -1018,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | |||
1018 | struct btree *b; | 1027 | struct btree *b; |
1019 | 1028 | ||
1020 | mutex_lock(&c->bucket_lock); | 1029 | mutex_lock(&c->bucket_lock); |
1021 | b = mca_alloc(c, k, level); | 1030 | b = mca_alloc(c, NULL, k, level); |
1022 | mutex_unlock(&c->bucket_lock); | 1031 | mutex_unlock(&c->bucket_lock); |
1023 | 1032 | ||
1024 | if (!IS_ERR_OR_NULL(b)) { | 1033 | if (!IS_ERR_OR_NULL(b)) { |
@@ -1051,20 +1060,21 @@ static void btree_node_free(struct btree *b) | |||
1051 | mutex_unlock(&b->c->bucket_lock); | 1060 | mutex_unlock(&b->c->bucket_lock); |
1052 | } | 1061 | } |
1053 | 1062 | ||
1054 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) | 1063 | struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, |
1064 | int level) | ||
1055 | { | 1065 | { |
1056 | BKEY_PADDED(key) k; | 1066 | BKEY_PADDED(key) k; |
1057 | struct btree *b = ERR_PTR(-EAGAIN); | 1067 | struct btree *b = ERR_PTR(-EAGAIN); |
1058 | 1068 | ||
1059 | mutex_lock(&c->bucket_lock); | 1069 | mutex_lock(&c->bucket_lock); |
1060 | retry: | 1070 | retry: |
1061 | if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) | 1071 | if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) |
1062 | goto err; | 1072 | goto err; |
1063 | 1073 | ||
1064 | bkey_put(c, &k.key); | 1074 | bkey_put(c, &k.key); |
1065 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); | 1075 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); |
1066 | 1076 | ||
1067 | b = mca_alloc(c, &k.key, level); | 1077 | b = mca_alloc(c, op, &k.key, level); |
1068 | if (IS_ERR(b)) | 1078 | if (IS_ERR(b)) |
1069 | goto err_free; | 1079 | goto err_free; |
1070 | 1080 | ||
@@ -1090,9 +1100,10 @@ err: | |||
1090 | return b; | 1100 | return b; |
1091 | } | 1101 | } |
1092 | 1102 | ||
1093 | static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) | 1103 | static struct btree *btree_node_alloc_replacement(struct btree *b, |
1104 | struct btree_op *op) | ||
1094 | { | 1105 | { |
1095 | struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); | 1106 | struct btree *n = bch_btree_node_alloc(b->c, op, b->level); |
1096 | if (!IS_ERR_OR_NULL(n)) { | 1107 | if (!IS_ERR_OR_NULL(n)) { |
1097 | mutex_lock(&n->write_lock); | 1108 | mutex_lock(&n->write_lock); |
1098 | bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); | 1109 | bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); |
@@ -1126,22 +1137,22 @@ static int btree_check_reserve(struct btree *b, struct btree_op *op) | |||
1126 | { | 1137 | { |
1127 | struct cache_set *c = b->c; | 1138 | struct cache_set *c = b->c; |
1128 | struct cache *ca; | 1139 | struct cache *ca; |
1129 | unsigned i, reserve = c->root->level * 2 + 1; | 1140 | unsigned i, reserve = (c->root->level - b->level) * 2 + 1; |
1130 | int ret = 0; | ||
1131 | 1141 | ||
1132 | mutex_lock(&c->bucket_lock); | 1142 | mutex_lock(&c->bucket_lock); |
1133 | 1143 | ||
1134 | for_each_cache(ca, c, i) | 1144 | for_each_cache(ca, c, i) |
1135 | if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { | 1145 | if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { |
1136 | if (op) | 1146 | if (op) |
1137 | prepare_to_wait(&c->bucket_wait, &op->wait, | 1147 | prepare_to_wait(&c->btree_cache_wait, &op->wait, |
1138 | TASK_UNINTERRUPTIBLE); | 1148 | TASK_UNINTERRUPTIBLE); |
1139 | ret = -EINTR; | 1149 | mutex_unlock(&c->bucket_lock); |
1140 | break; | 1150 | return -EINTR; |
1141 | } | 1151 | } |
1142 | 1152 | ||
1143 | mutex_unlock(&c->bucket_lock); | 1153 | mutex_unlock(&c->bucket_lock); |
1144 | return ret; | 1154 | |
1155 | return mca_cannibalize_lock(b->c, op); | ||
1145 | } | 1156 | } |
1146 | 1157 | ||
1147 | /* Garbage collection */ | 1158 | /* Garbage collection */ |
@@ -1273,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *, | |||
1273 | struct keylist *, atomic_t *, struct bkey *); | 1284 | struct keylist *, atomic_t *, struct bkey *); |
1274 | 1285 | ||
1275 | static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | 1286 | static int btree_gc_coalesce(struct btree *b, struct btree_op *op, |
1276 | struct keylist *keylist, struct gc_stat *gc, | 1287 | struct gc_stat *gc, struct gc_merge_info *r) |
1277 | struct gc_merge_info *r) | ||
1278 | { | 1288 | { |
1279 | unsigned i, nodes = 0, keys = 0, blocks; | 1289 | unsigned i, nodes = 0, keys = 0, blocks; |
1280 | struct btree *new_nodes[GC_MERGE_NODES]; | 1290 | struct btree *new_nodes[GC_MERGE_NODES]; |
1291 | struct keylist keylist; | ||
1281 | struct closure cl; | 1292 | struct closure cl; |
1282 | struct bkey *k; | 1293 | struct bkey *k; |
1283 | 1294 | ||
1295 | bch_keylist_init(&keylist); | ||
1296 | |||
1297 | if (btree_check_reserve(b, NULL)) | ||
1298 | return 0; | ||
1299 | |||
1284 | memset(new_nodes, 0, sizeof(new_nodes)); | 1300 | memset(new_nodes, 0, sizeof(new_nodes)); |
1285 | closure_init_stack(&cl); | 1301 | closure_init_stack(&cl); |
1286 | 1302 | ||
@@ -1295,11 +1311,20 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1295 | return 0; | 1311 | return 0; |
1296 | 1312 | ||
1297 | for (i = 0; i < nodes; i++) { | 1313 | for (i = 0; i < nodes; i++) { |
1298 | new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); | 1314 | new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); |
1299 | if (IS_ERR_OR_NULL(new_nodes[i])) | 1315 | if (IS_ERR_OR_NULL(new_nodes[i])) |
1300 | goto out_nocoalesce; | 1316 | goto out_nocoalesce; |
1301 | } | 1317 | } |
1302 | 1318 | ||
1319 | /* | ||
1320 | * We have to check the reserve here, after we've allocated our new | ||
1321 | * nodes, to make sure the insert below will succeed - we also check | ||
1322 | * before as an optimization to potentially avoid a bunch of expensive | ||
1323 | * allocs/sorts | ||
1324 | */ | ||
1325 | if (btree_check_reserve(b, NULL)) | ||
1326 | goto out_nocoalesce; | ||
1327 | |||
1303 | for (i = 0; i < nodes; i++) | 1328 | for (i = 0; i < nodes; i++) |
1304 | mutex_lock(&new_nodes[i]->write_lock); | 1329 | mutex_lock(&new_nodes[i]->write_lock); |
1305 | 1330 | ||
@@ -1361,12 +1386,12 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1361 | 1386 | ||
1362 | n2->keys -= keys; | 1387 | n2->keys -= keys; |
1363 | 1388 | ||
1364 | if (__bch_keylist_realloc(keylist, | 1389 | if (__bch_keylist_realloc(&keylist, |
1365 | bkey_u64s(&new_nodes[i]->key))) | 1390 | bkey_u64s(&new_nodes[i]->key))) |
1366 | goto out_nocoalesce; | 1391 | goto out_nocoalesce; |
1367 | 1392 | ||
1368 | bch_btree_node_write(new_nodes[i], &cl); | 1393 | bch_btree_node_write(new_nodes[i], &cl); |
1369 | bch_keylist_add(keylist, &new_nodes[i]->key); | 1394 | bch_keylist_add(&keylist, &new_nodes[i]->key); |
1370 | } | 1395 | } |
1371 | 1396 | ||
1372 | for (i = 0; i < nodes; i++) | 1397 | for (i = 0; i < nodes; i++) |
@@ -1380,15 +1405,15 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1380 | rw_unlock(true, new_nodes[0]); | 1405 | rw_unlock(true, new_nodes[0]); |
1381 | 1406 | ||
1382 | for (i = 0; i < nodes; i++) { | 1407 | for (i = 0; i < nodes; i++) { |
1383 | if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) | 1408 | if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) |
1384 | goto out_nocoalesce; | 1409 | goto out_nocoalesce; |
1385 | 1410 | ||
1386 | make_btree_freeing_key(r[i].b, keylist->top); | 1411 | make_btree_freeing_key(r[i].b, keylist.top); |
1387 | bch_keylist_push(keylist); | 1412 | bch_keylist_push(&keylist); |
1388 | } | 1413 | } |
1389 | 1414 | ||
1390 | bch_btree_insert_node(b, op, keylist, NULL, NULL); | 1415 | bch_btree_insert_node(b, op, &keylist, NULL, NULL); |
1391 | BUG_ON(!bch_keylist_empty(keylist)); | 1416 | BUG_ON(!bch_keylist_empty(&keylist)); |
1392 | 1417 | ||
1393 | for (i = 0; i < nodes; i++) { | 1418 | for (i = 0; i < nodes; i++) { |
1394 | btree_node_free(r[i].b); | 1419 | btree_node_free(r[i].b); |
@@ -1403,13 +1428,16 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1403 | trace_bcache_btree_gc_coalesce(nodes); | 1428 | trace_bcache_btree_gc_coalesce(nodes); |
1404 | gc->nodes--; | 1429 | gc->nodes--; |
1405 | 1430 | ||
1431 | bch_keylist_free(&keylist); | ||
1432 | |||
1406 | /* Invalidated our iterator */ | 1433 | /* Invalidated our iterator */ |
1407 | return -EINTR; | 1434 | return -EINTR; |
1408 | 1435 | ||
1409 | out_nocoalesce: | 1436 | out_nocoalesce: |
1410 | closure_sync(&cl); | 1437 | closure_sync(&cl); |
1438 | bch_keylist_free(&keylist); | ||
1411 | 1439 | ||
1412 | while ((k = bch_keylist_pop(keylist))) | 1440 | while ((k = bch_keylist_pop(&keylist))) |
1413 | if (!bkey_cmp(k, &ZERO_KEY)) | 1441 | if (!bkey_cmp(k, &ZERO_KEY)) |
1414 | atomic_dec(&b->c->prio_blocked); | 1442 | atomic_dec(&b->c->prio_blocked); |
1415 | 1443 | ||
@@ -1421,6 +1449,42 @@ out_nocoalesce: | |||
1421 | return 0; | 1449 | return 0; |
1422 | } | 1450 | } |
1423 | 1451 | ||
1452 | static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, | ||
1453 | struct btree *replace) | ||
1454 | { | ||
1455 | struct keylist keys; | ||
1456 | struct btree *n; | ||
1457 | |||
1458 | if (btree_check_reserve(b, NULL)) | ||
1459 | return 0; | ||
1460 | |||
1461 | n = btree_node_alloc_replacement(replace, NULL); | ||
1462 | |||
1463 | /* recheck reserve after allocating replacement node */ | ||
1464 | if (btree_check_reserve(b, NULL)) { | ||
1465 | btree_node_free(n); | ||
1466 | rw_unlock(true, n); | ||
1467 | return 0; | ||
1468 | } | ||
1469 | |||
1470 | bch_btree_node_write_sync(n); | ||
1471 | |||
1472 | bch_keylist_init(&keys); | ||
1473 | bch_keylist_add(&keys, &n->key); | ||
1474 | |||
1475 | make_btree_freeing_key(replace, keys.top); | ||
1476 | bch_keylist_push(&keys); | ||
1477 | |||
1478 | bch_btree_insert_node(b, op, &keys, NULL, NULL); | ||
1479 | BUG_ON(!bch_keylist_empty(&keys)); | ||
1480 | |||
1481 | btree_node_free(replace); | ||
1482 | rw_unlock(true, n); | ||
1483 | |||
1484 | /* Invalidated our iterator */ | ||
1485 | return -EINTR; | ||
1486 | } | ||
1487 | |||
1424 | static unsigned btree_gc_count_keys(struct btree *b) | 1488 | static unsigned btree_gc_count_keys(struct btree *b) |
1425 | { | 1489 | { |
1426 | struct bkey *k; | 1490 | struct bkey *k; |
@@ -1438,14 +1502,11 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1438 | { | 1502 | { |
1439 | int ret = 0; | 1503 | int ret = 0; |
1440 | bool should_rewrite; | 1504 | bool should_rewrite; |
1441 | struct btree *n; | ||
1442 | struct bkey *k; | 1505 | struct bkey *k; |
1443 | struct keylist keys; | ||
1444 | struct btree_iter iter; | 1506 | struct btree_iter iter; |
1445 | struct gc_merge_info r[GC_MERGE_NODES]; | 1507 | struct gc_merge_info r[GC_MERGE_NODES]; |
1446 | struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; | 1508 | struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; |
1447 | 1509 | ||
1448 | bch_keylist_init(&keys); | ||
1449 | bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); | 1510 | bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); |
1450 | 1511 | ||
1451 | for (i = r; i < r + ARRAY_SIZE(r); i++) | 1512 | for (i = r; i < r + ARRAY_SIZE(r); i++) |
@@ -1454,7 +1515,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1454 | while (1) { | 1515 | while (1) { |
1455 | k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); | 1516 | k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); |
1456 | if (k) { | 1517 | if (k) { |
1457 | r->b = bch_btree_node_get(b->c, k, b->level - 1, true); | 1518 | r->b = bch_btree_node_get(b->c, op, k, b->level - 1, |
1519 | true); | ||
1458 | if (IS_ERR(r->b)) { | 1520 | if (IS_ERR(r->b)) { |
1459 | ret = PTR_ERR(r->b); | 1521 | ret = PTR_ERR(r->b); |
1460 | break; | 1522 | break; |
@@ -1462,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1462 | 1524 | ||
1463 | r->keys = btree_gc_count_keys(r->b); | 1525 | r->keys = btree_gc_count_keys(r->b); |
1464 | 1526 | ||
1465 | ret = btree_gc_coalesce(b, op, &keys, gc, r); | 1527 | ret = btree_gc_coalesce(b, op, gc, r); |
1466 | if (ret) | 1528 | if (ret) |
1467 | break; | 1529 | break; |
1468 | } | 1530 | } |
@@ -1472,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1472 | 1534 | ||
1473 | if (!IS_ERR(last->b)) { | 1535 | if (!IS_ERR(last->b)) { |
1474 | should_rewrite = btree_gc_mark_node(last->b, gc); | 1536 | should_rewrite = btree_gc_mark_node(last->b, gc); |
1475 | if (should_rewrite && | 1537 | if (should_rewrite) { |
1476 | !btree_check_reserve(b, NULL)) { | 1538 | ret = btree_gc_rewrite_node(b, op, last->b); |
1477 | n = btree_node_alloc_replacement(last->b, | 1539 | if (ret) |
1478 | false); | ||
1479 | |||
1480 | if (!IS_ERR_OR_NULL(n)) { | ||
1481 | bch_btree_node_write_sync(n); | ||
1482 | |||
1483 | bch_keylist_add(&keys, &n->key); | ||
1484 | |||
1485 | make_btree_freeing_key(last->b, | ||
1486 | keys.top); | ||
1487 | bch_keylist_push(&keys); | ||
1488 | |||
1489 | bch_btree_insert_node(b, op, &keys, | ||
1490 | NULL, NULL); | ||
1491 | BUG_ON(!bch_keylist_empty(&keys)); | ||
1492 | |||
1493 | btree_node_free(last->b); | ||
1494 | rw_unlock(true, last->b); | ||
1495 | last->b = n; | ||
1496 | |||
1497 | /* Invalidated our iterator */ | ||
1498 | ret = -EINTR; | ||
1499 | break; | 1540 | break; |
1500 | } | ||
1501 | } | 1541 | } |
1502 | 1542 | ||
1503 | if (last->b->level) { | 1543 | if (last->b->level) { |
@@ -1537,8 +1577,6 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1537 | rw_unlock(true, i->b); | 1577 | rw_unlock(true, i->b); |
1538 | } | 1578 | } |
1539 | 1579 | ||
1540 | bch_keylist_free(&keys); | ||
1541 | |||
1542 | return ret; | 1580 | return ret; |
1543 | } | 1581 | } |
1544 | 1582 | ||
@@ -1551,7 +1589,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | |||
1551 | 1589 | ||
1552 | should_rewrite = btree_gc_mark_node(b, gc); | 1590 | should_rewrite = btree_gc_mark_node(b, gc); |
1553 | if (should_rewrite) { | 1591 | if (should_rewrite) { |
1554 | n = btree_node_alloc_replacement(b, false); | 1592 | n = btree_node_alloc_replacement(b, NULL); |
1555 | 1593 | ||
1556 | if (!IS_ERR_OR_NULL(n)) { | 1594 | if (!IS_ERR_OR_NULL(n)) { |
1557 | bch_btree_node_write_sync(n); | 1595 | bch_btree_node_write_sync(n); |
@@ -1887,11 +1925,14 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
1887 | closure_init_stack(&cl); | 1925 | closure_init_stack(&cl); |
1888 | bch_keylist_init(&parent_keys); | 1926 | bch_keylist_init(&parent_keys); |
1889 | 1927 | ||
1890 | if (!b->level && | 1928 | if (btree_check_reserve(b, op)) { |
1891 | btree_check_reserve(b, op)) | 1929 | if (!b->level) |
1892 | return -EINTR; | 1930 | return -EINTR; |
1931 | else | ||
1932 | WARN(1, "insufficient reserve for split\n"); | ||
1933 | } | ||
1893 | 1934 | ||
1894 | n1 = btree_node_alloc_replacement(b, true); | 1935 | n1 = btree_node_alloc_replacement(b, op); |
1895 | if (IS_ERR(n1)) | 1936 | if (IS_ERR(n1)) |
1896 | goto err; | 1937 | goto err; |
1897 | 1938 | ||
@@ -1903,12 +1944,12 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
1903 | 1944 | ||
1904 | trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); | 1945 | trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); |
1905 | 1946 | ||
1906 | n2 = bch_btree_node_alloc(b->c, b->level, true); | 1947 | n2 = bch_btree_node_alloc(b->c, op, b->level); |
1907 | if (IS_ERR(n2)) | 1948 | if (IS_ERR(n2)) |
1908 | goto err_free1; | 1949 | goto err_free1; |
1909 | 1950 | ||
1910 | if (!b->parent) { | 1951 | if (!b->parent) { |
1911 | n3 = bch_btree_node_alloc(b->c, b->level + 1, true); | 1952 | n3 = bch_btree_node_alloc(b->c, op, b->level + 1); |
1912 | if (IS_ERR(n3)) | 1953 | if (IS_ERR(n3)) |
1913 | goto err_free2; | 1954 | goto err_free2; |
1914 | } | 1955 | } |
@@ -1995,7 +2036,7 @@ err_free1: | |||
1995 | btree_node_free(n1); | 2036 | btree_node_free(n1); |
1996 | rw_unlock(true, n1); | 2037 | rw_unlock(true, n1); |
1997 | err: | 2038 | err: |
1998 | WARN(1, "bcache: btree split failed"); | 2039 | WARN(1, "bcache: btree split failed (level %u)", b->level); |
1999 | 2040 | ||
2000 | if (n3 == ERR_PTR(-EAGAIN) || | 2041 | if (n3 == ERR_PTR(-EAGAIN) || |
2001 | n2 == ERR_PTR(-EAGAIN) || | 2042 | n2 == ERR_PTR(-EAGAIN) || |
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index acebf26809cc..3ce371fa7f98 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h | |||
@@ -242,8 +242,9 @@ void __bch_btree_node_write(struct btree *, struct closure *); | |||
242 | void bch_btree_node_write(struct btree *, struct closure *); | 242 | void bch_btree_node_write(struct btree *, struct closure *); |
243 | 243 | ||
244 | void bch_btree_set_root(struct btree *); | 244 | void bch_btree_set_root(struct btree *); |
245 | struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); | 245 | struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int); |
246 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); | 246 | struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, |
247 | struct bkey *, int, bool); | ||
247 | 248 | ||
248 | int bch_btree_insert_check_key(struct btree *, struct btree_op *, | 249 | int bch_btree_insert_check_key(struct btree *, struct btree_op *, |
249 | struct bkey *); | 250 | struct bkey *); |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 307fe378ea43..2d4a56219ec7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -1495,14 +1495,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1495 | 1495 | ||
1496 | sema_init(&c->sb_write_mutex, 1); | 1496 | sema_init(&c->sb_write_mutex, 1); |
1497 | mutex_init(&c->bucket_lock); | 1497 | mutex_init(&c->bucket_lock); |
1498 | init_waitqueue_head(&c->try_wait); | 1498 | init_waitqueue_head(&c->btree_cache_wait); |
1499 | init_waitqueue_head(&c->bucket_wait); | 1499 | init_waitqueue_head(&c->bucket_wait); |
1500 | sema_init(&c->uuid_write_mutex, 1); | 1500 | sema_init(&c->uuid_write_mutex, 1); |
1501 | 1501 | ||
1502 | spin_lock_init(&c->btree_gc_time.lock); | 1502 | spin_lock_init(&c->btree_gc_time.lock); |
1503 | spin_lock_init(&c->btree_split_time.lock); | 1503 | spin_lock_init(&c->btree_split_time.lock); |
1504 | spin_lock_init(&c->btree_read_time.lock); | 1504 | spin_lock_init(&c->btree_read_time.lock); |
1505 | spin_lock_init(&c->try_harder_time.lock); | ||
1506 | 1505 | ||
1507 | bch_moving_init_cache_set(c); | 1506 | bch_moving_init_cache_set(c); |
1508 | 1507 | ||
@@ -1591,7 +1590,7 @@ static void run_cache_set(struct cache_set *c) | |||
1591 | goto err; | 1590 | goto err; |
1592 | 1591 | ||
1593 | err = "error reading btree root"; | 1592 | err = "error reading btree root"; |
1594 | c->root = bch_btree_node_get(c, k, j->btree_level, true); | 1593 | c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true); |
1595 | if (IS_ERR_OR_NULL(c->root)) | 1594 | if (IS_ERR_OR_NULL(c->root)) |
1596 | goto err; | 1595 | goto err; |
1597 | 1596 | ||
@@ -1666,7 +1665,7 @@ static void run_cache_set(struct cache_set *c) | |||
1666 | goto err; | 1665 | goto err; |
1667 | 1666 | ||
1668 | err = "cannot allocate new btree root"; | 1667 | err = "cannot allocate new btree root"; |
1669 | c->root = bch_btree_node_alloc(c, 0, true); | 1668 | c->root = bch_btree_node_alloc(c, NULL, 0); |
1670 | if (IS_ERR_OR_NULL(c->root)) | 1669 | if (IS_ERR_OR_NULL(c->root)) |
1671 | goto err; | 1670 | goto err; |
1672 | 1671 | ||
@@ -1847,13 +1846,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) | |||
1847 | for_each_bucket(b, ca) | 1846 | for_each_bucket(b, ca) |
1848 | atomic_set(&b->pin, 0); | 1847 | atomic_set(&b->pin, 0); |
1849 | 1848 | ||
1850 | if (bch_cache_allocator_init(ca)) | ||
1851 | goto err; | ||
1852 | |||
1853 | return 0; | 1849 | return 0; |
1854 | err: | ||
1855 | kobject_put(&ca->kobj); | ||
1856 | return -ENOMEM; | ||
1857 | } | 1850 | } |
1858 | 1851 | ||
1859 | static void register_cache(struct cache_sb *sb, struct page *sb_page, | 1852 | static void register_cache(struct cache_sb *sb, struct page *sb_page, |
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 662b9484ed5e..89aaa2ef38f9 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms); | |||
54 | sysfs_time_stats_attribute(btree_split, sec, us); | 54 | sysfs_time_stats_attribute(btree_split, sec, us); |
55 | sysfs_time_stats_attribute(btree_sort, ms, us); | 55 | sysfs_time_stats_attribute(btree_sort, ms, us); |
56 | sysfs_time_stats_attribute(btree_read, ms, us); | 56 | sysfs_time_stats_attribute(btree_read, ms, us); |
57 | sysfs_time_stats_attribute(try_harder, ms, us); | ||
58 | 57 | ||
59 | read_attribute(btree_nodes); | 58 | read_attribute(btree_nodes); |
60 | read_attribute(btree_used_percent); | 59 | read_attribute(btree_used_percent); |
@@ -534,7 +533,6 @@ lock_root: | |||
534 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); | 533 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); |
535 | sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); | 534 | sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); |
536 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); | 535 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); |
537 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); | ||
538 | 536 | ||
539 | sysfs_print(btree_used_percent, btree_used(c)); | 537 | sysfs_print(btree_used_percent, btree_used(c)); |
540 | sysfs_print(btree_nodes, c->gc_stats.nodes); | 538 | sysfs_print(btree_nodes, c->gc_stats.nodes); |
@@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = { | |||
709 | sysfs_time_stats_attribute_list(btree_split, sec, us) | 707 | sysfs_time_stats_attribute_list(btree_split, sec, us) |
710 | sysfs_time_stats_attribute_list(btree_sort, ms, us) | 708 | sysfs_time_stats_attribute_list(btree_sort, ms, us) |
711 | sysfs_time_stats_attribute_list(btree_read, ms, us) | 709 | sysfs_time_stats_attribute_list(btree_read, ms, us) |
712 | sysfs_time_stats_attribute_list(try_harder, ms, us) | ||
713 | 710 | ||
714 | &sysfs_btree_nodes, | 711 | &sysfs_btree_nodes, |
715 | &sysfs_btree_used_percent, | 712 | &sysfs_btree_used_percent, |