aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorKent Overstreet <kmo@daterainc.com>2014-03-17 20:15:53 -0400
committerKent Overstreet <kmo@daterainc.com>2014-03-18 15:23:35 -0400
commit0a63b66db566cffdf90182eb6e66fdd4d0479e63 (patch)
treed1284e5008b668befb8179de30aeb50d4e789177 /drivers/md
parent56b30770b27d54d68ad51eccc6d888282b568cee (diff)
bcache: Rework btree cache reserve handling
This changes the bucket allocation reserves to use _real_ reserves - separate freelists - instead of watermarks, which if nothing else makes the current code saner to reason about and is going to be important in the future when we add support for multiple btrees. It also adds btree_check_reserve(), which checks (and locks) the reserves for both bucket allocation and memory allocation for btree nodes; the old code just kinda sorta assumed that since (e.g. for btree node splits) it had the root locked and that meant no other threads could try to make use of the same reserve; this technically should have been ok for memory allocation (we should always have a reserve for memory allocation (the btree node cache is used as a reserve and we preallocate it)), but multiple btrees will mean that locking the root won't be sufficient anymore, and for the bucket allocation reserve it was technically possible for the old code to deadlock. Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c23
-rw-r--r--drivers/md/bcache/bcache.h15
-rw-r--r--drivers/md/bcache/btree.c225
-rw-r--r--drivers/md/bcache/btree.h5
-rw-r--r--drivers/md/bcache/super.c13
-rw-r--r--drivers/md/bcache/sysfs.c3
6 files changed, 145 insertions, 139 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 5ba4eaea57f4..a59ef6147fc7 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -375,6 +375,7 @@ static int bch_allocator_thread(void *arg)
375 } 375 }
376 376
377 allocator_wait(ca, bch_allocator_push(ca, bucket)); 377 allocator_wait(ca, bch_allocator_push(ca, bucket));
378 wake_up(&ca->set->btree_cache_wait);
378 wake_up(&ca->set->bucket_wait); 379 wake_up(&ca->set->bucket_wait);
379 } 380 }
380 381
@@ -717,25 +718,3 @@ int bch_cache_allocator_start(struct cache *ca)
717 ca->alloc_thread = k; 718 ca->alloc_thread = k;
718 return 0; 719 return 0;
719} 720}
720
721int bch_cache_allocator_init(struct cache *ca)
722{
723 /*
724 * Reserve:
725 * Prio/gen writes first
726 * Then 8 for btree allocations
727 * Then half for the moving garbage collector
728 */
729#if 0
730 ca->watermark[WATERMARK_PRIO] = 0;
731
732 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
733
734 ca->watermark[WATERMARK_MOVINGGC] = 8 +
735 ca->watermark[WATERMARK_METADATA];
736
737 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
738 ca->watermark[WATERMARK_MOVINGGC];
739#endif
740 return 0;
741}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 15d26236caf9..171cda89cb6b 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -562,19 +562,16 @@ struct cache_set {
562 struct list_head btree_cache_freed; 562 struct list_head btree_cache_freed;
563 563
564 /* Number of elements in btree_cache + btree_cache_freeable lists */ 564 /* Number of elements in btree_cache + btree_cache_freeable lists */
565 unsigned bucket_cache_used; 565 unsigned btree_cache_used;
566 566
567 /* 567 /*
568 * If we need to allocate memory for a new btree node and that 568 * If we need to allocate memory for a new btree node and that
569 * allocation fails, we can cannibalize another node in the btree cache 569 * allocation fails, we can cannibalize another node in the btree cache
570 * to satisfy the allocation. However, only one thread can be doing this 570 * to satisfy the allocation - lock to guarantee only one thread does
571 * at a time, for obvious reasons - try_harder and try_wait are 571 * this at a time:
572 * basically a lock for this that we can wait on asynchronously. The
573 * btree_root() macro releases the lock when it returns.
574 */ 572 */
575 struct task_struct *try_harder; 573 wait_queue_head_t btree_cache_wait;
576 wait_queue_head_t try_wait; 574 struct task_struct *btree_cache_alloc_lock;
577 uint64_t try_harder_start;
578 575
579 /* 576 /*
580 * When we free a btree node, we increment the gen of the bucket the 577 * When we free a btree node, we increment the gen of the bucket the
@@ -669,7 +666,6 @@ struct cache_set {
669 struct time_stats btree_gc_time; 666 struct time_stats btree_gc_time;
670 struct time_stats btree_split_time; 667 struct time_stats btree_split_time;
671 struct time_stats btree_read_time; 668 struct time_stats btree_read_time;
672 struct time_stats try_harder_time;
673 669
674 atomic_long_t cache_read_races; 670 atomic_long_t cache_read_races;
675 atomic_long_t writeback_keys_done; 671 atomic_long_t writeback_keys_done;
@@ -956,7 +952,6 @@ int bch_open_buckets_alloc(struct cache_set *);
956void bch_open_buckets_free(struct cache_set *); 952void bch_open_buckets_free(struct cache_set *);
957 953
958int bch_cache_allocator_start(struct cache *ca); 954int bch_cache_allocator_start(struct cache *ca);
959int bch_cache_allocator_init(struct cache *ca);
960 955
961void bch_debug_exit(void); 956void bch_debug_exit(void);
962int bch_debug_init(struct kobject *); 957int bch_debug_init(struct kobject *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index beb32551da78..be90596a9e2a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -117,7 +117,7 @@
117({ \ 117({ \
118 int _r, l = (b)->level - 1; \ 118 int _r, l = (b)->level - 1; \
119 bool _w = l <= (op)->lock; \ 119 bool _w = l <= (op)->lock; \
120 struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ 120 struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
121 if (!IS_ERR(_child)) { \ 121 if (!IS_ERR(_child)) { \
122 _child->parent = (b); \ 122 _child->parent = (b); \
123 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ 123 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
@@ -146,17 +146,12 @@
146 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 146 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
147 } \ 147 } \
148 rw_unlock(_w, _b); \ 148 rw_unlock(_w, _b); \
149 bch_cannibalize_unlock(c); \
149 if (_r == -EINTR) \ 150 if (_r == -EINTR) \
150 schedule(); \ 151 schedule(); \
151 bch_cannibalize_unlock(c); \
152 if (_r == -ENOSPC) { \
153 wait_event((c)->try_wait, \
154 !(c)->try_harder); \
155 _r = -EINTR; \
156 } \
157 } while (_r == -EINTR); \ 152 } while (_r == -EINTR); \
158 \ 153 \
159 finish_wait(&(c)->bucket_wait, &(op)->wait); \ 154 finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
160 _r; \ 155 _r; \
161}) 156})
162 157
@@ -563,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
563#define mca_reserve(c) (((c->root && c->root->level) \ 558#define mca_reserve(c) (((c->root && c->root->level) \
564 ? c->root->level : 1) * 8 + 16) 559 ? c->root->level : 1) * 8 + 16)
565#define mca_can_free(c) \ 560#define mca_can_free(c) \
566 max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) 561 max_t(int, 0, c->btree_cache_used - mca_reserve(c))
567 562
568static void mca_data_free(struct btree *b) 563static void mca_data_free(struct btree *b)
569{ 564{
@@ -571,7 +566,7 @@ static void mca_data_free(struct btree *b)
571 566
572 bch_btree_keys_free(&b->keys); 567 bch_btree_keys_free(&b->keys);
573 568
574 b->c->bucket_cache_used--; 569 b->c->btree_cache_used--;
575 list_move(&b->list, &b->c->btree_cache_freed); 570 list_move(&b->list, &b->c->btree_cache_freed);
576} 571}
577 572
@@ -596,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
596 ilog2(b->c->btree_pages), 591 ilog2(b->c->btree_pages),
597 btree_order(k)), 592 btree_order(k)),
598 gfp)) { 593 gfp)) {
599 b->c->bucket_cache_used++; 594 b->c->btree_cache_used++;
600 list_move(&b->list, &b->c->btree_cache); 595 list_move(&b->list, &b->c->btree_cache);
601 } else { 596 } else {
602 list_move(&b->list, &b->c->btree_cache_freed); 597 list_move(&b->list, &b->c->btree_cache_freed);
@@ -675,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
675 if (c->shrinker_disabled) 670 if (c->shrinker_disabled)
676 return SHRINK_STOP; 671 return SHRINK_STOP;
677 672
678 if (c->try_harder) 673 if (c->btree_cache_alloc_lock)
679 return SHRINK_STOP; 674 return SHRINK_STOP;
680 675
681 /* Return -1 if we can't do anything right now */ 676 /* Return -1 if we can't do anything right now */
@@ -707,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
707 } 702 }
708 } 703 }
709 704
710 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { 705 for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
711 if (list_empty(&c->btree_cache)) 706 if (list_empty(&c->btree_cache))
712 goto out; 707 goto out;
713 708
@@ -736,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
736 if (c->shrinker_disabled) 731 if (c->shrinker_disabled)
737 return 0; 732 return 0;
738 733
739 if (c->try_harder) 734 if (c->btree_cache_alloc_lock)
740 return 0; 735 return 0;
741 736
742 return mca_can_free(c) * c->btree_pages; 737 return mca_can_free(c) * c->btree_pages;
@@ -840,17 +835,30 @@ out:
840 return b; 835 return b;
841} 836}
842 837
843static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) 838static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
839{
840 struct task_struct *old;
841
842 old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
843 if (old && old != current) {
844 if (op)
845 prepare_to_wait(&c->btree_cache_wait, &op->wait,
846 TASK_UNINTERRUPTIBLE);
847 return -EINTR;
848 }
849
850 return 0;
851}
852
853static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
854 struct bkey *k)
844{ 855{
845 struct btree *b; 856 struct btree *b;
846 857
847 trace_bcache_btree_cache_cannibalize(c); 858 trace_bcache_btree_cache_cannibalize(c);
848 859
849 if (!c->try_harder) { 860 if (mca_cannibalize_lock(c, op))
850 c->try_harder = current; 861 return ERR_PTR(-EINTR);
851 c->try_harder_start = local_clock();
852 } else if (c->try_harder != current)
853 return ERR_PTR(-ENOSPC);
854 862
855 list_for_each_entry_reverse(b, &c->btree_cache, list) 863 list_for_each_entry_reverse(b, &c->btree_cache, list)
856 if (!mca_reap(b, btree_order(k), false)) 864 if (!mca_reap(b, btree_order(k), false))
@@ -860,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
860 if (!mca_reap(b, btree_order(k), true)) 868 if (!mca_reap(b, btree_order(k), true))
861 return b; 869 return b;
862 870
871 WARN(1, "btree cache cannibalize failed\n");
863 return ERR_PTR(-ENOMEM); 872 return ERR_PTR(-ENOMEM);
864} 873}
865 874
@@ -871,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
871 */ 880 */
872static void bch_cannibalize_unlock(struct cache_set *c) 881static void bch_cannibalize_unlock(struct cache_set *c)
873{ 882{
874 if (c->try_harder == current) { 883 if (c->btree_cache_alloc_lock == current) {
875 bch_time_stats_update(&c->try_harder_time, c->try_harder_start); 884 c->btree_cache_alloc_lock = NULL;
876 c->try_harder = NULL; 885 wake_up(&c->btree_cache_wait);
877 wake_up(&c->try_wait);
878 } 886 }
879} 887}
880 888
881static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) 889static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
890 struct bkey *k, int level)
882{ 891{
883 struct btree *b; 892 struct btree *b;
884 893
@@ -941,7 +950,7 @@ err:
941 if (b) 950 if (b)
942 rw_unlock(true, b); 951 rw_unlock(true, b);
943 952
944 b = mca_cannibalize(c, k); 953 b = mca_cannibalize(c, op, k);
945 if (!IS_ERR(b)) 954 if (!IS_ERR(b))
946 goto out; 955 goto out;
947 956
@@ -957,8 +966,8 @@ err:
957 * The btree node will have either a read or a write lock held, depending on 966 * The btree node will have either a read or a write lock held, depending on
958 * level and op->lock. 967 * level and op->lock.
959 */ 968 */
960struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, 969struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
961 int level, bool write) 970 struct bkey *k, int level, bool write)
962{ 971{
963 int i = 0; 972 int i = 0;
964 struct btree *b; 973 struct btree *b;
@@ -972,7 +981,7 @@ retry:
972 return ERR_PTR(-EAGAIN); 981 return ERR_PTR(-EAGAIN);
973 982
974 mutex_lock(&c->bucket_lock); 983 mutex_lock(&c->bucket_lock);
975 b = mca_alloc(c, k, level); 984 b = mca_alloc(c, op, k, level);
976 mutex_unlock(&c->bucket_lock); 985 mutex_unlock(&c->bucket_lock);
977 986
978 if (!b) 987 if (!b)
@@ -1018,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
1018 struct btree *b; 1027 struct btree *b;
1019 1028
1020 mutex_lock(&c->bucket_lock); 1029 mutex_lock(&c->bucket_lock);
1021 b = mca_alloc(c, k, level); 1030 b = mca_alloc(c, NULL, k, level);
1022 mutex_unlock(&c->bucket_lock); 1031 mutex_unlock(&c->bucket_lock);
1023 1032
1024 if (!IS_ERR_OR_NULL(b)) { 1033 if (!IS_ERR_OR_NULL(b)) {
@@ -1051,20 +1060,21 @@ static void btree_node_free(struct btree *b)
1051 mutex_unlock(&b->c->bucket_lock); 1060 mutex_unlock(&b->c->bucket_lock);
1052} 1061}
1053 1062
1054struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) 1063struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1064 int level)
1055{ 1065{
1056 BKEY_PADDED(key) k; 1066 BKEY_PADDED(key) k;
1057 struct btree *b = ERR_PTR(-EAGAIN); 1067 struct btree *b = ERR_PTR(-EAGAIN);
1058 1068
1059 mutex_lock(&c->bucket_lock); 1069 mutex_lock(&c->bucket_lock);
1060retry: 1070retry:
1061 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) 1071 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
1062 goto err; 1072 goto err;
1063 1073
1064 bkey_put(c, &k.key); 1074 bkey_put(c, &k.key);
1065 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1075 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1066 1076
1067 b = mca_alloc(c, &k.key, level); 1077 b = mca_alloc(c, op, &k.key, level);
1068 if (IS_ERR(b)) 1078 if (IS_ERR(b))
1069 goto err_free; 1079 goto err_free;
1070 1080
@@ -1090,9 +1100,10 @@ err:
1090 return b; 1100 return b;
1091} 1101}
1092 1102
1093static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) 1103static struct btree *btree_node_alloc_replacement(struct btree *b,
1104 struct btree_op *op)
1094{ 1105{
1095 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); 1106 struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
1096 if (!IS_ERR_OR_NULL(n)) { 1107 if (!IS_ERR_OR_NULL(n)) {
1097 mutex_lock(&n->write_lock); 1108 mutex_lock(&n->write_lock);
1098 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); 1109 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
@@ -1126,22 +1137,22 @@ static int btree_check_reserve(struct btree *b, struct btree_op *op)
1126{ 1137{
1127 struct cache_set *c = b->c; 1138 struct cache_set *c = b->c;
1128 struct cache *ca; 1139 struct cache *ca;
1129 unsigned i, reserve = c->root->level * 2 + 1; 1140 unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
1130 int ret = 0;
1131 1141
1132 mutex_lock(&c->bucket_lock); 1142 mutex_lock(&c->bucket_lock);
1133 1143
1134 for_each_cache(ca, c, i) 1144 for_each_cache(ca, c, i)
1135 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { 1145 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1136 if (op) 1146 if (op)
1137 prepare_to_wait(&c->bucket_wait, &op->wait, 1147 prepare_to_wait(&c->btree_cache_wait, &op->wait,
1138 TASK_UNINTERRUPTIBLE); 1148 TASK_UNINTERRUPTIBLE);
1139 ret = -EINTR; 1149 mutex_unlock(&c->bucket_lock);
1140 break; 1150 return -EINTR;
1141 } 1151 }
1142 1152
1143 mutex_unlock(&c->bucket_lock); 1153 mutex_unlock(&c->bucket_lock);
1144 return ret; 1154
1155 return mca_cannibalize_lock(b->c, op);
1145} 1156}
1146 1157
1147/* Garbage collection */ 1158/* Garbage collection */
@@ -1273,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *,
1273 struct keylist *, atomic_t *, struct bkey *); 1284 struct keylist *, atomic_t *, struct bkey *);
1274 1285
1275static int btree_gc_coalesce(struct btree *b, struct btree_op *op, 1286static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1276 struct keylist *keylist, struct gc_stat *gc, 1287 struct gc_stat *gc, struct gc_merge_info *r)
1277 struct gc_merge_info *r)
1278{ 1288{
1279 unsigned i, nodes = 0, keys = 0, blocks; 1289 unsigned i, nodes = 0, keys = 0, blocks;
1280 struct btree *new_nodes[GC_MERGE_NODES]; 1290 struct btree *new_nodes[GC_MERGE_NODES];
1291 struct keylist keylist;
1281 struct closure cl; 1292 struct closure cl;
1282 struct bkey *k; 1293 struct bkey *k;
1283 1294
1295 bch_keylist_init(&keylist);
1296
1297 if (btree_check_reserve(b, NULL))
1298 return 0;
1299
1284 memset(new_nodes, 0, sizeof(new_nodes)); 1300 memset(new_nodes, 0, sizeof(new_nodes));
1285 closure_init_stack(&cl); 1301 closure_init_stack(&cl);
1286 1302
@@ -1295,11 +1311,20 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1295 return 0; 1311 return 0;
1296 1312
1297 for (i = 0; i < nodes; i++) { 1313 for (i = 0; i < nodes; i++) {
1298 new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); 1314 new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
1299 if (IS_ERR_OR_NULL(new_nodes[i])) 1315 if (IS_ERR_OR_NULL(new_nodes[i]))
1300 goto out_nocoalesce; 1316 goto out_nocoalesce;
1301 } 1317 }
1302 1318
1319 /*
1320 * We have to check the reserve here, after we've allocated our new
1321 * nodes, to make sure the insert below will succeed - we also check
1322 * before as an optimization to potentially avoid a bunch of expensive
1323 * allocs/sorts
1324 */
1325 if (btree_check_reserve(b, NULL))
1326 goto out_nocoalesce;
1327
1303 for (i = 0; i < nodes; i++) 1328 for (i = 0; i < nodes; i++)
1304 mutex_lock(&new_nodes[i]->write_lock); 1329 mutex_lock(&new_nodes[i]->write_lock);
1305 1330
@@ -1361,12 +1386,12 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1361 1386
1362 n2->keys -= keys; 1387 n2->keys -= keys;
1363 1388
1364 if (__bch_keylist_realloc(keylist, 1389 if (__bch_keylist_realloc(&keylist,
1365 bkey_u64s(&new_nodes[i]->key))) 1390 bkey_u64s(&new_nodes[i]->key)))
1366 goto out_nocoalesce; 1391 goto out_nocoalesce;
1367 1392
1368 bch_btree_node_write(new_nodes[i], &cl); 1393 bch_btree_node_write(new_nodes[i], &cl);
1369 bch_keylist_add(keylist, &new_nodes[i]->key); 1394 bch_keylist_add(&keylist, &new_nodes[i]->key);
1370 } 1395 }
1371 1396
1372 for (i = 0; i < nodes; i++) 1397 for (i = 0; i < nodes; i++)
@@ -1380,15 +1405,15 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1380 rw_unlock(true, new_nodes[0]); 1405 rw_unlock(true, new_nodes[0]);
1381 1406
1382 for (i = 0; i < nodes; i++) { 1407 for (i = 0; i < nodes; i++) {
1383 if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) 1408 if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
1384 goto out_nocoalesce; 1409 goto out_nocoalesce;
1385 1410
1386 make_btree_freeing_key(r[i].b, keylist->top); 1411 make_btree_freeing_key(r[i].b, keylist.top);
1387 bch_keylist_push(keylist); 1412 bch_keylist_push(&keylist);
1388 } 1413 }
1389 1414
1390 bch_btree_insert_node(b, op, keylist, NULL, NULL); 1415 bch_btree_insert_node(b, op, &keylist, NULL, NULL);
1391 BUG_ON(!bch_keylist_empty(keylist)); 1416 BUG_ON(!bch_keylist_empty(&keylist));
1392 1417
1393 for (i = 0; i < nodes; i++) { 1418 for (i = 0; i < nodes; i++) {
1394 btree_node_free(r[i].b); 1419 btree_node_free(r[i].b);
@@ -1403,13 +1428,16 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1403 trace_bcache_btree_gc_coalesce(nodes); 1428 trace_bcache_btree_gc_coalesce(nodes);
1404 gc->nodes--; 1429 gc->nodes--;
1405 1430
1431 bch_keylist_free(&keylist);
1432
1406 /* Invalidated our iterator */ 1433 /* Invalidated our iterator */
1407 return -EINTR; 1434 return -EINTR;
1408 1435
1409out_nocoalesce: 1436out_nocoalesce:
1410 closure_sync(&cl); 1437 closure_sync(&cl);
1438 bch_keylist_free(&keylist);
1411 1439
1412 while ((k = bch_keylist_pop(keylist))) 1440 while ((k = bch_keylist_pop(&keylist)))
1413 if (!bkey_cmp(k, &ZERO_KEY)) 1441 if (!bkey_cmp(k, &ZERO_KEY))
1414 atomic_dec(&b->c->prio_blocked); 1442 atomic_dec(&b->c->prio_blocked);
1415 1443
@@ -1421,6 +1449,42 @@ out_nocoalesce:
1421 return 0; 1449 return 0;
1422} 1450}
1423 1451
1452static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
1453 struct btree *replace)
1454{
1455 struct keylist keys;
1456 struct btree *n;
1457
1458 if (btree_check_reserve(b, NULL))
1459 return 0;
1460
1461 n = btree_node_alloc_replacement(replace, NULL);
1462
1463 /* recheck reserve after allocating replacement node */
1464 if (btree_check_reserve(b, NULL)) {
1465 btree_node_free(n);
1466 rw_unlock(true, n);
1467 return 0;
1468 }
1469
1470 bch_btree_node_write_sync(n);
1471
1472 bch_keylist_init(&keys);
1473 bch_keylist_add(&keys, &n->key);
1474
1475 make_btree_freeing_key(replace, keys.top);
1476 bch_keylist_push(&keys);
1477
1478 bch_btree_insert_node(b, op, &keys, NULL, NULL);
1479 BUG_ON(!bch_keylist_empty(&keys));
1480
1481 btree_node_free(replace);
1482 rw_unlock(true, n);
1483
1484 /* Invalidated our iterator */
1485 return -EINTR;
1486}
1487
1424static unsigned btree_gc_count_keys(struct btree *b) 1488static unsigned btree_gc_count_keys(struct btree *b)
1425{ 1489{
1426 struct bkey *k; 1490 struct bkey *k;
@@ -1438,14 +1502,11 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1438{ 1502{
1439 int ret = 0; 1503 int ret = 0;
1440 bool should_rewrite; 1504 bool should_rewrite;
1441 struct btree *n;
1442 struct bkey *k; 1505 struct bkey *k;
1443 struct keylist keys;
1444 struct btree_iter iter; 1506 struct btree_iter iter;
1445 struct gc_merge_info r[GC_MERGE_NODES]; 1507 struct gc_merge_info r[GC_MERGE_NODES];
1446 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; 1508 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
1447 1509
1448 bch_keylist_init(&keys);
1449 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 1510 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1450 1511
1451 for (i = r; i < r + ARRAY_SIZE(r); i++) 1512 for (i = r; i < r + ARRAY_SIZE(r); i++)
@@ -1454,7 +1515,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1454 while (1) { 1515 while (1) {
1455 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1516 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1456 if (k) { 1517 if (k) {
1457 r->b = bch_btree_node_get(b->c, k, b->level - 1, true); 1518 r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
1519 true);
1458 if (IS_ERR(r->b)) { 1520 if (IS_ERR(r->b)) {
1459 ret = PTR_ERR(r->b); 1521 ret = PTR_ERR(r->b);
1460 break; 1522 break;
@@ -1462,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1462 1524
1463 r->keys = btree_gc_count_keys(r->b); 1525 r->keys = btree_gc_count_keys(r->b);
1464 1526
1465 ret = btree_gc_coalesce(b, op, &keys, gc, r); 1527 ret = btree_gc_coalesce(b, op, gc, r);
1466 if (ret) 1528 if (ret)
1467 break; 1529 break;
1468 } 1530 }
@@ -1472,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1472 1534
1473 if (!IS_ERR(last->b)) { 1535 if (!IS_ERR(last->b)) {
1474 should_rewrite = btree_gc_mark_node(last->b, gc); 1536 should_rewrite = btree_gc_mark_node(last->b, gc);
1475 if (should_rewrite && 1537 if (should_rewrite) {
1476 !btree_check_reserve(b, NULL)) { 1538 ret = btree_gc_rewrite_node(b, op, last->b);
1477 n = btree_node_alloc_replacement(last->b, 1539 if (ret)
1478 false);
1479
1480 if (!IS_ERR_OR_NULL(n)) {
1481 bch_btree_node_write_sync(n);
1482
1483 bch_keylist_add(&keys, &n->key);
1484
1485 make_btree_freeing_key(last->b,
1486 keys.top);
1487 bch_keylist_push(&keys);
1488
1489 bch_btree_insert_node(b, op, &keys,
1490 NULL, NULL);
1491 BUG_ON(!bch_keylist_empty(&keys));
1492
1493 btree_node_free(last->b);
1494 rw_unlock(true, last->b);
1495 last->b = n;
1496
1497 /* Invalidated our iterator */
1498 ret = -EINTR;
1499 break; 1540 break;
1500 }
1501 } 1541 }
1502 1542
1503 if (last->b->level) { 1543 if (last->b->level) {
@@ -1537,8 +1577,6 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1537 rw_unlock(true, i->b); 1577 rw_unlock(true, i->b);
1538 } 1578 }
1539 1579
1540 bch_keylist_free(&keys);
1541
1542 return ret; 1580 return ret;
1543} 1581}
1544 1582
@@ -1551,7 +1589,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1551 1589
1552 should_rewrite = btree_gc_mark_node(b, gc); 1590 should_rewrite = btree_gc_mark_node(b, gc);
1553 if (should_rewrite) { 1591 if (should_rewrite) {
1554 n = btree_node_alloc_replacement(b, false); 1592 n = btree_node_alloc_replacement(b, NULL);
1555 1593
1556 if (!IS_ERR_OR_NULL(n)) { 1594 if (!IS_ERR_OR_NULL(n)) {
1557 bch_btree_node_write_sync(n); 1595 bch_btree_node_write_sync(n);
@@ -1887,11 +1925,14 @@ static int btree_split(struct btree *b, struct btree_op *op,
1887 closure_init_stack(&cl); 1925 closure_init_stack(&cl);
1888 bch_keylist_init(&parent_keys); 1926 bch_keylist_init(&parent_keys);
1889 1927
1890 if (!b->level && 1928 if (btree_check_reserve(b, op)) {
1891 btree_check_reserve(b, op)) 1929 if (!b->level)
1892 return -EINTR; 1930 return -EINTR;
1931 else
1932 WARN(1, "insufficient reserve for split\n");
1933 }
1893 1934
1894 n1 = btree_node_alloc_replacement(b, true); 1935 n1 = btree_node_alloc_replacement(b, op);
1895 if (IS_ERR(n1)) 1936 if (IS_ERR(n1))
1896 goto err; 1937 goto err;
1897 1938
@@ -1903,12 +1944,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
1903 1944
1904 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); 1945 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
1905 1946
1906 n2 = bch_btree_node_alloc(b->c, b->level, true); 1947 n2 = bch_btree_node_alloc(b->c, op, b->level);
1907 if (IS_ERR(n2)) 1948 if (IS_ERR(n2))
1908 goto err_free1; 1949 goto err_free1;
1909 1950
1910 if (!b->parent) { 1951 if (!b->parent) {
1911 n3 = bch_btree_node_alloc(b->c, b->level + 1, true); 1952 n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
1912 if (IS_ERR(n3)) 1953 if (IS_ERR(n3))
1913 goto err_free2; 1954 goto err_free2;
1914 } 1955 }
@@ -1995,7 +2036,7 @@ err_free1:
1995 btree_node_free(n1); 2036 btree_node_free(n1);
1996 rw_unlock(true, n1); 2037 rw_unlock(true, n1);
1997err: 2038err:
1998 WARN(1, "bcache: btree split failed"); 2039 WARN(1, "bcache: btree split failed (level %u)", b->level);
1999 2040
2000 if (n3 == ERR_PTR(-EAGAIN) || 2041 if (n3 == ERR_PTR(-EAGAIN) ||
2001 n2 == ERR_PTR(-EAGAIN) || 2042 n2 == ERR_PTR(-EAGAIN) ||
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index acebf26809cc..3ce371fa7f98 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -242,8 +242,9 @@ void __bch_btree_node_write(struct btree *, struct closure *);
242void bch_btree_node_write(struct btree *, struct closure *); 242void bch_btree_node_write(struct btree *, struct closure *);
243 243
244void bch_btree_set_root(struct btree *); 244void bch_btree_set_root(struct btree *);
245struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); 245struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
246struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); 246struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
247 struct bkey *, int, bool);
247 248
248int bch_btree_insert_check_key(struct btree *, struct btree_op *, 249int bch_btree_insert_check_key(struct btree *, struct btree_op *,
249 struct bkey *); 250 struct bkey *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 307fe378ea43..2d4a56219ec7 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1495,14 +1495,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1495 1495
1496 sema_init(&c->sb_write_mutex, 1); 1496 sema_init(&c->sb_write_mutex, 1);
1497 mutex_init(&c->bucket_lock); 1497 mutex_init(&c->bucket_lock);
1498 init_waitqueue_head(&c->try_wait); 1498 init_waitqueue_head(&c->btree_cache_wait);
1499 init_waitqueue_head(&c->bucket_wait); 1499 init_waitqueue_head(&c->bucket_wait);
1500 sema_init(&c->uuid_write_mutex, 1); 1500 sema_init(&c->uuid_write_mutex, 1);
1501 1501
1502 spin_lock_init(&c->btree_gc_time.lock); 1502 spin_lock_init(&c->btree_gc_time.lock);
1503 spin_lock_init(&c->btree_split_time.lock); 1503 spin_lock_init(&c->btree_split_time.lock);
1504 spin_lock_init(&c->btree_read_time.lock); 1504 spin_lock_init(&c->btree_read_time.lock);
1505 spin_lock_init(&c->try_harder_time.lock);
1506 1505
1507 bch_moving_init_cache_set(c); 1506 bch_moving_init_cache_set(c);
1508 1507
@@ -1591,7 +1590,7 @@ static void run_cache_set(struct cache_set *c)
1591 goto err; 1590 goto err;
1592 1591
1593 err = "error reading btree root"; 1592 err = "error reading btree root";
1594 c->root = bch_btree_node_get(c, k, j->btree_level, true); 1593 c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
1595 if (IS_ERR_OR_NULL(c->root)) 1594 if (IS_ERR_OR_NULL(c->root))
1596 goto err; 1595 goto err;
1597 1596
@@ -1666,7 +1665,7 @@ static void run_cache_set(struct cache_set *c)
1666 goto err; 1665 goto err;
1667 1666
1668 err = "cannot allocate new btree root"; 1667 err = "cannot allocate new btree root";
1669 c->root = bch_btree_node_alloc(c, 0, true); 1668 c->root = bch_btree_node_alloc(c, NULL, 0);
1670 if (IS_ERR_OR_NULL(c->root)) 1669 if (IS_ERR_OR_NULL(c->root))
1671 goto err; 1670 goto err;
1672 1671
@@ -1847,13 +1846,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1847 for_each_bucket(b, ca) 1846 for_each_bucket(b, ca)
1848 atomic_set(&b->pin, 0); 1847 atomic_set(&b->pin, 0);
1849 1848
1850 if (bch_cache_allocator_init(ca))
1851 goto err;
1852
1853 return 0; 1849 return 0;
1854err:
1855 kobject_put(&ca->kobj);
1856 return -ENOMEM;
1857} 1850}
1858 1851
1859static void register_cache(struct cache_sb *sb, struct page *sb_page, 1852static void register_cache(struct cache_sb *sb, struct page *sb_page,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 662b9484ed5e..89aaa2ef38f9 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms);
54sysfs_time_stats_attribute(btree_split, sec, us); 54sysfs_time_stats_attribute(btree_split, sec, us);
55sysfs_time_stats_attribute(btree_sort, ms, us); 55sysfs_time_stats_attribute(btree_sort, ms, us);
56sysfs_time_stats_attribute(btree_read, ms, us); 56sysfs_time_stats_attribute(btree_read, ms, us);
57sysfs_time_stats_attribute(try_harder, ms, us);
58 57
59read_attribute(btree_nodes); 58read_attribute(btree_nodes);
60read_attribute(btree_used_percent); 59read_attribute(btree_used_percent);
@@ -534,7 +533,6 @@ lock_root:
534 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); 533 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
535 sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); 534 sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
536 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); 535 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
537 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
538 536
539 sysfs_print(btree_used_percent, btree_used(c)); 537 sysfs_print(btree_used_percent, btree_used(c));
540 sysfs_print(btree_nodes, c->gc_stats.nodes); 538 sysfs_print(btree_nodes, c->gc_stats.nodes);
@@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = {
709 sysfs_time_stats_attribute_list(btree_split, sec, us) 707 sysfs_time_stats_attribute_list(btree_split, sec, us)
710 sysfs_time_stats_attribute_list(btree_sort, ms, us) 708 sysfs_time_stats_attribute_list(btree_sort, ms, us)
711 sysfs_time_stats_attribute_list(btree_read, ms, us) 709 sysfs_time_stats_attribute_list(btree_read, ms, us)
712 sysfs_time_stats_attribute_list(try_harder, ms, us)
713 710
714 &sysfs_btree_nodes, 711 &sysfs_btree_nodes,
715 &sysfs_btree_used_percent, 712 &sysfs_btree_used_percent,