aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/md/bcache/alloc.c23
-rw-r--r--drivers/md/bcache/bcache.h15
-rw-r--r--drivers/md/bcache/btree.c225
-rw-r--r--drivers/md/bcache/btree.h5
-rw-r--r--drivers/md/bcache/super.c13
-rw-r--r--drivers/md/bcache/sysfs.c3
6 files changed, 145 insertions, 139 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 5ba4eaea57f4..a59ef6147fc7 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -375,6 +375,7 @@ static int bch_allocator_thread(void *arg)
375 } 375 }
376 376
377 allocator_wait(ca, bch_allocator_push(ca, bucket)); 377 allocator_wait(ca, bch_allocator_push(ca, bucket));
378 wake_up(&ca->set->btree_cache_wait);
378 wake_up(&ca->set->bucket_wait); 379 wake_up(&ca->set->bucket_wait);
379 } 380 }
380 381
@@ -717,25 +718,3 @@ int bch_cache_allocator_start(struct cache *ca)
717 ca->alloc_thread = k; 718 ca->alloc_thread = k;
718 return 0; 719 return 0;
719} 720}
720
721int bch_cache_allocator_init(struct cache *ca)
722{
723 /*
724 * Reserve:
725 * Prio/gen writes first
726 * Then 8 for btree allocations
727 * Then half for the moving garbage collector
728 */
729#if 0
730 ca->watermark[WATERMARK_PRIO] = 0;
731
732 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
733
734 ca->watermark[WATERMARK_MOVINGGC] = 8 +
735 ca->watermark[WATERMARK_METADATA];
736
737 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
738 ca->watermark[WATERMARK_MOVINGGC];
739#endif
740 return 0;
741}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 15d26236caf9..171cda89cb6b 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -562,19 +562,16 @@ struct cache_set {
562 struct list_head btree_cache_freed; 562 struct list_head btree_cache_freed;
563 563
564 /* Number of elements in btree_cache + btree_cache_freeable lists */ 564 /* Number of elements in btree_cache + btree_cache_freeable lists */
565 unsigned bucket_cache_used; 565 unsigned btree_cache_used;
566 566
567 /* 567 /*
568 * If we need to allocate memory for a new btree node and that 568 * If we need to allocate memory for a new btree node and that
569 * allocation fails, we can cannibalize another node in the btree cache 569 * allocation fails, we can cannibalize another node in the btree cache
570 * to satisfy the allocation. However, only one thread can be doing this 570 * to satisfy the allocation - lock to guarantee only one thread does
571 * at a time, for obvious reasons - try_harder and try_wait are 571 * this at a time:
572 * basically a lock for this that we can wait on asynchronously. The
573 * btree_root() macro releases the lock when it returns.
574 */ 572 */
575 struct task_struct *try_harder; 573 wait_queue_head_t btree_cache_wait;
576 wait_queue_head_t try_wait; 574 struct task_struct *btree_cache_alloc_lock;
577 uint64_t try_harder_start;
578 575
579 /* 576 /*
580 * When we free a btree node, we increment the gen of the bucket the 577 * When we free a btree node, we increment the gen of the bucket the
@@ -669,7 +666,6 @@ struct cache_set {
669 struct time_stats btree_gc_time; 666 struct time_stats btree_gc_time;
670 struct time_stats btree_split_time; 667 struct time_stats btree_split_time;
671 struct time_stats btree_read_time; 668 struct time_stats btree_read_time;
672 struct time_stats try_harder_time;
673 669
674 atomic_long_t cache_read_races; 670 atomic_long_t cache_read_races;
675 atomic_long_t writeback_keys_done; 671 atomic_long_t writeback_keys_done;
@@ -956,7 +952,6 @@ int bch_open_buckets_alloc(struct cache_set *);
956void bch_open_buckets_free(struct cache_set *); 952void bch_open_buckets_free(struct cache_set *);
957 953
958int bch_cache_allocator_start(struct cache *ca); 954int bch_cache_allocator_start(struct cache *ca);
959int bch_cache_allocator_init(struct cache *ca);
960 955
961void bch_debug_exit(void); 956void bch_debug_exit(void);
962int bch_debug_init(struct kobject *); 957int bch_debug_init(struct kobject *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index beb32551da78..be90596a9e2a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -117,7 +117,7 @@
117({ \ 117({ \
118 int _r, l = (b)->level - 1; \ 118 int _r, l = (b)->level - 1; \
119 bool _w = l <= (op)->lock; \ 119 bool _w = l <= (op)->lock; \
120 struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ 120 struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
121 if (!IS_ERR(_child)) { \ 121 if (!IS_ERR(_child)) { \
122 _child->parent = (b); \ 122 _child->parent = (b); \
123 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ 123 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
@@ -146,17 +146,12 @@
146 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 146 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
147 } \ 147 } \
148 rw_unlock(_w, _b); \ 148 rw_unlock(_w, _b); \
149 bch_cannibalize_unlock(c); \
149 if (_r == -EINTR) \ 150 if (_r == -EINTR) \
150 schedule(); \ 151 schedule(); \
151 bch_cannibalize_unlock(c); \
152 if (_r == -ENOSPC) { \
153 wait_event((c)->try_wait, \
154 !(c)->try_harder); \
155 _r = -EINTR; \
156 } \
157 } while (_r == -EINTR); \ 152 } while (_r == -EINTR); \
158 \ 153 \
159 finish_wait(&(c)->bucket_wait, &(op)->wait); \ 154 finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
160 _r; \ 155 _r; \
161}) 156})
162 157
@@ -563,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
563#define mca_reserve(c) (((c->root && c->root->level) \ 558#define mca_reserve(c) (((c->root && c->root->level) \
564 ? c->root->level : 1) * 8 + 16) 559 ? c->root->level : 1) * 8 + 16)
565#define mca_can_free(c) \ 560#define mca_can_free(c) \
566 max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) 561 max_t(int, 0, c->btree_cache_used - mca_reserve(c))
567 562
568static void mca_data_free(struct btree *b) 563static void mca_data_free(struct btree *b)
569{ 564{
@@ -571,7 +566,7 @@ static void mca_data_free(struct btree *b)
571 566
572 bch_btree_keys_free(&b->keys); 567 bch_btree_keys_free(&b->keys);
573 568
574 b->c->bucket_cache_used--; 569 b->c->btree_cache_used--;
575 list_move(&b->list, &b->c->btree_cache_freed); 570 list_move(&b->list, &b->c->btree_cache_freed);
576} 571}
577 572
@@ -596,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
596 ilog2(b->c->btree_pages), 591 ilog2(b->c->btree_pages),
597 btree_order(k)), 592 btree_order(k)),
598 gfp)) { 593 gfp)) {
599 b->c->bucket_cache_used++; 594 b->c->btree_cache_used++;
600 list_move(&b->list, &b->c->btree_cache); 595 list_move(&b->list, &b->c->btree_cache);
601 } else { 596 } else {
602 list_move(&b->list, &b->c->btree_cache_freed); 597 list_move(&b->list, &b->c->btree_cache_freed);
@@ -675,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
675 if (c->shrinker_disabled) 670 if (c->shrinker_disabled)
676 return SHRINK_STOP; 671 return SHRINK_STOP;
677 672
678 if (c->try_harder) 673 if (c->btree_cache_alloc_lock)
679 return SHRINK_STOP; 674 return SHRINK_STOP;
680 675
681 /* Return -1 if we can't do anything right now */ 676 /* Return -1 if we can't do anything right now */
@@ -707,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
707 } 702 }
708 } 703 }
709 704
710 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { 705 for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
711 if (list_empty(&c->btree_cache)) 706 if (list_empty(&c->btree_cache))
712 goto out; 707 goto out;
713 708
@@ -736,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
736 if (c->shrinker_disabled) 731 if (c->shrinker_disabled)
737 return 0; 732 return 0;
738 733
739 if (c->try_harder) 734 if (c->btree_cache_alloc_lock)
740 return 0; 735 return 0;
741 736
742 return mca_can_free(c) * c->btree_pages; 737 return mca_can_free(c) * c->btree_pages;
@@ -840,17 +835,30 @@ out:
840 return b; 835 return b;
841} 836}
842 837
843static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) 838static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
839{
840 struct task_struct *old;
841
842 old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
843 if (old && old != current) {
844 if (op)
845 prepare_to_wait(&c->btree_cache_wait, &op->wait,
846 TASK_UNINTERRUPTIBLE);
847 return -EINTR;
848 }
849
850 return 0;
851}
852
853static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
854 struct bkey *k)
844{ 855{
845 struct btree *b; 856 struct btree *b;
846 857
847 trace_bcache_btree_cache_cannibalize(c); 858 trace_bcache_btree_cache_cannibalize(c);
848 859
849 if (!c->try_harder) { 860 if (mca_cannibalize_lock(c, op))
850 c->try_harder = current; 861 return ERR_PTR(-EINTR);
851 c->try_harder_start = local_clock();
852 } else if (c->try_harder != current)
853 return ERR_PTR(-ENOSPC);
854 862
855 list_for_each_entry_reverse(b, &c->btree_cache, list) 863 list_for_each_entry_reverse(b, &c->btree_cache, list)
856 if (!mca_reap(b, btree_order(k), false)) 864 if (!mca_reap(b, btree_order(k), false))
@@ -860,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
860 if (!mca_reap(b, btree_order(k), true)) 868 if (!mca_reap(b, btree_order(k), true))
861 return b; 869 return b;
862 870
871 WARN(1, "btree cache cannibalize failed\n");
863 return ERR_PTR(-ENOMEM); 872 return ERR_PTR(-ENOMEM);
864} 873}
865 874
@@ -871,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
871 */ 880 */
872static void bch_cannibalize_unlock(struct cache_set *c) 881static void bch_cannibalize_unlock(struct cache_set *c)
873{ 882{
874 if (c->try_harder == current) { 883 if (c->btree_cache_alloc_lock == current) {
875 bch_time_stats_update(&c->try_harder_time, c->try_harder_start); 884 c->btree_cache_alloc_lock = NULL;
876 c->try_harder = NULL; 885 wake_up(&c->btree_cache_wait);
877 wake_up(&c->try_wait);
878 } 886 }
879} 887}
880 888
881static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) 889static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
890 struct bkey *k, int level)
882{ 891{
883 struct btree *b; 892 struct btree *b;
884 893
@@ -941,7 +950,7 @@ err:
941 if (b) 950 if (b)
942 rw_unlock(true, b); 951 rw_unlock(true, b);
943 952
944 b = mca_cannibalize(c, k); 953 b = mca_cannibalize(c, op, k);
945 if (!IS_ERR(b)) 954 if (!IS_ERR(b))
946 goto out; 955 goto out;
947 956
@@ -957,8 +966,8 @@ err:
957 * The btree node will have either a read or a write lock held, depending on 966 * The btree node will have either a read or a write lock held, depending on
958 * level and op->lock. 967 * level and op->lock.
959 */ 968 */
960struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, 969struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
961 int level, bool write) 970 struct bkey *k, int level, bool write)
962{ 971{
963 int i = 0; 972 int i = 0;
964 struct btree *b; 973 struct btree *b;
@@ -972,7 +981,7 @@ retry:
972 return ERR_PTR(-EAGAIN); 981 return ERR_PTR(-EAGAIN);
973 982
974 mutex_lock(&c->bucket_lock); 983 mutex_lock(&c->bucket_lock);
975 b = mca_alloc(c, k, level); 984 b = mca_alloc(c, op, k, level);
976 mutex_unlock(&c->bucket_lock); 985 mutex_unlock(&c->bucket_lock);
977 986
978 if (!b) 987 if (!b)
@@ -1018,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
1018 struct btree *b; 1027 struct btree *b;
1019 1028
1020 mutex_lock(&c->bucket_lock); 1029 mutex_lock(&c->bucket_lock);
1021 b = mca_alloc(c, k, level); 1030 b = mca_alloc(c, NULL, k, level);
1022 mutex_unlock(&c->bucket_lock); 1031 mutex_unlock(&c->bucket_lock);
1023 1032
1024 if (!IS_ERR_OR_NULL(b)) { 1033 if (!IS_ERR_OR_NULL(b)) {
@@ -1051,20 +1060,21 @@ static void btree_node_free(struct btree *b)
1051 mutex_unlock(&b->c->bucket_lock); 1060 mutex_unlock(&b->c->bucket_lock);
1052} 1061}
1053 1062
1054struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) 1063struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1064 int level)
1055{ 1065{
1056 BKEY_PADDED(key) k; 1066 BKEY_PADDED(key) k;
1057 struct btree *b = ERR_PTR(-EAGAIN); 1067 struct btree *b = ERR_PTR(-EAGAIN);
1058 1068
1059 mutex_lock(&c->bucket_lock); 1069 mutex_lock(&c->bucket_lock);
1060retry: 1070retry:
1061 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) 1071 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
1062 goto err; 1072 goto err;
1063 1073
1064 bkey_put(c, &k.key); 1074 bkey_put(c, &k.key);
1065 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1075 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1066 1076
1067 b = mca_alloc(c, &k.key, level); 1077 b = mca_alloc(c, op, &k.key, level);
1068 if (IS_ERR(b)) 1078 if (IS_ERR(b))
1069 goto err_free; 1079 goto err_free;
1070 1080
@@ -1090,9 +1100,10 @@ err:
1090 return b; 1100 return b;
1091} 1101}
1092 1102
1093static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) 1103static struct btree *btree_node_alloc_replacement(struct btree *b,
1104 struct btree_op *op)
1094{ 1105{
1095 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); 1106 struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
1096 if (!IS_ERR_OR_NULL(n)) { 1107 if (!IS_ERR_OR_NULL(n)) {
1097 mutex_lock(&n->write_lock); 1108 mutex_lock(&n->write_lock);
1098 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); 1109 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
@@ -1126,22 +1137,22 @@ static int btree_check_reserve(struct btree *b, struct btree_op *op)
1126{ 1137{
1127 struct cache_set *c = b->c; 1138 struct cache_set *c = b->c;
1128 struct cache *ca; 1139 struct cache *ca;
1129 unsigned i, reserve = c->root->level * 2 + 1; 1140 unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
1130 int ret = 0;
1131 1141
1132 mutex_lock(&c->bucket_lock); 1142 mutex_lock(&c->bucket_lock);
1133 1143
1134 for_each_cache(ca, c, i) 1144 for_each_cache(ca, c, i)
1135 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { 1145 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1136 if (op) 1146 if (op)
1137 prepare_to_wait(&c->bucket_wait, &op->wait, 1147 prepare_to_wait(&c->btree_cache_wait, &op->wait,
1138 TASK_UNINTERRUPTIBLE); 1148 TASK_UNINTERRUPTIBLE);
1139 ret = -EINTR; 1149 mutex_unlock(&c->bucket_lock);
1140 break; 1150 return -EINTR;
1141 } 1151 }
1142 1152
1143 mutex_unlock(&c->bucket_lock); 1153 mutex_unlock(&c->bucket_lock);
1144 return ret; 1154
1155 return mca_cannibalize_lock(b->c, op);
1145} 1156}
1146 1157
1147/* Garbage collection */ 1158/* Garbage collection */
@@ -1273,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *,
1273 struct keylist *, atomic_t *, struct bkey *); 1284 struct keylist *, atomic_t *, struct bkey *);
1274 1285
1275static int btree_gc_coalesce(struct btree *b, struct btree_op *op, 1286static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1276 struct keylist *keylist, struct gc_stat *gc, 1287 struct gc_stat *gc, struct gc_merge_info *r)
1277 struct gc_merge_info *r)
1278{ 1288{
1279 unsigned i, nodes = 0, keys = 0, blocks; 1289 unsigned i, nodes = 0, keys = 0, blocks;
1280 struct btree *new_nodes[GC_MERGE_NODES]; 1290 struct btree *new_nodes[GC_MERGE_NODES];
1291 struct keylist keylist;
1281 struct closure cl; 1292 struct closure cl;
1282 struct bkey *k; 1293 struct bkey *k;
1283 1294
1295 bch_keylist_init(&keylist);
1296
1297 if (btree_check_reserve(b, NULL))
1298 return 0;
1299
1284 memset(new_nodes, 0, sizeof(new_nodes)); 1300 memset(new_nodes, 0, sizeof(new_nodes));
1285 closure_init_stack(&cl); 1301 closure_init_stack(&cl);
1286 1302
@@ -1295,11 +1311,20 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1295 return 0; 1311 return 0;
1296 1312
1297 for (i = 0; i < nodes; i++) { 1313 for (i = 0; i < nodes; i++) {
1298 new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); 1314 new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
1299 if (IS_ERR_OR_NULL(new_nodes[i])) 1315 if (IS_ERR_OR_NULL(new_nodes[i]))
1300 goto out_nocoalesce; 1316 goto out_nocoalesce;
1301 } 1317 }
1302 1318
1319 /*
1320 * We have to check the reserve here, after we've allocated our new
1321 * nodes, to make sure the insert below will succeed - we also check
1322 * before as an optimization to potentially avoid a bunch of expensive
1323 * allocs/sorts
1324 */
1325 if (btree_check_reserve(b, NULL))
1326 goto out_nocoalesce;
1327
1303 for (i = 0; i < nodes; i++) 1328 for (i = 0; i < nodes; i++)
1304 mutex_lock(&new_nodes[i]->write_lock); 1329 mutex_lock(&new_nodes[i]->write_lock);
1305 1330
@@ -1361,12 +1386,12 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1361 1386
1362 n2->keys -= keys; 1387 n2->keys -= keys;
1363 1388
1364 if (__bch_keylist_realloc(keylist, 1389 if (__bch_keylist_realloc(&keylist,
1365 bkey_u64s(&new_nodes[i]->key))) 1390 bkey_u64s(&new_nodes[i]->key)))
1366 goto out_nocoalesce; 1391 goto out_nocoalesce;
1367 1392
1368 bch_btree_node_write(new_nodes[i], &cl); 1393 bch_btree_node_write(new_nodes[i], &cl);
1369 bch_keylist_add(keylist, &new_nodes[i]->key); 1394 bch_keylist_add(&keylist, &new_nodes[i]->key);
1370 } 1395 }
1371 1396
1372 for (i = 0; i < nodes; i++) 1397 for (i = 0; i < nodes; i++)
@@ -1380,15 +1405,15 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1380 rw_unlock(true, new_nodes[0]); 1405 rw_unlock(true, new_nodes[0]);
1381 1406
1382 for (i = 0; i < nodes; i++) { 1407 for (i = 0; i < nodes; i++) {
1383 if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) 1408 if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
1384 goto out_nocoalesce; 1409 goto out_nocoalesce;
1385 1410
1386 make_btree_freeing_key(r[i].b, keylist->top); 1411 make_btree_freeing_key(r[i].b, keylist.top);
1387 bch_keylist_push(keylist); 1412 bch_keylist_push(&keylist);
1388 } 1413 }
1389 1414
1390 bch_btree_insert_node(b, op, keylist, NULL, NULL); 1415 bch_btree_insert_node(b, op, &keylist, NULL, NULL);
1391 BUG_ON(!bch_keylist_empty(keylist)); 1416 BUG_ON(!bch_keylist_empty(&keylist));
1392 1417
1393 for (i = 0; i < nodes; i++) { 1418 for (i = 0; i < nodes; i++) {
1394 btree_node_free(r[i].b); 1419 btree_node_free(r[i].b);
@@ -1403,13 +1428,16 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1403 trace_bcache_btree_gc_coalesce(nodes); 1428 trace_bcache_btree_gc_coalesce(nodes);
1404 gc->nodes--; 1429 gc->nodes--;
1405 1430
1431 bch_keylist_free(&keylist);
1432
1406 /* Invalidated our iterator */ 1433 /* Invalidated our iterator */
1407 return -EINTR; 1434 return -EINTR;
1408 1435
1409out_nocoalesce: 1436out_nocoalesce:
1410 closure_sync(&cl); 1437 closure_sync(&cl);
1438 bch_keylist_free(&keylist);
1411 1439
1412 while ((k = bch_keylist_pop(keylist))) 1440 while ((k = bch_keylist_pop(&keylist)))
1413 if (!bkey_cmp(k, &ZERO_KEY)) 1441 if (!bkey_cmp(k, &ZERO_KEY))
1414 atomic_dec(&b->c->prio_blocked); 1442 atomic_dec(&b->c->prio_blocked);
1415 1443
@@ -1421,6 +1449,42 @@ out_nocoalesce:
1421 return 0; 1449 return 0;
1422} 1450}
1423 1451
1452static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
1453 struct btree *replace)
1454{
1455 struct keylist keys;
1456 struct btree *n;
1457
1458 if (btree_check_reserve(b, NULL))
1459 return 0;
1460
1461 n = btree_node_alloc_replacement(replace, NULL);
1462
1463 /* recheck reserve after allocating replacement node */
1464 if (btree_check_reserve(b, NULL)) {
1465 btree_node_free(n);
1466 rw_unlock(true, n);
1467 return 0;
1468 }
1469
1470 bch_btree_node_write_sync(n);
1471
1472 bch_keylist_init(&keys);
1473 bch_keylist_add(&keys, &n->key);
1474
1475 make_btree_freeing_key(replace, keys.top);
1476 bch_keylist_push(&keys);
1477
1478 bch_btree_insert_node(b, op, &keys, NULL, NULL);
1479 BUG_ON(!bch_keylist_empty(&keys));
1480
1481 btree_node_free(replace);
1482 rw_unlock(true, n);
1483
1484 /* Invalidated our iterator */
1485 return -EINTR;
1486}
1487
1424static unsigned btree_gc_count_keys(struct btree *b) 1488static unsigned btree_gc_count_keys(struct btree *b)
1425{ 1489{
1426 struct bkey *k; 1490 struct bkey *k;
@@ -1438,14 +1502,11 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1438{ 1502{
1439 int ret = 0; 1503 int ret = 0;
1440 bool should_rewrite; 1504 bool should_rewrite;
1441 struct btree *n;
1442 struct bkey *k; 1505 struct bkey *k;
1443 struct keylist keys;
1444 struct btree_iter iter; 1506 struct btree_iter iter;
1445 struct gc_merge_info r[GC_MERGE_NODES]; 1507 struct gc_merge_info r[GC_MERGE_NODES];
1446 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; 1508 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
1447 1509
1448 bch_keylist_init(&keys);
1449 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 1510 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1450 1511
1451 for (i = r; i < r + ARRAY_SIZE(r); i++) 1512 for (i = r; i < r + ARRAY_SIZE(r); i++)
@@ -1454,7 +1515,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1454 while (1) { 1515 while (1) {
1455 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1516 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1456 if (k) { 1517 if (k) {
1457 r->b = bch_btree_node_get(b->c, k, b->level - 1, true); 1518 r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
1519 true);
1458 if (IS_ERR(r->b)) { 1520 if (IS_ERR(r->b)) {
1459 ret = PTR_ERR(r->b); 1521 ret = PTR_ERR(r->b);
1460 break; 1522 break;
@@ -1462,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1462 1524
1463 r->keys = btree_gc_count_keys(r->b); 1525 r->keys = btree_gc_count_keys(r->b);
1464 1526
1465 ret = btree_gc_coalesce(b, op, &keys, gc, r); 1527 ret = btree_gc_coalesce(b, op, gc, r);
1466 if (ret) 1528 if (ret)
1467 break; 1529 break;
1468 } 1530 }
@@ -1472,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1472 1534
1473 if (!IS_ERR(last->b)) { 1535 if (!IS_ERR(last->b)) {
1474 should_rewrite = btree_gc_mark_node(last->b, gc); 1536 should_rewrite = btree_gc_mark_node(last->b, gc);
1475 if (should_rewrite && 1537 if (should_rewrite) {
1476 !btree_check_reserve(b, NULL)) { 1538 ret = btree_gc_rewrite_node(b, op, last->b);
1477 n = btree_node_alloc_replacement(last->b, 1539 if (ret)
1478 false);
1479
1480 if (!IS_ERR_OR_NULL(n)) {
1481 bch_btree_node_write_sync(n);
1482
1483 bch_keylist_add(&keys, &n->key);
1484
1485 make_btree_freeing_key(last->b,
1486 keys.top);
1487 bch_keylist_push(&keys);
1488
1489 bch_btree_insert_node(b, op, &keys,
1490 NULL, NULL);
1491 BUG_ON(!bch_keylist_empty(&keys));
1492
1493 btree_node_free(last->b);
1494 rw_unlock(true, last->b);
1495 last->b = n;
1496
1497 /* Invalidated our iterator */
1498 ret = -EINTR;
1499 break; 1540 break;
1500 }
1501 } 1541 }
1502 1542
1503 if (last->b->level) { 1543 if (last->b->level) {
@@ -1537,8 +1577,6 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1537 rw_unlock(true, i->b); 1577 rw_unlock(true, i->b);
1538 } 1578 }
1539 1579
1540 bch_keylist_free(&keys);
1541
1542 return ret; 1580 return ret;
1543} 1581}
1544 1582
@@ -1551,7 +1589,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1551 1589
1552 should_rewrite = btree_gc_mark_node(b, gc); 1590 should_rewrite = btree_gc_mark_node(b, gc);
1553 if (should_rewrite) { 1591 if (should_rewrite) {
1554 n = btree_node_alloc_replacement(b, false); 1592 n = btree_node_alloc_replacement(b, NULL);
1555 1593
1556 if (!IS_ERR_OR_NULL(n)) { 1594 if (!IS_ERR_OR_NULL(n)) {
1557 bch_btree_node_write_sync(n); 1595 bch_btree_node_write_sync(n);
@@ -1887,11 +1925,14 @@ static int btree_split(struct btree *b, struct btree_op *op,
1887 closure_init_stack(&cl); 1925 closure_init_stack(&cl);
1888 bch_keylist_init(&parent_keys); 1926 bch_keylist_init(&parent_keys);
1889 1927
1890 if (!b->level && 1928 if (btree_check_reserve(b, op)) {
1891 btree_check_reserve(b, op)) 1929 if (!b->level)
1892 return -EINTR; 1930 return -EINTR;
1931 else
1932 WARN(1, "insufficient reserve for split\n");
1933 }
1893 1934
1894 n1 = btree_node_alloc_replacement(b, true); 1935 n1 = btree_node_alloc_replacement(b, op);
1895 if (IS_ERR(n1)) 1936 if (IS_ERR(n1))
1896 goto err; 1937 goto err;
1897 1938
@@ -1903,12 +1944,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
1903 1944
1904 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); 1945 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
1905 1946
1906 n2 = bch_btree_node_alloc(b->c, b->level, true); 1947 n2 = bch_btree_node_alloc(b->c, op, b->level);
1907 if (IS_ERR(n2)) 1948 if (IS_ERR(n2))
1908 goto err_free1; 1949 goto err_free1;
1909 1950
1910 if (!b->parent) { 1951 if (!b->parent) {
1911 n3 = bch_btree_node_alloc(b->c, b->level + 1, true); 1952 n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
1912 if (IS_ERR(n3)) 1953 if (IS_ERR(n3))
1913 goto err_free2; 1954 goto err_free2;
1914 } 1955 }
@@ -1995,7 +2036,7 @@ err_free1:
1995 btree_node_free(n1); 2036 btree_node_free(n1);
1996 rw_unlock(true, n1); 2037 rw_unlock(true, n1);
1997err: 2038err:
1998 WARN(1, "bcache: btree split failed"); 2039 WARN(1, "bcache: btree split failed (level %u)", b->level);
1999 2040
2000 if (n3 == ERR_PTR(-EAGAIN) || 2041 if (n3 == ERR_PTR(-EAGAIN) ||
2001 n2 == ERR_PTR(-EAGAIN) || 2042 n2 == ERR_PTR(-EAGAIN) ||
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index acebf26809cc..3ce371fa7f98 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -242,8 +242,9 @@ void __bch_btree_node_write(struct btree *, struct closure *);
242void bch_btree_node_write(struct btree *, struct closure *); 242void bch_btree_node_write(struct btree *, struct closure *);
243 243
244void bch_btree_set_root(struct btree *); 244void bch_btree_set_root(struct btree *);
245struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); 245struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
246struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); 246struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
247 struct bkey *, int, bool);
247 248
248int bch_btree_insert_check_key(struct btree *, struct btree_op *, 249int bch_btree_insert_check_key(struct btree *, struct btree_op *,
249 struct bkey *); 250 struct bkey *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 307fe378ea43..2d4a56219ec7 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1495,14 +1495,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1495 1495
1496 sema_init(&c->sb_write_mutex, 1); 1496 sema_init(&c->sb_write_mutex, 1);
1497 mutex_init(&c->bucket_lock); 1497 mutex_init(&c->bucket_lock);
1498 init_waitqueue_head(&c->try_wait); 1498 init_waitqueue_head(&c->btree_cache_wait);
1499 init_waitqueue_head(&c->bucket_wait); 1499 init_waitqueue_head(&c->bucket_wait);
1500 sema_init(&c->uuid_write_mutex, 1); 1500 sema_init(&c->uuid_write_mutex, 1);
1501 1501
1502 spin_lock_init(&c->btree_gc_time.lock); 1502 spin_lock_init(&c->btree_gc_time.lock);
1503 spin_lock_init(&c->btree_split_time.lock); 1503 spin_lock_init(&c->btree_split_time.lock);
1504 spin_lock_init(&c->btree_read_time.lock); 1504 spin_lock_init(&c->btree_read_time.lock);
1505 spin_lock_init(&c->try_harder_time.lock);
1506 1505
1507 bch_moving_init_cache_set(c); 1506 bch_moving_init_cache_set(c);
1508 1507
@@ -1591,7 +1590,7 @@ static void run_cache_set(struct cache_set *c)
1591 goto err; 1590 goto err;
1592 1591
1593 err = "error reading btree root"; 1592 err = "error reading btree root";
1594 c->root = bch_btree_node_get(c, k, j->btree_level, true); 1593 c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
1595 if (IS_ERR_OR_NULL(c->root)) 1594 if (IS_ERR_OR_NULL(c->root))
1596 goto err; 1595 goto err;
1597 1596
@@ -1666,7 +1665,7 @@ static void run_cache_set(struct cache_set *c)
1666 goto err; 1665 goto err;
1667 1666
1668 err = "cannot allocate new btree root"; 1667 err = "cannot allocate new btree root";
1669 c->root = bch_btree_node_alloc(c, 0, true); 1668 c->root = bch_btree_node_alloc(c, NULL, 0);
1670 if (IS_ERR_OR_NULL(c->root)) 1669 if (IS_ERR_OR_NULL(c->root))
1671 goto err; 1670 goto err;
1672 1671
@@ -1847,13 +1846,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1847 for_each_bucket(b, ca) 1846 for_each_bucket(b, ca)
1848 atomic_set(&b->pin, 0); 1847 atomic_set(&b->pin, 0);
1849 1848
1850 if (bch_cache_allocator_init(ca))
1851 goto err;
1852
1853 return 0; 1849 return 0;
1854err:
1855 kobject_put(&ca->kobj);
1856 return -ENOMEM;
1857} 1850}
1858 1851
1859static void register_cache(struct cache_sb *sb, struct page *sb_page, 1852static void register_cache(struct cache_sb *sb, struct page *sb_page,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 662b9484ed5e..89aaa2ef38f9 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms);
54sysfs_time_stats_attribute(btree_split, sec, us); 54sysfs_time_stats_attribute(btree_split, sec, us);
55sysfs_time_stats_attribute(btree_sort, ms, us); 55sysfs_time_stats_attribute(btree_sort, ms, us);
56sysfs_time_stats_attribute(btree_read, ms, us); 56sysfs_time_stats_attribute(btree_read, ms, us);
57sysfs_time_stats_attribute(try_harder, ms, us);
58 57
59read_attribute(btree_nodes); 58read_attribute(btree_nodes);
60read_attribute(btree_used_percent); 59read_attribute(btree_used_percent);
@@ -534,7 +533,6 @@ lock_root:
534 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); 533 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
535 sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); 534 sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
536 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); 535 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
537 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
538 536
539 sysfs_print(btree_used_percent, btree_used(c)); 537 sysfs_print(btree_used_percent, btree_used(c));
540 sysfs_print(btree_nodes, c->gc_stats.nodes); 538 sysfs_print(btree_nodes, c->gc_stats.nodes);
@@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = {
709 sysfs_time_stats_attribute_list(btree_split, sec, us) 707 sysfs_time_stats_attribute_list(btree_split, sec, us)
710 sysfs_time_stats_attribute_list(btree_sort, ms, us) 708 sysfs_time_stats_attribute_list(btree_sort, ms, us)
711 sysfs_time_stats_attribute_list(btree_read, ms, us) 709 sysfs_time_stats_attribute_list(btree_read, ms, us)
712 sysfs_time_stats_attribute_list(try_harder, ms, us)
713 710
714 &sysfs_btree_nodes, 711 &sysfs_btree_nodes,
715 &sysfs_btree_used_percent, 712 &sysfs_btree_used_percent,