aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/bcache/btree.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/bcache/btree.c')
-rw-r--r--drivers/md/bcache/btree.c451
1 files changed, 212 insertions, 239 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a5658f04e62..ee372884c405 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -24,6 +24,7 @@
24#include "btree.h" 24#include "btree.h"
25#include "debug.h" 25#include "debug.h"
26#include "request.h" 26#include "request.h"
27#include "writeback.h"
27 28
28#include <linux/slab.h> 29#include <linux/slab.h>
29#include <linux/bitops.h> 30#include <linux/bitops.h>
@@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
134 return crc ^ 0xffffffffffffffffULL; 135 return crc ^ 0xffffffffffffffffULL;
135} 136}
136 137
137static void btree_bio_endio(struct bio *bio, int error) 138static void bch_btree_node_read_done(struct btree *b)
138{ 139{
139 struct closure *cl = bio->bi_private;
140 struct btree *b = container_of(cl, struct btree, io.cl);
141
142 if (error)
143 set_btree_node_io_error(b);
144
145 bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
146 ? "writing btree" : "reading btree");
147 closure_put(cl);
148}
149
150static void btree_bio_init(struct btree *b)
151{
152 BUG_ON(b->bio);
153 b->bio = bch_bbio_alloc(b->c);
154
155 b->bio->bi_end_io = btree_bio_endio;
156 b->bio->bi_private = &b->io.cl;
157}
158
159void bch_btree_read_done(struct closure *cl)
160{
161 struct btree *b = container_of(cl, struct btree, io.cl);
162 struct bset *i = b->sets[0].data;
163 struct btree_iter *iter = b->c->fill_iter;
164 const char *err = "bad btree header"; 140 const char *err = "bad btree header";
165 BUG_ON(b->nsets || b->written); 141 struct bset *i = b->sets[0].data;
166 142 struct btree_iter *iter;
167 bch_bbio_free(b->bio, b->c);
168 b->bio = NULL;
169 143
170 mutex_lock(&b->c->fill_lock); 144 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
171 iter->used = 0; 146 iter->used = 0;
172 147
173 if (btree_node_io_error(b) || 148 if (!i->seq)
174 !i->seq)
175 goto err; 149 goto err;
176 150
177 for (; 151 for (;
@@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl)
228 if (b->written < btree_blocks(b)) 202 if (b->written < btree_blocks(b))
229 bch_bset_init_next(b); 203 bch_bset_init_next(b);
230out: 204out:
231 205 mempool_free(iter, b->c->fill_iter);
232 mutex_unlock(&b->c->fill_lock); 206 return;
233
234 spin_lock(&b->c->btree_read_time_lock);
235 bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
236 spin_unlock(&b->c->btree_read_time_lock);
237
238 smp_wmb(); /* read_done is our write lock */
239 set_btree_node_read_done(b);
240
241 closure_return(cl);
242err: 207err:
243 set_btree_node_io_error(b); 208 set_btree_node_io_error(b);
244 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 209 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
@@ -247,48 +212,69 @@ err:
247 goto out; 212 goto out;
248} 213}
249 214
250void bch_btree_read(struct btree *b) 215static void btree_node_read_endio(struct bio *bio, int error)
216{
217 struct closure *cl = bio->bi_private;
218 closure_put(cl);
219}
220
221void bch_btree_node_read(struct btree *b)
251{ 222{
252 BUG_ON(b->nsets || b->written); 223 uint64_t start_time = local_clock();
224 struct closure cl;
225 struct bio *bio;
226
227 trace_bcache_btree_read(b);
228
229 closure_init_stack(&cl);
230
231 bio = bch_bbio_alloc(b->c);
232 bio->bi_rw = REQ_META|READ_SYNC;
233 bio->bi_size = KEY_SIZE(&b->key) << 9;
234 bio->bi_end_io = btree_node_read_endio;
235 bio->bi_private = &cl;
236
237 bch_bio_map(bio, b->sets[0].data);
238
239 bch_submit_bbio(bio, b->c, &b->key, 0);
240 closure_sync(&cl);
253 241
254 if (!closure_trylock(&b->io.cl, &b->c->cl)) 242 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
255 BUG(); 243 set_btree_node_io_error(b);
256 244
257 b->io_start_time = local_clock(); 245 bch_bbio_free(bio, b->c);
258 246
259 btree_bio_init(b); 247 if (btree_node_io_error(b))
260 b->bio->bi_rw = REQ_META|READ_SYNC; 248 goto err;
261 b->bio->bi_size = KEY_SIZE(&b->key) << 9;
262 249
263 bch_bio_map(b->bio, b->sets[0].data); 250 bch_btree_node_read_done(b);
264 251
265 pr_debug("%s", pbtree(b)); 252 spin_lock(&b->c->btree_read_time_lock);
266 trace_bcache_btree_read(b->bio); 253 bch_time_stats_update(&b->c->btree_read_time, start_time);
267 bch_submit_bbio(b->bio, b->c, &b->key, 0); 254 spin_unlock(&b->c->btree_read_time_lock);
268 255
269 continue_at(&b->io.cl, bch_btree_read_done, system_wq); 256 return;
257err:
258 bch_cache_set_error(b->c, "io error reading bucket %lu",
259 PTR_BUCKET_NR(b->c, &b->key, 0));
270} 260}
271 261
272static void btree_complete_write(struct btree *b, struct btree_write *w) 262static void btree_complete_write(struct btree *b, struct btree_write *w)
273{ 263{
274 if (w->prio_blocked && 264 if (w->prio_blocked &&
275 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 265 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
276 wake_up(&b->c->alloc_wait); 266 wake_up_allocators(b->c);
277 267
278 if (w->journal) { 268 if (w->journal) {
279 atomic_dec_bug(w->journal); 269 atomic_dec_bug(w->journal);
280 __closure_wake_up(&b->c->journal.wait); 270 __closure_wake_up(&b->c->journal.wait);
281 } 271 }
282 272
283 if (w->owner)
284 closure_put(w->owner);
285
286 w->prio_blocked = 0; 273 w->prio_blocked = 0;
287 w->journal = NULL; 274 w->journal = NULL;
288 w->owner = NULL;
289} 275}
290 276
291static void __btree_write_done(struct closure *cl) 277static void __btree_node_write_done(struct closure *cl)
292{ 278{
293 struct btree *b = container_of(cl, struct btree, io.cl); 279 struct btree *b = container_of(cl, struct btree, io.cl);
294 struct btree_write *w = btree_prev_write(b); 280 struct btree_write *w = btree_prev_write(b);
@@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl)
304 closure_return(cl); 290 closure_return(cl);
305} 291}
306 292
307static void btree_write_done(struct closure *cl) 293static void btree_node_write_done(struct closure *cl)
308{ 294{
309 struct btree *b = container_of(cl, struct btree, io.cl); 295 struct btree *b = container_of(cl, struct btree, io.cl);
310 struct bio_vec *bv; 296 struct bio_vec *bv;
@@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl)
313 __bio_for_each_segment(bv, b->bio, n, 0) 299 __bio_for_each_segment(bv, b->bio, n, 0)
314 __free_page(bv->bv_page); 300 __free_page(bv->bv_page);
315 301
316 __btree_write_done(cl); 302 __btree_node_write_done(cl);
317} 303}
318 304
319static void do_btree_write(struct btree *b) 305static void btree_node_write_endio(struct bio *bio, int error)
306{
307 struct closure *cl = bio->bi_private;
308 struct btree *b = container_of(cl, struct btree, io.cl);
309
310 if (error)
311 set_btree_node_io_error(b);
312
313 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
314 closure_put(cl);
315}
316
317static void do_btree_node_write(struct btree *b)
320{ 318{
321 struct closure *cl = &b->io.cl; 319 struct closure *cl = &b->io.cl;
322 struct bset *i = b->sets[b->nsets].data; 320 struct bset *i = b->sets[b->nsets].data;
@@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b)
325 i->version = BCACHE_BSET_VERSION; 323 i->version = BCACHE_BSET_VERSION;
326 i->csum = btree_csum_set(b, i); 324 i->csum = btree_csum_set(b, i);
327 325
328 btree_bio_init(b); 326 BUG_ON(b->bio);
329 b->bio->bi_rw = REQ_META|WRITE_SYNC; 327 b->bio = bch_bbio_alloc(b->c);
330 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 328
329 b->bio->bi_end_io = btree_node_write_endio;
330 b->bio->bi_private = &b->io.cl;
331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
331 bch_bio_map(b->bio, i); 333 bch_bio_map(b->bio, i);
332 334
335 /*
336 * If we're appending to a leaf node, we don't technically need FUA -
337 * this write just needs to be persisted before the next journal write,
338 * which will be marked FLUSH|FUA.
339 *
340 * Similarly if we're writing a new btree root - the pointer is going to
341 * be in the next journal entry.
342 *
343 * But if we're writing a new btree node (that isn't a root) or
344 * appending to a non leaf btree node, we need either FUA or a flush
345 * when we write the parent with the new pointer. FUA is cheaper than a
346 * flush, and writes appending to leaf nodes aren't blocking anything so
347 * just make all btree node writes FUA to keep things sane.
348 */
349
333 bkey_copy(&k.key, &b->key); 350 bkey_copy(&k.key, &b->key);
334 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 351 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
335 352
336 if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { 353 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
337 int j; 354 int j;
338 struct bio_vec *bv; 355 struct bio_vec *bv;
339 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 356 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b)
342 memcpy(page_address(bv->bv_page), 359 memcpy(page_address(bv->bv_page),
343 base + j * PAGE_SIZE, PAGE_SIZE); 360 base + j * PAGE_SIZE, PAGE_SIZE);
344 361
345 trace_bcache_btree_write(b->bio);
346 bch_submit_bbio(b->bio, b->c, &k.key, 0); 362 bch_submit_bbio(b->bio, b->c, &k.key, 0);
347 363
348 continue_at(cl, btree_write_done, NULL); 364 continue_at(cl, btree_node_write_done, NULL);
349 } else { 365 } else {
350 b->bio->bi_vcnt = 0; 366 b->bio->bi_vcnt = 0;
351 bch_bio_map(b->bio, i); 367 bch_bio_map(b->bio, i);
352 368
353 trace_bcache_btree_write(b->bio);
354 bch_submit_bbio(b->bio, b->c, &k.key, 0); 369 bch_submit_bbio(b->bio, b->c, &k.key, 0);
355 370
356 closure_sync(cl); 371 closure_sync(cl);
357 __btree_write_done(cl); 372 __btree_node_write_done(cl);
358 } 373 }
359} 374}
360 375
361static void __btree_write(struct btree *b) 376void bch_btree_node_write(struct btree *b, struct closure *parent)
362{ 377{
363 struct bset *i = b->sets[b->nsets].data; 378 struct bset *i = b->sets[b->nsets].data;
364 379
380 trace_bcache_btree_write(b);
381
365 BUG_ON(current->bio_list); 382 BUG_ON(current->bio_list);
383 BUG_ON(b->written >= btree_blocks(b));
384 BUG_ON(b->written && !i->keys);
385 BUG_ON(b->sets->data->seq != i->seq);
386 bch_check_key_order(b, i);
366 387
367 closure_lock(&b->io, &b->c->cl);
368 cancel_delayed_work(&b->work); 388 cancel_delayed_work(&b->work);
369 389
390 /* If caller isn't waiting for write, parent refcount is cache set */
391 closure_lock(&b->io, parent ?: &b->c->cl);
392
370 clear_bit(BTREE_NODE_dirty, &b->flags); 393 clear_bit(BTREE_NODE_dirty, &b->flags);
371 change_bit(BTREE_NODE_write_idx, &b->flags); 394 change_bit(BTREE_NODE_write_idx, &b->flags);
372 395
373 bch_check_key_order(b, i); 396 do_btree_node_write(b);
374 BUG_ON(b->written && !i->keys);
375
376 do_btree_write(b);
377
378 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
379 397
380 b->written += set_blocks(i, b->c); 398 b->written += set_blocks(i, b->c);
381 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, 399 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
@@ -387,37 +405,31 @@ static void __btree_write(struct btree *b)
387 bch_bset_init_next(b); 405 bch_bset_init_next(b);
388} 406}
389 407
390static void btree_write_work(struct work_struct *w) 408static void btree_node_write_work(struct work_struct *w)
391{ 409{
392 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 410 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
393 411
394 down_write(&b->lock); 412 rw_lock(true, b, b->level);
395 413
396 if (btree_node_dirty(b)) 414 if (btree_node_dirty(b))
397 __btree_write(b); 415 bch_btree_node_write(b, NULL);
398 up_write(&b->lock); 416 rw_unlock(true, b);
399} 417}
400 418
401void bch_btree_write(struct btree *b, bool now, struct btree_op *op) 419static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
402{ 420{
403 struct bset *i = b->sets[b->nsets].data; 421 struct bset *i = b->sets[b->nsets].data;
404 struct btree_write *w = btree_current_write(b); 422 struct btree_write *w = btree_current_write(b);
405 423
406 BUG_ON(b->written && 424 BUG_ON(!b->written);
407 (b->written >= btree_blocks(b) || 425 BUG_ON(!i->keys);
408 i->seq != b->sets[0].data->seq ||
409 !i->keys));
410 426
411 if (!btree_node_dirty(b)) { 427 if (!btree_node_dirty(b))
412 set_btree_node_dirty(b); 428 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
413 queue_delayed_work(btree_io_wq, &b->work,
414 msecs_to_jiffies(30000));
415 }
416 429
417 w->prio_blocked += b->prio_blocked; 430 set_btree_node_dirty(b);
418 b->prio_blocked = 0;
419 431
420 if (op && op->journal && !b->level) { 432 if (op && op->journal) {
421 if (w->journal && 433 if (w->journal &&
422 journal_pin_cmp(b->c, w, op)) { 434 journal_pin_cmp(b->c, w, op)) {
423 atomic_dec_bug(w->journal); 435 atomic_dec_bug(w->journal);
@@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
430 } 442 }
431 } 443 }
432 444
433 if (current->bio_list)
434 return;
435
436 /* Force write if set is too big */ 445 /* Force write if set is too big */
437 if (now || 446 if (set_bytes(i) > PAGE_SIZE - 48 &&
438 b->level || 447 !current->bio_list)
439 set_bytes(i) > PAGE_SIZE - 48) { 448 bch_btree_node_write(b, NULL);
440 if (op && now) {
441 /* Must wait on multiple writes */
442 BUG_ON(w->owner);
443 w->owner = &op->cl;
444 closure_get(&op->cl);
445 }
446
447 __btree_write(b);
448 }
449 BUG_ON(!b->written);
450} 449}
451 450
452/* 451/*
@@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
559 init_rwsem(&b->lock); 558 init_rwsem(&b->lock);
560 lockdep_set_novalidate_class(&b->lock); 559 lockdep_set_novalidate_class(&b->lock);
561 INIT_LIST_HEAD(&b->list); 560 INIT_LIST_HEAD(&b->list);
562 INIT_DELAYED_WORK(&b->work, btree_write_work); 561 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
563 b->c = c; 562 b->c = c;
564 closure_init_unlocked(&b->io); 563 closure_init_unlocked(&b->io);
565 564
@@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
582 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
583 582
584 if (cl && btree_node_dirty(b)) 583 if (cl && btree_node_dirty(b))
585 bch_btree_write(b, true, NULL); 584 bch_btree_node_write(b, NULL);
586 585
587 if (cl) 586 if (cl)
588 closure_wait_event_async(&b->io.wait, cl, 587 closure_wait_event_async(&b->io.wait, cl,
@@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
623 else if (!mutex_trylock(&c->bucket_lock)) 622 else if (!mutex_trylock(&c->bucket_lock))
624 return -1; 623 return -1;
625 624
625 /*
626 * It's _really_ critical that we don't free too many btree nodes - we
627 * have to always leave ourselves a reserve. The reserve is how we
628 * guarantee that allocating memory for a new btree node can always
629 * succeed, so that inserting keys into the btree can always succeed and
630 * IO can always make forward progress:
631 */
626 nr /= c->btree_pages; 632 nr /= c->btree_pages;
627 nr = min_t(unsigned long, nr, mca_can_free(c)); 633 nr = min_t(unsigned long, nr, mca_can_free(c));
628 634
@@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
766 int ret = -ENOMEM; 772 int ret = -ENOMEM;
767 struct btree *i; 773 struct btree *i;
768 774
775 trace_bcache_btree_cache_cannibalize(c);
776
769 if (!cl) 777 if (!cl)
770 return ERR_PTR(-ENOMEM); 778 return ERR_PTR(-ENOMEM);
771 779
@@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
784 return ERR_PTR(-EAGAIN); 792 return ERR_PTR(-EAGAIN);
785 } 793 }
786 794
787 /* XXX: tracepoint */
788 c->try_harder = cl; 795 c->try_harder = cl;
789 c->try_harder_start = local_clock(); 796 c->try_harder_start = local_clock();
790retry: 797retry:
@@ -905,6 +912,9 @@ retry:
905 b = mca_find(c, k); 912 b = mca_find(c, k);
906 913
907 if (!b) { 914 if (!b) {
915 if (current->bio_list)
916 return ERR_PTR(-EAGAIN);
917
908 mutex_lock(&c->bucket_lock); 918 mutex_lock(&c->bucket_lock);
909 b = mca_alloc(c, k, level, &op->cl); 919 b = mca_alloc(c, k, level, &op->cl);
910 mutex_unlock(&c->bucket_lock); 920 mutex_unlock(&c->bucket_lock);
@@ -914,7 +924,7 @@ retry:
914 if (IS_ERR(b)) 924 if (IS_ERR(b))
915 return b; 925 return b;
916 926
917 bch_btree_read(b); 927 bch_btree_node_read(b);
918 928
919 if (!write) 929 if (!write)
920 downgrade_write(&b->lock); 930 downgrade_write(&b->lock);
@@ -937,15 +947,12 @@ retry:
937 for (; i <= b->nsets; i++) 947 for (; i <= b->nsets; i++)
938 prefetch(b->sets[i].data); 948 prefetch(b->sets[i].data);
939 949
940 if (!closure_wait_event(&b->io.wait, &op->cl, 950 if (btree_node_io_error(b)) {
941 btree_node_read_done(b))) {
942 rw_unlock(write, b);
943 b = ERR_PTR(-EAGAIN);
944 } else if (btree_node_io_error(b)) {
945 rw_unlock(write, b); 951 rw_unlock(write, b);
946 b = ERR_PTR(-EIO); 952 return ERR_PTR(-EIO);
947 } else 953 }
948 BUG_ON(!b->written); 954
955 BUG_ON(!b->written);
949 956
950 return b; 957 return b;
951} 958}
@@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
959 mutex_unlock(&c->bucket_lock); 966 mutex_unlock(&c->bucket_lock);
960 967
961 if (!IS_ERR_OR_NULL(b)) { 968 if (!IS_ERR_OR_NULL(b)) {
962 bch_btree_read(b); 969 bch_btree_node_read(b);
963 rw_unlock(true, b); 970 rw_unlock(true, b);
964 } 971 }
965} 972}
@@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
970{ 977{
971 unsigned i; 978 unsigned i;
972 979
980 trace_bcache_btree_node_free(b);
981
973 /* 982 /*
974 * The BUG_ON() in btree_node_get() implies that we must have a write 983 * The BUG_ON() in btree_node_get() implies that we must have a write
975 * lock on parent to free or even invalidate a node 984 * lock on parent to free or even invalidate a node
976 */ 985 */
977 BUG_ON(op->lock <= b->level); 986 BUG_ON(op->lock <= b->level);
978 BUG_ON(b == b->c->root); 987 BUG_ON(b == b->c->root);
979 pr_debug("bucket %s", pbtree(b));
980 988
981 if (btree_node_dirty(b)) 989 if (btree_node_dirty(b))
982 btree_complete_write(b, btree_current_write(b)); 990 btree_complete_write(b, btree_current_write(b));
983 clear_bit(BTREE_NODE_dirty, &b->flags); 991 clear_bit(BTREE_NODE_dirty, &b->flags);
984 992
985 if (b->prio_blocked &&
986 !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
987 wake_up(&b->c->alloc_wait);
988
989 b->prio_blocked = 0;
990
991 cancel_delayed_work(&b->work); 993 cancel_delayed_work(&b->work);
992 994
993 mutex_lock(&b->c->bucket_lock); 995 mutex_lock(&b->c->bucket_lock);
@@ -1028,17 +1030,20 @@ retry:
1028 goto retry; 1030 goto retry;
1029 } 1031 }
1030 1032
1031 set_btree_node_read_done(b);
1032 b->accessed = 1; 1033 b->accessed = 1;
1033 bch_bset_init_next(b); 1034 bch_bset_init_next(b);
1034 1035
1035 mutex_unlock(&c->bucket_lock); 1036 mutex_unlock(&c->bucket_lock);
1037
1038 trace_bcache_btree_node_alloc(b);
1036 return b; 1039 return b;
1037err_free: 1040err_free:
1038 bch_bucket_free(c, &k.key); 1041 bch_bucket_free(c, &k.key);
1039 __bkey_put(c, &k.key); 1042 __bkey_put(c, &k.key);
1040err: 1043err:
1041 mutex_unlock(&c->bucket_lock); 1044 mutex_unlock(&c->bucket_lock);
1045
1046 trace_bcache_btree_node_alloc_fail(b);
1042 return b; 1047 return b;
1043} 1048}
1044 1049
@@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1137 gc->nkeys++; 1142 gc->nkeys++;
1138 1143
1139 gc->data += KEY_SIZE(k); 1144 gc->data += KEY_SIZE(k);
1140 if (KEY_DIRTY(k)) { 1145 if (KEY_DIRTY(k))
1141 gc->dirty += KEY_SIZE(k); 1146 gc->dirty += KEY_SIZE(k);
1142 if (d)
1143 d->sectors_dirty_gc += KEY_SIZE(k);
1144 }
1145 } 1147 }
1146 1148
1147 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1149 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1166 1168
1167 if (!IS_ERR_OR_NULL(n)) { 1169 if (!IS_ERR_OR_NULL(n)) {
1168 swap(b, n); 1170 swap(b, n);
1171 __bkey_put(b->c, &b->key);
1169 1172
1170 memcpy(k->ptr, b->key.ptr, 1173 memcpy(k->ptr, b->key.ptr,
1171 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1174 sizeof(uint64_t) * KEY_PTRS(&b->key));
1172 1175
1173 __bkey_put(b->c, &b->key);
1174 atomic_inc(&b->c->prio_blocked);
1175 b->prio_blocked++;
1176
1177 btree_node_free(n, op); 1176 btree_node_free(n, op);
1178 up_write(&n->lock); 1177 up_write(&n->lock);
1179 } 1178 }
@@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1278 btree_node_free(r->b, op); 1277 btree_node_free(r->b, op);
1279 up_write(&r->b->lock); 1278 up_write(&r->b->lock);
1280 1279
1281 pr_debug("coalesced %u nodes", nodes); 1280 trace_bcache_btree_gc_coalesce(nodes);
1282 1281
1283 gc->nodes--; 1282 gc->nodes--;
1284 nodes--; 1283 nodes--;
@@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1293 void write(struct btree *r) 1292 void write(struct btree *r)
1294 { 1293 {
1295 if (!r->written) 1294 if (!r->written)
1296 bch_btree_write(r, true, op); 1295 bch_btree_node_write(r, &op->cl);
1297 else if (btree_node_dirty(r)) { 1296 else if (btree_node_dirty(r))
1298 BUG_ON(btree_current_write(r)->owner); 1297 bch_btree_node_write(r, writes);
1299 btree_current_write(r)->owner = writes;
1300 closure_get(writes);
1301
1302 bch_btree_write(r, true, NULL);
1303 }
1304 1298
1305 up_write(&r->lock); 1299 up_write(&r->lock);
1306 } 1300 }
@@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1386 ret = btree_gc_recurse(b, op, writes, gc); 1380 ret = btree_gc_recurse(b, op, writes, gc);
1387 1381
1388 if (!b->written || btree_node_dirty(b)) { 1382 if (!b->written || btree_node_dirty(b)) {
1389 atomic_inc(&b->c->prio_blocked); 1383 bch_btree_node_write(b, n ? &op->cl : NULL);
1390 b->prio_blocked++;
1391 bch_btree_write(b, true, n ? op : NULL);
1392 } 1384 }
1393 1385
1394 if (!IS_ERR_OR_NULL(n)) { 1386 if (!IS_ERR_OR_NULL(n)) {
@@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c)
1405{ 1397{
1406 struct cache *ca; 1398 struct cache *ca;
1407 struct bucket *b; 1399 struct bucket *b;
1408 struct bcache_device **d;
1409 unsigned i; 1400 unsigned i;
1410 1401
1411 if (!c->gc_mark_valid) 1402 if (!c->gc_mark_valid)
@@ -1419,16 +1410,12 @@ static void btree_gc_start(struct cache_set *c)
1419 for_each_cache(ca, c, i) 1410 for_each_cache(ca, c, i)
1420 for_each_bucket(b, ca) { 1411 for_each_bucket(b, ca) {
1421 b->gc_gen = b->gen; 1412 b->gc_gen = b->gen;
1422 if (!atomic_read(&b->pin)) 1413 if (!atomic_read(&b->pin)) {
1423 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 1414 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1415 SET_GC_SECTORS_USED(b, 0);
1416 }
1424 } 1417 }
1425 1418
1426 for (d = c->devices;
1427 d < c->devices + c->nr_uuids;
1428 d++)
1429 if (*d)
1430 (*d)->sectors_dirty_gc = 0;
1431
1432 mutex_unlock(&c->bucket_lock); 1419 mutex_unlock(&c->bucket_lock);
1433} 1420}
1434 1421
@@ -1437,7 +1424,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1437 size_t available = 0; 1424 size_t available = 0;
1438 struct bucket *b; 1425 struct bucket *b;
1439 struct cache *ca; 1426 struct cache *ca;
1440 struct bcache_device **d;
1441 unsigned i; 1427 unsigned i;
1442 1428
1443 mutex_lock(&c->bucket_lock); 1429 mutex_lock(&c->bucket_lock);
@@ -1480,22 +1466,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1480 } 1466 }
1481 } 1467 }
1482 1468
1483 for (d = c->devices;
1484 d < c->devices + c->nr_uuids;
1485 d++)
1486 if (*d) {
1487 unsigned long last =
1488 atomic_long_read(&((*d)->sectors_dirty));
1489 long difference = (*d)->sectors_dirty_gc - last;
1490
1491 pr_debug("sectors dirty off by %li", difference);
1492
1493 (*d)->sectors_dirty_last += difference;
1494
1495 atomic_long_set(&((*d)->sectors_dirty),
1496 (*d)->sectors_dirty_gc);
1497 }
1498
1499 mutex_unlock(&c->bucket_lock); 1469 mutex_unlock(&c->bucket_lock);
1500 return available; 1470 return available;
1501} 1471}
@@ -1508,10 +1478,9 @@ static void bch_btree_gc(struct closure *cl)
1508 struct gc_stat stats; 1478 struct gc_stat stats;
1509 struct closure writes; 1479 struct closure writes;
1510 struct btree_op op; 1480 struct btree_op op;
1511
1512 uint64_t start_time = local_clock(); 1481 uint64_t start_time = local_clock();
1513 trace_bcache_gc_start(c->sb.set_uuid); 1482
1514 blktrace_msg_all(c, "Starting gc"); 1483 trace_bcache_gc_start(c);
1515 1484
1516 memset(&stats, 0, sizeof(struct gc_stat)); 1485 memset(&stats, 0, sizeof(struct gc_stat));
1517 closure_init_stack(&writes); 1486 closure_init_stack(&writes);
@@ -1520,14 +1489,14 @@ static void bch_btree_gc(struct closure *cl)
1520 1489
1521 btree_gc_start(c); 1490 btree_gc_start(c);
1522 1491
1492 atomic_inc(&c->prio_blocked);
1493
1523 ret = btree_root(gc_root, c, &op, &writes, &stats); 1494 ret = btree_root(gc_root, c, &op, &writes, &stats);
1524 closure_sync(&op.cl); 1495 closure_sync(&op.cl);
1525 closure_sync(&writes); 1496 closure_sync(&writes);
1526 1497
1527 if (ret) { 1498 if (ret) {
1528 blktrace_msg_all(c, "Stopped gc");
1529 pr_warn("gc failed!"); 1499 pr_warn("gc failed!");
1530
1531 continue_at(cl, bch_btree_gc, bch_gc_wq); 1500 continue_at(cl, bch_btree_gc, bch_gc_wq);
1532 } 1501 }
1533 1502
@@ -1537,6 +1506,9 @@ static void bch_btree_gc(struct closure *cl)
1537 1506
1538 available = bch_btree_gc_finish(c); 1507 available = bch_btree_gc_finish(c);
1539 1508
1509 atomic_dec(&c->prio_blocked);
1510 wake_up_allocators(c);
1511
1540 bch_time_stats_update(&c->btree_gc_time, start_time); 1512 bch_time_stats_update(&c->btree_gc_time, start_time);
1541 1513
1542 stats.key_bytes *= sizeof(uint64_t); 1514 stats.key_bytes *= sizeof(uint64_t);
@@ -1544,10 +1516,8 @@ static void bch_btree_gc(struct closure *cl)
1544 stats.data <<= 9; 1516 stats.data <<= 9;
1545 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1517 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1546 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1518 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1547 blktrace_msg_all(c, "Finished gc");
1548 1519
1549 trace_bcache_gc_end(c->sb.set_uuid); 1520 trace_bcache_gc_end(c);
1550 wake_up(&c->alloc_wait);
1551 1521
1552 continue_at(cl, bch_moving_gc, bch_gc_wq); 1522 continue_at(cl, bch_moving_gc, bch_gc_wq);
1553} 1523}
@@ -1654,14 +1624,14 @@ static bool fix_overlapping_extents(struct btree *b,
1654 struct btree_iter *iter, 1624 struct btree_iter *iter,
1655 struct btree_op *op) 1625 struct btree_op *op)
1656{ 1626{
1657 void subtract_dirty(struct bkey *k, int sectors) 1627 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1658 { 1628 {
1659 struct bcache_device *d = b->c->devices[KEY_INODE(k)]; 1629 if (KEY_DIRTY(k))
1660 1630 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1661 if (KEY_DIRTY(k) && d) 1631 offset, -sectors);
1662 atomic_long_sub(sectors, &d->sectors_dirty);
1663 } 1632 }
1664 1633
1634 uint64_t old_offset;
1665 unsigned old_size, sectors_found = 0; 1635 unsigned old_size, sectors_found = 0;
1666 1636
1667 while (1) { 1637 while (1) {
@@ -1673,6 +1643,7 @@ static bool fix_overlapping_extents(struct btree *b,
1673 if (bkey_cmp(k, &START_KEY(insert)) <= 0) 1643 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1674 continue; 1644 continue;
1675 1645
1646 old_offset = KEY_START(k);
1676 old_size = KEY_SIZE(k); 1647 old_size = KEY_SIZE(k);
1677 1648
1678 /* 1649 /*
@@ -1728,7 +1699,7 @@ static bool fix_overlapping_extents(struct btree *b,
1728 1699
1729 struct bkey *top; 1700 struct bkey *top;
1730 1701
1731 subtract_dirty(k, KEY_SIZE(insert)); 1702 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1732 1703
1733 if (bkey_written(b, k)) { 1704 if (bkey_written(b, k)) {
1734 /* 1705 /*
@@ -1775,7 +1746,7 @@ static bool fix_overlapping_extents(struct btree *b,
1775 } 1746 }
1776 } 1747 }
1777 1748
1778 subtract_dirty(k, old_size - KEY_SIZE(k)); 1749 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1779 } 1750 }
1780 1751
1781check_failed: 1752check_failed:
@@ -1798,7 +1769,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1798{ 1769{
1799 struct bset *i = b->sets[b->nsets].data; 1770 struct bset *i = b->sets[b->nsets].data;
1800 struct bkey *m, *prev; 1771 struct bkey *m, *prev;
1801 const char *status = "insert"; 1772 unsigned status = BTREE_INSERT_STATUS_INSERT;
1802 1773
1803 BUG_ON(bkey_cmp(k, &b->key) > 0); 1774 BUG_ON(bkey_cmp(k, &b->key) > 0);
1804 BUG_ON(b->level && !KEY_PTRS(k)); 1775 BUG_ON(b->level && !KEY_PTRS(k));
@@ -1831,17 +1802,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1831 goto insert; 1802 goto insert;
1832 1803
1833 /* prev is in the tree, if we merge we're done */ 1804 /* prev is in the tree, if we merge we're done */
1834 status = "back merging"; 1805 status = BTREE_INSERT_STATUS_BACK_MERGE;
1835 if (prev && 1806 if (prev &&
1836 bch_bkey_try_merge(b, prev, k)) 1807 bch_bkey_try_merge(b, prev, k))
1837 goto merged; 1808 goto merged;
1838 1809
1839 status = "overwrote front"; 1810 status = BTREE_INSERT_STATUS_OVERWROTE;
1840 if (m != end(i) && 1811 if (m != end(i) &&
1841 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) 1812 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1842 goto copy; 1813 goto copy;
1843 1814
1844 status = "front merge"; 1815 status = BTREE_INSERT_STATUS_FRONT_MERGE;
1845 if (m != end(i) && 1816 if (m != end(i) &&
1846 bch_bkey_try_merge(b, k, m)) 1817 bch_bkey_try_merge(b, k, m))
1847 goto copy; 1818 goto copy;
@@ -1851,21 +1822,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1851insert: shift_keys(b, m, k); 1822insert: shift_keys(b, m, k);
1852copy: bkey_copy(m, k); 1823copy: bkey_copy(m, k);
1853merged: 1824merged:
1854 bch_check_keys(b, "%s for %s at %s: %s", status, 1825 if (KEY_DIRTY(k))
1855 op_type(op), pbtree(b), pkey(k)); 1826 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1856 bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, 1827 KEY_START(k), KEY_SIZE(k));
1857 op_type(op), pbtree(b), pkey(k)); 1828
1829 bch_check_keys(b, "%u for %s", status, op_type(op));
1858 1830
1859 if (b->level && !KEY_OFFSET(k)) 1831 if (b->level && !KEY_OFFSET(k))
1860 b->prio_blocked++; 1832 btree_current_write(b)->prio_blocked++;
1861 1833
1862 pr_debug("%s for %s at %s: %s", status, 1834 trace_bcache_btree_insert_key(b, k, op->type, status);
1863 op_type(op), pbtree(b), pkey(k));
1864 1835
1865 return true; 1836 return true;
1866} 1837}
1867 1838
1868bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1839static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
1869{ 1840{
1870 bool ret = false; 1841 bool ret = false;
1871 struct bkey *k; 1842 struct bkey *k;
@@ -1896,7 +1867,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1896 should_split(b)) 1867 should_split(b))
1897 goto out; 1868 goto out;
1898 1869
1899 op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); 1870 op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
1900 1871
1901 SET_KEY_PTRS(&op->replace, 1); 1872 SET_KEY_PTRS(&op->replace, 1);
1902 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 1873 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
@@ -1907,7 +1878,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1907 1878
1908 BUG_ON(op->type != BTREE_INSERT); 1879 BUG_ON(op->type != BTREE_INSERT);
1909 BUG_ON(!btree_insert_key(b, op, &tmp.k)); 1880 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1910 bch_btree_write(b, false, NULL);
1911 ret = true; 1881 ret = true;
1912out: 1882out:
1913 downgrade_write(&b->lock); 1883 downgrade_write(&b->lock);
@@ -1929,12 +1899,11 @@ static int btree_split(struct btree *b, struct btree_op *op)
1929 1899
1930 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1900 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
1931 1901
1932 pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
1933 pbtree(b), n1->sets[0].data->keys);
1934
1935 if (split) { 1902 if (split) {
1936 unsigned keys = 0; 1903 unsigned keys = 0;
1937 1904
1905 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
1906
1938 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 1907 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
1939 if (IS_ERR(n2)) 1908 if (IS_ERR(n2))
1940 goto err_free1; 1909 goto err_free1;
@@ -1967,18 +1936,21 @@ static int btree_split(struct btree *b, struct btree_op *op)
1967 bkey_copy_key(&n2->key, &b->key); 1936 bkey_copy_key(&n2->key, &b->key);
1968 1937
1969 bch_keylist_add(&op->keys, &n2->key); 1938 bch_keylist_add(&op->keys, &n2->key);
1970 bch_btree_write(n2, true, op); 1939 bch_btree_node_write(n2, &op->cl);
1971 rw_unlock(true, n2); 1940 rw_unlock(true, n2);
1972 } else 1941 } else {
1942 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
1943
1973 bch_btree_insert_keys(n1, op); 1944 bch_btree_insert_keys(n1, op);
1945 }
1974 1946
1975 bch_keylist_add(&op->keys, &n1->key); 1947 bch_keylist_add(&op->keys, &n1->key);
1976 bch_btree_write(n1, true, op); 1948 bch_btree_node_write(n1, &op->cl);
1977 1949
1978 if (n3) { 1950 if (n3) {
1979 bkey_copy_key(&n3->key, &MAX_KEY); 1951 bkey_copy_key(&n3->key, &MAX_KEY);
1980 bch_btree_insert_keys(n3, op); 1952 bch_btree_insert_keys(n3, op);
1981 bch_btree_write(n3, true, op); 1953 bch_btree_node_write(n3, &op->cl);
1982 1954
1983 closure_sync(&op->cl); 1955 closure_sync(&op->cl);
1984 bch_btree_set_root(n3); 1956 bch_btree_set_root(n3);
@@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2082 2054
2083 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2055 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2084 2056
2085 if (bch_btree_insert_keys(b, op)) 2057 if (bch_btree_insert_keys(b, op)) {
2086 bch_btree_write(b, false, op); 2058 if (!b->level)
2059 bch_btree_leaf_dirty(b, op);
2060 else
2061 bch_btree_node_write(b, &op->cl);
2062 }
2087 } 2063 }
2088 2064
2089 return 0; 2065 return 0;
@@ -2140,6 +2116,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
2140void bch_btree_set_root(struct btree *b) 2116void bch_btree_set_root(struct btree *b)
2141{ 2117{
2142 unsigned i; 2118 unsigned i;
2119 struct closure cl;
2120
2121 closure_init_stack(&cl);
2122
2123 trace_bcache_btree_set_root(b);
2143 2124
2144 BUG_ON(!b->written); 2125 BUG_ON(!b->written);
2145 2126
@@ -2153,8 +2134,8 @@ void bch_btree_set_root(struct btree *b)
2153 b->c->root = b; 2134 b->c->root = b;
2154 __bkey_put(b->c, &b->key); 2135 __bkey_put(b->c, &b->key);
2155 2136
2156 bch_journal_meta(b->c, NULL); 2137 bch_journal_meta(b->c, &cl);
2157 pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); 2138 closure_sync(&cl);
2158} 2139}
2159 2140
2160/* Cache lookup */ 2141/* Cache lookup */
@@ -2215,9 +2196,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2215 KEY_OFFSET(k) - bio->bi_sector); 2196 KEY_OFFSET(k) - bio->bi_sector);
2216 2197
2217 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 2198 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
2218 if (!n)
2219 return -EAGAIN;
2220
2221 if (n == bio) 2199 if (n == bio)
2222 op->lookup_done = true; 2200 op->lookup_done = true;
2223 2201
@@ -2240,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2240 n->bi_end_io = bch_cache_read_endio; 2218 n->bi_end_io = bch_cache_read_endio;
2241 n->bi_private = &s->cl; 2219 n->bi_private = &s->cl;
2242 2220
2243 trace_bcache_cache_hit(n);
2244 __bch_submit_bbio(n, b->c); 2221 __bch_submit_bbio(n, b->c);
2245 } 2222 }
2246 2223
@@ -2257,9 +2234,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
2257 struct btree_iter iter; 2234 struct btree_iter iter;
2258 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); 2235 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2259 2236
2260 pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
2261 (uint64_t) bio->bi_sector);
2262
2263 do { 2237 do {
2264 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 2238 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2265 if (!k) { 2239 if (!k) {
@@ -2303,7 +2277,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2303} 2277}
2304 2278
2305static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2279static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2306 struct keybuf *buf, struct bkey *end) 2280 struct keybuf *buf, struct bkey *end,
2281 keybuf_pred_fn *pred)
2307{ 2282{
2308 struct btree_iter iter; 2283 struct btree_iter iter;
2309 bch_btree_iter_init(b, &iter, &buf->last_scanned); 2284 bch_btree_iter_init(b, &iter, &buf->last_scanned);
@@ -2322,11 +2297,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2322 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2297 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2323 break; 2298 break;
2324 2299
2325 if (buf->key_predicate(buf, k)) { 2300 if (pred(buf, k)) {
2326 struct keybuf_key *w; 2301 struct keybuf_key *w;
2327 2302
2328 pr_debug("%s", pkey(k));
2329
2330 spin_lock(&buf->lock); 2303 spin_lock(&buf->lock);
2331 2304
2332 w = array_alloc(&buf->freelist); 2305 w = array_alloc(&buf->freelist);
@@ -2343,7 +2316,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2343 if (!k) 2316 if (!k)
2344 break; 2317 break;
2345 2318
2346 btree(refill_keybuf, k, b, op, buf, end); 2319 btree(refill_keybuf, k, b, op, buf, end, pred);
2347 /* 2320 /*
2348 * Might get an error here, but can't really do anything 2321 * Might get an error here, but can't really do anything
2349 * and it'll get logged elsewhere. Just read what we 2322 * and it'll get logged elsewhere. Just read what we
@@ -2361,7 +2334,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2361} 2334}
2362 2335
2363void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2336void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2364 struct bkey *end) 2337 struct bkey *end, keybuf_pred_fn *pred)
2365{ 2338{
2366 struct bkey start = buf->last_scanned; 2339 struct bkey start = buf->last_scanned;
2367 struct btree_op op; 2340 struct btree_op op;
@@ -2369,7 +2342,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2369 2342
2370 cond_resched(); 2343 cond_resched();
2371 2344
2372 btree_root(refill_keybuf, c, &op, buf, end); 2345 btree_root(refill_keybuf, c, &op, buf, end, pred);
2373 closure_sync(&op.cl); 2346 closure_sync(&op.cl);
2374 2347
2375 pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2348 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
@@ -2455,7 +2428,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2455 2428
2456struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2429struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2457 struct keybuf *buf, 2430 struct keybuf *buf,
2458 struct bkey *end) 2431 struct bkey *end,
2432 keybuf_pred_fn *pred)
2459{ 2433{
2460 struct keybuf_key *ret; 2434 struct keybuf_key *ret;
2461 2435
@@ -2469,15 +2443,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2469 break; 2443 break;
2470 } 2444 }
2471 2445
2472 bch_refill_keybuf(c, buf, end); 2446 bch_refill_keybuf(c, buf, end, pred);
2473 } 2447 }
2474 2448
2475 return ret; 2449 return ret;
2476} 2450}
2477 2451
2478void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) 2452void bch_keybuf_init(struct keybuf *buf)
2479{ 2453{
2480 buf->key_predicate = fn;
2481 buf->last_scanned = MAX_KEY; 2454 buf->last_scanned = MAX_KEY;
2482 buf->keys = RB_ROOT; 2455 buf->keys = RB_ROOT;
2483 2456