aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/bcache
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-01-30 14:40:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-30 14:40:10 -0500
commit53d8ab29f8f6d67e37857b68189b38fa3d87dd8e (patch)
tree3c770b58f0404c67b1b084f626dcafa8464c7512 /drivers/md/bcache
parentf568849edac8611d603e00bd6cbbcfea09395ae6 (diff)
parent14424be4dbfa127001ad623869f7ee4c7635e991 (diff)
Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver changes from Jens Axboe: - bcache update from Kent Overstreet. - two bcache fixes from Nicholas Swenson. - cciss pci init error fix from Andrew. - underflow fix in the parallel IDE pg_write code from Dan Carpenter. I'm sure the 1 (or 0) users of that are now happy. - two PCI related fixes for sx8 from Jingoo Han. - floppy init fix for first block read from Jiri Kosina. - pktcdvd error return miss fix from Julia Lawall. - removal of IRQF_SHARED from the SEGA Dreamcast CD-ROM code from Michael Opdenacker. - comment typo fix for the loop driver from Olaf Hering. - potential oops fix for null_blk from Raghavendra K T. - two fixes from Sam Bradshaw (Micron) for the mtip32xx driver, fixing an OOM problem and a problem with handling security locked conditions * 'for-3.14/drivers' of git://git.kernel.dk/linux-block: (47 commits) mg_disk: Spelling s/finised/finished/ null_blk: Null pointer deference problem in alloc_page_buffers mtip32xx: Correctly handle security locked condition mtip32xx: Make SGL container per-command to eliminate high order dma allocation drivers/block/loop.c: fix comment typo in loop_config_discard drivers/block/cciss.c:cciss_init_one(): use proper errnos drivers/block/paride/pg.c: underflow bug in pg_write() drivers/block/sx8.c: remove unnecessary pci_set_drvdata() drivers/block/sx8.c: use module_pci_driver() floppy: bail out in open() if drive is not responding to block0 read bcache: Fix auxiliary search trees for key size > cacheline size bcache: Don't return -EINTR when insert finished bcache: Improve bucket_prio() calculation bcache: Add bch_bkey_equal_header() bcache: update bch_bkey_try_merge bcache: Move insert_fixup() to btree_keys_ops bcache: Convert sorting to btree_keys bcache: Convert debug code to btree_keys bcache: Convert btree_iter to struct btree_keys bcache: Refactor bset_tree sysfs stats ...
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r--drivers/md/bcache/Makefile5
-rw-r--r--drivers/md/bcache/alloc.c89
-rw-r--r--drivers/md/bcache/bcache.h82
-rw-r--r--drivers/md/bcache/bset.c904
-rw-r--r--drivers/md/bcache/bset.h440
-rw-r--r--drivers/md/bcache/btree.c676
-rw-r--r--drivers/md/bcache/btree.h62
-rw-r--r--drivers/md/bcache/closure.c90
-rw-r--r--drivers/md/bcache/closure.h355
-rw-r--r--drivers/md/bcache/debug.c247
-rw-r--r--drivers/md/bcache/debug.h27
-rw-r--r--drivers/md/bcache/extents.c616
-rw-r--r--drivers/md/bcache/extents.h13
-rw-r--r--drivers/md/bcache/journal.c75
-rw-r--r--drivers/md/bcache/journal.h1
-rw-r--r--drivers/md/bcache/movinggc.c2
-rw-r--r--drivers/md/bcache/request.c72
-rw-r--r--drivers/md/bcache/request.h21
-rw-r--r--drivers/md/bcache/super.c103
-rw-r--r--drivers/md/bcache/sysfs.c79
-rw-r--r--drivers/md/bcache/util.h8
21 files changed, 2212 insertions, 1755 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 0e9c82523be6..c488b846f831 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -1,7 +1,8 @@
1 1
2obj-$(CONFIG_BCACHE) += bcache.o 2obj-$(CONFIG_BCACHE) += bcache.o
3 3
4bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ 4bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
5 movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o 5 io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
6 util.o writeback.o
6 7
7CFLAGS_request.o += -Iblock 8CFLAGS_request.o += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4c9852d92b0a..c0d37d082443 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
132{ 132{
133 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); 133 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
134 134
135 if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && 135 if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
136 CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) 136 unsigned i;
137 return false; 137
138 for (i = 0; i < RESERVE_NONE; i++)
139 if (!fifo_full(&ca->free[i]))
140 goto add;
138 141
142 return false;
143 }
144add:
139 b->prio = 0; 145 b->prio = 0;
140 146
141 if (can_inc_bucket_gen(b) && 147 if (can_inc_bucket_gen(b) &&
@@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
162 fifo_push(&ca->free_inc, b - ca->buckets); 168 fifo_push(&ca->free_inc, b - ca->buckets);
163} 169}
164 170
165#define bucket_prio(b) \ 171/*
166 (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) 172 * Determines what order we're going to reuse buckets, smallest bucket_prio()
173 * first: we also take into account the number of sectors of live data in that
174 * bucket, and in order for that multiply to make sense we have to scale bucket
175 *
176 * Thus, we scale the bucket priorities so that the bucket with the smallest
177 * prio is worth 1/8th of what INITIAL_PRIO is worth.
178 */
179
180#define bucket_prio(b) \
181({ \
182 unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
183 \
184 (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
185})
167 186
168#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) 187#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
169#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) 188#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
@@ -304,6 +323,21 @@ do { \
304 __set_current_state(TASK_RUNNING); \ 323 __set_current_state(TASK_RUNNING); \
305} while (0) 324} while (0)
306 325
326static int bch_allocator_push(struct cache *ca, long bucket)
327{
328 unsigned i;
329
330 /* Prios/gens are actually the most important reserve */
331 if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
332 return true;
333
334 for (i = 0; i < RESERVE_NR; i++)
335 if (fifo_push(&ca->free[i], bucket))
336 return true;
337
338 return false;
339}
340
307static int bch_allocator_thread(void *arg) 341static int bch_allocator_thread(void *arg)
308{ 342{
309 struct cache *ca = arg; 343 struct cache *ca = arg;
@@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg)
336 mutex_lock(&ca->set->bucket_lock); 370 mutex_lock(&ca->set->bucket_lock);
337 } 371 }
338 372
339 allocator_wait(ca, !fifo_full(&ca->free)); 373 allocator_wait(ca, bch_allocator_push(ca, bucket));
340
341 fifo_push(&ca->free, bucket);
342 wake_up(&ca->set->bucket_wait); 374 wake_up(&ca->set->bucket_wait);
343 } 375 }
344 376
@@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg)
365 } 397 }
366} 398}
367 399
368long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) 400long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
369{ 401{
370 DEFINE_WAIT(w); 402 DEFINE_WAIT(w);
371 struct bucket *b; 403 struct bucket *b;
372 long r; 404 long r;
373 405
374 /* fastpath */ 406 /* fastpath */
375 if (fifo_used(&ca->free) > ca->watermark[watermark]) { 407 if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
376 fifo_pop(&ca->free, r); 408 fifo_pop(&ca->free[reserve], r))
377 goto out; 409 goto out;
378 }
379 410
380 if (!wait) 411 if (!wait)
381 return -1; 412 return -1;
382 413
383 while (1) { 414 do {
384 if (fifo_used(&ca->free) > ca->watermark[watermark]) {
385 fifo_pop(&ca->free, r);
386 break;
387 }
388
389 prepare_to_wait(&ca->set->bucket_wait, &w, 415 prepare_to_wait(&ca->set->bucket_wait, &w,
390 TASK_UNINTERRUPTIBLE); 416 TASK_UNINTERRUPTIBLE);
391 417
392 mutex_unlock(&ca->set->bucket_lock); 418 mutex_unlock(&ca->set->bucket_lock);
393 schedule(); 419 schedule();
394 mutex_lock(&ca->set->bucket_lock); 420 mutex_lock(&ca->set->bucket_lock);
395 } 421 } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
422 !fifo_pop(&ca->free[reserve], r));
396 423
397 finish_wait(&ca->set->bucket_wait, &w); 424 finish_wait(&ca->set->bucket_wait, &w);
398out: 425out:
@@ -401,12 +428,14 @@ out:
401 if (expensive_debug_checks(ca->set)) { 428 if (expensive_debug_checks(ca->set)) {
402 size_t iter; 429 size_t iter;
403 long i; 430 long i;
431 unsigned j;
404 432
405 for (iter = 0; iter < prio_buckets(ca) * 2; iter++) 433 for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
406 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); 434 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
407 435
408 fifo_for_each(i, &ca->free, iter) 436 for (j = 0; j < RESERVE_NR; j++)
409 BUG_ON(i == r); 437 fifo_for_each(i, &ca->free[j], iter)
438 BUG_ON(i == r);
410 fifo_for_each(i, &ca->free_inc, iter) 439 fifo_for_each(i, &ca->free_inc, iter)
411 BUG_ON(i == r); 440 BUG_ON(i == r);
412 fifo_for_each(i, &ca->unused, iter) 441 fifo_for_each(i, &ca->unused, iter)
@@ -419,7 +448,7 @@ out:
419 448
420 SET_GC_SECTORS_USED(b, ca->sb.bucket_size); 449 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
421 450
422 if (watermark <= WATERMARK_METADATA) { 451 if (reserve <= RESERVE_PRIO) {
423 SET_GC_MARK(b, GC_MARK_METADATA); 452 SET_GC_MARK(b, GC_MARK_METADATA);
424 SET_GC_MOVE(b, 0); 453 SET_GC_MOVE(b, 0);
425 b->prio = BTREE_PRIO; 454 b->prio = BTREE_PRIO;
@@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
445 } 474 }
446} 475}
447 476
448int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 477int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
449 struct bkey *k, int n, bool wait) 478 struct bkey *k, int n, bool wait)
450{ 479{
451 int i; 480 int i;
@@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
459 488
460 for (i = 0; i < n; i++) { 489 for (i = 0; i < n; i++) {
461 struct cache *ca = c->cache_by_alloc[i]; 490 struct cache *ca = c->cache_by_alloc[i];
462 long b = bch_bucket_alloc(ca, watermark, wait); 491 long b = bch_bucket_alloc(ca, reserve, wait);
463 492
464 if (b == -1) 493 if (b == -1)
465 goto err; 494 goto err;
@@ -478,12 +507,12 @@ err:
478 return -1; 507 return -1;
479} 508}
480 509
481int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 510int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
482 struct bkey *k, int n, bool wait) 511 struct bkey *k, int n, bool wait)
483{ 512{
484 int ret; 513 int ret;
485 mutex_lock(&c->bucket_lock); 514 mutex_lock(&c->bucket_lock);
486 ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); 515 ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
487 mutex_unlock(&c->bucket_lock); 516 mutex_unlock(&c->bucket_lock);
488 return ret; 517 return ret;
489} 518}
@@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
573 602
574 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { 603 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
575 unsigned watermark = write_prio 604 unsigned watermark = write_prio
576 ? WATERMARK_MOVINGGC 605 ? RESERVE_MOVINGGC
577 : WATERMARK_NONE; 606 : RESERVE_NONE;
578 607
579 spin_unlock(&c->data_bucket_lock); 608 spin_unlock(&c->data_bucket_lock);
580 609
@@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca)
689 * Then 8 for btree allocations 718 * Then 8 for btree allocations
690 * Then half for the moving garbage collector 719 * Then half for the moving garbage collector
691 */ 720 */
692 721#if 0
693 ca->watermark[WATERMARK_PRIO] = 0; 722 ca->watermark[WATERMARK_PRIO] = 0;
694 723
695 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); 724 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
@@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca)
699 728
700 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + 729 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
701 ca->watermark[WATERMARK_MOVINGGC]; 730 ca->watermark[WATERMARK_MOVINGGC];
702 731#endif
703 return 0; 732 return 0;
704} 733}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index dbdbca5a9591..0c707e4f4eaf 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -187,6 +187,7 @@
187#include <linux/types.h> 187#include <linux/types.h>
188#include <linux/workqueue.h> 188#include <linux/workqueue.h>
189 189
190#include "bset.h"
190#include "util.h" 191#include "util.h"
191#include "closure.h" 192#include "closure.h"
192 193
@@ -309,7 +310,8 @@ struct cached_dev {
309 struct cache_sb sb; 310 struct cache_sb sb;
310 struct bio sb_bio; 311 struct bio sb_bio;
311 struct bio_vec sb_bv[1]; 312 struct bio_vec sb_bv[1];
312 struct closure_with_waitlist sb_write; 313 struct closure sb_write;
314 struct semaphore sb_write_mutex;
313 315
314 /* Refcount on the cache set. Always nonzero when we're caching. */ 316 /* Refcount on the cache set. Always nonzero when we're caching. */
315 atomic_t count; 317 atomic_t count;
@@ -382,12 +384,12 @@ struct cached_dev {
382 unsigned writeback_rate_p_term_inverse; 384 unsigned writeback_rate_p_term_inverse;
383}; 385};
384 386
385enum alloc_watermarks { 387enum alloc_reserve {
386 WATERMARK_PRIO, 388 RESERVE_BTREE,
387 WATERMARK_METADATA, 389 RESERVE_PRIO,
388 WATERMARK_MOVINGGC, 390 RESERVE_MOVINGGC,
389 WATERMARK_NONE, 391 RESERVE_NONE,
390 WATERMARK_MAX 392 RESERVE_NR,
391}; 393};
392 394
393struct cache { 395struct cache {
@@ -399,8 +401,6 @@ struct cache {
399 struct kobject kobj; 401 struct kobject kobj;
400 struct block_device *bdev; 402 struct block_device *bdev;
401 403
402 unsigned watermark[WATERMARK_MAX];
403
404 struct task_struct *alloc_thread; 404 struct task_struct *alloc_thread;
405 405
406 struct closure prio; 406 struct closure prio;
@@ -429,7 +429,7 @@ struct cache {
429 * because all the data they contained was overwritten), so we only 429 * because all the data they contained was overwritten), so we only
430 * need to discard them before they can be moved to the free list. 430 * need to discard them before they can be moved to the free list.
431 */ 431 */
432 DECLARE_FIFO(long, free); 432 DECLARE_FIFO(long, free)[RESERVE_NR];
433 DECLARE_FIFO(long, free_inc); 433 DECLARE_FIFO(long, free_inc);
434 DECLARE_FIFO(long, unused); 434 DECLARE_FIFO(long, unused);
435 435
@@ -514,7 +514,8 @@ struct cache_set {
514 uint64_t cached_dev_sectors; 514 uint64_t cached_dev_sectors;
515 struct closure caching; 515 struct closure caching;
516 516
517 struct closure_with_waitlist sb_write; 517 struct closure sb_write;
518 struct semaphore sb_write_mutex;
518 519
519 mempool_t *search; 520 mempool_t *search;
520 mempool_t *bio_meta; 521 mempool_t *bio_meta;
@@ -629,13 +630,15 @@ struct cache_set {
629 630
630#ifdef CONFIG_BCACHE_DEBUG 631#ifdef CONFIG_BCACHE_DEBUG
631 struct btree *verify_data; 632 struct btree *verify_data;
633 struct bset *verify_ondisk;
632 struct mutex verify_lock; 634 struct mutex verify_lock;
633#endif 635#endif
634 636
635 unsigned nr_uuids; 637 unsigned nr_uuids;
636 struct uuid_entry *uuids; 638 struct uuid_entry *uuids;
637 BKEY_PADDED(uuid_bucket); 639 BKEY_PADDED(uuid_bucket);
638 struct closure_with_waitlist uuid_write; 640 struct closure uuid_write;
641 struct semaphore uuid_write_mutex;
639 642
640 /* 643 /*
641 * A btree node on disk could have too many bsets for an iterator to fit 644 * A btree node on disk could have too many bsets for an iterator to fit
@@ -643,13 +646,7 @@ struct cache_set {
643 */ 646 */
644 mempool_t *fill_iter; 647 mempool_t *fill_iter;
645 648
646 /* 649 struct bset_sort_state sort;
647 * btree_sort() is a merge sort and requires temporary space - single
648 * element mempool
649 */
650 struct mutex sort_lock;
651 struct bset *sort;
652 unsigned sort_crit_factor;
653 650
654 /* List of buckets we're currently writing data to */ 651 /* List of buckets we're currently writing data to */
655 struct list_head data_buckets; 652 struct list_head data_buckets;
@@ -665,7 +662,6 @@ struct cache_set {
665 unsigned congested_read_threshold_us; 662 unsigned congested_read_threshold_us;
666 unsigned congested_write_threshold_us; 663 unsigned congested_write_threshold_us;
667 664
668 struct time_stats sort_time;
669 struct time_stats btree_gc_time; 665 struct time_stats btree_gc_time;
670 struct time_stats btree_split_time; 666 struct time_stats btree_split_time;
671 struct time_stats btree_read_time; 667 struct time_stats btree_read_time;
@@ -683,9 +679,9 @@ struct cache_set {
683 unsigned error_decay; 679 unsigned error_decay;
684 680
685 unsigned short journal_delay_ms; 681 unsigned short journal_delay_ms;
682 bool expensive_debug_checks;
686 unsigned verify:1; 683 unsigned verify:1;
687 unsigned key_merging_disabled:1; 684 unsigned key_merging_disabled:1;
688 unsigned expensive_debug_checks:1;
689 unsigned gc_always_rewrite:1; 685 unsigned gc_always_rewrite:1;
690 unsigned shrinker_disabled:1; 686 unsigned shrinker_disabled:1;
691 unsigned copy_gc_enabled:1; 687 unsigned copy_gc_enabled:1;
@@ -707,13 +703,8 @@ struct bbio {
707 struct bio bio; 703 struct bio bio;
708}; 704};
709 705
710static inline unsigned local_clock_us(void)
711{
712 return local_clock() >> 10;
713}
714
715#define BTREE_PRIO USHRT_MAX 706#define BTREE_PRIO USHRT_MAX
716#define INITIAL_PRIO 32768 707#define INITIAL_PRIO 32768U
717 708
718#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) 709#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
719#define btree_blocks(b) \ 710#define btree_blocks(b) \
@@ -726,21 +717,6 @@ static inline unsigned local_clock_us(void)
726#define bucket_bytes(c) ((c)->sb.bucket_size << 9) 717#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
727#define block_bytes(c) ((c)->sb.block_size << 9) 718#define block_bytes(c) ((c)->sb.block_size << 9)
728 719
729#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
730#define set_bytes(i) __set_bytes(i, i->keys)
731
732#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
733#define set_blocks(i, c) __set_blocks(i, (i)->keys, c)
734
735#define node(i, j) ((struct bkey *) ((i)->d + (j)))
736#define end(i) node(i, (i)->keys)
737
738#define index(i, b) \
739 ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \
740 block_bytes(b->c)))
741
742#define btree_data_space(b) (PAGE_SIZE << (b)->page_order)
743
744#define prios_per_bucket(c) \ 720#define prios_per_bucket(c) \
745 ((bucket_bytes(c) - sizeof(struct prio_set)) / \ 721 ((bucket_bytes(c) - sizeof(struct prio_set)) / \
746 sizeof(struct bucket_disk)) 722 sizeof(struct bucket_disk))
@@ -783,20 +759,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
783 return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); 759 return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
784} 760}
785 761
786/* Btree key macros */ 762static inline uint8_t gen_after(uint8_t a, uint8_t b)
763{
764 uint8_t r = a - b;
765 return r > 128U ? 0 : r;
766}
787 767
788static inline void bkey_init(struct bkey *k) 768static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
769 unsigned i)
789{ 770{
790 *k = ZERO_KEY; 771 return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
791} 772}
792 773
774static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
775 unsigned i)
776{
777 return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
778}
779
780/* Btree key macros */
781
793/* 782/*
794 * This is used for various on disk data structures - cache_sb, prio_set, bset, 783 * This is used for various on disk data structures - cache_sb, prio_set, bset,
795 * jset: The checksum is _always_ the first 8 bytes of these structs 784 * jset: The checksum is _always_ the first 8 bytes of these structs
796 */ 785 */
797#define csum_set(i) \ 786#define csum_set(i) \
798 bch_crc64(((void *) (i)) + sizeof(uint64_t), \ 787 bch_crc64(((void *) (i)) + sizeof(uint64_t), \
799 ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) 788 ((void *) bset_bkey_last(i)) - \
789 (((void *) (i)) + sizeof(uint64_t)))
800 790
801/* Error handling macros */ 791/* Error handling macros */
802 792
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 7d388b8bb50e..4f6b5940e609 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -5,30 +5,134 @@
5 * Copyright 2012 Google, Inc. 5 * Copyright 2012 Google, Inc.
6 */ 6 */
7 7
8#include "bcache.h" 8#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
9#include "btree.h"
10#include "debug.h"
11 9
10#include "util.h"
11#include "bset.h"
12
13#include <linux/console.h>
12#include <linux/random.h> 14#include <linux/random.h>
13#include <linux/prefetch.h> 15#include <linux/prefetch.h>
14 16
17#ifdef CONFIG_BCACHE_DEBUG
18
19void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
20{
21 struct bkey *k, *next;
22
23 for (k = i->start; k < bset_bkey_last(i); k = next) {
24 next = bkey_next(k);
25
26 printk(KERN_ERR "block %u key %zi/%u: ", set,
27 (uint64_t *) k - i->d, i->keys);
28
29 if (b->ops->key_dump)
30 b->ops->key_dump(b, k);
31 else
32 printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
33
34 if (next < bset_bkey_last(i) &&
35 bkey_cmp(k, b->ops->is_extents ?
36 &START_KEY(next) : next) > 0)
37 printk(KERN_ERR "Key skipped backwards\n");
38 }
39}
40
41void bch_dump_bucket(struct btree_keys *b)
42{
43 unsigned i;
44
45 console_lock();
46 for (i = 0; i <= b->nsets; i++)
47 bch_dump_bset(b, b->set[i].data,
48 bset_sector_offset(b, b->set[i].data));
49 console_unlock();
50}
51
52int __bch_count_data(struct btree_keys *b)
53{
54 unsigned ret = 0;
55 struct btree_iter iter;
56 struct bkey *k;
57
58 if (b->ops->is_extents)
59 for_each_key(b, k, &iter)
60 ret += KEY_SIZE(k);
61 return ret;
62}
63
64void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
65{
66 va_list args;
67 struct bkey *k, *p = NULL;
68 struct btree_iter iter;
69 const char *err;
70
71 for_each_key(b, k, &iter) {
72 if (b->ops->is_extents) {
73 err = "Keys out of order";
74 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
75 goto bug;
76
77 if (bch_ptr_invalid(b, k))
78 continue;
79
80 err = "Overlapping keys";
81 if (p && bkey_cmp(p, &START_KEY(k)) > 0)
82 goto bug;
83 } else {
84 if (bch_ptr_bad(b, k))
85 continue;
86
87 err = "Duplicate keys";
88 if (p && !bkey_cmp(p, k))
89 goto bug;
90 }
91 p = k;
92 }
93#if 0
94 err = "Key larger than btree node key";
95 if (p && bkey_cmp(p, &b->key) > 0)
96 goto bug;
97#endif
98 return;
99bug:
100 bch_dump_bucket(b);
101
102 va_start(args, fmt);
103 vprintk(fmt, args);
104 va_end(args);
105
106 panic("bch_check_keys error: %s:\n", err);
107}
108
109static void bch_btree_iter_next_check(struct btree_iter *iter)
110{
111 struct bkey *k = iter->data->k, *next = bkey_next(k);
112
113 if (next < iter->data->end &&
114 bkey_cmp(k, iter->b->ops->is_extents ?
115 &START_KEY(next) : next) > 0) {
116 bch_dump_bucket(iter->b);
117 panic("Key skipped backwards\n");
118 }
119}
120
121#else
122
123static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
124
125#endif
126
15/* Keylists */ 127/* Keylists */
16 128
17int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) 129int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
18{ 130{
19 size_t oldsize = bch_keylist_nkeys(l); 131 size_t oldsize = bch_keylist_nkeys(l);
20 size_t newsize = oldsize + 2 + nptrs; 132 size_t newsize = oldsize + u64s;
21 uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; 133 uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
22 uint64_t *new_keys; 134 uint64_t *new_keys;
23 135
24 /* The journalling code doesn't handle the case where the keys to insert
25 * is bigger than an empty write: If we just return -ENOMEM here,
26 * bio_insert() and bio_invalidate() will insert the keys created so far
27 * and finish the rest when the keylist is empty.
28 */
29 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
30 return -ENOMEM;
31
32 newsize = roundup_pow_of_two(newsize); 136 newsize = roundup_pow_of_two(newsize);
33 137
34 if (newsize <= KEYLIST_INLINE || 138 if (newsize <= KEYLIST_INLINE ||
@@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l)
71 bch_keylist_bytes(l)); 175 bch_keylist_bytes(l));
72} 176}
73 177
74/* Pointer validation */
75
76static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
77{
78 unsigned i;
79
80 for (i = 0; i < KEY_PTRS(k); i++)
81 if (ptr_available(c, k, i)) {
82 struct cache *ca = PTR_CACHE(c, k, i);
83 size_t bucket = PTR_BUCKET_NR(c, k, i);
84 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
85
86 if (KEY_SIZE(k) + r > c->sb.bucket_size ||
87 bucket < ca->sb.first_bucket ||
88 bucket >= ca->sb.nbuckets)
89 return true;
90 }
91
92 return false;
93}
94
95bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
96{
97 char buf[80];
98
99 if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
100 goto bad;
101
102 if (__ptr_invalid(c, k))
103 goto bad;
104
105 return false;
106bad:
107 bch_bkey_to_text(buf, sizeof(buf), k);
108 cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
109 return true;
110}
111
112bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
113{
114 char buf[80];
115
116 if (!KEY_SIZE(k))
117 return true;
118
119 if (KEY_SIZE(k) > KEY_OFFSET(k))
120 goto bad;
121
122 if (__ptr_invalid(c, k))
123 goto bad;
124
125 return false;
126bad:
127 bch_bkey_to_text(buf, sizeof(buf), k);
128 cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
129 return true;
130}
131
132static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
133 unsigned ptr)
134{
135 struct bucket *g = PTR_BUCKET(b->c, k, ptr);
136 char buf[80];
137
138 if (mutex_trylock(&b->c->bucket_lock)) {
139 if (b->level) {
140 if (KEY_DIRTY(k) ||
141 g->prio != BTREE_PRIO ||
142 (b->c->gc_mark_valid &&
143 GC_MARK(g) != GC_MARK_METADATA))
144 goto err;
145
146 } else {
147 if (g->prio == BTREE_PRIO)
148 goto err;
149
150 if (KEY_DIRTY(k) &&
151 b->c->gc_mark_valid &&
152 GC_MARK(g) != GC_MARK_DIRTY)
153 goto err;
154 }
155 mutex_unlock(&b->c->bucket_lock);
156 }
157
158 return false;
159err:
160 mutex_unlock(&b->c->bucket_lock);
161 bch_bkey_to_text(buf, sizeof(buf), k);
162 btree_bug(b,
163"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
164 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
165 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
166 return true;
167}
168
169bool bch_ptr_bad(struct btree *b, const struct bkey *k)
170{
171 struct bucket *g;
172 unsigned i, stale;
173
174 if (!bkey_cmp(k, &ZERO_KEY) ||
175 !KEY_PTRS(k) ||
176 bch_ptr_invalid(b, k))
177 return true;
178
179 for (i = 0; i < KEY_PTRS(k); i++) {
180 if (!ptr_available(b->c, k, i))
181 return true;
182
183 g = PTR_BUCKET(b->c, k, i);
184 stale = ptr_stale(b->c, k, i);
185
186 btree_bug_on(stale > 96, b,
187 "key too stale: %i, need_gc %u",
188 stale, b->c->need_gc);
189
190 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
191 b, "stale dirty pointer");
192
193 if (stale)
194 return true;
195
196 if (expensive_debug_checks(b->c) &&
197 ptr_bad_expensive_checks(b, k, i))
198 return true;
199 }
200
201 return false;
202}
203
204/* Key/pointer manipulation */ 178/* Key/pointer manipulation */
205 179
206void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, 180void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
@@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k)
255 return true; 229 return true;
256} 230}
257 231
258static uint64_t merge_chksums(struct bkey *l, struct bkey *r) 232/* Auxiliary search trees */
233
234/* 32 bits total: */
235#define BKEY_MID_BITS 3
236#define BKEY_EXPONENT_BITS 7
237#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
238#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
239
240struct bkey_float {
241 unsigned exponent:BKEY_EXPONENT_BITS;
242 unsigned m:BKEY_MID_BITS;
243 unsigned mantissa:BKEY_MANTISSA_BITS;
244} __packed;
245
246/*
247 * BSET_CACHELINE was originally intended to match the hardware cacheline size -
248 * it used to be 64, but I realized the lookup code would touch slightly less
249 * memory if it was 128.
250 *
251 * It definites the number of bytes (in struct bset) per struct bkey_float in
252 * the auxiliar search tree - when we're done searching the bset_float tree we
253 * have this many bytes left that we do a linear search over.
254 *
255 * Since (after level 5) every level of the bset_tree is on a new cacheline,
256 * we're touching one fewer cacheline in the bset tree in exchange for one more
257 * cacheline in the linear search - but the linear search might stop before it
258 * gets to the second cacheline.
259 */
260
261#define BSET_CACHELINE 128
262
263/* Space required for the btree node keys */
264static inline size_t btree_keys_bytes(struct btree_keys *b)
259{ 265{
260 return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & 266 return PAGE_SIZE << b->page_order;
261 ~((uint64_t)1 << 63);
262} 267}
263 268
264/* Tries to merge l and r: l should be lower than r 269static inline size_t btree_keys_cachelines(struct btree_keys *b)
265 * Returns true if we were able to merge. If we did merge, l will be the merged
266 * key, r will be untouched.
267 */
268bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
269{ 270{
270 unsigned i; 271 return btree_keys_bytes(b) / BSET_CACHELINE;
272}
271 273
272 if (key_merging_disabled(b->c)) 274/* Space required for the auxiliary search trees */
273 return false; 275static inline size_t bset_tree_bytes(struct btree_keys *b)
276{
277 return btree_keys_cachelines(b) * sizeof(struct bkey_float);
278}
274 279
275 if (KEY_PTRS(l) != KEY_PTRS(r) || 280/* Space required for the prev pointers */
276 KEY_DIRTY(l) != KEY_DIRTY(r) || 281static inline size_t bset_prev_bytes(struct btree_keys *b)
277 bkey_cmp(l, &START_KEY(r))) 282{
278 return false; 283 return btree_keys_cachelines(b) * sizeof(uint8_t);
284}
279 285
280 for (i = 0; i < KEY_PTRS(l); i++) 286/* Memory allocation */
281 if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
282 PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
283 return false;
284 287
285 /* Keys with no pointers aren't restricted to one bucket and could 288void bch_btree_keys_free(struct btree_keys *b)
286 * overflow KEY_SIZE 289{
287 */ 290 struct bset_tree *t = b->set;
288 if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
289 SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
290 SET_KEY_SIZE(l, USHRT_MAX);
291 291
292 bch_cut_front(l, r); 292 if (bset_prev_bytes(b) < PAGE_SIZE)
293 return false; 293 kfree(t->prev);
294 } 294 else
295 free_pages((unsigned long) t->prev,
296 get_order(bset_prev_bytes(b)));
295 297
296 if (KEY_CSUM(l)) { 298 if (bset_tree_bytes(b) < PAGE_SIZE)
297 if (KEY_CSUM(r)) 299 kfree(t->tree);
298 l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); 300 else
299 else 301 free_pages((unsigned long) t->tree,
300 SET_KEY_CSUM(l, 0); 302 get_order(bset_tree_bytes(b)));
301 }
302 303
303 SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); 304 free_pages((unsigned long) t->data, b->page_order);
304 SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
305 305
306 return true; 306 t->prev = NULL;
307 t->tree = NULL;
308 t->data = NULL;
309}
310EXPORT_SYMBOL(bch_btree_keys_free);
311
312int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
313{
314 struct bset_tree *t = b->set;
315
316 BUG_ON(t->data);
317
318 b->page_order = page_order;
319
320 t->data = (void *) __get_free_pages(gfp, b->page_order);
321 if (!t->data)
322 goto err;
323
324 t->tree = bset_tree_bytes(b) < PAGE_SIZE
325 ? kmalloc(bset_tree_bytes(b), gfp)
326 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
327 if (!t->tree)
328 goto err;
329
330 t->prev = bset_prev_bytes(b) < PAGE_SIZE
331 ? kmalloc(bset_prev_bytes(b), gfp)
332 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
333 if (!t->prev)
334 goto err;
335
336 return 0;
337err:
338 bch_btree_keys_free(b);
339 return -ENOMEM;
307} 340}
341EXPORT_SYMBOL(bch_btree_keys_alloc);
342
343void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
344 bool *expensive_debug_checks)
345{
346 unsigned i;
347
348 b->ops = ops;
349 b->expensive_debug_checks = expensive_debug_checks;
350 b->nsets = 0;
351 b->last_set_unwritten = 0;
352
353 /* XXX: shouldn't be needed */
354 for (i = 0; i < MAX_BSETS; i++)
355 b->set[i].size = 0;
356 /*
357 * Second loop starts at 1 because b->keys[0]->data is the memory we
358 * allocated
359 */
360 for (i = 1; i < MAX_BSETS; i++)
361 b->set[i].data = NULL;
362}
363EXPORT_SYMBOL(bch_btree_keys_init);
308 364
309/* Binary tree stuff for auxiliary search trees */ 365/* Binary tree stuff for auxiliary search trees */
310 366
@@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
455 return ((void *) k - (void *) t->data) / BSET_CACHELINE; 511 return ((void *) k - (void *) t->data) / BSET_CACHELINE;
456} 512}
457 513
458static unsigned bkey_to_cacheline_offset(struct bkey *k) 514static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
515 unsigned cacheline,
516 struct bkey *k)
459{ 517{
460 return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); 518 return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
461} 519}
462 520
463static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) 521static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
@@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
504 : tree_to_prev_bkey(t, j >> ffs(j)); 562 : tree_to_prev_bkey(t, j >> ffs(j));
505 563
506 struct bkey *r = is_power_of_2(j + 1) 564 struct bkey *r = is_power_of_2(j + 1)
507 ? node(t->data, t->data->keys - bkey_u64s(&t->end)) 565 ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
508 : tree_to_bkey(t, j >> (ffz(j) + 1)); 566 : tree_to_bkey(t, j >> (ffz(j) + 1));
509 567
510 BUG_ON(m < l || m > r); 568 BUG_ON(m < l || m > r);
@@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
528 f->exponent = 127; 586 f->exponent = 127;
529} 587}
530 588
531static void bset_alloc_tree(struct btree *b, struct bset_tree *t) 589static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
532{ 590{
533 if (t != b->sets) { 591 if (t != b->set) {
534 unsigned j = roundup(t[-1].size, 592 unsigned j = roundup(t[-1].size,
535 64 / sizeof(struct bkey_float)); 593 64 / sizeof(struct bkey_float));
536 594
@@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
538 t->prev = t[-1].prev + j; 596 t->prev = t[-1].prev + j;
539 } 597 }
540 598
541 while (t < b->sets + MAX_BSETS) 599 while (t < b->set + MAX_BSETS)
542 t++->size = 0; 600 t++->size = 0;
543} 601}
544 602
545static void bset_build_unwritten_tree(struct btree *b) 603static void bch_bset_build_unwritten_tree(struct btree_keys *b)
546{ 604{
547 struct bset_tree *t = b->sets + b->nsets; 605 struct bset_tree *t = bset_tree_last(b);
606
607 BUG_ON(b->last_set_unwritten);
608 b->last_set_unwritten = 1;
548 609
549 bset_alloc_tree(b, t); 610 bset_alloc_tree(b, t);
550 611
551 if (t->tree != b->sets->tree + bset_tree_space(b)) { 612 if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
552 t->prev[0] = bkey_to_cacheline_offset(t->data->start); 613 t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
553 t->size = 1; 614 t->size = 1;
554 } 615 }
555} 616}
556 617
557static void bset_build_written_tree(struct btree *b) 618void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
619{
620 if (i != b->set->data) {
621 b->set[++b->nsets].data = i;
622 i->seq = b->set->data->seq;
623 } else
624 get_random_bytes(&i->seq, sizeof(uint64_t));
625
626 i->magic = magic;
627 i->version = 0;
628 i->keys = 0;
629
630 bch_bset_build_unwritten_tree(b);
631}
632EXPORT_SYMBOL(bch_bset_init_next);
633
634void bch_bset_build_written_tree(struct btree_keys *b)
558{ 635{
559 struct bset_tree *t = b->sets + b->nsets; 636 struct bset_tree *t = bset_tree_last(b);
560 struct bkey *k = t->data->start; 637 struct bkey *prev = NULL, *k = t->data->start;
561 unsigned j, cacheline = 1; 638 unsigned j, cacheline = 1;
562 639
640 b->last_set_unwritten = 0;
641
563 bset_alloc_tree(b, t); 642 bset_alloc_tree(b, t);
564 643
565 t->size = min_t(unsigned, 644 t->size = min_t(unsigned,
566 bkey_to_cacheline(t, end(t->data)), 645 bkey_to_cacheline(t, bset_bkey_last(t->data)),
567 b->sets->tree + bset_tree_space(b) - t->tree); 646 b->set->tree + btree_keys_cachelines(b) - t->tree);
568 647
569 if (t->size < 2) { 648 if (t->size < 2) {
570 t->size = 0; 649 t->size = 0;
@@ -577,16 +656,14 @@ static void bset_build_written_tree(struct btree *b)
577 for (j = inorder_next(0, t->size); 656 for (j = inorder_next(0, t->size);
578 j; 657 j;
579 j = inorder_next(j, t->size)) { 658 j = inorder_next(j, t->size)) {
580 while (bkey_to_cacheline(t, k) != cacheline) 659 while (bkey_to_cacheline(t, k) < cacheline)
581 k = bkey_next(k); 660 prev = k, k = bkey_next(k);
582 661
583 t->prev[j] = bkey_u64s(k); 662 t->prev[j] = bkey_u64s(prev);
584 k = bkey_next(k); 663 t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
585 cacheline++;
586 t->tree[j].m = bkey_to_cacheline_offset(k);
587 } 664 }
588 665
589 while (bkey_next(k) != end(t->data)) 666 while (bkey_next(k) != bset_bkey_last(t->data))
590 k = bkey_next(k); 667 k = bkey_next(k);
591 668
592 t->end = *k; 669 t->end = *k;
@@ -597,14 +674,17 @@ static void bset_build_written_tree(struct btree *b)
597 j = inorder_next(j, t->size)) 674 j = inorder_next(j, t->size))
598 make_bfloat(t, j); 675 make_bfloat(t, j);
599} 676}
677EXPORT_SYMBOL(bch_bset_build_written_tree);
600 678
601void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) 679/* Insert */
680
681void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
602{ 682{
603 struct bset_tree *t; 683 struct bset_tree *t;
604 unsigned inorder, j = 1; 684 unsigned inorder, j = 1;
605 685
606 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 686 for (t = b->set; t <= bset_tree_last(b); t++)
607 if (k < end(t->data)) 687 if (k < bset_bkey_last(t->data))
608 goto found_set; 688 goto found_set;
609 689
610 BUG(); 690 BUG();
@@ -617,7 +697,7 @@ found_set:
617 if (k == t->data->start) 697 if (k == t->data->start)
618 goto fix_left; 698 goto fix_left;
619 699
620 if (bkey_next(k) == end(t->data)) { 700 if (bkey_next(k) == bset_bkey_last(t->data)) {
621 t->end = *k; 701 t->end = *k;
622 goto fix_right; 702 goto fix_right;
623 } 703 }
@@ -642,10 +722,12 @@ fix_right: do {
642 j = j * 2 + 1; 722 j = j * 2 + 1;
643 } while (j < t->size); 723 } while (j < t->size);
644} 724}
725EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
645 726
646void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) 727static void bch_bset_fix_lookup_table(struct btree_keys *b,
728 struct bset_tree *t,
729 struct bkey *k)
647{ 730{
648 struct bset_tree *t = &b->sets[b->nsets];
649 unsigned shift = bkey_u64s(k); 731 unsigned shift = bkey_u64s(k);
650 unsigned j = bkey_to_cacheline(t, k); 732 unsigned j = bkey_to_cacheline(t, k);
651 733
@@ -657,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
657 * lookup table for the first key that is strictly greater than k: 739 * lookup table for the first key that is strictly greater than k:
658 * it's either k's cacheline or the next one 740 * it's either k's cacheline or the next one
659 */ 741 */
660 if (j < t->size && 742 while (j < t->size &&
661 table_to_bkey(t, j) <= k) 743 table_to_bkey(t, j) <= k)
662 j++; 744 j++;
663 745
664 /* Adjust all the lookup table entries, and find a new key for any that 746 /* Adjust all the lookup table entries, and find a new key for any that
@@ -673,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
673 while (k < cacheline_to_bkey(t, j, 0)) 755 while (k < cacheline_to_bkey(t, j, 0))
674 k = bkey_next(k); 756 k = bkey_next(k);
675 757
676 t->prev[j] = bkey_to_cacheline_offset(k); 758 t->prev[j] = bkey_to_cacheline_offset(t, j, k);
677 } 759 }
678 } 760 }
679 761
680 if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) 762 if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
681 return; 763 return;
682 764
683 /* Possibly add a new entry to the end of the lookup table */ 765 /* Possibly add a new entry to the end of the lookup table */
684 766
685 for (k = table_to_bkey(t, t->size - 1); 767 for (k = table_to_bkey(t, t->size - 1);
686 k != end(t->data); 768 k != bset_bkey_last(t->data);
687 k = bkey_next(k)) 769 k = bkey_next(k))
688 if (t->size == bkey_to_cacheline(t, k)) { 770 if (t->size == bkey_to_cacheline(t, k)) {
689 t->prev[t->size] = bkey_to_cacheline_offset(k); 771 t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k);
690 t->size++; 772 t->size++;
691 } 773 }
692} 774}
693 775
694void bch_bset_init_next(struct btree *b) 776/*
777 * Tries to merge l and r: l should be lower than r
778 * Returns true if we were able to merge. If we did merge, l will be the merged
779 * key, r will be untouched.
780 */
781bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
695{ 782{
696 struct bset *i = write_block(b); 783 if (!b->ops->key_merge)
784 return false;
697 785
698 if (i != b->sets[0].data) { 786 /*
699 b->sets[++b->nsets].data = i; 787 * Generic header checks
700 i->seq = b->sets[0].data->seq; 788 * Assumes left and right are in order
701 } else 789 * Left and right must be exactly aligned
702 get_random_bytes(&i->seq, sizeof(uint64_t)); 790 */
791 if (!bch_bkey_equal_header(l, r) ||
792 bkey_cmp(l, &START_KEY(r)))
793 return false;
703 794
704 i->magic = bset_magic(&b->c->sb); 795 return b->ops->key_merge(b, l, r);
705 i->version = 0; 796}
706 i->keys = 0; 797EXPORT_SYMBOL(bch_bkey_try_merge);
707 798
708 bset_build_unwritten_tree(b); 799void bch_bset_insert(struct btree_keys *b, struct bkey *where,
800 struct bkey *insert)
801{
802 struct bset_tree *t = bset_tree_last(b);
803
804 BUG_ON(!b->last_set_unwritten);
805 BUG_ON(bset_byte_offset(b, t->data) +
806 __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
807 PAGE_SIZE << b->page_order);
808
809 memmove((uint64_t *) where + bkey_u64s(insert),
810 where,
811 (void *) bset_bkey_last(t->data) - (void *) where);
812
813 t->data->keys += bkey_u64s(insert);
814 bkey_copy(where, insert);
815 bch_bset_fix_lookup_table(b, t, where);
709} 816}
817EXPORT_SYMBOL(bch_bset_insert);
818
819unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
820 struct bkey *replace_key)
821{
822 unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
823 struct bset *i = bset_tree_last(b)->data;
824 struct bkey *m, *prev = NULL;
825 struct btree_iter iter;
826
827 BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
828
829 m = bch_btree_iter_init(b, &iter, b->ops->is_extents
830 ? PRECEDING_KEY(&START_KEY(k))
831 : PRECEDING_KEY(k));
832
833 if (b->ops->insert_fixup(b, k, &iter, replace_key))
834 return status;
835
836 status = BTREE_INSERT_STATUS_INSERT;
837
838 while (m != bset_bkey_last(i) &&
839 bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
840 prev = m, m = bkey_next(m);
841
842 /* prev is in the tree, if we merge we're done */
843 status = BTREE_INSERT_STATUS_BACK_MERGE;
844 if (prev &&
845 bch_bkey_try_merge(b, prev, k))
846 goto merged;
847#if 0
848 status = BTREE_INSERT_STATUS_OVERWROTE;
849 if (m != bset_bkey_last(i) &&
850 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
851 goto copy;
852#endif
853 status = BTREE_INSERT_STATUS_FRONT_MERGE;
854 if (m != bset_bkey_last(i) &&
855 bch_bkey_try_merge(b, k, m))
856 goto copy;
857
858 bch_bset_insert(b, m, k);
859copy: bkey_copy(m, k);
860merged:
861 return status;
862}
863EXPORT_SYMBOL(bch_btree_insert_key);
864
865/* Lookup */
710 866
711struct bset_search_iter { 867struct bset_search_iter {
712 struct bkey *l, *r; 868 struct bkey *l, *r;
713}; 869};
714 870
715static struct bset_search_iter bset_search_write_set(struct btree *b, 871static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
716 struct bset_tree *t,
717 const struct bkey *search) 872 const struct bkey *search)
718{ 873{
719 unsigned li = 0, ri = t->size; 874 unsigned li = 0, ri = t->size;
720 875
721 BUG_ON(!b->nsets &&
722 t->size < bkey_to_cacheline(t, end(t->data)));
723
724 while (li + 1 != ri) { 876 while (li + 1 != ri) {
725 unsigned m = (li + ri) >> 1; 877 unsigned m = (li + ri) >> 1;
726 878
@@ -732,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b,
732 884
733 return (struct bset_search_iter) { 885 return (struct bset_search_iter) {
734 table_to_bkey(t, li), 886 table_to_bkey(t, li),
735 ri < t->size ? table_to_bkey(t, ri) : end(t->data) 887 ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
736 }; 888 };
737} 889}
738 890
739static struct bset_search_iter bset_search_tree(struct btree *b, 891static struct bset_search_iter bset_search_tree(struct bset_tree *t,
740 struct bset_tree *t,
741 const struct bkey *search) 892 const struct bkey *search)
742{ 893{
743 struct bkey *l, *r; 894 struct bkey *l, *r;
@@ -784,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
784 f = &t->tree[inorder_next(j, t->size)]; 935 f = &t->tree[inorder_next(j, t->size)];
785 r = cacheline_to_bkey(t, inorder, f->m); 936 r = cacheline_to_bkey(t, inorder, f->m);
786 } else 937 } else
787 r = end(t->data); 938 r = bset_bkey_last(t->data);
788 } else { 939 } else {
789 r = cacheline_to_bkey(t, inorder, f->m); 940 r = cacheline_to_bkey(t, inorder, f->m);
790 941
@@ -798,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
798 return (struct bset_search_iter) {l, r}; 949 return (struct bset_search_iter) {l, r};
799} 950}
800 951
801struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, 952struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
802 const struct bkey *search) 953 const struct bkey *search)
803{ 954{
804 struct bset_search_iter i; 955 struct bset_search_iter i;
@@ -820,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
820 971
821 if (unlikely(!t->size)) { 972 if (unlikely(!t->size)) {
822 i.l = t->data->start; 973 i.l = t->data->start;
823 i.r = end(t->data); 974 i.r = bset_bkey_last(t->data);
824 } else if (bset_written(b, t)) { 975 } else if (bset_written(b, t)) {
825 /* 976 /*
826 * Each node in the auxiliary search tree covers a certain range 977 * Each node in the auxiliary search tree covers a certain range
@@ -830,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
830 */ 981 */
831 982
832 if (unlikely(bkey_cmp(search, &t->end) >= 0)) 983 if (unlikely(bkey_cmp(search, &t->end) >= 0))
833 return end(t->data); 984 return bset_bkey_last(t->data);
834 985
835 if (unlikely(bkey_cmp(search, t->data->start) < 0)) 986 if (unlikely(bkey_cmp(search, t->data->start) < 0))
836 return t->data->start; 987 return t->data->start;
837 988
838 i = bset_search_tree(b, t, search); 989 i = bset_search_tree(t, search);
839 } else 990 } else {
840 i = bset_search_write_set(b, t, search); 991 BUG_ON(!b->nsets &&
992 t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
841 993
842 if (expensive_debug_checks(b->c)) { 994 i = bset_search_write_set(t, search);
995 }
996
997 if (btree_keys_expensive_checks(b)) {
843 BUG_ON(bset_written(b, t) && 998 BUG_ON(bset_written(b, t) &&
844 i.l != t->data->start && 999 i.l != t->data->start &&
845 bkey_cmp(tree_to_prev_bkey(t, 1000 bkey_cmp(tree_to_prev_bkey(t,
846 inorder_to_tree(bkey_to_cacheline(t, i.l), t)), 1001 inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
847 search) > 0); 1002 search) > 0);
848 1003
849 BUG_ON(i.r != end(t->data) && 1004 BUG_ON(i.r != bset_bkey_last(t->data) &&
850 bkey_cmp(i.r, search) <= 0); 1005 bkey_cmp(i.r, search) <= 0);
851 } 1006 }
852 1007
@@ -856,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
856 1011
857 return i.l; 1012 return i.l;
858} 1013}
1014EXPORT_SYMBOL(__bch_bset_search);
859 1015
860/* Btree iterator */ 1016/* Btree iterator */
861 1017
862/* 1018typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
863 * Returns true if l > r - unless l == r, in which case returns true if l is 1019 struct btree_iter_set);
864 * older than r. 1020
865 *
866 * Necessary for btree_sort_fixup() - if there are multiple keys that compare
867 * equal in different sets, we have to process them newest to oldest.
868 */
869static inline bool btree_iter_cmp(struct btree_iter_set l, 1021static inline bool btree_iter_cmp(struct btree_iter_set l,
870 struct btree_iter_set r) 1022 struct btree_iter_set r)
871{ 1023{
872 int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); 1024 return bkey_cmp(l.k, r.k) > 0;
873
874 return c ? c > 0 : l.k < r.k;
875} 1025}
876 1026
877static inline bool btree_iter_end(struct btree_iter *iter) 1027static inline bool btree_iter_end(struct btree_iter *iter)
@@ -888,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
888 btree_iter_cmp)); 1038 btree_iter_cmp));
889} 1039}
890 1040
891struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, 1041static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
892 struct bkey *search, struct bset_tree *start) 1042 struct btree_iter *iter,
1043 struct bkey *search,
1044 struct bset_tree *start)
893{ 1045{
894 struct bkey *ret = NULL; 1046 struct bkey *ret = NULL;
895 iter->size = ARRAY_SIZE(iter->data); 1047 iter->size = ARRAY_SIZE(iter->data);
@@ -899,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
899 iter->b = b; 1051 iter->b = b;
900#endif 1052#endif
901 1053
902 for (; start <= &b->sets[b->nsets]; start++) { 1054 for (; start <= bset_tree_last(b); start++) {
903 ret = bch_bset_search(b, start, search); 1055 ret = bch_bset_search(b, start, search);
904 bch_btree_iter_push(iter, ret, end(start->data)); 1056 bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
905 } 1057 }
906 1058
907 return ret; 1059 return ret;
908} 1060}
909 1061
910struct bkey *bch_btree_iter_next(struct btree_iter *iter) 1062struct bkey *bch_btree_iter_init(struct btree_keys *b,
1063 struct btree_iter *iter,
1064 struct bkey *search)
1065{
1066 return __bch_btree_iter_init(b, iter, search, b->set);
1067}
1068EXPORT_SYMBOL(bch_btree_iter_init);
1069
1070static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
1071 btree_iter_cmp_fn *cmp)
911{ 1072{
912 struct btree_iter_set unused; 1073 struct btree_iter_set unused;
913 struct bkey *ret = NULL; 1074 struct bkey *ret = NULL;
@@ -924,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
924 } 1085 }
925 1086
926 if (iter->data->k == iter->data->end) 1087 if (iter->data->k == iter->data->end)
927 heap_pop(iter, unused, btree_iter_cmp); 1088 heap_pop(iter, unused, cmp);
928 else 1089 else
929 heap_sift(iter, 0, btree_iter_cmp); 1090 heap_sift(iter, 0, cmp);
930 } 1091 }
931 1092
932 return ret; 1093 return ret;
933} 1094}
934 1095
1096struct bkey *bch_btree_iter_next(struct btree_iter *iter)
1097{
1098 return __bch_btree_iter_next(iter, btree_iter_cmp);
1099
1100}
1101EXPORT_SYMBOL(bch_btree_iter_next);
1102
935struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, 1103struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
936 struct btree *b, ptr_filter_fn fn) 1104 struct btree_keys *b, ptr_filter_fn fn)
937{ 1105{
938 struct bkey *ret; 1106 struct bkey *ret;
939 1107
@@ -946,70 +1114,58 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
946 1114
947/* Mergesort */ 1115/* Mergesort */
948 1116
949static void sort_key_next(struct btree_iter *iter, 1117void bch_bset_sort_state_free(struct bset_sort_state *state)
950 struct btree_iter_set *i)
951{ 1118{
952 i->k = bkey_next(i->k); 1119 if (state->pool)
953 1120 mempool_destroy(state->pool);
954 if (i->k == i->end)
955 *i = iter->data[--iter->used];
956} 1121}
957 1122
958static void btree_sort_fixup(struct btree_iter *iter) 1123int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
959{ 1124{
960 while (iter->used > 1) { 1125 spin_lock_init(&state->time.lock);
961 struct btree_iter_set *top = iter->data, *i = top + 1;
962 1126
963 if (iter->used > 2 && 1127 state->page_order = page_order;
964 btree_iter_cmp(i[0], i[1])) 1128 state->crit_factor = int_sqrt(1 << page_order);
965 i++;
966 1129
967 if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) 1130 state->pool = mempool_create_page_pool(1, page_order);
968 break; 1131 if (!state->pool)
969 1132 return -ENOMEM;
970 if (!KEY_SIZE(i->k)) {
971 sort_key_next(iter, i);
972 heap_sift(iter, i - top, btree_iter_cmp);
973 continue;
974 }
975
976 if (top->k > i->k) {
977 if (bkey_cmp(top->k, i->k) >= 0)
978 sort_key_next(iter, i);
979 else
980 bch_cut_front(top->k, i->k);
981 1133
982 heap_sift(iter, i - top, btree_iter_cmp); 1134 return 0;
983 } else {
984 /* can't happen because of comparison func */
985 BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
986 bch_cut_back(&START_KEY(i->k), top->k);
987 }
988 }
989} 1135}
1136EXPORT_SYMBOL(bch_bset_sort_state_init);
990 1137
991static void btree_mergesort(struct btree *b, struct bset *out, 1138static void btree_mergesort(struct btree_keys *b, struct bset *out,
992 struct btree_iter *iter, 1139 struct btree_iter *iter,
993 bool fixup, bool remove_stale) 1140 bool fixup, bool remove_stale)
994{ 1141{
1142 int i;
995 struct bkey *k, *last = NULL; 1143 struct bkey *k, *last = NULL;
996 bool (*bad)(struct btree *, const struct bkey *) = remove_stale 1144 BKEY_PADDED(k) tmp;
1145 bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
997 ? bch_ptr_bad 1146 ? bch_ptr_bad
998 : bch_ptr_invalid; 1147 : bch_ptr_invalid;
999 1148
1149 /* Heapify the iterator, using our comparison function */
1150 for (i = iter->used / 2 - 1; i >= 0; --i)
1151 heap_sift(iter, i, b->ops->sort_cmp);
1152
1000 while (!btree_iter_end(iter)) { 1153 while (!btree_iter_end(iter)) {
1001 if (fixup && !b->level) 1154 if (b->ops->sort_fixup && fixup)
1002 btree_sort_fixup(iter); 1155 k = b->ops->sort_fixup(iter, &tmp.k);
1156 else
1157 k = NULL;
1158
1159 if (!k)
1160 k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
1003 1161
1004 k = bch_btree_iter_next(iter);
1005 if (bad(b, k)) 1162 if (bad(b, k))
1006 continue; 1163 continue;
1007 1164
1008 if (!last) { 1165 if (!last) {
1009 last = out->start; 1166 last = out->start;
1010 bkey_copy(last, k); 1167 bkey_copy(last, k);
1011 } else if (b->level || 1168 } else if (!bch_bkey_try_merge(b, last, k)) {
1012 !bch_bkey_try_merge(b, last, k)) {
1013 last = bkey_next(last); 1169 last = bkey_next(last);
1014 bkey_copy(last, k); 1170 bkey_copy(last, k);
1015 } 1171 }
@@ -1020,27 +1176,27 @@ static void btree_mergesort(struct btree *b, struct bset *out,
1020 pr_debug("sorted %i keys", out->keys); 1176 pr_debug("sorted %i keys", out->keys);
1021} 1177}
1022 1178
1023static void __btree_sort(struct btree *b, struct btree_iter *iter, 1179static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
1024 unsigned start, unsigned order, bool fixup) 1180 unsigned start, unsigned order, bool fixup,
1181 struct bset_sort_state *state)
1025{ 1182{
1026 uint64_t start_time; 1183 uint64_t start_time;
1027 bool remove_stale = !b->written; 1184 bool used_mempool = false;
1028 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, 1185 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
1029 order); 1186 order);
1030 if (!out) { 1187 if (!out) {
1031 mutex_lock(&b->c->sort_lock); 1188 BUG_ON(order > state->page_order);
1032 out = b->c->sort; 1189
1033 order = ilog2(bucket_pages(b->c)); 1190 out = page_address(mempool_alloc(state->pool, GFP_NOIO));
1191 used_mempool = true;
1192 order = state->page_order;
1034 } 1193 }
1035 1194
1036 start_time = local_clock(); 1195 start_time = local_clock();
1037 1196
1038 btree_mergesort(b, out, iter, fixup, remove_stale); 1197 btree_mergesort(b, out, iter, fixup, false);
1039 b->nsets = start; 1198 b->nsets = start;
1040 1199
1041 if (!fixup && !start && b->written)
1042 bch_btree_verify(b, out);
1043
1044 if (!start && order == b->page_order) { 1200 if (!start && order == b->page_order) {
1045 /* 1201 /*
1046 * Our temporary buffer is the same size as the btree node's 1202 * Our temporary buffer is the same size as the btree node's
@@ -1048,84 +1204,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
1048 * memcpy() 1204 * memcpy()
1049 */ 1205 */
1050 1206
1051 out->magic = bset_magic(&b->c->sb); 1207 out->magic = b->set->data->magic;
1052 out->seq = b->sets[0].data->seq; 1208 out->seq = b->set->data->seq;
1053 out->version = b->sets[0].data->version; 1209 out->version = b->set->data->version;
1054 swap(out, b->sets[0].data); 1210 swap(out, b->set->data);
1055
1056 if (b->c->sort == b->sets[0].data)
1057 b->c->sort = out;
1058 } else { 1211 } else {
1059 b->sets[start].data->keys = out->keys; 1212 b->set[start].data->keys = out->keys;
1060 memcpy(b->sets[start].data->start, out->start, 1213 memcpy(b->set[start].data->start, out->start,
1061 (void *) end(out) - (void *) out->start); 1214 (void *) bset_bkey_last(out) - (void *) out->start);
1062 } 1215 }
1063 1216
1064 if (out == b->c->sort) 1217 if (used_mempool)
1065 mutex_unlock(&b->c->sort_lock); 1218 mempool_free(virt_to_page(out), state->pool);
1066 else 1219 else
1067 free_pages((unsigned long) out, order); 1220 free_pages((unsigned long) out, order);
1068 1221
1069 if (b->written) 1222 bch_bset_build_written_tree(b);
1070 bset_build_written_tree(b);
1071 1223
1072 if (!start) 1224 if (!start)
1073 bch_time_stats_update(&b->c->sort_time, start_time); 1225 bch_time_stats_update(&state->time, start_time);
1074} 1226}
1075 1227
1076void bch_btree_sort_partial(struct btree *b, unsigned start) 1228void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
1229 struct bset_sort_state *state)
1077{ 1230{
1078 size_t order = b->page_order, keys = 0; 1231 size_t order = b->page_order, keys = 0;
1079 struct btree_iter iter; 1232 struct btree_iter iter;
1080 int oldsize = bch_count_data(b); 1233 int oldsize = bch_count_data(b);
1081 1234
1082 __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); 1235 __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
1083
1084 BUG_ON(b->sets[b->nsets].data == write_block(b) &&
1085 (b->sets[b->nsets].size || b->nsets));
1086
1087 1236
1088 if (start) { 1237 if (start) {
1089 unsigned i; 1238 unsigned i;
1090 1239
1091 for (i = start; i <= b->nsets; i++) 1240 for (i = start; i <= b->nsets; i++)
1092 keys += b->sets[i].data->keys; 1241 keys += b->set[i].data->keys;
1093 1242
1094 order = roundup_pow_of_two(__set_bytes(b->sets->data, 1243 order = get_order(__set_bytes(b->set->data, keys));
1095 keys)) / PAGE_SIZE;
1096 if (order)
1097 order = ilog2(order);
1098 } 1244 }
1099 1245
1100 __btree_sort(b, &iter, start, order, false); 1246 __btree_sort(b, &iter, start, order, false, state);
1101 1247
1102 EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); 1248 EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
1103} 1249}
1250EXPORT_SYMBOL(bch_btree_sort_partial);
1104 1251
1105void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) 1252void bch_btree_sort_and_fix_extents(struct btree_keys *b,
1253 struct btree_iter *iter,
1254 struct bset_sort_state *state)
1106{ 1255{
1107 BUG_ON(!b->written); 1256 __btree_sort(b, iter, 0, b->page_order, true, state);
1108 __btree_sort(b, iter, 0, b->page_order, true);
1109} 1257}
1110 1258
1111void bch_btree_sort_into(struct btree *b, struct btree *new) 1259void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
1260 struct bset_sort_state *state)
1112{ 1261{
1113 uint64_t start_time = local_clock(); 1262 uint64_t start_time = local_clock();
1114 1263
1115 struct btree_iter iter; 1264 struct btree_iter iter;
1116 bch_btree_iter_init(b, &iter, NULL); 1265 bch_btree_iter_init(b, &iter, NULL);
1117 1266
1118 btree_mergesort(b, new->sets->data, &iter, false, true); 1267 btree_mergesort(b, new->set->data, &iter, false, true);
1119 1268
1120 bch_time_stats_update(&b->c->sort_time, start_time); 1269 bch_time_stats_update(&state->time, start_time);
1121 1270
1122 bkey_copy_key(&new->key, &b->key); 1271 new->set->size = 0; // XXX: why?
1123 new->sets->size = 0;
1124} 1272}
1125 1273
1126#define SORT_CRIT (4096 / sizeof(uint64_t)) 1274#define SORT_CRIT (4096 / sizeof(uint64_t))
1127 1275
1128void bch_btree_sort_lazy(struct btree *b) 1276void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
1129{ 1277{
1130 unsigned crit = SORT_CRIT; 1278 unsigned crit = SORT_CRIT;
1131 int i; 1279 int i;
@@ -1134,50 +1282,32 @@ void bch_btree_sort_lazy(struct btree *b)
1134 if (!b->nsets) 1282 if (!b->nsets)
1135 goto out; 1283 goto out;
1136 1284
1137 /* If not a leaf node, always sort */
1138 if (b->level) {
1139 bch_btree_sort(b);
1140 return;
1141 }
1142
1143 for (i = b->nsets - 1; i >= 0; --i) { 1285 for (i = b->nsets - 1; i >= 0; --i) {
1144 crit *= b->c->sort_crit_factor; 1286 crit *= state->crit_factor;
1145 1287
1146 if (b->sets[i].data->keys < crit) { 1288 if (b->set[i].data->keys < crit) {
1147 bch_btree_sort_partial(b, i); 1289 bch_btree_sort_partial(b, i, state);
1148 return; 1290 return;
1149 } 1291 }
1150 } 1292 }
1151 1293
1152 /* Sort if we'd overflow */ 1294 /* Sort if we'd overflow */
1153 if (b->nsets + 1 == MAX_BSETS) { 1295 if (b->nsets + 1 == MAX_BSETS) {
1154 bch_btree_sort(b); 1296 bch_btree_sort(b, state);
1155 return; 1297 return;
1156 } 1298 }
1157 1299
1158out: 1300out:
1159 bset_build_written_tree(b); 1301 bch_bset_build_written_tree(b);
1160} 1302}
1303EXPORT_SYMBOL(bch_btree_sort_lazy);
1161 1304
1162/* Sysfs stuff */ 1305void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
1163
1164struct bset_stats {
1165 struct btree_op op;
1166 size_t nodes;
1167 size_t sets_written, sets_unwritten;
1168 size_t bytes_written, bytes_unwritten;
1169 size_t floats, failed;
1170};
1171
1172static int btree_bset_stats(struct btree_op *op, struct btree *b)
1173{ 1306{
1174 struct bset_stats *stats = container_of(op, struct bset_stats, op);
1175 unsigned i; 1307 unsigned i;
1176 1308
1177 stats->nodes++;
1178
1179 for (i = 0; i <= b->nsets; i++) { 1309 for (i = 0; i <= b->nsets; i++) {
1180 struct bset_tree *t = &b->sets[i]; 1310 struct bset_tree *t = &b->set[i];
1181 size_t bytes = t->data->keys * sizeof(uint64_t); 1311 size_t bytes = t->data->keys * sizeof(uint64_t);
1182 size_t j; 1312 size_t j;
1183 1313
@@ -1195,32 +1325,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b)
1195 stats->bytes_unwritten += bytes; 1325 stats->bytes_unwritten += bytes;
1196 } 1326 }
1197 } 1327 }
1198
1199 return MAP_CONTINUE;
1200}
1201
1202int bch_bset_print_stats(struct cache_set *c, char *buf)
1203{
1204 struct bset_stats t;
1205 int ret;
1206
1207 memset(&t, 0, sizeof(struct bset_stats));
1208 bch_btree_op_init(&t.op, -1);
1209
1210 ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
1211 if (ret < 0)
1212 return ret;
1213
1214 return snprintf(buf, PAGE_SIZE,
1215 "btree nodes: %zu\n"
1216 "written sets: %zu\n"
1217 "unwritten sets: %zu\n"
1218 "written key bytes: %zu\n"
1219 "unwritten key bytes: %zu\n"
1220 "floats: %zu\n"
1221 "failed: %zu\n",
1222 t.nodes,
1223 t.sets_written, t.sets_unwritten,
1224 t.bytes_written, t.bytes_unwritten,
1225 t.floats, t.failed);
1226} 1328}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 1d3c24f9fa0e..003260f4ddf6 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,7 +1,11 @@
1#ifndef _BCACHE_BSET_H 1#ifndef _BCACHE_BSET_H
2#define _BCACHE_BSET_H 2#define _BCACHE_BSET_H
3 3
4#include <linux/slab.h> 4#include <linux/bcache.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7
8#include "util.h" /* for time_stats */
5 9
6/* 10/*
7 * BKEYS: 11 * BKEYS:
@@ -142,20 +146,13 @@
142 * first key in that range of bytes again. 146 * first key in that range of bytes again.
143 */ 147 */
144 148
145/* Btree key comparison/iteration */ 149struct btree_keys;
150struct btree_iter;
151struct btree_iter_set;
152struct bkey_float;
146 153
147#define MAX_BSETS 4U 154#define MAX_BSETS 4U
148 155
149struct btree_iter {
150 size_t size, used;
151#ifdef CONFIG_BCACHE_DEBUG
152 struct btree *b;
153#endif
154 struct btree_iter_set {
155 struct bkey *k, *end;
156 } data[MAX_BSETS];
157};
158
159struct bset_tree { 156struct bset_tree {
160 /* 157 /*
161 * We construct a binary tree in an array as if the array 158 * We construct a binary tree in an array as if the array
@@ -165,14 +162,14 @@ struct bset_tree {
165 */ 162 */
166 163
167 /* size of the binary tree and prev array */ 164 /* size of the binary tree and prev array */
168 unsigned size; 165 unsigned size;
169 166
170 /* function of size - precalculated for to_inorder() */ 167 /* function of size - precalculated for to_inorder() */
171 unsigned extra; 168 unsigned extra;
172 169
173 /* copy of the last key in the set */ 170 /* copy of the last key in the set */
174 struct bkey end; 171 struct bkey end;
175 struct bkey_float *tree; 172 struct bkey_float *tree;
176 173
177 /* 174 /*
178 * The nodes in the bset tree point to specific keys - this 175 * The nodes in the bset tree point to specific keys - this
@@ -182,12 +179,219 @@ struct bset_tree {
182 * to keep bkey_float to 4 bytes and prev isn't used in the fast 179 * to keep bkey_float to 4 bytes and prev isn't used in the fast
183 * path. 180 * path.
184 */ 181 */
185 uint8_t *prev; 182 uint8_t *prev;
186 183
187 /* The actual btree node, with pointers to each sorted set */ 184 /* The actual btree node, with pointers to each sorted set */
188 struct bset *data; 185 struct bset *data;
186};
187
188struct btree_keys_ops {
189 bool (*sort_cmp)(struct btree_iter_set,
190 struct btree_iter_set);
191 struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *);
192 bool (*insert_fixup)(struct btree_keys *, struct bkey *,
193 struct btree_iter *, struct bkey *);
194 bool (*key_invalid)(struct btree_keys *,
195 const struct bkey *);
196 bool (*key_bad)(struct btree_keys *, const struct bkey *);
197 bool (*key_merge)(struct btree_keys *,
198 struct bkey *, struct bkey *);
199 void (*key_to_text)(char *, size_t, const struct bkey *);
200 void (*key_dump)(struct btree_keys *, const struct bkey *);
201
202 /*
203 * Only used for deciding whether to use START_KEY(k) or just the key
204 * itself in a couple places
205 */
206 bool is_extents;
207};
208
209struct btree_keys {
210 const struct btree_keys_ops *ops;
211 uint8_t page_order;
212 uint8_t nsets;
213 unsigned last_set_unwritten:1;
214 bool *expensive_debug_checks;
215
216 /*
217 * Sets of sorted keys - the real btree node - plus a binary search tree
218 *
219 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
220 * to the memory we have allocated for this btree node. Additionally,
221 * set[0]->data points to the entire btree node as it exists on disk.
222 */
223 struct bset_tree set[MAX_BSETS];
224};
225
226static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
227{
228 return b->set + b->nsets;
229}
230
231static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
232{
233 return t <= b->set + b->nsets - b->last_set_unwritten;
234}
235
236static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
237{
238 return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
239}
240
241static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
242{
243 return ((size_t) i) - ((size_t) b->set->data);
244}
245
246static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
247{
248 return bset_byte_offset(b, i) >> 9;
249}
250
251#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
252#define set_bytes(i) __set_bytes(i, i->keys)
253
254#define __set_blocks(i, k, block_bytes) \
255 DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
256#define set_blocks(i, block_bytes) \
257 __set_blocks(i, (i)->keys, block_bytes)
258
259static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
260{
261 struct bset_tree *t = bset_tree_last(b);
262
263 BUG_ON((PAGE_SIZE << b->page_order) <
264 (bset_byte_offset(b, t->data) + set_bytes(t->data)));
265
266 if (!b->last_set_unwritten)
267 return 0;
268
269 return ((PAGE_SIZE << b->page_order) -
270 (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
271 sizeof(u64);
272}
273
274static inline struct bset *bset_next_set(struct btree_keys *b,
275 unsigned block_bytes)
276{
277 struct bset *i = bset_tree_last(b)->data;
278
279 return ((void *) i) + roundup(set_bytes(i), block_bytes);
280}
281
282void bch_btree_keys_free(struct btree_keys *);
283int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
284void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
285 bool *);
286
287void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
288void bch_bset_build_written_tree(struct btree_keys *);
289void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
290bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
291void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
292unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
293 struct bkey *);
294
295enum {
296 BTREE_INSERT_STATUS_NO_INSERT = 0,
297 BTREE_INSERT_STATUS_INSERT,
298 BTREE_INSERT_STATUS_BACK_MERGE,
299 BTREE_INSERT_STATUS_OVERWROTE,
300 BTREE_INSERT_STATUS_FRONT_MERGE,
189}; 301};
190 302
303/* Btree key iteration */
304
305struct btree_iter {
306 size_t size, used;
307#ifdef CONFIG_BCACHE_DEBUG
308 struct btree_keys *b;
309#endif
310 struct btree_iter_set {
311 struct bkey *k, *end;
312 } data[MAX_BSETS];
313};
314
315typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
316
317struct bkey *bch_btree_iter_next(struct btree_iter *);
318struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
319 struct btree_keys *, ptr_filter_fn);
320
321void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
322struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
323 struct bkey *);
324
325struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
326 const struct bkey *);
327
328/*
329 * Returns the first key that is strictly greater than search
330 */
331static inline struct bkey *bch_bset_search(struct btree_keys *b,
332 struct bset_tree *t,
333 const struct bkey *search)
334{
335 return search ? __bch_bset_search(b, t, search) : t->data->start;
336}
337
338#define for_each_key_filter(b, k, iter, filter) \
339 for (bch_btree_iter_init((b), (iter), NULL); \
340 ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
341
342#define for_each_key(b, k, iter) \
343 for (bch_btree_iter_init((b), (iter), NULL); \
344 ((k) = bch_btree_iter_next(iter));)
345
346/* Sorting */
347
348struct bset_sort_state {
349 mempool_t *pool;
350
351 unsigned page_order;
352 unsigned crit_factor;
353
354 struct time_stats time;
355};
356
357void bch_bset_sort_state_free(struct bset_sort_state *);
358int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
359void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
360void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
361 struct bset_sort_state *);
362void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
363 struct bset_sort_state *);
364void bch_btree_sort_partial(struct btree_keys *, unsigned,
365 struct bset_sort_state *);
366
367static inline void bch_btree_sort(struct btree_keys *b,
368 struct bset_sort_state *state)
369{
370 bch_btree_sort_partial(b, 0, state);
371}
372
373struct bset_stats {
374 size_t sets_written, sets_unwritten;
375 size_t bytes_written, bytes_unwritten;
376 size_t floats, failed;
377};
378
379void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
380
381/* Bkey utility code */
382
383#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
384
385static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
386{
387 return bkey_idx(i->start, idx);
388}
389
390static inline void bkey_init(struct bkey *k)
391{
392 *k = ZERO_KEY;
393}
394
191static __always_inline int64_t bkey_cmp(const struct bkey *l, 395static __always_inline int64_t bkey_cmp(const struct bkey *l,
192 const struct bkey *r) 396 const struct bkey *r)
193{ 397{
@@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
196 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); 400 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
197} 401}
198 402
403void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
404 unsigned);
405bool __bch_cut_front(const struct bkey *, struct bkey *);
406bool __bch_cut_back(const struct bkey *, struct bkey *);
407
408static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
409{
410 BUG_ON(bkey_cmp(where, k) > 0);
411 return __bch_cut_front(where, k);
412}
413
414static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
415{
416 BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
417 return __bch_cut_back(where, k);
418}
419
420#define PRECEDING_KEY(_k) \
421({ \
422 struct bkey *_ret = NULL; \
423 \
424 if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
425 _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
426 \
427 if (!_ret->low) \
428 _ret->high--; \
429 _ret->low--; \
430 } \
431 \
432 _ret; \
433})
434
435static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
436{
437 return b->ops->key_invalid(b, k);
438}
439
440static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
441{
442 return b->ops->key_bad(b, k);
443}
444
445static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
446 size_t size, const struct bkey *k)
447{
448 return b->ops->key_to_text(buf, size, k);
449}
450
451static inline bool bch_bkey_equal_header(const struct bkey *l,
452 const struct bkey *r)
453{
454 return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
455 KEY_PTRS(l) == KEY_PTRS(r) &&
456 KEY_CSUM(l) == KEY_CSUM(l));
457}
458
199/* Keylists */ 459/* Keylists */
200 460
201struct keylist { 461struct keylist {
@@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l)
257 517
258struct bkey *bch_keylist_pop(struct keylist *); 518struct bkey *bch_keylist_pop(struct keylist *);
259void bch_keylist_pop_front(struct keylist *); 519void bch_keylist_pop_front(struct keylist *);
260int bch_keylist_realloc(struct keylist *, int, struct cache_set *); 520int __bch_keylist_realloc(struct keylist *, unsigned);
261
262void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
263 unsigned);
264bool __bch_cut_front(const struct bkey *, struct bkey *);
265bool __bch_cut_back(const struct bkey *, struct bkey *);
266 521
267static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) 522/* Debug stuff */
268{
269 BUG_ON(bkey_cmp(where, k) > 0);
270 return __bch_cut_front(where, k);
271}
272 523
273static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) 524#ifdef CONFIG_BCACHE_DEBUG
274{
275 BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
276 return __bch_cut_back(where, k);
277}
278
279const char *bch_ptr_status(struct cache_set *, const struct bkey *);
280bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
281bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
282
283bool bch_ptr_bad(struct btree *, const struct bkey *);
284
285static inline uint8_t gen_after(uint8_t a, uint8_t b)
286{
287 uint8_t r = a - b;
288 return r > 128U ? 0 : r;
289}
290
291static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
292 unsigned i)
293{
294 return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
295}
296
297static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
298 unsigned i)
299{
300 return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
301}
302
303
304typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
305
306struct bkey *bch_btree_iter_next(struct btree_iter *);
307struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
308 struct btree *, ptr_filter_fn);
309
310void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
311struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
312 struct bkey *, struct bset_tree *);
313
314/* 32 bits total: */
315#define BKEY_MID_BITS 3
316#define BKEY_EXPONENT_BITS 7
317#define BKEY_MANTISSA_BITS 22
318#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
319
320struct bkey_float {
321 unsigned exponent:BKEY_EXPONENT_BITS;
322 unsigned m:BKEY_MID_BITS;
323 unsigned mantissa:BKEY_MANTISSA_BITS;
324} __packed;
325
326/*
327 * BSET_CACHELINE was originally intended to match the hardware cacheline size -
328 * it used to be 64, but I realized the lookup code would touch slightly less
329 * memory if it was 128.
330 *
331 * It definites the number of bytes (in struct bset) per struct bkey_float in
332 * the auxiliar search tree - when we're done searching the bset_float tree we
333 * have this many bytes left that we do a linear search over.
334 *
335 * Since (after level 5) every level of the bset_tree is on a new cacheline,
336 * we're touching one fewer cacheline in the bset tree in exchange for one more
337 * cacheline in the linear search - but the linear search might stop before it
338 * gets to the second cacheline.
339 */
340
341#define BSET_CACHELINE 128
342#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE)
343 525
344#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) 526int __bch_count_data(struct btree_keys *);
345#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) 527void __bch_check_keys(struct btree_keys *, const char *, ...);
528void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
529void bch_dump_bucket(struct btree_keys *);
346 530
347void bch_bset_init_next(struct btree *); 531#else
348 532
349void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); 533static inline int __bch_count_data(struct btree_keys *b) { return -1; }
350void bch_bset_fix_lookup_table(struct btree *, struct bkey *); 534static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
535static inline void bch_dump_bucket(struct btree_keys *b) {}
536void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
351 537
352struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, 538#endif
353 const struct bkey *);
354 539
355/* 540static inline bool btree_keys_expensive_checks(struct btree_keys *b)
356 * Returns the first key that is strictly greater than search
357 */
358static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
359 const struct bkey *search)
360{ 541{
361 return search ? __bch_bset_search(b, t, search) : t->data->start; 542#ifdef CONFIG_BCACHE_DEBUG
543 return *b->expensive_debug_checks;
544#else
545 return false;
546#endif
362} 547}
363 548
364#define PRECEDING_KEY(_k) \ 549static inline int bch_count_data(struct btree_keys *b)
365({ \
366 struct bkey *_ret = NULL; \
367 \
368 if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
369 _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
370 \
371 if (!_ret->low) \
372 _ret->high--; \
373 _ret->low--; \
374 } \
375 \
376 _ret; \
377})
378
379bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
380void bch_btree_sort_lazy(struct btree *);
381void bch_btree_sort_into(struct btree *, struct btree *);
382void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
383void bch_btree_sort_partial(struct btree *, unsigned);
384
385static inline void bch_btree_sort(struct btree *b)
386{ 550{
387 bch_btree_sort_partial(b, 0); 551 return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
388} 552}
389 553
390int bch_bset_print_stats(struct cache_set *, char *); 554#define bch_check_keys(b, ...) \
555do { \
556 if (btree_keys_expensive_checks(b)) \
557 __bch_check_keys(b, __VA_ARGS__); \
558} while (0)
391 559
392#endif 560#endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 946ecd3b048b..98cc0a810a36 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,7 +23,7 @@
23#include "bcache.h" 23#include "bcache.h"
24#include "btree.h" 24#include "btree.h"
25#include "debug.h" 25#include "debug.h"
26#include "writeback.h" 26#include "extents.h"
27 27
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/bitops.h> 29#include <linux/bitops.h>
@@ -89,13 +89,6 @@
89 * Test module load/unload 89 * Test module load/unload
90 */ 90 */
91 91
92enum {
93 BTREE_INSERT_STATUS_INSERT,
94 BTREE_INSERT_STATUS_BACK_MERGE,
95 BTREE_INSERT_STATUS_OVERWROTE,
96 BTREE_INSERT_STATUS_FRONT_MERGE,
97};
98
99#define MAX_NEED_GC 64 92#define MAX_NEED_GC 64
100#define MAX_SAVE_PRIO 72 93#define MAX_SAVE_PRIO 72
101 94
@@ -106,14 +99,6 @@ enum {
106 99
107static struct workqueue_struct *btree_io_wq; 100static struct workqueue_struct *btree_io_wq;
108 101
109static inline bool should_split(struct btree *b)
110{
111 struct bset *i = write_block(b);
112 return b->written >= btree_blocks(b) ||
113 (b->written + __set_blocks(i, i->keys + 15, b->c)
114 > btree_blocks(b));
115}
116
117#define insert_lock(s, b) ((b)->level <= (s)->lock) 102#define insert_lock(s, b) ((b)->level <= (s)->lock)
118 103
119/* 104/*
@@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b)
167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 152 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
168 } \ 153 } \
169 rw_unlock(_w, _b); \ 154 rw_unlock(_w, _b); \
155 if (_r == -EINTR) \
156 schedule(); \
170 bch_cannibalize_unlock(c); \ 157 bch_cannibalize_unlock(c); \
171 if (_r == -ENOSPC) { \ 158 if (_r == -ENOSPC) { \
172 wait_event((c)->try_wait, \ 159 wait_event((c)->try_wait, \
@@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b)
175 } \ 162 } \
176 } while (_r == -EINTR); \ 163 } while (_r == -EINTR); \
177 \ 164 \
165 finish_wait(&(c)->bucket_wait, &(op)->wait); \
178 _r; \ 166 _r; \
179}) 167})
180 168
169static inline struct bset *write_block(struct btree *b)
170{
171 return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
172}
173
181/* Btree key manipulation */ 174/* Btree key manipulation */
182 175
183void bkey_put(struct cache_set *c, struct bkey *k) 176void bkey_put(struct cache_set *c, struct bkey *k)
@@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k)
194static uint64_t btree_csum_set(struct btree *b, struct bset *i) 187static uint64_t btree_csum_set(struct btree *b, struct bset *i)
195{ 188{
196 uint64_t crc = b->key.ptr[0]; 189 uint64_t crc = b->key.ptr[0];
197 void *data = (void *) i + 8, *end = end(i); 190 void *data = (void *) i + 8, *end = bset_bkey_last(i);
198 191
199 crc = bch_crc64_update(crc, data, end - data); 192 crc = bch_crc64_update(crc, data, end - data);
200 return crc ^ 0xffffffffffffffffULL; 193 return crc ^ 0xffffffffffffffffULL;
201} 194}
202 195
203static void bch_btree_node_read_done(struct btree *b) 196void bch_btree_node_read_done(struct btree *b)
204{ 197{
205 const char *err = "bad btree header"; 198 const char *err = "bad btree header";
206 struct bset *i = b->sets[0].data; 199 struct bset *i = btree_bset_first(b);
207 struct btree_iter *iter; 200 struct btree_iter *iter;
208 201
209 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); 202 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
@@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b)
211 iter->used = 0; 204 iter->used = 0;
212 205
213#ifdef CONFIG_BCACHE_DEBUG 206#ifdef CONFIG_BCACHE_DEBUG
214 iter->b = b; 207 iter->b = &b->keys;
215#endif 208#endif
216 209
217 if (!i->seq) 210 if (!i->seq)
218 goto err; 211 goto err;
219 212
220 for (; 213 for (;
221 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; 214 b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
222 i = write_block(b)) { 215 i = write_block(b)) {
223 err = "unsupported bset version"; 216 err = "unsupported bset version";
224 if (i->version > BCACHE_BSET_VERSION) 217 if (i->version > BCACHE_BSET_VERSION)
225 goto err; 218 goto err;
226 219
227 err = "bad btree header"; 220 err = "bad btree header";
228 if (b->written + set_blocks(i, b->c) > btree_blocks(b)) 221 if (b->written + set_blocks(i, block_bytes(b->c)) >
222 btree_blocks(b))
229 goto err; 223 goto err;
230 224
231 err = "bad magic"; 225 err = "bad magic";
@@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b)
245 } 239 }
246 240
247 err = "empty set"; 241 err = "empty set";
248 if (i != b->sets[0].data && !i->keys) 242 if (i != b->keys.set[0].data && !i->keys)
249 goto err; 243 goto err;
250 244
251 bch_btree_iter_push(iter, i->start, end(i)); 245 bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
252 246
253 b->written += set_blocks(i, b->c); 247 b->written += set_blocks(i, block_bytes(b->c));
254 } 248 }
255 249
256 err = "corrupted btree"; 250 err = "corrupted btree";
257 for (i = write_block(b); 251 for (i = write_block(b);
258 index(i, b) < btree_blocks(b); 252 bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
259 i = ((void *) i) + block_bytes(b->c)) 253 i = ((void *) i) + block_bytes(b->c))
260 if (i->seq == b->sets[0].data->seq) 254 if (i->seq == b->keys.set[0].data->seq)
261 goto err; 255 goto err;
262 256
263 bch_btree_sort_and_fix_extents(b, iter); 257 bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
264 258
265 i = b->sets[0].data; 259 i = b->keys.set[0].data;
266 err = "short btree key"; 260 err = "short btree key";
267 if (b->sets[0].size && 261 if (b->keys.set[0].size &&
268 bkey_cmp(&b->key, &b->sets[0].end) < 0) 262 bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
269 goto err; 263 goto err;
270 264
271 if (b->written < btree_blocks(b)) 265 if (b->written < btree_blocks(b))
272 bch_bset_init_next(b); 266 bch_bset_init_next(&b->keys, write_block(b),
267 bset_magic(&b->c->sb));
273out: 268out:
274 mempool_free(iter, b->c->fill_iter); 269 mempool_free(iter, b->c->fill_iter);
275 return; 270 return;
276err: 271err:
277 set_btree_node_io_error(b); 272 set_btree_node_io_error(b);
278 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 273 bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
279 err, PTR_BUCKET_NR(b->c, &b->key, 0), 274 err, PTR_BUCKET_NR(b->c, &b->key, 0),
280 index(i, b), i->keys); 275 bset_block_offset(b, i), i->keys);
281 goto out; 276 goto out;
282} 277}
283 278
@@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error)
287 closure_put(cl); 282 closure_put(cl);
288} 283}
289 284
290void bch_btree_node_read(struct btree *b) 285static void bch_btree_node_read(struct btree *b)
291{ 286{
292 uint64_t start_time = local_clock(); 287 uint64_t start_time = local_clock();
293 struct closure cl; 288 struct closure cl;
@@ -303,7 +298,7 @@ void bch_btree_node_read(struct btree *b)
303 bio->bi_end_io = btree_node_read_endio; 298 bio->bi_end_io = btree_node_read_endio;
304 bio->bi_private = &cl; 299 bio->bi_private = &cl;
305 300
306 bch_bio_map(bio, b->sets[0].data); 301 bch_bio_map(bio, b->keys.set[0].data);
307 302
308 bch_submit_bbio(bio, b->c, &b->key, 0); 303 bch_submit_bbio(bio, b->c, &b->key, 0);
309 closure_sync(&cl); 304 closure_sync(&cl);
@@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
340 w->journal = NULL; 335 w->journal = NULL;
341} 336}
342 337
338static void btree_node_write_unlock(struct closure *cl)
339{
340 struct btree *b = container_of(cl, struct btree, io);
341
342 up(&b->io_mutex);
343}
344
343static void __btree_node_write_done(struct closure *cl) 345static void __btree_node_write_done(struct closure *cl)
344{ 346{
345 struct btree *b = container_of(cl, struct btree, io.cl); 347 struct btree *b = container_of(cl, struct btree, io);
346 struct btree_write *w = btree_prev_write(b); 348 struct btree_write *w = btree_prev_write(b);
347 349
348 bch_bbio_free(b->bio, b->c); 350 bch_bbio_free(b->bio, b->c);
@@ -353,12 +355,12 @@ static void __btree_node_write_done(struct closure *cl)
353 queue_delayed_work(btree_io_wq, &b->work, 355 queue_delayed_work(btree_io_wq, &b->work,
354 msecs_to_jiffies(30000)); 356 msecs_to_jiffies(30000));
355 357
356 closure_return(cl); 358 closure_return_with_destructor(cl, btree_node_write_unlock);
357} 359}
358 360
359static void btree_node_write_done(struct closure *cl) 361static void btree_node_write_done(struct closure *cl)
360{ 362{
361 struct btree *b = container_of(cl, struct btree, io.cl); 363 struct btree *b = container_of(cl, struct btree, io);
362 struct bio_vec *bv; 364 struct bio_vec *bv;
363 int n; 365 int n;
364 366
@@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl)
371static void btree_node_write_endio(struct bio *bio, int error) 373static void btree_node_write_endio(struct bio *bio, int error)
372{ 374{
373 struct closure *cl = bio->bi_private; 375 struct closure *cl = bio->bi_private;
374 struct btree *b = container_of(cl, struct btree, io.cl); 376 struct btree *b = container_of(cl, struct btree, io);
375 377
376 if (error) 378 if (error)
377 set_btree_node_io_error(b); 379 set_btree_node_io_error(b);
@@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error)
382 384
383static void do_btree_node_write(struct btree *b) 385static void do_btree_node_write(struct btree *b)
384{ 386{
385 struct closure *cl = &b->io.cl; 387 struct closure *cl = &b->io;
386 struct bset *i = b->sets[b->nsets].data; 388 struct bset *i = btree_bset_last(b);
387 BKEY_PADDED(key) k; 389 BKEY_PADDED(key) k;
388 390
389 i->version = BCACHE_BSET_VERSION; 391 i->version = BCACHE_BSET_VERSION;
@@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b)
395 b->bio->bi_end_io = btree_node_write_endio; 397 b->bio->bi_end_io = btree_node_write_endio;
396 b->bio->bi_private = cl; 398 b->bio->bi_private = cl;
397 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 399 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
398 b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c); 400 b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
399 bch_bio_map(b->bio, i); 401 bch_bio_map(b->bio, i);
400 402
401 /* 403 /*
@@ -414,7 +416,8 @@ static void do_btree_node_write(struct btree *b)
414 */ 416 */
415 417
416 bkey_copy(&k.key, &b->key); 418 bkey_copy(&k.key, &b->key);
417 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 419 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
420 bset_sector_offset(&b->keys, i));
418 421
419 if (!bio_alloc_pages(b->bio, GFP_NOIO)) { 422 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
420 int j; 423 int j;
@@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b)
435 bch_submit_bbio(b->bio, b->c, &k.key, 0); 438 bch_submit_bbio(b->bio, b->c, &k.key, 0);
436 439
437 closure_sync(cl); 440 closure_sync(cl);
438 __btree_node_write_done(cl); 441 continue_at_nobarrier(cl, __btree_node_write_done, NULL);
439 } 442 }
440} 443}
441 444
442void bch_btree_node_write(struct btree *b, struct closure *parent) 445void bch_btree_node_write(struct btree *b, struct closure *parent)
443{ 446{
444 struct bset *i = b->sets[b->nsets].data; 447 struct bset *i = btree_bset_last(b);
445 448
446 trace_bcache_btree_write(b); 449 trace_bcache_btree_write(b);
447 450
448 BUG_ON(current->bio_list); 451 BUG_ON(current->bio_list);
449 BUG_ON(b->written >= btree_blocks(b)); 452 BUG_ON(b->written >= btree_blocks(b));
450 BUG_ON(b->written && !i->keys); 453 BUG_ON(b->written && !i->keys);
451 BUG_ON(b->sets->data->seq != i->seq); 454 BUG_ON(btree_bset_first(b)->seq != i->seq);
452 bch_check_keys(b, "writing"); 455 bch_check_keys(&b->keys, "writing");
453 456
454 cancel_delayed_work(&b->work); 457 cancel_delayed_work(&b->work);
455 458
456 /* If caller isn't waiting for write, parent refcount is cache set */ 459 /* If caller isn't waiting for write, parent refcount is cache set */
457 closure_lock(&b->io, parent ?: &b->c->cl); 460 down(&b->io_mutex);
461 closure_init(&b->io, parent ?: &b->c->cl);
458 462
459 clear_bit(BTREE_NODE_dirty, &b->flags); 463 clear_bit(BTREE_NODE_dirty, &b->flags);
460 change_bit(BTREE_NODE_write_idx, &b->flags); 464 change_bit(BTREE_NODE_write_idx, &b->flags);
461 465
462 do_btree_node_write(b); 466 do_btree_node_write(b);
463 467
464 b->written += set_blocks(i, b->c); 468 atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
465 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
466 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 469 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
467 470
468 bch_btree_sort_lazy(b); 471 b->written += set_blocks(i, block_bytes(b->c));
472
473 /* If not a leaf node, always sort */
474 if (b->level && b->keys.nsets)
475 bch_btree_sort(&b->keys, &b->c->sort);
476 else
477 bch_btree_sort_lazy(&b->keys, &b->c->sort);
478
479 /*
480 * do verify if there was more than one set initially (i.e. we did a
481 * sort) and we sorted down to a single set:
482 */
483 if (i != b->keys.set->data && !b->keys.nsets)
484 bch_btree_verify(b);
469 485
470 if (b->written < btree_blocks(b)) 486 if (b->written < btree_blocks(b))
471 bch_bset_init_next(b); 487 bch_bset_init_next(&b->keys, write_block(b),
488 bset_magic(&b->c->sb));
472} 489}
473 490
474static void bch_btree_node_write_sync(struct btree *b) 491static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w)
493 510
494static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 511static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
495{ 512{
496 struct bset *i = b->sets[b->nsets].data; 513 struct bset *i = btree_bset_last(b);
497 struct btree_write *w = btree_current_write(b); 514 struct btree_write *w = btree_current_write(b);
498 515
499 BUG_ON(!b->written); 516 BUG_ON(!b->written);
@@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
528 * mca -> memory cache 545 * mca -> memory cache
529 */ 546 */
530 547
531static void mca_reinit(struct btree *b)
532{
533 unsigned i;
534
535 b->flags = 0;
536 b->written = 0;
537 b->nsets = 0;
538
539 for (i = 0; i < MAX_BSETS; i++)
540 b->sets[i].size = 0;
541 /*
542 * Second loop starts at 1 because b->sets[0]->data is the memory we
543 * allocated
544 */
545 for (i = 1; i < MAX_BSETS; i++)
546 b->sets[i].data = NULL;
547}
548
549#define mca_reserve(c) (((c->root && c->root->level) \ 548#define mca_reserve(c) (((c->root && c->root->level) \
550 ? c->root->level : 1) * 8 + 16) 549 ? c->root->level : 1) * 8 + 16)
551#define mca_can_free(c) \ 550#define mca_can_free(c) \
@@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b)
553 552
554static void mca_data_free(struct btree *b) 553static void mca_data_free(struct btree *b)
555{ 554{
556 struct bset_tree *t = b->sets; 555 BUG_ON(b->io_mutex.count != 1);
557 BUG_ON(!closure_is_unlocked(&b->io.cl));
558 556
559 if (bset_prev_bytes(b) < PAGE_SIZE) 557 bch_btree_keys_free(&b->keys);
560 kfree(t->prev);
561 else
562 free_pages((unsigned long) t->prev,
563 get_order(bset_prev_bytes(b)));
564 558
565 if (bset_tree_bytes(b) < PAGE_SIZE)
566 kfree(t->tree);
567 else
568 free_pages((unsigned long) t->tree,
569 get_order(bset_tree_bytes(b)));
570
571 free_pages((unsigned long) t->data, b->page_order);
572
573 t->prev = NULL;
574 t->tree = NULL;
575 t->data = NULL;
576 list_move(&b->list, &b->c->btree_cache_freed);
577 b->c->bucket_cache_used--; 559 b->c->bucket_cache_used--;
560 list_move(&b->list, &b->c->btree_cache_freed);
578} 561}
579 562
580static void mca_bucket_free(struct btree *b) 563static void mca_bucket_free(struct btree *b)
@@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k)
593 576
594static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 577static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
595{ 578{
596 struct bset_tree *t = b->sets; 579 if (!bch_btree_keys_alloc(&b->keys,
597 BUG_ON(t->data); 580 max_t(unsigned,
598 581 ilog2(b->c->btree_pages),
599 b->page_order = max_t(unsigned, 582 btree_order(k)),
600 ilog2(b->c->btree_pages), 583 gfp)) {
601 btree_order(k)); 584 b->c->bucket_cache_used++;
602 585 list_move(&b->list, &b->c->btree_cache);
603 t->data = (void *) __get_free_pages(gfp, b->page_order); 586 } else {
604 if (!t->data) 587 list_move(&b->list, &b->c->btree_cache_freed);
605 goto err; 588 }
606
607 t->tree = bset_tree_bytes(b) < PAGE_SIZE
608 ? kmalloc(bset_tree_bytes(b), gfp)
609 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
610 if (!t->tree)
611 goto err;
612
613 t->prev = bset_prev_bytes(b) < PAGE_SIZE
614 ? kmalloc(bset_prev_bytes(b), gfp)
615 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
616 if (!t->prev)
617 goto err;
618
619 list_move(&b->list, &b->c->btree_cache);
620 b->c->bucket_cache_used++;
621 return;
622err:
623 mca_data_free(b);
624} 589}
625 590
626static struct btree *mca_bucket_alloc(struct cache_set *c, 591static struct btree *mca_bucket_alloc(struct cache_set *c,
@@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
635 INIT_LIST_HEAD(&b->list); 600 INIT_LIST_HEAD(&b->list);
636 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 601 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
637 b->c = c; 602 b->c = c;
638 closure_init_unlocked(&b->io); 603 sema_init(&b->io_mutex, 1);
639 604
640 mca_data_alloc(b, k, gfp); 605 mca_data_alloc(b, k, gfp);
641 return b; 606 return b;
@@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
651 if (!down_write_trylock(&b->lock)) 616 if (!down_write_trylock(&b->lock))
652 return -ENOMEM; 617 return -ENOMEM;
653 618
654 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 619 BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
655 620
656 if (b->page_order < min_order || 621 if (b->keys.page_order < min_order)
657 (!flush && 622 goto out_unlock;
658 (btree_node_dirty(b) || 623
659 atomic_read(&b->io.cl.remaining) != -1))) { 624 if (!flush) {
660 rw_unlock(true, b); 625 if (btree_node_dirty(b))
661 return -ENOMEM; 626 goto out_unlock;
627
628 if (down_trylock(&b->io_mutex))
629 goto out_unlock;
630 up(&b->io_mutex);
662 } 631 }
663 632
664 if (btree_node_dirty(b)) 633 if (btree_node_dirty(b))
665 bch_btree_node_write_sync(b); 634 bch_btree_node_write_sync(b);
666 635
667 /* wait for any in flight btree write */ 636 /* wait for any in flight btree write */
668 closure_wait_event(&b->io.wait, &cl, 637 down(&b->io_mutex);
669 atomic_read(&b->io.cl.remaining) == -1); 638 up(&b->io_mutex);
670 639
671 return 0; 640 return 0;
641out_unlock:
642 rw_unlock(true, b);
643 return -ENOMEM;
672} 644}
673 645
674static unsigned long bch_mca_scan(struct shrinker *shrink, 646static unsigned long bch_mca_scan(struct shrinker *shrink,
@@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
714 } 686 }
715 } 687 }
716 688
717 /*
718 * Can happen right when we first start up, before we've read in any
719 * btree nodes
720 */
721 if (list_empty(&c->btree_cache))
722 goto out;
723
724 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { 689 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
690 if (list_empty(&c->btree_cache))
691 goto out;
692
725 b = list_first_entry(&c->btree_cache, struct btree, list); 693 b = list_first_entry(&c->btree_cache, struct btree, list);
726 list_rotate_left(&c->btree_cache); 694 list_rotate_left(&c->btree_cache);
727 695
@@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c)
767#ifdef CONFIG_BCACHE_DEBUG 735#ifdef CONFIG_BCACHE_DEBUG
768 if (c->verify_data) 736 if (c->verify_data)
769 list_move(&c->verify_data->list, &c->btree_cache); 737 list_move(&c->verify_data->list, &c->btree_cache);
738
739 free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
770#endif 740#endif
771 741
772 list_splice(&c->btree_cache_freeable, 742 list_splice(&c->btree_cache_freeable,
@@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c)
807#ifdef CONFIG_BCACHE_DEBUG 777#ifdef CONFIG_BCACHE_DEBUG
808 mutex_init(&c->verify_lock); 778 mutex_init(&c->verify_lock);
809 779
780 c->verify_ondisk = (void *)
781 __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
782
810 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 783 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
811 784
812 if (c->verify_data && 785 if (c->verify_data &&
813 c->verify_data->sets[0].data) 786 c->verify_data->keys.set->data)
814 list_del_init(&c->verify_data->list); 787 list_del_init(&c->verify_data->list);
815 else 788 else
816 c->verify_data = NULL; 789 c->verify_data = NULL;
@@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
908 list_for_each_entry(b, &c->btree_cache_freed, list) 881 list_for_each_entry(b, &c->btree_cache_freed, list)
909 if (!mca_reap(b, 0, false)) { 882 if (!mca_reap(b, 0, false)) {
910 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 883 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
911 if (!b->sets[0].data) 884 if (!b->keys.set[0].data)
912 goto err; 885 goto err;
913 else 886 else
914 goto out; 887 goto out;
@@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
919 goto err; 892 goto err;
920 893
921 BUG_ON(!down_write_trylock(&b->lock)); 894 BUG_ON(!down_write_trylock(&b->lock));
922 if (!b->sets->data) 895 if (!b->keys.set->data)
923 goto err; 896 goto err;
924out: 897out:
925 BUG_ON(!closure_is_unlocked(&b->io.cl)); 898 BUG_ON(b->io_mutex.count != 1);
926 899
927 bkey_copy(&b->key, k); 900 bkey_copy(&b->key, k);
928 list_move(&b->list, &c->btree_cache); 901 list_move(&b->list, &c->btree_cache);
@@ -930,10 +903,17 @@ out:
930 hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 903 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
931 904
932 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 905 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
933 b->level = level;
934 b->parent = (void *) ~0UL; 906 b->parent = (void *) ~0UL;
907 b->flags = 0;
908 b->written = 0;
909 b->level = level;
935 910
936 mca_reinit(b); 911 if (!b->level)
912 bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
913 &b->c->expensive_debug_checks);
914 else
915 bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
916 &b->c->expensive_debug_checks);
937 917
938 return b; 918 return b;
939err: 919err:
@@ -994,13 +974,13 @@ retry:
994 974
995 b->accessed = 1; 975 b->accessed = 1;
996 976
997 for (; i <= b->nsets && b->sets[i].size; i++) { 977 for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
998 prefetch(b->sets[i].tree); 978 prefetch(b->keys.set[i].tree);
999 prefetch(b->sets[i].data); 979 prefetch(b->keys.set[i].data);
1000 } 980 }
1001 981
1002 for (; i <= b->nsets; i++) 982 for (; i <= b->keys.nsets; i++)
1003 prefetch(b->sets[i].data); 983 prefetch(b->keys.set[i].data);
1004 984
1005 if (btree_node_io_error(b)) { 985 if (btree_node_io_error(b)) {
1006 rw_unlock(write, b); 986 rw_unlock(write, b);
@@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
1063 1043
1064 mutex_lock(&c->bucket_lock); 1044 mutex_lock(&c->bucket_lock);
1065retry: 1045retry:
1066 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) 1046 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1067 goto err; 1047 goto err;
1068 1048
1069 bkey_put(c, &k.key); 1049 bkey_put(c, &k.key);
@@ -1080,7 +1060,7 @@ retry:
1080 } 1060 }
1081 1061
1082 b->accessed = 1; 1062 b->accessed = 1;
1083 bch_bset_init_next(b); 1063 bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
1084 1064
1085 mutex_unlock(&c->bucket_lock); 1065 mutex_unlock(&c->bucket_lock);
1086 1066
@@ -1098,8 +1078,10 @@ err:
1098static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) 1078static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
1099{ 1079{
1100 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); 1080 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
1101 if (!IS_ERR_OR_NULL(n)) 1081 if (!IS_ERR_OR_NULL(n)) {
1102 bch_btree_sort_into(b, n); 1082 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1083 bkey_copy_key(&n->key, &b->key);
1084 }
1103 1085
1104 return n; 1086 return n;
1105} 1087}
@@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1120 atomic_inc(&b->c->prio_blocked); 1102 atomic_inc(&b->c->prio_blocked);
1121} 1103}
1122 1104
1105static int btree_check_reserve(struct btree *b, struct btree_op *op)
1106{
1107 struct cache_set *c = b->c;
1108 struct cache *ca;
1109 unsigned i, reserve = c->root->level * 2 + 1;
1110 int ret = 0;
1111
1112 mutex_lock(&c->bucket_lock);
1113
1114 for_each_cache(ca, c, i)
1115 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1116 if (op)
1117 prepare_to_wait(&c->bucket_wait, &op->wait,
1118 TASK_UNINTERRUPTIBLE);
1119 ret = -EINTR;
1120 break;
1121 }
1122
1123 mutex_unlock(&c->bucket_lock);
1124 return ret;
1125}
1126
1123/* Garbage collection */ 1127/* Garbage collection */
1124 1128
1125uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) 1129uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1183 1187
1184 gc->nodes++; 1188 gc->nodes++;
1185 1189
1186 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1190 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1187 stale = max(stale, btree_mark_key(b, k)); 1191 stale = max(stale, btree_mark_key(b, k));
1188 keys++; 1192 keys++;
1189 1193
1190 if (bch_ptr_bad(b, k)) 1194 if (bch_ptr_bad(&b->keys, k))
1191 continue; 1195 continue;
1192 1196
1193 gc->key_bytes += bkey_u64s(k); 1197 gc->key_bytes += bkey_u64s(k);
@@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1197 gc->data += KEY_SIZE(k); 1201 gc->data += KEY_SIZE(k);
1198 } 1202 }
1199 1203
1200 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1204 for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
1201 btree_bug_on(t->size && 1205 btree_bug_on(t->size &&
1202 bset_written(b, t) && 1206 bset_written(&b->keys, t) &&
1203 bkey_cmp(&b->key, &t->end) < 0, 1207 bkey_cmp(&b->key, &t->end) < 0,
1204 b, "found short btree key in gc"); 1208 b, "found short btree key in gc");
1205 1209
@@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1243 blocks = btree_default_blocks(b->c) * 2 / 3; 1247 blocks = btree_default_blocks(b->c) * 2 / 3;
1244 1248
1245 if (nodes < 2 || 1249 if (nodes < 2 ||
1246 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1250 __set_blocks(b->keys.set[0].data, keys,
1251 block_bytes(b->c)) > blocks * (nodes - 1))
1247 return 0; 1252 return 0;
1248 1253
1249 for (i = 0; i < nodes; i++) { 1254 for (i = 0; i < nodes; i++) {
@@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1253 } 1258 }
1254 1259
1255 for (i = nodes - 1; i > 0; --i) { 1260 for (i = nodes - 1; i > 0; --i) {
1256 struct bset *n1 = new_nodes[i]->sets->data; 1261 struct bset *n1 = btree_bset_first(new_nodes[i]);
1257 struct bset *n2 = new_nodes[i - 1]->sets->data; 1262 struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
1258 struct bkey *k, *last = NULL; 1263 struct bkey *k, *last = NULL;
1259 1264
1260 keys = 0; 1265 keys = 0;
1261 1266
1262 if (i > 1) { 1267 if (i > 1) {
1263 for (k = n2->start; 1268 for (k = n2->start;
1264 k < end(n2); 1269 k < bset_bkey_last(n2);
1265 k = bkey_next(k)) { 1270 k = bkey_next(k)) {
1266 if (__set_blocks(n1, n1->keys + keys + 1271 if (__set_blocks(n1, n1->keys + keys +
1267 bkey_u64s(k), b->c) > blocks) 1272 bkey_u64s(k),
1273 block_bytes(b->c)) > blocks)
1268 break; 1274 break;
1269 1275
1270 last = k; 1276 last = k;
@@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1280 * though) 1286 * though)
1281 */ 1287 */
1282 if (__set_blocks(n1, n1->keys + n2->keys, 1288 if (__set_blocks(n1, n1->keys + n2->keys,
1283 b->c) > btree_blocks(new_nodes[i])) 1289 block_bytes(b->c)) >
1290 btree_blocks(new_nodes[i]))
1284 goto out_nocoalesce; 1291 goto out_nocoalesce;
1285 1292
1286 keys = n2->keys; 1293 keys = n2->keys;
@@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1288 last = &r->b->key; 1295 last = &r->b->key;
1289 } 1296 }
1290 1297
1291 BUG_ON(__set_blocks(n1, n1->keys + keys, 1298 BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
1292 b->c) > btree_blocks(new_nodes[i])); 1299 btree_blocks(new_nodes[i]));
1293 1300
1294 if (last) 1301 if (last)
1295 bkey_copy_key(&new_nodes[i]->key, last); 1302 bkey_copy_key(&new_nodes[i]->key, last);
1296 1303
1297 memcpy(end(n1), 1304 memcpy(bset_bkey_last(n1),
1298 n2->start, 1305 n2->start,
1299 (void *) node(n2, keys) - (void *) n2->start); 1306 (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
1300 1307
1301 n1->keys += keys; 1308 n1->keys += keys;
1302 r[i].keys = n1->keys; 1309 r[i].keys = n1->keys;
1303 1310
1304 memmove(n2->start, 1311 memmove(n2->start,
1305 node(n2, keys), 1312 bset_bkey_idx(n2, keys),
1306 (void *) end(n2) - (void *) node(n2, keys)); 1313 (void *) bset_bkey_last(n2) -
1314 (void *) bset_bkey_idx(n2, keys));
1307 1315
1308 n2->keys -= keys; 1316 n2->keys -= keys;
1309 1317
1310 if (bch_keylist_realloc(keylist, 1318 if (__bch_keylist_realloc(keylist,
1311 KEY_PTRS(&new_nodes[i]->key), b->c)) 1319 bkey_u64s(&new_nodes[i]->key)))
1312 goto out_nocoalesce; 1320 goto out_nocoalesce;
1313 1321
1314 bch_btree_node_write(new_nodes[i], &cl); 1322 bch_btree_node_write(new_nodes[i], &cl);
@@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1316 } 1324 }
1317 1325
1318 for (i = 0; i < nodes; i++) { 1326 for (i = 0; i < nodes; i++) {
1319 if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) 1327 if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
1320 goto out_nocoalesce; 1328 goto out_nocoalesce;
1321 1329
1322 make_btree_freeing_key(r[i].b, keylist->top); 1330 make_btree_freeing_key(r[i].b, keylist->top);
@@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1324 } 1332 }
1325 1333
1326 /* We emptied out this node */ 1334 /* We emptied out this node */
1327 BUG_ON(new_nodes[0]->sets->data->keys); 1335 BUG_ON(btree_bset_first(new_nodes[0])->keys);
1328 btree_node_free(new_nodes[0]); 1336 btree_node_free(new_nodes[0]);
1329 rw_unlock(true, new_nodes[0]); 1337 rw_unlock(true, new_nodes[0]);
1330 1338
@@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b)
1370 struct btree_iter iter; 1378 struct btree_iter iter;
1371 unsigned ret = 0; 1379 unsigned ret = 0;
1372 1380
1373 for_each_key_filter(b, k, &iter, bch_ptr_bad) 1381 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
1374 ret += bkey_u64s(k); 1382 ret += bkey_u64s(k);
1375 1383
1376 return ret; 1384 return ret;
@@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1390 struct gc_merge_info *last = r + GC_MERGE_NODES - 1; 1398 struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
1391 1399
1392 bch_keylist_init(&keys); 1400 bch_keylist_init(&keys);
1393 bch_btree_iter_init(b, &iter, &b->c->gc_done); 1401 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1394 1402
1395 for (i = 0; i < GC_MERGE_NODES; i++) 1403 for (i = 0; i < GC_MERGE_NODES; i++)
1396 r[i].b = ERR_PTR(-EINTR); 1404 r[i].b = ERR_PTR(-EINTR);
1397 1405
1398 while (1) { 1406 while (1) {
1399 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1407 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1400 if (k) { 1408 if (k) {
1401 r->b = bch_btree_node_get(b->c, k, b->level - 1, true); 1409 r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
1402 if (IS_ERR(r->b)) { 1410 if (IS_ERR(r->b)) {
@@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1416 1424
1417 if (!IS_ERR(last->b)) { 1425 if (!IS_ERR(last->b)) {
1418 should_rewrite = btree_gc_mark_node(last->b, gc); 1426 should_rewrite = btree_gc_mark_node(last->b, gc);
1419 if (should_rewrite) { 1427 if (should_rewrite &&
1428 !btree_check_reserve(b, NULL)) {
1420 n = btree_node_alloc_replacement(last->b, 1429 n = btree_node_alloc_replacement(last->b,
1421 false); 1430 false);
1422 1431
@@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1705 struct bucket *g; 1714 struct bucket *g;
1706 struct btree_iter iter; 1715 struct btree_iter iter;
1707 1716
1708 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1717 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1709 for (i = 0; i < KEY_PTRS(k); i++) { 1718 for (i = 0; i < KEY_PTRS(k); i++) {
1710 if (!ptr_available(b->c, k, i)) 1719 if (!ptr_available(b->c, k, i))
1711 continue; 1720 continue;
@@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1728 } 1737 }
1729 1738
1730 if (b->level) { 1739 if (b->level) {
1731 bch_btree_iter_init(b, &iter, NULL); 1740 bch_btree_iter_init(&b->keys, &iter, NULL);
1732 1741
1733 do { 1742 do {
1734 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1743 k = bch_btree_iter_next_filter(&iter, &b->keys,
1744 bch_ptr_bad);
1735 if (k) 1745 if (k)
1736 btree_node_prefetch(b->c, k, b->level - 1); 1746 btree_node_prefetch(b->c, k, b->level - 1);
1737 1747
@@ -1774,235 +1784,36 @@ err:
1774 1784
1775/* Btree insertion */ 1785/* Btree insertion */
1776 1786
1777static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) 1787static bool btree_insert_key(struct btree *b, struct bkey *k,
1778{ 1788 struct bkey *replace_key)
1779 struct bset *i = b->sets[b->nsets].data;
1780
1781 memmove((uint64_t *) where + bkey_u64s(insert),
1782 where,
1783 (void *) end(i) - (void *) where);
1784
1785 i->keys += bkey_u64s(insert);
1786 bkey_copy(where, insert);
1787 bch_bset_fix_lookup_table(b, where);
1788}
1789
1790static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
1791 struct btree_iter *iter,
1792 struct bkey *replace_key)
1793{ 1789{
1794 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1790 unsigned status;
1795 {
1796 if (KEY_DIRTY(k))
1797 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1798 offset, -sectors);
1799 }
1800
1801 uint64_t old_offset;
1802 unsigned old_size, sectors_found = 0;
1803
1804 while (1) {
1805 struct bkey *k = bch_btree_iter_next(iter);
1806 if (!k ||
1807 bkey_cmp(&START_KEY(k), insert) >= 0)
1808 break;
1809
1810 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1811 continue;
1812
1813 old_offset = KEY_START(k);
1814 old_size = KEY_SIZE(k);
1815
1816 /*
1817 * We might overlap with 0 size extents; we can't skip these
1818 * because if they're in the set we're inserting to we have to
1819 * adjust them so they don't overlap with the key we're
1820 * inserting. But we don't want to check them for replace
1821 * operations.
1822 */
1823
1824 if (replace_key && KEY_SIZE(k)) {
1825 /*
1826 * k might have been split since we inserted/found the
1827 * key we're replacing
1828 */
1829 unsigned i;
1830 uint64_t offset = KEY_START(k) -
1831 KEY_START(replace_key);
1832
1833 /* But it must be a subset of the replace key */
1834 if (KEY_START(k) < KEY_START(replace_key) ||
1835 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
1836 goto check_failed;
1837
1838 /* We didn't find a key that we were supposed to */
1839 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1840 goto check_failed;
1841
1842 if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
1843 KEY_DIRTY(k) != KEY_DIRTY(replace_key))
1844 goto check_failed;
1845
1846 /* skip past gen */
1847 offset <<= 8;
1848
1849 BUG_ON(!KEY_PTRS(replace_key));
1850 1791
1851 for (i = 0; i < KEY_PTRS(replace_key); i++) 1792 BUG_ON(bkey_cmp(k, &b->key) > 0);
1852 if (k->ptr[i] != replace_key->ptr[i] + offset)
1853 goto check_failed;
1854
1855 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1856 }
1857
1858 if (bkey_cmp(insert, k) < 0 &&
1859 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1860 /*
1861 * We overlapped in the middle of an existing key: that
1862 * means we have to split the old key. But we have to do
1863 * slightly different things depending on whether the
1864 * old key has been written out yet.
1865 */
1866
1867 struct bkey *top;
1868
1869 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1870
1871 if (bkey_written(b, k)) {
1872 /*
1873 * We insert a new key to cover the top of the
1874 * old key, and the old key is modified in place
1875 * to represent the bottom split.
1876 *
1877 * It's completely arbitrary whether the new key
1878 * is the top or the bottom, but it has to match
1879 * up with what btree_sort_fixup() does - it
1880 * doesn't check for this kind of overlap, it
1881 * depends on us inserting a new key for the top
1882 * here.
1883 */
1884 top = bch_bset_search(b, &b->sets[b->nsets],
1885 insert);
1886 shift_keys(b, top, k);
1887 } else {
1888 BKEY_PADDED(key) temp;
1889 bkey_copy(&temp.key, k);
1890 shift_keys(b, k, &temp.key);
1891 top = bkey_next(k);
1892 }
1893
1894 bch_cut_front(insert, top);
1895 bch_cut_back(&START_KEY(insert), k);
1896 bch_bset_fix_invalidated_key(b, k);
1897 return false;
1898 }
1899
1900 if (bkey_cmp(insert, k) < 0) {
1901 bch_cut_front(insert, k);
1902 } else {
1903 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1904 old_offset = KEY_START(insert);
1905
1906 if (bkey_written(b, k) &&
1907 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1908 /*
1909 * Completely overwrote, so we don't have to
1910 * invalidate the binary search tree
1911 */
1912 bch_cut_front(k, k);
1913 } else {
1914 __bch_cut_back(&START_KEY(insert), k);
1915 bch_bset_fix_invalidated_key(b, k);
1916 }
1917 }
1918
1919 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1920 }
1921 1793
1922check_failed: 1794 status = bch_btree_insert_key(&b->keys, k, replace_key);
1923 if (replace_key) { 1795 if (status != BTREE_INSERT_STATUS_NO_INSERT) {
1924 if (!sectors_found) { 1796 bch_check_keys(&b->keys, "%u for %s", status,
1925 return true; 1797 replace_key ? "replace" : "insert");
1926 } else if (sectors_found < KEY_SIZE(insert)) {
1927 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1928 (KEY_SIZE(insert) - sectors_found));
1929 SET_KEY_SIZE(insert, sectors_found);
1930 }
1931 }
1932 1798
1933 return false; 1799 trace_bcache_btree_insert_key(b, k, replace_key != NULL,
1800 status);
1801 return true;
1802 } else
1803 return false;
1934} 1804}
1935 1805
1936static bool btree_insert_key(struct btree *b, struct btree_op *op, 1806static size_t insert_u64s_remaining(struct btree *b)
1937 struct bkey *k, struct bkey *replace_key)
1938{ 1807{
1939 struct bset *i = b->sets[b->nsets].data; 1808 ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys);
1940 struct bkey *m, *prev;
1941 unsigned status = BTREE_INSERT_STATUS_INSERT;
1942
1943 BUG_ON(bkey_cmp(k, &b->key) > 0);
1944 BUG_ON(b->level && !KEY_PTRS(k));
1945 BUG_ON(!b->level && !KEY_OFFSET(k));
1946
1947 if (!b->level) {
1948 struct btree_iter iter;
1949
1950 /*
1951 * bset_search() returns the first key that is strictly greater
1952 * than the search key - but for back merging, we want to find
1953 * the previous key.
1954 */
1955 prev = NULL;
1956 m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
1957 1809
1958 if (fix_overlapping_extents(b, k, &iter, replace_key)) { 1810 /*
1959 op->insert_collision = true; 1811 * Might land in the middle of an existing extent and have to split it
1960 return false; 1812 */
1961 } 1813 if (b->keys.ops->is_extents)
1962 1814 ret -= KEY_MAX_U64S;
1963 if (KEY_DIRTY(k))
1964 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1965 KEY_START(k), KEY_SIZE(k));
1966
1967 while (m != end(i) &&
1968 bkey_cmp(k, &START_KEY(m)) > 0)
1969 prev = m, m = bkey_next(m);
1970
1971 if (key_merging_disabled(b->c))
1972 goto insert;
1973
1974 /* prev is in the tree, if we merge we're done */
1975 status = BTREE_INSERT_STATUS_BACK_MERGE;
1976 if (prev &&
1977 bch_bkey_try_merge(b, prev, k))
1978 goto merged;
1979
1980 status = BTREE_INSERT_STATUS_OVERWROTE;
1981 if (m != end(i) &&
1982 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1983 goto copy;
1984
1985 status = BTREE_INSERT_STATUS_FRONT_MERGE;
1986 if (m != end(i) &&
1987 bch_bkey_try_merge(b, k, m))
1988 goto copy;
1989 } else {
1990 BUG_ON(replace_key);
1991 m = bch_bset_search(b, &b->sets[b->nsets], k);
1992 }
1993
1994insert: shift_keys(b, m, k);
1995copy: bkey_copy(m, k);
1996merged:
1997 bch_check_keys(b, "%u for %s", status,
1998 replace_key ? "replace" : "insert");
1999
2000 if (b->level && !KEY_OFFSET(k))
2001 btree_current_write(b)->prio_blocked++;
2002
2003 trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
2004 1815
2005 return true; 1816 return max(ret, 0L);
2006} 1817}
2007 1818
2008static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, 1819static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
@@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2010 struct bkey *replace_key) 1821 struct bkey *replace_key)
2011{ 1822{
2012 bool ret = false; 1823 bool ret = false;
2013 int oldsize = bch_count_data(b); 1824 int oldsize = bch_count_data(&b->keys);
2014 1825
2015 while (!bch_keylist_empty(insert_keys)) { 1826 while (!bch_keylist_empty(insert_keys)) {
2016 struct bset *i = write_block(b);
2017 struct bkey *k = insert_keys->keys; 1827 struct bkey *k = insert_keys->keys;
2018 1828
2019 if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) 1829 if (bkey_u64s(k) > insert_u64s_remaining(b))
2020 > btree_blocks(b))
2021 break; 1830 break;
2022 1831
2023 if (bkey_cmp(k, &b->key) <= 0) { 1832 if (bkey_cmp(k, &b->key) <= 0) {
2024 if (!b->level) 1833 if (!b->level)
2025 bkey_put(b->c, k); 1834 bkey_put(b->c, k);
2026 1835
2027 ret |= btree_insert_key(b, op, k, replace_key); 1836 ret |= btree_insert_key(b, k, replace_key);
2028 bch_keylist_pop_front(insert_keys); 1837 bch_keylist_pop_front(insert_keys);
2029 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { 1838 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2030 BKEY_PADDED(key) temp; 1839 BKEY_PADDED(key) temp;
@@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2033 bch_cut_back(&b->key, &temp.key); 1842 bch_cut_back(&b->key, &temp.key);
2034 bch_cut_front(&b->key, insert_keys->keys); 1843 bch_cut_front(&b->key, insert_keys->keys);
2035 1844
2036 ret |= btree_insert_key(b, op, &temp.key, replace_key); 1845 ret |= btree_insert_key(b, &temp.key, replace_key);
2037 break; 1846 break;
2038 } else { 1847 } else {
2039 break; 1848 break;
2040 } 1849 }
2041 } 1850 }
2042 1851
1852 if (!ret)
1853 op->insert_collision = true;
1854
2043 BUG_ON(!bch_keylist_empty(insert_keys) && b->level); 1855 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2044 1856
2045 BUG_ON(bch_count_data(b) < oldsize); 1857 BUG_ON(bch_count_data(&b->keys) < oldsize);
2046 return ret; 1858 return ret;
2047} 1859}
2048 1860
@@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
2059 closure_init_stack(&cl); 1871 closure_init_stack(&cl);
2060 bch_keylist_init(&parent_keys); 1872 bch_keylist_init(&parent_keys);
2061 1873
1874 if (!b->level &&
1875 btree_check_reserve(b, op))
1876 return -EINTR;
1877
2062 n1 = btree_node_alloc_replacement(b, true); 1878 n1 = btree_node_alloc_replacement(b, true);
2063 if (IS_ERR(n1)) 1879 if (IS_ERR(n1))
2064 goto err; 1880 goto err;
2065 1881
2066 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1882 split = set_blocks(btree_bset_first(n1),
1883 block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
2067 1884
2068 if (split) { 1885 if (split) {
2069 unsigned keys = 0; 1886 unsigned keys = 0;
2070 1887
2071 trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 1888 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
2072 1889
2073 n2 = bch_btree_node_alloc(b->c, b->level, true); 1890 n2 = bch_btree_node_alloc(b->c, b->level, true);
2074 if (IS_ERR(n2)) 1891 if (IS_ERR(n2))
@@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op,
2087 * search tree yet 1904 * search tree yet
2088 */ 1905 */
2089 1906
2090 while (keys < (n1->sets[0].data->keys * 3) / 5) 1907 while (keys < (btree_bset_first(n1)->keys * 3) / 5)
2091 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1908 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
1909 keys));
2092 1910
2093 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); 1911 bkey_copy_key(&n1->key,
2094 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1912 bset_bkey_idx(btree_bset_first(n1), keys));
1913 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
2095 1914
2096 n2->sets[0].data->keys = n1->sets[0].data->keys - keys; 1915 btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
2097 n1->sets[0].data->keys = keys; 1916 btree_bset_first(n1)->keys = keys;
2098 1917
2099 memcpy(n2->sets[0].data->start, 1918 memcpy(btree_bset_first(n2)->start,
2100 end(n1->sets[0].data), 1919 bset_bkey_last(btree_bset_first(n1)),
2101 n2->sets[0].data->keys * sizeof(uint64_t)); 1920 btree_bset_first(n2)->keys * sizeof(uint64_t));
2102 1921
2103 bkey_copy_key(&n2->key, &b->key); 1922 bkey_copy_key(&n2->key, &b->key);
2104 1923
@@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
2106 bch_btree_node_write(n2, &cl); 1925 bch_btree_node_write(n2, &cl);
2107 rw_unlock(true, n2); 1926 rw_unlock(true, n2);
2108 } else { 1927 } else {
2109 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 1928 trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
2110 1929
2111 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 1930 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2112 } 1931 }
@@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
2149 1968
2150 return 0; 1969 return 0;
2151err_free2: 1970err_free2:
1971 bkey_put(b->c, &n2->key);
2152 btree_node_free(n2); 1972 btree_node_free(n2);
2153 rw_unlock(true, n2); 1973 rw_unlock(true, n2);
2154err_free1: 1974err_free1:
1975 bkey_put(b->c, &n1->key);
2155 btree_node_free(n1); 1976 btree_node_free(n1);
2156 rw_unlock(true, n1); 1977 rw_unlock(true, n1);
2157err: 1978err:
1979 WARN(1, "bcache: btree split failed");
1980
2158 if (n3 == ERR_PTR(-EAGAIN) || 1981 if (n3 == ERR_PTR(-EAGAIN) ||
2159 n2 == ERR_PTR(-EAGAIN) || 1982 n2 == ERR_PTR(-EAGAIN) ||
2160 n1 == ERR_PTR(-EAGAIN)) 1983 n1 == ERR_PTR(-EAGAIN))
2161 return -EAGAIN; 1984 return -EAGAIN;
2162 1985
2163 pr_warn("couldn't split");
2164 return -ENOMEM; 1986 return -ENOMEM;
2165} 1987}
2166 1988
@@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2171{ 1993{
2172 BUG_ON(b->level && replace_key); 1994 BUG_ON(b->level && replace_key);
2173 1995
2174 if (should_split(b)) { 1996 if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
2175 if (current->bio_list) { 1997 if (current->bio_list) {
2176 op->lock = b->c->root->level + 1; 1998 op->lock = b->c->root->level + 1;
2177 return -EAGAIN; 1999 return -EAGAIN;
@@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2180 return -EINTR; 2002 return -EINTR;
2181 } else { 2003 } else {
2182 /* Invalidated all iterators */ 2004 /* Invalidated all iterators */
2183 return btree_split(b, op, insert_keys, replace_key) ?: 2005 int ret = btree_split(b, op, insert_keys, replace_key);
2184 -EINTR; 2006
2007 return bch_keylist_empty(insert_keys) ?
2008 0 : ret ?: -EINTR;
2185 } 2009 }
2186 } else { 2010 } else {
2187 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2011 BUG_ON(write_block(b) != btree_bset_last(b));
2188 2012
2189 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 2013 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2190 if (!b->level) 2014 if (!b->level)
@@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2323 struct bkey *k; 2147 struct bkey *k;
2324 struct btree_iter iter; 2148 struct btree_iter iter;
2325 2149
2326 bch_btree_iter_init(b, &iter, from); 2150 bch_btree_iter_init(&b->keys, &iter, from);
2327 2151
2328 while ((k = bch_btree_iter_next_filter(&iter, b, 2152 while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
2329 bch_ptr_bad))) { 2153 bch_ptr_bad))) {
2330 ret = btree(map_nodes_recurse, k, b, 2154 ret = btree(map_nodes_recurse, k, b,
2331 op, from, fn, flags); 2155 op, from, fn, flags);
@@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2356 struct bkey *k; 2180 struct bkey *k;
2357 struct btree_iter iter; 2181 struct btree_iter iter;
2358 2182
2359 bch_btree_iter_init(b, &iter, from); 2183 bch_btree_iter_init(&b->keys, &iter, from);
2360 2184
2361 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { 2185 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
2362 ret = !b->level 2186 ret = !b->level
2363 ? fn(op, b, k) 2187 ? fn(op, b, k)
2364 : btree(map_keys_recurse, k, b, op, from, fn, flags); 2188 : btree(map_keys_recurse, k, b, op, from, fn, flags);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 767e75570896..af065e97e55c 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -130,20 +130,12 @@ struct btree {
130 unsigned long flags; 130 unsigned long flags;
131 uint16_t written; /* would be nice to kill */ 131 uint16_t written; /* would be nice to kill */
132 uint8_t level; 132 uint8_t level;
133 uint8_t nsets; 133
134 uint8_t page_order; 134 struct btree_keys keys;
135
136 /*
137 * Set of sorted keys - the real btree node - plus a binary search tree
138 *
139 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
140 * to the memory we have allocated for this btree node. Additionally,
141 * set[0]->data points to the entire btree node as it exists on disk.
142 */
143 struct bset_tree sets[MAX_BSETS];
144 135
145 /* For outstanding btree writes, used as a lock - protects write_idx */ 136 /* For outstanding btree writes, used as a lock - protects write_idx */
146 struct closure_with_waitlist io; 137 struct closure io;
138 struct semaphore io_mutex;
147 139
148 struct list_head list; 140 struct list_head list;
149 struct delayed_work work; 141 struct delayed_work work;
@@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b)
179 return b->writes + (btree_node_write_idx(b) ^ 1); 171 return b->writes + (btree_node_write_idx(b) ^ 1);
180} 172}
181 173
182static inline unsigned bset_offset(struct btree *b, struct bset *i) 174static inline struct bset *btree_bset_first(struct btree *b)
183{ 175{
184 return (((size_t) i) - ((size_t) b->sets->data)) >> 9; 176 return b->keys.set->data;
185} 177}
186 178
187static inline struct bset *write_block(struct btree *b) 179static inline struct bset *btree_bset_last(struct btree *b)
188{ 180{
189 return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); 181 return bset_tree_last(&b->keys)->data;
190} 182}
191 183
192static inline bool bset_written(struct btree *b, struct bset_tree *t) 184static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
193{ 185{
194 return t->data < write_block(b); 186 return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
195}
196
197static inline bool bkey_written(struct btree *b, struct bkey *k)
198{
199 return k < write_block(b)->start;
200} 187}
201 188
202static inline void set_gc_sectors(struct cache_set *c) 189static inline void set_gc_sectors(struct cache_set *c)
@@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c)
204 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); 191 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
205} 192}
206 193
207static inline struct bkey *bch_btree_iter_init(struct btree *b,
208 struct btree_iter *iter,
209 struct bkey *search)
210{
211 return __bch_btree_iter_init(b, iter, search, b->sets);
212}
213
214static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
215{
216 if (b->level)
217 return bch_btree_ptr_invalid(b->c, k);
218 else
219 return bch_extent_ptr_invalid(b->c, k);
220}
221
222void bkey_put(struct cache_set *c, struct bkey *k); 194void bkey_put(struct cache_set *c, struct bkey *k);
223 195
224/* Looping macros */ 196/* Looping macros */
@@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k);
229 iter++) \ 201 iter++) \
230 hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) 202 hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
231 203
232#define for_each_key_filter(b, k, iter, filter) \
233 for (bch_btree_iter_init((b), (iter), NULL); \
234 ((k) = bch_btree_iter_next_filter((iter), b, filter));)
235
236#define for_each_key(b, k, iter) \
237 for (bch_btree_iter_init((b), (iter), NULL); \
238 ((k) = bch_btree_iter_next(iter));)
239
240/* Recursing down the btree */ 204/* Recursing down the btree */
241 205
242struct btree_op { 206struct btree_op {
207 /* for waiting on btree reserve in btree_split() */
208 wait_queue_t wait;
209
243 /* Btree level at which we start taking write locks */ 210 /* Btree level at which we start taking write locks */
244 short lock; 211 short lock;
245 212
@@ -249,6 +216,7 @@ struct btree_op {
249static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) 216static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
250{ 217{
251 memset(op, 0, sizeof(struct btree_op)); 218 memset(op, 0, sizeof(struct btree_op));
219 init_wait(&op->wait);
252 op->lock = write_lock_level; 220 op->lock = write_lock_level;
253} 221}
254 222
@@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b)
267 (w ? up_write : up_read)(&b->lock); 235 (w ? up_write : up_read)(&b->lock);
268} 236}
269 237
270void bch_btree_node_read(struct btree *); 238void bch_btree_node_read_done(struct btree *);
271void bch_btree_node_write(struct btree *, struct closure *); 239void bch_btree_node_write(struct btree *, struct closure *);
272 240
273void bch_btree_set_root(struct btree *); 241void bch_btree_set_root(struct btree *);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index dfff2410322e..7a228de95fd7 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -11,19 +11,6 @@
11 11
12#include "closure.h" 12#include "closure.h"
13 13
14#define CL_FIELD(type, field) \
15 case TYPE_ ## type: \
16 return &container_of(cl, struct type, cl)->field
17
18static struct closure_waitlist *closure_waitlist(struct closure *cl)
19{
20 switch (cl->type) {
21 CL_FIELD(closure_with_waitlist, wait);
22 default:
23 return NULL;
24 }
25}
26
27static inline void closure_put_after_sub(struct closure *cl, int flags) 14static inline void closure_put_after_sub(struct closure *cl, int flags)
28{ 15{
29 int r = flags & CLOSURE_REMAINING_MASK; 16 int r = flags & CLOSURE_REMAINING_MASK;
@@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
42 closure_queue(cl); 29 closure_queue(cl);
43 } else { 30 } else {
44 struct closure *parent = cl->parent; 31 struct closure *parent = cl->parent;
45 struct closure_waitlist *wait = closure_waitlist(cl);
46 closure_fn *destructor = cl->fn; 32 closure_fn *destructor = cl->fn;
47 33
48 closure_debug_destroy(cl); 34 closure_debug_destroy(cl);
49 35
50 smp_mb();
51 atomic_set(&cl->remaining, -1);
52
53 if (wait)
54 closure_wake_up(wait);
55
56 if (destructor) 36 if (destructor)
57 destructor(cl); 37 destructor(cl);
58 38
@@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v)
69} 49}
70EXPORT_SYMBOL(closure_sub); 50EXPORT_SYMBOL(closure_sub);
71 51
52/**
53 * closure_put - decrement a closure's refcount
54 */
72void closure_put(struct closure *cl) 55void closure_put(struct closure *cl)
73{ 56{
74 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); 57 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
75} 58}
76EXPORT_SYMBOL(closure_put); 59EXPORT_SYMBOL(closure_put);
77 60
78static void set_waiting(struct closure *cl, unsigned long f) 61/**
79{ 62 * closure_wake_up - wake up all closures on a wait list, without memory barrier
80#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 63 */
81 cl->waiting_on = f;
82#endif
83}
84
85void __closure_wake_up(struct closure_waitlist *wait_list) 64void __closure_wake_up(struct closure_waitlist *wait_list)
86{ 65{
87 struct llist_node *list; 66 struct llist_node *list;
@@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
106 cl = container_of(reverse, struct closure, list); 85 cl = container_of(reverse, struct closure, list);
107 reverse = llist_next(reverse); 86 reverse = llist_next(reverse);
108 87
109 set_waiting(cl, 0); 88 closure_set_waiting(cl, 0);
110 closure_sub(cl, CLOSURE_WAITING + 1); 89 closure_sub(cl, CLOSURE_WAITING + 1);
111 } 90 }
112} 91}
113EXPORT_SYMBOL(__closure_wake_up); 92EXPORT_SYMBOL(__closure_wake_up);
114 93
115bool closure_wait(struct closure_waitlist *list, struct closure *cl) 94/**
95 * closure_wait - add a closure to a waitlist
96 *
97 * @waitlist will own a ref on @cl, which will be released when
98 * closure_wake_up() is called on @waitlist.
99 *
100 */
101bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
116{ 102{
117 if (atomic_read(&cl->remaining) & CLOSURE_WAITING) 103 if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
118 return false; 104 return false;
119 105
120 set_waiting(cl, _RET_IP_); 106 closure_set_waiting(cl, _RET_IP_);
121 atomic_add(CLOSURE_WAITING + 1, &cl->remaining); 107 atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
122 llist_add(&cl->list, &list->list); 108 llist_add(&cl->list, &waitlist->list);
123 109
124 return true; 110 return true;
125} 111}
126EXPORT_SYMBOL(closure_wait); 112EXPORT_SYMBOL(closure_wait);
127 113
128/** 114/**
129 * closure_sync() - sleep until a closure a closure has nothing left to wait on 115 * closure_sync - sleep until a closure a closure has nothing left to wait on
130 * 116 *
131 * Sleeps until the refcount hits 1 - the thread that's running the closure owns 117 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
132 * the last refcount. 118 * the last refcount.
@@ -148,46 +134,6 @@ void closure_sync(struct closure *cl)
148} 134}
149EXPORT_SYMBOL(closure_sync); 135EXPORT_SYMBOL(closure_sync);
150 136
151/**
152 * closure_trylock() - try to acquire the closure, without waiting
153 * @cl: closure to lock
154 *
155 * Returns true if the closure was succesfully locked.
156 */
157bool closure_trylock(struct closure *cl, struct closure *parent)
158{
159 if (atomic_cmpxchg(&cl->remaining, -1,
160 CLOSURE_REMAINING_INITIALIZER) != -1)
161 return false;
162
163 smp_mb();
164
165 cl->parent = parent;
166 if (parent)
167 closure_get(parent);
168
169 closure_set_ret_ip(cl);
170 closure_debug_create(cl);
171 return true;
172}
173EXPORT_SYMBOL(closure_trylock);
174
175void __closure_lock(struct closure *cl, struct closure *parent,
176 struct closure_waitlist *wait_list)
177{
178 struct closure wait;
179 closure_init_stack(&wait);
180
181 while (1) {
182 if (closure_trylock(cl, parent))
183 return;
184
185 closure_wait_event(wait_list, &wait,
186 atomic_read(&cl->remaining) == -1);
187 }
188}
189EXPORT_SYMBOL(__closure_lock);
190
191#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 137#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
192 138
193static LIST_HEAD(closure_list); 139static LIST_HEAD(closure_list);
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 9762f1be3304..7ef7461912be 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -72,30 +72,6 @@
72 * closure - _always_ use continue_at(). Doing so consistently will help 72 * closure - _always_ use continue_at(). Doing so consistently will help
73 * eliminate an entire class of particularly pernicious races. 73 * eliminate an entire class of particularly pernicious races.
74 * 74 *
75 * For a closure to wait on an arbitrary event, we need to introduce waitlists:
76 *
77 * struct closure_waitlist list;
78 * closure_wait_event(list, cl, condition);
79 * closure_wake_up(wait_list);
80 *
81 * These work analagously to wait_event() and wake_up() - except that instead of
82 * operating on the current thread (for wait_event()) and lists of threads, they
83 * operate on an explicit closure and lists of closures.
84 *
85 * Because it's a closure we can now wait either synchronously or
86 * asynchronously. closure_wait_event() returns the current value of the
87 * condition, and if it returned false continue_at() or closure_sync() can be
88 * used to wait for it to become true.
89 *
90 * It's useful for waiting on things when you can't sleep in the context in
91 * which you must check the condition (perhaps a spinlock held, or you might be
92 * beneath generic_make_request() - in which case you can't sleep on IO).
93 *
94 * closure_wait_event() will wait either synchronously or asynchronously,
95 * depending on whether the closure is in blocking mode or not. You can pick a
96 * mode explicitly with closure_wait_event_sync() and
97 * closure_wait_event_async(), which do just what you might expect.
98 *
99 * Lastly, you might have a wait list dedicated to a specific event, and have no 75 * Lastly, you might have a wait list dedicated to a specific event, and have no
100 * need for specifying the condition - you just want to wait until someone runs 76 * need for specifying the condition - you just want to wait until someone runs
101 * closure_wake_up() on the appropriate wait list. In that case, just use 77 * closure_wake_up() on the appropriate wait list. In that case, just use
@@ -121,40 +97,6 @@
121 * All this implies that a closure should typically be embedded in a particular 97 * All this implies that a closure should typically be embedded in a particular
122 * struct (which its refcount will normally control the lifetime of), and that 98 * struct (which its refcount will normally control the lifetime of), and that
123 * struct can very much be thought of as a stack frame. 99 * struct can very much be thought of as a stack frame.
124 *
125 * Locking:
126 *
127 * Closures are based on work items but they can be thought of as more like
128 * threads - in that like threads and unlike work items they have a well
129 * defined lifetime; they are created (with closure_init()) and eventually
130 * complete after a continue_at(cl, NULL, NULL).
131 *
132 * Suppose you've got some larger structure with a closure embedded in it that's
133 * used for periodically doing garbage collection. You only want one garbage
134 * collection happening at a time, so the natural thing to do is protect it with
135 * a lock. However, it's difficult to use a lock protecting a closure correctly
136 * because the unlock should come after the last continue_to() (additionally, if
137 * you're using the closure asynchronously a mutex won't work since a mutex has
138 * to be unlocked by the same process that locked it).
139 *
140 * So to make it less error prone and more efficient, we also have the ability
141 * to use closures as locks:
142 *
143 * closure_init_unlocked();
144 * closure_trylock();
145 *
146 * That's all we need for trylock() - the last closure_put() implicitly unlocks
147 * it for you. But for closure_lock(), we also need a wait list:
148 *
149 * struct closure_with_waitlist frobnicator_cl;
150 *
151 * closure_init_unlocked(&frobnicator_cl);
152 * closure_lock(&frobnicator_cl);
153 *
154 * A closure_with_waitlist embeds a closure and a wait list - much like struct
155 * delayed_work embeds a work item and a timer_list. The important thing is, use
156 * it exactly like you would a regular closure and closure_put() will magically
157 * handle everything for you.
158 */ 100 */
159 101
160struct closure; 102struct closure;
@@ -164,12 +106,6 @@ struct closure_waitlist {
164 struct llist_head list; 106 struct llist_head list;
165}; 107};
166 108
167enum closure_type {
168 TYPE_closure = 0,
169 TYPE_closure_with_waitlist = 1,
170 MAX_CLOSURE_TYPE = 1,
171};
172
173enum closure_state { 109enum closure_state {
174 /* 110 /*
175 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by 111 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
@@ -224,8 +160,6 @@ struct closure {
224 160
225 atomic_t remaining; 161 atomic_t remaining;
226 162
227 enum closure_type type;
228
229#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 163#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
230#define CLOSURE_MAGIC_DEAD 0xc054dead 164#define CLOSURE_MAGIC_DEAD 0xc054dead
231#define CLOSURE_MAGIC_ALIVE 0xc054a11e 165#define CLOSURE_MAGIC_ALIVE 0xc054a11e
@@ -237,34 +171,12 @@ struct closure {
237#endif 171#endif
238}; 172};
239 173
240struct closure_with_waitlist {
241 struct closure cl;
242 struct closure_waitlist wait;
243};
244
245extern unsigned invalid_closure_type(void);
246
247#define __CLOSURE_TYPE(cl, _t) \
248 __builtin_types_compatible_p(typeof(cl), struct _t) \
249 ? TYPE_ ## _t : \
250
251#define __closure_type(cl) \
252( \
253 __CLOSURE_TYPE(cl, closure) \
254 __CLOSURE_TYPE(cl, closure_with_waitlist) \
255 invalid_closure_type() \
256)
257
258void closure_sub(struct closure *cl, int v); 174void closure_sub(struct closure *cl, int v);
259void closure_put(struct closure *cl); 175void closure_put(struct closure *cl);
260void __closure_wake_up(struct closure_waitlist *list); 176void __closure_wake_up(struct closure_waitlist *list);
261bool closure_wait(struct closure_waitlist *list, struct closure *cl); 177bool closure_wait(struct closure_waitlist *list, struct closure *cl);
262void closure_sync(struct closure *cl); 178void closure_sync(struct closure *cl);
263 179
264bool closure_trylock(struct closure *cl, struct closure *parent);
265void __closure_lock(struct closure *cl, struct closure *parent,
266 struct closure_waitlist *wait_list);
267
268#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 180#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
269 181
270void closure_debug_init(void); 182void closure_debug_init(void);
@@ -293,134 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl)
293#endif 205#endif
294} 206}
295 207
296static inline void closure_get(struct closure *cl) 208static inline void closure_set_waiting(struct closure *cl, unsigned long f)
297{ 209{
298#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 210#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
299 BUG_ON((atomic_inc_return(&cl->remaining) & 211 cl->waiting_on = f;
300 CLOSURE_REMAINING_MASK) <= 1);
301#else
302 atomic_inc(&cl->remaining);
303#endif 212#endif
304} 213}
305 214
306static inline void closure_set_stopped(struct closure *cl) 215static inline void __closure_end_sleep(struct closure *cl)
307{ 216{
308 atomic_sub(CLOSURE_RUNNING, &cl->remaining); 217 __set_current_state(TASK_RUNNING);
218
219 if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
220 atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
309} 221}
310 222
311static inline bool closure_is_unlocked(struct closure *cl) 223static inline void __closure_start_sleep(struct closure *cl)
312{ 224{
313 return atomic_read(&cl->remaining) == -1; 225 closure_set_ip(cl);
226 cl->task = current;
227 set_current_state(TASK_UNINTERRUPTIBLE);
228
229 if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
230 atomic_add(CLOSURE_SLEEPING, &cl->remaining);
314} 231}
315 232
316static inline void do_closure_init(struct closure *cl, struct closure *parent, 233static inline void closure_set_stopped(struct closure *cl)
317 bool running)
318{ 234{
319 cl->parent = parent; 235 atomic_sub(CLOSURE_RUNNING, &cl->remaining);
320 if (parent) 236}
321 closure_get(parent);
322
323 if (running) {
324 closure_debug_create(cl);
325 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
326 } else
327 atomic_set(&cl->remaining, -1);
328 237
238static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
239 struct workqueue_struct *wq)
240{
241 BUG_ON(object_is_on_stack(cl));
329 closure_set_ip(cl); 242 closure_set_ip(cl);
243 cl->fn = fn;
244 cl->wq = wq;
245 /* between atomic_dec() in closure_put() */
246 smp_mb__before_atomic_dec();
330} 247}
331 248
332/* 249static inline void closure_queue(struct closure *cl)
333 * Hack to get at the embedded closure if there is one, by doing an unsafe cast: 250{
334 * the result of __closure_type() is thrown away, it's used merely for type 251 struct workqueue_struct *wq = cl->wq;
335 * checking. 252 if (wq) {
336 */ 253 INIT_WORK(&cl->work, cl->work.func);
337#define __to_internal_closure(cl) \ 254 BUG_ON(!queue_work(wq, &cl->work));
338({ \ 255 } else
339 BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ 256 cl->fn(cl);
340 (struct closure *) cl; \ 257}
341})
342
343#define closure_init_type(cl, parent, running) \
344do { \
345 struct closure *_cl = __to_internal_closure(cl); \
346 _cl->type = __closure_type(*(cl)); \
347 do_closure_init(_cl, parent, running); \
348} while (0)
349 258
350/** 259/**
351 * __closure_init() - Initialize a closure, skipping the memset() 260 * closure_get - increment a closure's refcount
352 *
353 * May be used instead of closure_init() when memory has already been zeroed.
354 */ 261 */
355#define __closure_init(cl, parent) \ 262static inline void closure_get(struct closure *cl)
356 closure_init_type(cl, parent, true) 263{
264#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
265 BUG_ON((atomic_inc_return(&cl->remaining) &
266 CLOSURE_REMAINING_MASK) <= 1);
267#else
268 atomic_inc(&cl->remaining);
269#endif
270}
357 271
358/** 272/**
359 * closure_init() - Initialize a closure, setting the refcount to 1 273 * closure_init - Initialize a closure, setting the refcount to 1
360 * @cl: closure to initialize 274 * @cl: closure to initialize
361 * @parent: parent of the new closure. cl will take a refcount on it for its 275 * @parent: parent of the new closure. cl will take a refcount on it for its
362 * lifetime; may be NULL. 276 * lifetime; may be NULL.
363 */ 277 */
364#define closure_init(cl, parent) \ 278static inline void closure_init(struct closure *cl, struct closure *parent)
365do { \
366 memset((cl), 0, sizeof(*(cl))); \
367 __closure_init(cl, parent); \
368} while (0)
369
370static inline void closure_init_stack(struct closure *cl)
371{ 279{
372 memset(cl, 0, sizeof(struct closure)); 280 memset(cl, 0, sizeof(struct closure));
373 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); 281 cl->parent = parent;
374} 282 if (parent)
375 283 closure_get(parent);
376/**
377 * closure_init_unlocked() - Initialize a closure but leave it unlocked.
378 * @cl: closure to initialize
379 *
380 * For when the closure will be used as a lock. The closure may not be used
381 * until after a closure_lock() or closure_trylock().
382 */
383#define closure_init_unlocked(cl) \
384do { \
385 memset((cl), 0, sizeof(*(cl))); \
386 closure_init_type(cl, NULL, false); \
387} while (0)
388
389/**
390 * closure_lock() - lock and initialize a closure.
391 * @cl: the closure to lock
392 * @parent: the new parent for this closure
393 *
394 * The closure must be of one of the types that has a waitlist (otherwise we
395 * wouldn't be able to sleep on contention).
396 *
397 * @parent has exactly the same meaning as in closure_init(); if non null, the
398 * closure will take a reference on @parent which will be released when it is
399 * unlocked.
400 */
401#define closure_lock(cl, parent) \
402 __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
403 284
404static inline void __closure_end_sleep(struct closure *cl) 285 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
405{
406 __set_current_state(TASK_RUNNING);
407 286
408 if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) 287 closure_debug_create(cl);
409 atomic_sub(CLOSURE_SLEEPING, &cl->remaining); 288 closure_set_ip(cl);
410} 289}
411 290
412static inline void __closure_start_sleep(struct closure *cl) 291static inline void closure_init_stack(struct closure *cl)
413{ 292{
414 closure_set_ip(cl); 293 memset(cl, 0, sizeof(struct closure));
415 cl->task = current; 294 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
416 set_current_state(TASK_UNINTERRUPTIBLE);
417
418 if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
419 atomic_add(CLOSURE_SLEEPING, &cl->remaining);
420} 295}
421 296
422/** 297/**
423 * closure_wake_up() - wake up all closures on a wait list. 298 * closure_wake_up - wake up all closures on a wait list.
424 */ 299 */
425static inline void closure_wake_up(struct closure_waitlist *list) 300static inline void closure_wake_up(struct closure_waitlist *list)
426{ 301{
@@ -428,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list)
428 __closure_wake_up(list); 303 __closure_wake_up(list);
429} 304}
430 305
431/* 306/**
432 * Wait on an event, synchronously or asynchronously - analogous to wait_event() 307 * continue_at - jump to another function with barrier
433 * but for closures. 308 *
434 * 309 * After @cl is no longer waiting on anything (i.e. all outstanding refs have
435 * The loop is oddly structured so as to avoid a race; we must check the 310 * been dropped with closure_put()), it will resume execution at @fn running out
436 * condition again after we've added ourself to the waitlist. We know if we were 311 * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
437 * already on the waitlist because closure_wait() returns false; thus, we only 312 *
438 * schedule or break if closure_wait() returns false. If it returns true, we 313 * NOTE: This macro expands to a return in the calling function!
439 * just loop again - rechecking the condition. 314 *
440 * 315 * This is because after calling continue_at() you no longer have a ref on @cl,
441 * The __closure_wake_up() is necessary because we may race with the event 316 * and whatever @cl owns may be freed out from under you - a running closure fn
442 * becoming true; i.e. we see event false -> wait -> recheck condition, but the 317 * has a ref on its own closure which continue_at() drops.
443 * thread that made the event true may have called closure_wake_up() before we
444 * added ourself to the wait list.
445 *
446 * We have to call closure_sync() at the end instead of just
447 * __closure_end_sleep() because a different thread might've called
448 * closure_wake_up() before us and gotten preempted before they dropped the
449 * refcount on our closure. If this was a stack allocated closure, that would be
450 * bad.
451 */ 318 */
452#define closure_wait_event(list, cl, condition) \
453({ \
454 typeof(condition) ret; \
455 \
456 while (1) { \
457 ret = (condition); \
458 if (ret) { \
459 __closure_wake_up(list); \
460 closure_sync(cl); \
461 break; \
462 } \
463 \
464 __closure_start_sleep(cl); \
465 \
466 if (!closure_wait(list, cl)) \
467 schedule(); \
468 } \
469 \
470 ret; \
471})
472
473static inline void closure_queue(struct closure *cl)
474{
475 struct workqueue_struct *wq = cl->wq;
476 if (wq) {
477 INIT_WORK(&cl->work, cl->work.func);
478 BUG_ON(!queue_work(wq, &cl->work));
479 } else
480 cl->fn(cl);
481}
482
483static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
484 struct workqueue_struct *wq)
485{
486 BUG_ON(object_is_on_stack(cl));
487 closure_set_ip(cl);
488 cl->fn = fn;
489 cl->wq = wq;
490 /* between atomic_dec() in closure_put() */
491 smp_mb__before_atomic_dec();
492}
493
494#define continue_at(_cl, _fn, _wq) \ 319#define continue_at(_cl, _fn, _wq) \
495do { \ 320do { \
496 set_closure_fn(_cl, _fn, _wq); \ 321 set_closure_fn(_cl, _fn, _wq); \
@@ -498,8 +323,28 @@ do { \
498 return; \ 323 return; \
499} while (0) 324} while (0)
500 325
326/**
327 * closure_return - finish execution of a closure
328 *
329 * This is used to indicate that @cl is finished: when all outstanding refs on
330 * @cl have been dropped @cl's ref on its parent closure (as passed to
331 * closure_init()) will be dropped, if one was specified - thus this can be
332 * thought of as returning to the parent closure.
333 */
501#define closure_return(_cl) continue_at((_cl), NULL, NULL) 334#define closure_return(_cl) continue_at((_cl), NULL, NULL)
502 335
336/**
337 * continue_at_nobarrier - jump to another function without barrier
338 *
339 * Causes @fn to be executed out of @cl, in @wq context (or called directly if
340 * @wq is NULL).
341 *
342 * NOTE: like continue_at(), this macro expands to a return in the caller!
343 *
344 * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
345 * thus it's not safe to touch anything protected by @cl after a
346 * continue_at_nobarrier().
347 */
503#define continue_at_nobarrier(_cl, _fn, _wq) \ 348#define continue_at_nobarrier(_cl, _fn, _wq) \
504do { \ 349do { \
505 set_closure_fn(_cl, _fn, _wq); \ 350 set_closure_fn(_cl, _fn, _wq); \
@@ -507,6 +352,15 @@ do { \
507 return; \ 352 return; \
508} while (0) 353} while (0)
509 354
355/**
356 * closure_return - finish execution of a closure, with destructor
357 *
358 * Works like closure_return(), except @destructor will be called when all
359 * outstanding refs on @cl have been dropped; @destructor may be used to safely
360 * free the memory occupied by @cl, and it is called with the ref on the parent
361 * closure still held - so @destructor could safely return an item to a
362 * freelist protected by @cl's parent.
363 */
510#define closure_return_with_destructor(_cl, _destructor) \ 364#define closure_return_with_destructor(_cl, _destructor) \
511do { \ 365do { \
512 set_closure_fn(_cl, _destructor, NULL); \ 366 set_closure_fn(_cl, _destructor, NULL); \
@@ -514,6 +368,13 @@ do { \
514 return; \ 368 return; \
515} while (0) 369} while (0)
516 370
371/**
372 * closure_call - execute @fn out of a new, uninitialized closure
373 *
374 * Typically used when running out of one closure, and we want to run @fn
375 * asynchronously out of a new closure - @parent will then wait for @cl to
376 * finish.
377 */
517static inline void closure_call(struct closure *cl, closure_fn fn, 378static inline void closure_call(struct closure *cl, closure_fn fn,
518 struct workqueue_struct *wq, 379 struct workqueue_struct *wq,
519 struct closure *parent) 380 struct closure *parent)
@@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
522 continue_at_nobarrier(cl, fn, wq); 383 continue_at_nobarrier(cl, fn, wq);
523} 384}
524 385
525static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
526 struct workqueue_struct *wq,
527 struct closure *parent)
528{
529 if (closure_trylock(cl, parent))
530 continue_at_nobarrier(cl, fn, wq);
531}
532
533#endif /* _LINUX_CLOSURE_H */ 386#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 03cb4d114e16..8b1f1d5c1819 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -8,6 +8,7 @@
8#include "bcache.h" 8#include "bcache.h"
9#include "btree.h" 9#include "btree.h"
10#include "debug.h" 10#include "debug.h"
11#include "extents.h"
11 12
12#include <linux/console.h> 13#include <linux/console.h>
13#include <linux/debugfs.h> 14#include <linux/debugfs.h>
@@ -17,156 +18,88 @@
17 18
18static struct dentry *debug; 19static struct dentry *debug;
19 20
20const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
21{
22 unsigned i;
23
24 for (i = 0; i < KEY_PTRS(k); i++)
25 if (ptr_available(c, k, i)) {
26 struct cache *ca = PTR_CACHE(c, k, i);
27 size_t bucket = PTR_BUCKET_NR(c, k, i);
28 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
29
30 if (KEY_SIZE(k) + r > c->sb.bucket_size)
31 return "bad, length too big";
32 if (bucket < ca->sb.first_bucket)
33 return "bad, short offset";
34 if (bucket >= ca->sb.nbuckets)
35 return "bad, offset past end of device";
36 if (ptr_stale(c, k, i))
37 return "stale";
38 }
39
40 if (!bkey_cmp(k, &ZERO_KEY))
41 return "bad, null key";
42 if (!KEY_PTRS(k))
43 return "bad, no pointers";
44 if (!KEY_SIZE(k))
45 return "zeroed key";
46 return "";
47}
48
49int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
50{
51 unsigned i = 0;
52 char *out = buf, *end = buf + size;
53
54#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
55
56 p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
57
58 if (KEY_PTRS(k))
59 while (1) {
60 p("%llu:%llu gen %llu",
61 PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
62
63 if (++i == KEY_PTRS(k))
64 break;
65
66 p(", ");
67 }
68
69 p("]");
70
71 if (KEY_DIRTY(k))
72 p(" dirty");
73 if (KEY_CSUM(k))
74 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
75#undef p
76 return out - buf;
77}
78
79#ifdef CONFIG_BCACHE_DEBUG 21#ifdef CONFIG_BCACHE_DEBUG
80 22
81static void dump_bset(struct btree *b, struct bset *i) 23#define for_each_written_bset(b, start, i) \
82{ 24 for (i = (start); \
83 struct bkey *k, *next; 25 (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
84 unsigned j; 26 i->seq == (start)->seq; \
85 char buf[80]; 27 i = (void *) i + set_blocks(i, block_bytes(b->c)) * \
86 28 block_bytes(b->c))
87 for (k = i->start; k < end(i); k = next) {
88 next = bkey_next(k);
89
90 bch_bkey_to_text(buf, sizeof(buf), k);
91 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
92 (uint64_t *) k - i->d, i->keys, buf);
93
94 for (j = 0; j < KEY_PTRS(k); j++) {
95 size_t n = PTR_BUCKET_NR(b->c, k, j);
96 printk(" bucket %zu", n);
97
98 if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
99 printk(" prio %i",
100 PTR_BUCKET(b->c, k, j)->prio);
101 }
102 29
103 printk(" %s\n", bch_ptr_status(b->c, k)); 30void bch_btree_verify(struct btree *b)
104
105 if (next < end(i) &&
106 bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
107 printk(KERN_ERR "Key skipped backwards\n");
108 }
109}
110
111static void bch_dump_bucket(struct btree *b)
112{
113 unsigned i;
114
115 console_lock();
116 for (i = 0; i <= b->nsets; i++)
117 dump_bset(b, b->sets[i].data);
118 console_unlock();
119}
120
121void bch_btree_verify(struct btree *b, struct bset *new)
122{ 31{
123 struct btree *v = b->c->verify_data; 32 struct btree *v = b->c->verify_data;
124 struct closure cl; 33 struct bset *ondisk, *sorted, *inmemory;
125 closure_init_stack(&cl); 34 struct bio *bio;
126 35
127 if (!b->c->verify) 36 if (!b->c->verify || !b->c->verify_ondisk)
128 return; 37 return;
129 38
130 closure_wait_event(&b->io.wait, &cl, 39 down(&b->io_mutex);
131 atomic_read(&b->io.cl.remaining) == -1);
132
133 mutex_lock(&b->c->verify_lock); 40 mutex_lock(&b->c->verify_lock);
134 41
42 ondisk = b->c->verify_ondisk;
43 sorted = b->c->verify_data->keys.set->data;
44 inmemory = b->keys.set->data;
45
135 bkey_copy(&v->key, &b->key); 46 bkey_copy(&v->key, &b->key);
136 v->written = 0; 47 v->written = 0;
137 v->level = b->level; 48 v->level = b->level;
49 v->keys.ops = b->keys.ops;
50
51 bio = bch_bbio_alloc(b->c);
52 bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev;
53 bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0);
54 bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9;
55 bch_bio_map(bio, sorted);
138 56
139 bch_btree_node_read(v); 57 submit_bio_wait(REQ_META|READ_SYNC, bio);
140 closure_wait_event(&v->io.wait, &cl, 58 bch_bbio_free(bio, b->c);
141 atomic_read(&b->io.cl.remaining) == -1);
142 59
143 if (new->keys != v->sets[0].data->keys || 60 memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
144 memcmp(new->start, 61
145 v->sets[0].data->start, 62 bch_btree_node_read_done(v);
146 (void *) end(new) - (void *) new->start)) { 63 sorted = v->keys.set->data;
147 unsigned i, j; 64
65 if (inmemory->keys != sorted->keys ||
66 memcmp(inmemory->start,
67 sorted->start,
68 (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
69 struct bset *i;
70 unsigned j;
148 71
149 console_lock(); 72 console_lock();
150 73
151 printk(KERN_ERR "*** original memory node:\n"); 74 printk(KERN_ERR "*** in memory:\n");
152 for (i = 0; i <= b->nsets; i++) 75 bch_dump_bset(&b->keys, inmemory, 0);
153 dump_bset(b, b->sets[i].data);
154 76
155 printk(KERN_ERR "*** sorted memory node:\n"); 77 printk(KERN_ERR "*** read back in:\n");
156 dump_bset(b, new); 78 bch_dump_bset(&v->keys, sorted, 0);
157 79
158 printk(KERN_ERR "*** on disk node:\n"); 80 for_each_written_bset(b, ondisk, i) {
159 dump_bset(v, v->sets[0].data); 81 unsigned block = ((void *) i - (void *) ondisk) /
82 block_bytes(b->c);
83
84 printk(KERN_ERR "*** on disk block %u:\n", block);
85 bch_dump_bset(&b->keys, i, block);
86 }
160 87
161 for (j = 0; j < new->keys; j++) 88 printk(KERN_ERR "*** block %zu not written\n",
162 if (new->d[j] != v->sets[0].data->d[j]) 89 ((void *) i - (void *) ondisk) / block_bytes(b->c));
90
91 for (j = 0; j < inmemory->keys; j++)
92 if (inmemory->d[j] != sorted->d[j])
163 break; 93 break;
164 94
95 printk(KERN_ERR "b->written %u\n", b->written);
96
165 console_unlock(); 97 console_unlock();
166 panic("verify failed at %u\n", j); 98 panic("verify failed at %u\n", j);
167 } 99 }
168 100
169 mutex_unlock(&b->c->verify_lock); 101 mutex_unlock(&b->c->verify_lock);
102 up(&b->io_mutex);
170} 103}
171 104
172void bch_data_verify(struct cached_dev *dc, struct bio *bio) 105void bch_data_verify(struct cached_dev *dc, struct bio *bio)
@@ -207,74 +140,6 @@ out_put:
207 bio_put(check); 140 bio_put(check);
208} 141}
209 142
210int __bch_count_data(struct btree *b)
211{
212 unsigned ret = 0;
213 struct btree_iter iter;
214 struct bkey *k;
215
216 if (!b->level)
217 for_each_key(b, k, &iter)
218 ret += KEY_SIZE(k);
219 return ret;
220}
221
222void __bch_check_keys(struct btree *b, const char *fmt, ...)
223{
224 va_list args;
225 struct bkey *k, *p = NULL;
226 struct btree_iter iter;
227 const char *err;
228
229 for_each_key(b, k, &iter) {
230 if (!b->level) {
231 err = "Keys out of order";
232 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
233 goto bug;
234
235 if (bch_ptr_invalid(b, k))
236 continue;
237
238 err = "Overlapping keys";
239 if (p && bkey_cmp(p, &START_KEY(k)) > 0)
240 goto bug;
241 } else {
242 if (bch_ptr_bad(b, k))
243 continue;
244
245 err = "Duplicate keys";
246 if (p && !bkey_cmp(p, k))
247 goto bug;
248 }
249 p = k;
250 }
251
252 err = "Key larger than btree node key";
253 if (p && bkey_cmp(p, &b->key) > 0)
254 goto bug;
255
256 return;
257bug:
258 bch_dump_bucket(b);
259
260 va_start(args, fmt);
261 vprintk(fmt, args);
262 va_end(args);
263
264 panic("bcache error: %s:\n", err);
265}
266
267void bch_btree_iter_next_check(struct btree_iter *iter)
268{
269 struct bkey *k = iter->data->k, *next = bkey_next(k);
270
271 if (next < iter->data->end &&
272 bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
273 bch_dump_bucket(iter->b);
274 panic("Key skipped backwards\n");
275 }
276}
277
278#endif 143#endif
279 144
280#ifdef CONFIG_DEBUG_FS 145#ifdef CONFIG_DEBUG_FS
@@ -321,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
321 if (!w) 186 if (!w)
322 break; 187 break;
323 188
324 bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); 189 bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
325 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); 190 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
326 bch_keybuf_del(&i->keys, w); 191 bch_keybuf_del(&i->keys, w);
327 } 192 }
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 2ede60e31874..1f63c195d247 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -1,47 +1,30 @@
1#ifndef _BCACHE_DEBUG_H 1#ifndef _BCACHE_DEBUG_H
2#define _BCACHE_DEBUG_H 2#define _BCACHE_DEBUG_H
3 3
4/* Btree/bkey debug printing */ 4struct bio;
5 5struct cached_dev;
6int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); 6struct cache_set;
7 7
8#ifdef CONFIG_BCACHE_DEBUG 8#ifdef CONFIG_BCACHE_DEBUG
9 9
10void bch_btree_verify(struct btree *, struct bset *); 10void bch_btree_verify(struct btree *);
11void bch_data_verify(struct cached_dev *, struct bio *); 11void bch_data_verify(struct cached_dev *, struct bio *);
12int __bch_count_data(struct btree *);
13void __bch_check_keys(struct btree *, const char *, ...);
14void bch_btree_iter_next_check(struct btree_iter *);
15 12
16#define EBUG_ON(cond) BUG_ON(cond)
17#define expensive_debug_checks(c) ((c)->expensive_debug_checks) 13#define expensive_debug_checks(c) ((c)->expensive_debug_checks)
18#define key_merging_disabled(c) ((c)->key_merging_disabled) 14#define key_merging_disabled(c) ((c)->key_merging_disabled)
19#define bypass_torture_test(d) ((d)->bypass_torture_test) 15#define bypass_torture_test(d) ((d)->bypass_torture_test)
20 16
21#else /* DEBUG */ 17#else /* DEBUG */
22 18
23static inline void bch_btree_verify(struct btree *b, struct bset *i) {} 19static inline void bch_btree_verify(struct btree *b) {}
24static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} 20static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
25static inline int __bch_count_data(struct btree *b) { return -1; }
26static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
27static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
28 21
29#define EBUG_ON(cond) do { if (cond); } while (0)
30#define expensive_debug_checks(c) 0 22#define expensive_debug_checks(c) 0
31#define key_merging_disabled(c) 0 23#define key_merging_disabled(c) 0
32#define bypass_torture_test(d) 0 24#define bypass_torture_test(d) 0
33 25
34#endif 26#endif
35 27
36#define bch_count_data(b) \
37 (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
38
39#define bch_check_keys(b, ...) \
40do { \
41 if (expensive_debug_checks((b)->c)) \
42 __bch_check_keys(b, __VA_ARGS__); \
43} while (0)
44
45#ifdef CONFIG_DEBUG_FS 28#ifdef CONFIG_DEBUG_FS
46void bch_debug_init_cache_set(struct cache_set *); 29void bch_debug_init_cache_set(struct cache_set *);
47#else 30#else
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
new file mode 100644
index 000000000000..c3ead586dc27
--- /dev/null
+++ b/drivers/md/bcache/extents.c
@@ -0,0 +1,616 @@
1/*
2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3 *
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
6 * of the device.
7 *
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
12 *
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
15 *
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
19 *
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
21 */
22
23#include "bcache.h"
24#include "btree.h"
25#include "debug.h"
26#include "extents.h"
27#include "writeback.h"
28
29static void sort_key_next(struct btree_iter *iter,
30 struct btree_iter_set *i)
31{
32 i->k = bkey_next(i->k);
33
34 if (i->k == i->end)
35 *i = iter->data[--iter->used];
36}
37
38static bool bch_key_sort_cmp(struct btree_iter_set l,
39 struct btree_iter_set r)
40{
41 int64_t c = bkey_cmp(l.k, r.k);
42
43 return c ? c > 0 : l.k < r.k;
44}
45
46static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
47{
48 unsigned i;
49
50 for (i = 0; i < KEY_PTRS(k); i++)
51 if (ptr_available(c, k, i)) {
52 struct cache *ca = PTR_CACHE(c, k, i);
53 size_t bucket = PTR_BUCKET_NR(c, k, i);
54 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
55
56 if (KEY_SIZE(k) + r > c->sb.bucket_size ||
57 bucket < ca->sb.first_bucket ||
58 bucket >= ca->sb.nbuckets)
59 return true;
60 }
61
62 return false;
63}
64
65/* Common among btree and extent ptrs */
66
67static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
68{
69 unsigned i;
70
71 for (i = 0; i < KEY_PTRS(k); i++)
72 if (ptr_available(c, k, i)) {
73 struct cache *ca = PTR_CACHE(c, k, i);
74 size_t bucket = PTR_BUCKET_NR(c, k, i);
75 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
76
77 if (KEY_SIZE(k) + r > c->sb.bucket_size)
78 return "bad, length too big";
79 if (bucket < ca->sb.first_bucket)
80 return "bad, short offset";
81 if (bucket >= ca->sb.nbuckets)
82 return "bad, offset past end of device";
83 if (ptr_stale(c, k, i))
84 return "stale";
85 }
86
87 if (!bkey_cmp(k, &ZERO_KEY))
88 return "bad, null key";
89 if (!KEY_PTRS(k))
90 return "bad, no pointers";
91 if (!KEY_SIZE(k))
92 return "zeroed key";
93 return "";
94}
95
96void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
97{
98 unsigned i = 0;
99 char *out = buf, *end = buf + size;
100
101#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
102
103 p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
104
105 for (i = 0; i < KEY_PTRS(k); i++) {
106 if (i)
107 p(", ");
108
109 if (PTR_DEV(k, i) == PTR_CHECK_DEV)
110 p("check dev");
111 else
112 p("%llu:%llu gen %llu", PTR_DEV(k, i),
113 PTR_OFFSET(k, i), PTR_GEN(k, i));
114 }
115
116 p("]");
117
118 if (KEY_DIRTY(k))
119 p(" dirty");
120 if (KEY_CSUM(k))
121 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
122#undef p
123}
124
125static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
126{
127 struct btree *b = container_of(keys, struct btree, keys);
128 unsigned j;
129 char buf[80];
130
131 bch_extent_to_text(buf, sizeof(buf), k);
132 printk(" %s", buf);
133
134 for (j = 0; j < KEY_PTRS(k); j++) {
135 size_t n = PTR_BUCKET_NR(b->c, k, j);
136 printk(" bucket %zu", n);
137
138 if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
139 printk(" prio %i",
140 PTR_BUCKET(b->c, k, j)->prio);
141 }
142
143 printk(" %s\n", bch_ptr_status(b->c, k));
144}
145
146/* Btree ptrs */
147
148bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
149{
150 char buf[80];
151
152 if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
153 goto bad;
154
155 if (__ptr_invalid(c, k))
156 goto bad;
157
158 return false;
159bad:
160 bch_extent_to_text(buf, sizeof(buf), k);
161 cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
162 return true;
163}
164
165static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
166{
167 struct btree *b = container_of(bk, struct btree, keys);
168 return __bch_btree_ptr_invalid(b->c, k);
169}
170
171static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
172{
173 unsigned i;
174 char buf[80];
175 struct bucket *g;
176
177 if (mutex_trylock(&b->c->bucket_lock)) {
178 for (i = 0; i < KEY_PTRS(k); i++)
179 if (ptr_available(b->c, k, i)) {
180 g = PTR_BUCKET(b->c, k, i);
181
182 if (KEY_DIRTY(k) ||
183 g->prio != BTREE_PRIO ||
184 (b->c->gc_mark_valid &&
185 GC_MARK(g) != GC_MARK_METADATA))
186 goto err;
187 }
188
189 mutex_unlock(&b->c->bucket_lock);
190 }
191
192 return false;
193err:
194 mutex_unlock(&b->c->bucket_lock);
195 bch_extent_to_text(buf, sizeof(buf), k);
196 btree_bug(b,
197"inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
198 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
199 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
200 return true;
201}
202
203static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
204{
205 struct btree *b = container_of(bk, struct btree, keys);
206 unsigned i;
207
208 if (!bkey_cmp(k, &ZERO_KEY) ||
209 !KEY_PTRS(k) ||
210 bch_ptr_invalid(bk, k))
211 return true;
212
213 for (i = 0; i < KEY_PTRS(k); i++)
214 if (!ptr_available(b->c, k, i) ||
215 ptr_stale(b->c, k, i))
216 return true;
217
218 if (expensive_debug_checks(b->c) &&
219 btree_ptr_bad_expensive(b, k))
220 return true;
221
222 return false;
223}
224
225static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
226 struct bkey *insert,
227 struct btree_iter *iter,
228 struct bkey *replace_key)
229{
230 struct btree *b = container_of(bk, struct btree, keys);
231
232 if (!KEY_OFFSET(insert))
233 btree_current_write(b)->prio_blocked++;
234
235 return false;
236}
237
238const struct btree_keys_ops bch_btree_keys_ops = {
239 .sort_cmp = bch_key_sort_cmp,
240 .insert_fixup = bch_btree_ptr_insert_fixup,
241 .key_invalid = bch_btree_ptr_invalid,
242 .key_bad = bch_btree_ptr_bad,
243 .key_to_text = bch_extent_to_text,
244 .key_dump = bch_bkey_dump,
245};
246
247/* Extents */
248
249/*
250 * Returns true if l > r - unless l == r, in which case returns true if l is
251 * older than r.
252 *
253 * Necessary for btree_sort_fixup() - if there are multiple keys that compare
254 * equal in different sets, we have to process them newest to oldest.
255 */
256static bool bch_extent_sort_cmp(struct btree_iter_set l,
257 struct btree_iter_set r)
258{
259 int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
260
261 return c ? c > 0 : l.k < r.k;
262}
263
264static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
265 struct bkey *tmp)
266{
267 while (iter->used > 1) {
268 struct btree_iter_set *top = iter->data, *i = top + 1;
269
270 if (iter->used > 2 &&
271 bch_extent_sort_cmp(i[0], i[1]))
272 i++;
273
274 if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
275 break;
276
277 if (!KEY_SIZE(i->k)) {
278 sort_key_next(iter, i);
279 heap_sift(iter, i - top, bch_extent_sort_cmp);
280 continue;
281 }
282
283 if (top->k > i->k) {
284 if (bkey_cmp(top->k, i->k) >= 0)
285 sort_key_next(iter, i);
286 else
287 bch_cut_front(top->k, i->k);
288
289 heap_sift(iter, i - top, bch_extent_sort_cmp);
290 } else {
291 /* can't happen because of comparison func */
292 BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
293
294 if (bkey_cmp(i->k, top->k) < 0) {
295 bkey_copy(tmp, top->k);
296
297 bch_cut_back(&START_KEY(i->k), tmp);
298 bch_cut_front(i->k, top->k);
299 heap_sift(iter, 0, bch_extent_sort_cmp);
300
301 return tmp;
302 } else {
303 bch_cut_back(&START_KEY(i->k), top->k);
304 }
305 }
306 }
307
308 return NULL;
309}
310
311static bool bch_extent_insert_fixup(struct btree_keys *b,
312 struct bkey *insert,
313 struct btree_iter *iter,
314 struct bkey *replace_key)
315{
316 struct cache_set *c = container_of(b, struct btree, keys)->c;
317
318 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
319 {
320 if (KEY_DIRTY(k))
321 bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
322 offset, -sectors);
323 }
324
325 uint64_t old_offset;
326 unsigned old_size, sectors_found = 0;
327
328 BUG_ON(!KEY_OFFSET(insert));
329 BUG_ON(!KEY_SIZE(insert));
330
331 while (1) {
332 struct bkey *k = bch_btree_iter_next(iter);
333 if (!k)
334 break;
335
336 if (bkey_cmp(&START_KEY(k), insert) >= 0) {
337 if (KEY_SIZE(k))
338 break;
339 else
340 continue;
341 }
342
343 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
344 continue;
345
346 old_offset = KEY_START(k);
347 old_size = KEY_SIZE(k);
348
349 /*
350 * We might overlap with 0 size extents; we can't skip these
351 * because if they're in the set we're inserting to we have to
352 * adjust them so they don't overlap with the key we're
353 * inserting. But we don't want to check them for replace
354 * operations.
355 */
356
357 if (replace_key && KEY_SIZE(k)) {
358 /*
359 * k might have been split since we inserted/found the
360 * key we're replacing
361 */
362 unsigned i;
363 uint64_t offset = KEY_START(k) -
364 KEY_START(replace_key);
365
366 /* But it must be a subset of the replace key */
367 if (KEY_START(k) < KEY_START(replace_key) ||
368 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
369 goto check_failed;
370
371 /* We didn't find a key that we were supposed to */
372 if (KEY_START(k) > KEY_START(insert) + sectors_found)
373 goto check_failed;
374
375 if (!bch_bkey_equal_header(k, replace_key))
376 goto check_failed;
377
378 /* skip past gen */
379 offset <<= 8;
380
381 BUG_ON(!KEY_PTRS(replace_key));
382
383 for (i = 0; i < KEY_PTRS(replace_key); i++)
384 if (k->ptr[i] != replace_key->ptr[i] + offset)
385 goto check_failed;
386
387 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
388 }
389
390 if (bkey_cmp(insert, k) < 0 &&
391 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
392 /*
393 * We overlapped in the middle of an existing key: that
394 * means we have to split the old key. But we have to do
395 * slightly different things depending on whether the
396 * old key has been written out yet.
397 */
398
399 struct bkey *top;
400
401 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
402
403 if (bkey_written(b, k)) {
404 /*
405 * We insert a new key to cover the top of the
406 * old key, and the old key is modified in place
407 * to represent the bottom split.
408 *
409 * It's completely arbitrary whether the new key
410 * is the top or the bottom, but it has to match
411 * up with what btree_sort_fixup() does - it
412 * doesn't check for this kind of overlap, it
413 * depends on us inserting a new key for the top
414 * here.
415 */
416 top = bch_bset_search(b, bset_tree_last(b),
417 insert);
418 bch_bset_insert(b, top, k);
419 } else {
420 BKEY_PADDED(key) temp;
421 bkey_copy(&temp.key, k);
422 bch_bset_insert(b, k, &temp.key);
423 top = bkey_next(k);
424 }
425
426 bch_cut_front(insert, top);
427 bch_cut_back(&START_KEY(insert), k);
428 bch_bset_fix_invalidated_key(b, k);
429 goto out;
430 }
431
432 if (bkey_cmp(insert, k) < 0) {
433 bch_cut_front(insert, k);
434 } else {
435 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
436 old_offset = KEY_START(insert);
437
438 if (bkey_written(b, k) &&
439 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
440 /*
441 * Completely overwrote, so we don't have to
442 * invalidate the binary search tree
443 */
444 bch_cut_front(k, k);
445 } else {
446 __bch_cut_back(&START_KEY(insert), k);
447 bch_bset_fix_invalidated_key(b, k);
448 }
449 }
450
451 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
452 }
453
454check_failed:
455 if (replace_key) {
456 if (!sectors_found) {
457 return true;
458 } else if (sectors_found < KEY_SIZE(insert)) {
459 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
460 (KEY_SIZE(insert) - sectors_found));
461 SET_KEY_SIZE(insert, sectors_found);
462 }
463 }
464out:
465 if (KEY_DIRTY(insert))
466 bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
467 KEY_START(insert),
468 KEY_SIZE(insert));
469
470 return false;
471}
472
473static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
474{
475 struct btree *b = container_of(bk, struct btree, keys);
476 char buf[80];
477
478 if (!KEY_SIZE(k))
479 return true;
480
481 if (KEY_SIZE(k) > KEY_OFFSET(k))
482 goto bad;
483
484 if (__ptr_invalid(b->c, k))
485 goto bad;
486
487 return false;
488bad:
489 bch_extent_to_text(buf, sizeof(buf), k);
490 cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
491 return true;
492}
493
494static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
495 unsigned ptr)
496{
497 struct bucket *g = PTR_BUCKET(b->c, k, ptr);
498 char buf[80];
499
500 if (mutex_trylock(&b->c->bucket_lock)) {
501 if (b->c->gc_mark_valid &&
502 ((GC_MARK(g) != GC_MARK_DIRTY &&
503 KEY_DIRTY(k)) ||
504 GC_MARK(g) == GC_MARK_METADATA))
505 goto err;
506
507 if (g->prio == BTREE_PRIO)
508 goto err;
509
510 mutex_unlock(&b->c->bucket_lock);
511 }
512
513 return false;
514err:
515 mutex_unlock(&b->c->bucket_lock);
516 bch_extent_to_text(buf, sizeof(buf), k);
517 btree_bug(b,
518"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
519 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
520 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
521 return true;
522}
523
524static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
525{
526 struct btree *b = container_of(bk, struct btree, keys);
527 struct bucket *g;
528 unsigned i, stale;
529
530 if (!KEY_PTRS(k) ||
531 bch_extent_invalid(bk, k))
532 return true;
533
534 for (i = 0; i < KEY_PTRS(k); i++)
535 if (!ptr_available(b->c, k, i))
536 return true;
537
538 if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
539 return false;
540
541 for (i = 0; i < KEY_PTRS(k); i++) {
542 g = PTR_BUCKET(b->c, k, i);
543 stale = ptr_stale(b->c, k, i);
544
545 btree_bug_on(stale > 96, b,
546 "key too stale: %i, need_gc %u",
547 stale, b->c->need_gc);
548
549 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
550 b, "stale dirty pointer");
551
552 if (stale)
553 return true;
554
555 if (expensive_debug_checks(b->c) &&
556 bch_extent_bad_expensive(b, k, i))
557 return true;
558 }
559
560 return false;
561}
562
563static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
564{
565 return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
566 ~((uint64_t)1 << 63);
567}
568
569static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
570{
571 struct btree *b = container_of(bk, struct btree, keys);
572 unsigned i;
573
574 if (key_merging_disabled(b->c))
575 return false;
576
577 for (i = 0; i < KEY_PTRS(l); i++)
578 if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
579 PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
580 return false;
581
582 /* Keys with no pointers aren't restricted to one bucket and could
583 * overflow KEY_SIZE
584 */
585 if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
586 SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
587 SET_KEY_SIZE(l, USHRT_MAX);
588
589 bch_cut_front(l, r);
590 return false;
591 }
592
593 if (KEY_CSUM(l)) {
594 if (KEY_CSUM(r))
595 l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
596 else
597 SET_KEY_CSUM(l, 0);
598 }
599
600 SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
601 SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
602
603 return true;
604}
605
606const struct btree_keys_ops bch_extent_keys_ops = {
607 .sort_cmp = bch_extent_sort_cmp,
608 .sort_fixup = bch_extent_sort_fixup,
609 .insert_fixup = bch_extent_insert_fixup,
610 .key_invalid = bch_extent_invalid,
611 .key_bad = bch_extent_bad,
612 .key_merge = bch_extent_merge,
613 .key_to_text = bch_extent_to_text,
614 .key_dump = bch_bkey_dump,
615 .is_extents = true,
616};
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
new file mode 100644
index 000000000000..e4e23409782d
--- /dev/null
+++ b/drivers/md/bcache/extents.h
@@ -0,0 +1,13 @@
1#ifndef _BCACHE_EXTENTS_H
2#define _BCACHE_EXTENTS_H
3
4extern const struct btree_keys_ops bch_btree_keys_ops;
5extern const struct btree_keys_ops bch_extent_keys_ops;
6
7struct bkey;
8struct cache_set;
9
10void bch_extent_to_text(char *, size_t, const struct bkey *);
11bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
12
13#endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 7eafdf09a0ae..18039affc306 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -44,11 +44,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
44 44
45 closure_init_stack(&cl); 45 closure_init_stack(&cl);
46 46
47 pr_debug("reading %llu", (uint64_t) bucket); 47 pr_debug("reading %u", bucket_index);
48 48
49 while (offset < ca->sb.bucket_size) { 49 while (offset < ca->sb.bucket_size) {
50reread: left = ca->sb.bucket_size - offset; 50reread: left = ca->sb.bucket_size - offset;
51 len = min_t(unsigned, left, PAGE_SECTORS * 8); 51 len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
52 52
53 bio_reset(bio); 53 bio_reset(bio);
54 bio->bi_iter.bi_sector = bucket + offset; 54 bio->bi_iter.bi_sector = bucket + offset;
@@ -74,19 +74,28 @@ reread: left = ca->sb.bucket_size - offset;
74 struct list_head *where; 74 struct list_head *where;
75 size_t blocks, bytes = set_bytes(j); 75 size_t blocks, bytes = set_bytes(j);
76 76
77 if (j->magic != jset_magic(&ca->sb)) 77 if (j->magic != jset_magic(&ca->sb)) {
78 pr_debug("%u: bad magic", bucket_index);
78 return ret; 79 return ret;
80 }
79 81
80 if (bytes > left << 9) 82 if (bytes > left << 9 ||
83 bytes > PAGE_SIZE << JSET_BITS) {
84 pr_info("%u: too big, %zu bytes, offset %u",
85 bucket_index, bytes, offset);
81 return ret; 86 return ret;
87 }
82 88
83 if (bytes > len << 9) 89 if (bytes > len << 9)
84 goto reread; 90 goto reread;
85 91
86 if (j->csum != csum_set(j)) 92 if (j->csum != csum_set(j)) {
93 pr_info("%u: bad csum, %zu bytes, offset %u",
94 bucket_index, bytes, offset);
87 return ret; 95 return ret;
96 }
88 97
89 blocks = set_blocks(j, ca->set); 98 blocks = set_blocks(j, block_bytes(ca->set));
90 99
91 while (!list_empty(list)) { 100 while (!list_empty(list)) {
92 i = list_first_entry(list, 101 i = list_first_entry(list,
@@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
275 } 284 }
276 285
277 for (k = i->j.start; 286 for (k = i->j.start;
278 k < end(&i->j); 287 k < bset_bkey_last(&i->j);
279 k = bkey_next(k)) { 288 k = bkey_next(k)) {
280 unsigned j; 289 unsigned j;
281 290
@@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
313 n, i->j.seq - 1, start, end); 322 n, i->j.seq - 1, start, end);
314 323
315 for (k = i->j.start; 324 for (k = i->j.start;
316 k < end(&i->j); 325 k < bset_bkey_last(&i->j);
317 k = bkey_next(k)) { 326 k = bkey_next(k)) {
318 trace_bcache_journal_replay_key(k); 327 trace_bcache_journal_replay_key(k);
319 328
@@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl)
555 continue_at_nobarrier(cl, journal_write, system_wq); 564 continue_at_nobarrier(cl, journal_write, system_wq);
556} 565}
557 566
567static void journal_write_unlock(struct closure *cl)
568{
569 struct cache_set *c = container_of(cl, struct cache_set, journal.io);
570
571 c->journal.io_in_flight = 0;
572 spin_unlock(&c->journal.lock);
573}
574
558static void journal_write_unlocked(struct closure *cl) 575static void journal_write_unlocked(struct closure *cl)
559 __releases(c->journal.lock) 576 __releases(c->journal.lock)
560{ 577{
@@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl)
562 struct cache *ca; 579 struct cache *ca;
563 struct journal_write *w = c->journal.cur; 580 struct journal_write *w = c->journal.cur;
564 struct bkey *k = &c->journal.key; 581 struct bkey *k = &c->journal.key;
565 unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; 582 unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
583 c->sb.block_size;
566 584
567 struct bio *bio; 585 struct bio *bio;
568 struct bio_list list; 586 struct bio_list list;
569 bio_list_init(&list); 587 bio_list_init(&list);
570 588
571 if (!w->need_write) { 589 if (!w->need_write) {
572 /* 590 closure_return_with_destructor(cl, journal_write_unlock);
573 * XXX: have to unlock closure before we unlock journal lock,
574 * else we race with bch_journal(). But this way we race
575 * against cache set unregister. Doh.
576 */
577 set_closure_fn(cl, NULL, NULL);
578 closure_sub(cl, CLOSURE_RUNNING + 1);
579 spin_unlock(&c->journal.lock);
580 return;
581 } else if (journal_full(&c->journal)) { 591 } else if (journal_full(&c->journal)) {
582 journal_reclaim(c); 592 journal_reclaim(c);
583 spin_unlock(&c->journal.lock); 593 spin_unlock(&c->journal.lock);
@@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
586 continue_at(cl, journal_write, system_wq); 596 continue_at(cl, journal_write, system_wq);
587 } 597 }
588 598
589 c->journal.blocks_free -= set_blocks(w->data, c); 599 c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
590 600
591 w->data->btree_level = c->root->level; 601 w->data->btree_level = c->root->level;
592 602
@@ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c)
653 663
654 w->need_write = true; 664 w->need_write = true;
655 665
656 if (closure_trylock(cl, &c->cl)) 666 if (!c->journal.io_in_flight) {
657 journal_write_unlocked(cl); 667 c->journal.io_in_flight = 1;
658 else 668 closure_call(cl, journal_write_unlocked, NULL, &c->cl);
669 } else {
659 spin_unlock(&c->journal.lock); 670 spin_unlock(&c->journal.lock);
671 }
660} 672}
661 673
662static struct journal_write *journal_wait_for_write(struct cache_set *c, 674static struct journal_write *journal_wait_for_write(struct cache_set *c,
@@ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
664{ 676{
665 size_t sectors; 677 size_t sectors;
666 struct closure cl; 678 struct closure cl;
679 bool wait = false;
667 680
668 closure_init_stack(&cl); 681 closure_init_stack(&cl);
669 682
@@ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
673 struct journal_write *w = c->journal.cur; 686 struct journal_write *w = c->journal.cur;
674 687
675 sectors = __set_blocks(w->data, w->data->keys + nkeys, 688 sectors = __set_blocks(w->data, w->data->keys + nkeys,
676 c) * c->sb.block_size; 689 block_bytes(c)) * c->sb.block_size;
677 690
678 if (sectors <= min_t(size_t, 691 if (sectors <= min_t(size_t,
679 c->journal.blocks_free * c->sb.block_size, 692 c->journal.blocks_free * c->sb.block_size,
680 PAGE_SECTORS << JSET_BITS)) 693 PAGE_SECTORS << JSET_BITS))
681 return w; 694 return w;
682 695
683 /* XXX: tracepoint */ 696 if (wait)
697 closure_wait(&c->journal.wait, &cl);
698
684 if (!journal_full(&c->journal)) { 699 if (!journal_full(&c->journal)) {
685 trace_bcache_journal_entry_full(c); 700 if (wait)
701 trace_bcache_journal_entry_full(c);
686 702
687 /* 703 /*
688 * XXX: If we were inserting so many keys that they 704 * XXX: If we were inserting so many keys that they
@@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
692 */ 708 */
693 BUG_ON(!w->data->keys); 709 BUG_ON(!w->data->keys);
694 710
695 closure_wait(&w->wait, &cl);
696 journal_try_write(c); /* unlocks */ 711 journal_try_write(c); /* unlocks */
697 } else { 712 } else {
698 trace_bcache_journal_full(c); 713 if (wait)
714 trace_bcache_journal_full(c);
699 715
700 closure_wait(&c->journal.wait, &cl);
701 journal_reclaim(c); 716 journal_reclaim(c);
702 spin_unlock(&c->journal.lock); 717 spin_unlock(&c->journal.lock);
703 718
@@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
706 721
707 closure_sync(&cl); 722 closure_sync(&cl);
708 spin_lock(&c->journal.lock); 723 spin_lock(&c->journal.lock);
724 wait = true;
709 } 725 }
710} 726}
711 727
@@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c,
736 752
737 w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); 753 w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
738 754
739 memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); 755 memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
740 w->data->keys += bch_keylist_nkeys(keys); 756 w->data->keys += bch_keylist_nkeys(keys);
741 757
742 ret = &fifo_back(&c->journal.pin); 758 ret = &fifo_back(&c->journal.pin);
@@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c)
780{ 796{
781 struct journal *j = &c->journal; 797 struct journal *j = &c->journal;
782 798
783 closure_init_unlocked(&j->io);
784 spin_lock_init(&j->lock); 799 spin_lock_init(&j->lock);
785 INIT_DELAYED_WORK(&j->work, journal_write_work); 800 INIT_DELAYED_WORK(&j->work, journal_write_work);
786 801
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index a6472fda94b2..9180c4465075 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -104,6 +104,7 @@ struct journal {
104 /* used when waiting because the journal was full */ 104 /* used when waiting because the journal was full */
105 struct closure_waitlist wait; 105 struct closure_waitlist wait;
106 struct closure io; 106 struct closure io;
107 int io_in_flight;
107 struct delayed_work work; 108 struct delayed_work work;
108 109
109 /* Number of blocks free in the bucket(s) we're currently writing to */ 110 /* Number of blocks free in the bucket(s) we're currently writing to */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 052bd24d24b4..9eb60d102de8 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c)
211 for_each_cache(ca, c, i) { 211 for_each_cache(ca, c, i) {
212 unsigned sectors_to_move = 0; 212 unsigned sectors_to_move = 0;
213 unsigned reserve_sectors = ca->sb.bucket_size * 213 unsigned reserve_sectors = ca->sb.bucket_size *
214 min(fifo_used(&ca->free), ca->free.size / 2); 214 fifo_used(&ca->free[RESERVE_MOVINGGC]);
215 215
216 ca->heap.used = 0; 216 ca->heap.used = 0;
217 217
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index c906571997d7..72cd213f213f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -254,6 +254,24 @@ static void bch_data_insert_keys(struct closure *cl)
254 closure_return(cl); 254 closure_return(cl);
255} 255}
256 256
257static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
258 struct cache_set *c)
259{
260 size_t oldsize = bch_keylist_nkeys(l);
261 size_t newsize = oldsize + u64s;
262
263 /*
264 * The journalling code doesn't handle the case where the keys to insert
265 * is bigger than an empty write: If we just return -ENOMEM here,
266 * bio_insert() and bio_invalidate() will insert the keys created so far
267 * and finish the rest when the keylist is empty.
268 */
269 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
270 return -ENOMEM;
271
272 return __bch_keylist_realloc(l, u64s);
273}
274
257static void bch_data_invalidate(struct closure *cl) 275static void bch_data_invalidate(struct closure *cl)
258{ 276{
259 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 277 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
@@ -266,7 +284,7 @@ static void bch_data_invalidate(struct closure *cl)
266 unsigned sectors = min(bio_sectors(bio), 284 unsigned sectors = min(bio_sectors(bio),
267 1U << (KEY_SIZE_BITS - 1)); 285 1U << (KEY_SIZE_BITS - 1));
268 286
269 if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) 287 if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
270 goto out; 288 goto out;
271 289
272 bio->bi_iter.bi_sector += sectors; 290 bio->bi_iter.bi_sector += sectors;
@@ -356,7 +374,7 @@ static void bch_data_insert_start(struct closure *cl)
356 374
357 /* 1 for the device pointer and 1 for the chksum */ 375 /* 1 for the device pointer and 1 for the chksum */
358 if (bch_keylist_realloc(&op->insert_keys, 376 if (bch_keylist_realloc(&op->insert_keys,
359 1 + (op->csum ? 1 : 0), 377 3 + (op->csum ? 1 : 0),
360 op->c)) 378 op->c))
361 continue_at(cl, bch_data_insert_keys, bcache_wq); 379 continue_at(cl, bch_data_insert_keys, bcache_wq);
362 380
@@ -596,14 +614,12 @@ struct search {
596 /* Stack frame for bio_complete */ 614 /* Stack frame for bio_complete */
597 struct closure cl; 615 struct closure cl;
598 616
599 struct bcache_device *d;
600
601 struct bbio bio; 617 struct bbio bio;
602 struct bio *orig_bio; 618 struct bio *orig_bio;
603 struct bio *cache_miss; 619 struct bio *cache_miss;
620 struct bcache_device *d;
604 621
605 unsigned insert_bio_sectors; 622 unsigned insert_bio_sectors;
606
607 unsigned recoverable:1; 623 unsigned recoverable:1;
608 unsigned write:1; 624 unsigned write:1;
609 unsigned read_dirty_data:1; 625 unsigned read_dirty_data:1;
@@ -629,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error)
629 645
630 if (error) 646 if (error)
631 s->iop.error = error; 647 s->iop.error = error;
632 else if (ptr_stale(s->iop.c, &b->key, 0)) { 648 else if (!KEY_DIRTY(&b->key) &&
649 ptr_stale(s->iop.c, &b->key, 0)) {
633 atomic_long_inc(&s->iop.c->cache_read_races); 650 atomic_long_inc(&s->iop.c->cache_read_races);
634 s->iop.error = -EINTR; 651 s->iop.error = -EINTR;
635 } 652 }
@@ -710,10 +727,13 @@ static void cache_lookup(struct closure *cl)
710{ 727{
711 struct search *s = container_of(cl, struct search, iop.cl); 728 struct search *s = container_of(cl, struct search, iop.cl);
712 struct bio *bio = &s->bio.bio; 729 struct bio *bio = &s->bio.bio;
730 int ret;
731
732 bch_btree_op_init(&s->op, -1);
713 733
714 int ret = bch_btree_map_keys(&s->op, s->iop.c, 734 ret = bch_btree_map_keys(&s->op, s->iop.c,
715 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), 735 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
716 cache_lookup_fn, MAP_END_KEY); 736 cache_lookup_fn, MAP_END_KEY);
717 if (ret == -EAGAIN) 737 if (ret == -EAGAIN)
718 continue_at(cl, cache_lookup, bcache_wq); 738 continue_at(cl, cache_lookup, bcache_wq);
719 739
@@ -754,12 +774,12 @@ static void bio_complete(struct search *s)
754 } 774 }
755} 775}
756 776
757static void do_bio_hook(struct search *s) 777static void do_bio_hook(struct search *s, struct bio *orig_bio)
758{ 778{
759 struct bio *bio = &s->bio.bio; 779 struct bio *bio = &s->bio.bio;
760 780
761 bio_init(bio); 781 bio_init(bio);
762 __bio_clone_fast(bio, s->orig_bio); 782 __bio_clone_fast(bio, orig_bio);
763 bio->bi_end_io = request_endio; 783 bio->bi_end_io = request_endio;
764 bio->bi_private = &s->cl; 784 bio->bi_private = &s->cl;
765 785
@@ -778,26 +798,32 @@ static void search_free(struct closure *cl)
778 mempool_free(s, s->d->c->search); 798 mempool_free(s, s->d->c->search);
779} 799}
780 800
781static struct search *search_alloc(struct bio *bio, struct bcache_device *d) 801static inline struct search *search_alloc(struct bio *bio,
802 struct bcache_device *d)
782{ 803{
783 struct search *s; 804 struct search *s;
784 805
785 s = mempool_alloc(d->c->search, GFP_NOIO); 806 s = mempool_alloc(d->c->search, GFP_NOIO);
786 memset(s, 0, offsetof(struct search, iop.insert_keys));
787 807
788 __closure_init(&s->cl, NULL); 808 closure_init(&s->cl, NULL);
809 do_bio_hook(s, bio);
789 810
790 s->iop.inode = d->id;
791 s->iop.c = d->c;
792 s->d = d;
793 s->op.lock = -1;
794 s->iop.write_point = hash_long((unsigned long) current, 16);
795 s->orig_bio = bio; 811 s->orig_bio = bio;
796 s->write = (bio->bi_rw & REQ_WRITE) != 0; 812 s->cache_miss = NULL;
797 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 813 s->d = d;
798 s->recoverable = 1; 814 s->recoverable = 1;
815 s->write = (bio->bi_rw & REQ_WRITE) != 0;
816 s->read_dirty_data = 0;
799 s->start_time = jiffies; 817 s->start_time = jiffies;
800 do_bio_hook(s); 818
819 s->iop.c = d->c;
820 s->iop.bio = NULL;
821 s->iop.inode = d->id;
822 s->iop.write_point = hash_long((unsigned long) current, 16);
823 s->iop.write_prio = 0;
824 s->iop.error = 0;
825 s->iop.flags = 0;
826 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
801 827
802 return s; 828 return s;
803} 829}
@@ -843,7 +869,7 @@ static void cached_dev_read_error(struct closure *cl)
843 trace_bcache_read_retry(s->orig_bio); 869 trace_bcache_read_retry(s->orig_bio);
844 870
845 s->iop.error = 0; 871 s->iop.error = 0;
846 do_bio_hook(s); 872 do_bio_hook(s, s->orig_bio);
847 873
848 /* XXX: invalidate cache */ 874 /* XXX: invalidate cache */
849 875
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 2cd65bf073c2..39f21dbedc38 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -13,17 +13,22 @@ struct data_insert_op {
13 uint16_t write_prio; 13 uint16_t write_prio;
14 short error; 14 short error;
15 15
16 unsigned bypass:1; 16 union {
17 unsigned writeback:1; 17 uint16_t flags;
18 unsigned flush_journal:1;
19 unsigned csum:1;
20 18
21 unsigned replace:1; 19 struct {
22 unsigned replace_collision:1; 20 unsigned bypass:1;
21 unsigned writeback:1;
22 unsigned flush_journal:1;
23 unsigned csum:1;
23 24
24 unsigned insert_data_done:1; 25 unsigned replace:1;
26 unsigned replace_collision:1;
27
28 unsigned insert_data_done:1;
29 };
30 };
25 31
26 /* Anything past this point won't get zeroed in search_alloc() */
27 struct keylist insert_keys; 32 struct keylist insert_keys;
28 BKEY_PADDED(replace_key); 33 BKEY_PADDED(replace_key);
29}; 34};
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 93d593f957f6..24a3a1546caa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -9,6 +9,7 @@
9#include "bcache.h" 9#include "bcache.h"
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "extents.h"
12#include "request.h" 13#include "request.h"
13#include "writeback.h" 14#include "writeback.h"
14 15
@@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error)
225 struct cached_dev *dc = bio->bi_private; 226 struct cached_dev *dc = bio->bi_private;
226 /* XXX: error checking */ 227 /* XXX: error checking */
227 228
228 closure_put(&dc->sb_write.cl); 229 closure_put(&dc->sb_write);
229} 230}
230 231
231static void __write_super(struct cache_sb *sb, struct bio *bio) 232static void __write_super(struct cache_sb *sb, struct bio *bio)
@@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
263 submit_bio(REQ_WRITE, bio); 264 submit_bio(REQ_WRITE, bio);
264} 265}
265 266
267static void bch_write_bdev_super_unlock(struct closure *cl)
268{
269 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
270
271 up(&dc->sb_write_mutex);
272}
273
266void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) 274void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
267{ 275{
268 struct closure *cl = &dc->sb_write.cl; 276 struct closure *cl = &dc->sb_write;
269 struct bio *bio = &dc->sb_bio; 277 struct bio *bio = &dc->sb_bio;
270 278
271 closure_lock(&dc->sb_write, parent); 279 down(&dc->sb_write_mutex);
280 closure_init(cl, parent);
272 281
273 bio_reset(bio); 282 bio_reset(bio);
274 bio->bi_bdev = dc->bdev; 283 bio->bi_bdev = dc->bdev;
@@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
278 closure_get(cl); 287 closure_get(cl);
279 __write_super(&dc->sb, bio); 288 __write_super(&dc->sb, bio);
280 289
281 closure_return(cl); 290 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
282} 291}
283 292
284static void write_super_endio(struct bio *bio, int error) 293static void write_super_endio(struct bio *bio, int error)
@@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error)
286 struct cache *ca = bio->bi_private; 295 struct cache *ca = bio->bi_private;
287 296
288 bch_count_io_errors(ca, error, "writing superblock"); 297 bch_count_io_errors(ca, error, "writing superblock");
289 closure_put(&ca->set->sb_write.cl); 298 closure_put(&ca->set->sb_write);
299}
300
301static void bcache_write_super_unlock(struct closure *cl)
302{
303 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
304
305 up(&c->sb_write_mutex);
290} 306}
291 307
292void bcache_write_super(struct cache_set *c) 308void bcache_write_super(struct cache_set *c)
293{ 309{
294 struct closure *cl = &c->sb_write.cl; 310 struct closure *cl = &c->sb_write;
295 struct cache *ca; 311 struct cache *ca;
296 unsigned i; 312 unsigned i;
297 313
298 closure_lock(&c->sb_write, &c->cl); 314 down(&c->sb_write_mutex);
315 closure_init(cl, &c->cl);
299 316
300 c->sb.seq++; 317 c->sb.seq++;
301 318
@@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c)
317 __write_super(&ca->sb, bio); 334 __write_super(&ca->sb, bio);
318 } 335 }
319 336
320 closure_return(cl); 337 closure_return_with_destructor(cl, bcache_write_super_unlock);
321} 338}
322 339
323/* UUID io */ 340/* UUID io */
@@ -325,23 +342,31 @@ void bcache_write_super(struct cache_set *c)
325static void uuid_endio(struct bio *bio, int error) 342static void uuid_endio(struct bio *bio, int error)
326{ 343{
327 struct closure *cl = bio->bi_private; 344 struct closure *cl = bio->bi_private;
328 struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); 345 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
329 346
330 cache_set_err_on(error, c, "accessing uuids"); 347 cache_set_err_on(error, c, "accessing uuids");
331 bch_bbio_free(bio, c); 348 bch_bbio_free(bio, c);
332 closure_put(cl); 349 closure_put(cl);
333} 350}
334 351
352static void uuid_io_unlock(struct closure *cl)
353{
354 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
355
356 up(&c->uuid_write_mutex);
357}
358
335static void uuid_io(struct cache_set *c, unsigned long rw, 359static void uuid_io(struct cache_set *c, unsigned long rw,
336 struct bkey *k, struct closure *parent) 360 struct bkey *k, struct closure *parent)
337{ 361{
338 struct closure *cl = &c->uuid_write.cl; 362 struct closure *cl = &c->uuid_write;
339 struct uuid_entry *u; 363 struct uuid_entry *u;
340 unsigned i; 364 unsigned i;
341 char buf[80]; 365 char buf[80];
342 366
343 BUG_ON(!parent); 367 BUG_ON(!parent);
344 closure_lock(&c->uuid_write, parent); 368 down(&c->uuid_write_mutex);
369 closure_init(cl, parent);
345 370
346 for (i = 0; i < KEY_PTRS(k); i++) { 371 for (i = 0; i < KEY_PTRS(k); i++) {
347 struct bio *bio = bch_bbio_alloc(c); 372 struct bio *bio = bch_bbio_alloc(c);
@@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
359 break; 384 break;
360 } 385 }
361 386
362 bch_bkey_to_text(buf, sizeof(buf), k); 387 bch_extent_to_text(buf, sizeof(buf), k);
363 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); 388 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
364 389
365 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) 390 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
@@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
368 u - c->uuids, u->uuid, u->label, 393 u - c->uuids, u->uuid, u->label,
369 u->first_reg, u->last_reg, u->invalidated); 394 u->first_reg, u->last_reg, u->invalidated);
370 395
371 closure_return(cl); 396 closure_return_with_destructor(cl, uuid_io_unlock);
372} 397}
373 398
374static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) 399static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
375{ 400{
376 struct bkey *k = &j->uuid_bucket; 401 struct bkey *k = &j->uuid_bucket;
377 402
378 if (bch_btree_ptr_invalid(c, k)) 403 if (__bch_btree_ptr_invalid(c, k))
379 return "bad uuid pointer"; 404 return "bad uuid pointer";
380 405
381 bkey_copy(&c->uuid_bucket, k); 406 bkey_copy(&c->uuid_bucket, k);
@@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c)
420 445
421 lockdep_assert_held(&bch_register_lock); 446 lockdep_assert_held(&bch_register_lock);
422 447
423 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) 448 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
424 return 1; 449 return 1;
425 450
426 SET_KEY_SIZE(&k.key, c->sb.bucket_size); 451 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca)
538 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), 563 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
539 &ca->meta_sectors_written); 564 &ca->meta_sectors_written);
540 565
541 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), 566 //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
542 fifo_used(&ca->free_inc), fifo_used(&ca->unused)); 567 // fifo_used(&ca->free_inc), fifo_used(&ca->unused));
543 568
544 for (i = prio_buckets(ca) - 1; i >= 0; --i) { 569 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
545 long bucket; 570 long bucket;
@@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca)
558 p->magic = pset_magic(&ca->sb); 583 p->magic = pset_magic(&ca->sb);
559 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); 584 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
560 585
561 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); 586 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
562 BUG_ON(bucket == -1); 587 BUG_ON(bucket == -1);
563 588
564 mutex_unlock(&ca->set->bucket_lock); 589 mutex_unlock(&ca->set->bucket_lock);
@@ -1098,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1098 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); 1123 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1099 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); 1124 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1100 INIT_WORK(&dc->detach, cached_dev_detach_finish); 1125 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1101 closure_init_unlocked(&dc->sb_write); 1126 sema_init(&dc->sb_write_mutex, 1);
1102 INIT_LIST_HEAD(&dc->io_lru); 1127 INIT_LIST_HEAD(&dc->io_lru);
1103 spin_lock_init(&dc->io_lock); 1128 spin_lock_init(&dc->io_lock);
1104 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); 1129 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
@@ -1110,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1110 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); 1135 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1111 } 1136 }
1112 1137
1138 dc->disk.stripe_size = q->limits.io_opt >> 9;
1139
1140 if (dc->disk.stripe_size)
1141 dc->partial_stripes_expensive =
1142 q->limits.raid_partial_stripes_expensive;
1143
1113 ret = bcache_device_init(&dc->disk, block_size, 1144 ret = bcache_device_init(&dc->disk, block_size,
1114 dc->bdev->bd_part->nr_sects - dc->sb.data_offset); 1145 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1115 if (ret) 1146 if (ret)
@@ -1321,8 +1352,8 @@ static void cache_set_free(struct closure *cl)
1321 if (ca) 1352 if (ca)
1322 kobject_put(&ca->kobj); 1353 kobject_put(&ca->kobj);
1323 1354
1355 bch_bset_sort_state_free(&c->sort);
1324 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1356 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1325 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1326 1357
1327 if (c->bio_split) 1358 if (c->bio_split)
1328 bioset_free(c->bio_split); 1359 bioset_free(c->bio_split);
@@ -1447,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1447 c->block_bits = ilog2(sb->block_size); 1478 c->block_bits = ilog2(sb->block_size);
1448 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); 1479 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1449 1480
1450 c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; 1481 c->btree_pages = bucket_pages(c);
1451 if (c->btree_pages > BTREE_MAX_PAGES) 1482 if (c->btree_pages > BTREE_MAX_PAGES)
1452 c->btree_pages = max_t(int, c->btree_pages / 4, 1483 c->btree_pages = max_t(int, c->btree_pages / 4,
1453 BTREE_MAX_PAGES); 1484 BTREE_MAX_PAGES);
1454 1485
1455 c->sort_crit_factor = int_sqrt(c->btree_pages); 1486 sema_init(&c->sb_write_mutex, 1);
1456
1457 closure_init_unlocked(&c->sb_write);
1458 mutex_init(&c->bucket_lock); 1487 mutex_init(&c->bucket_lock);
1459 init_waitqueue_head(&c->try_wait); 1488 init_waitqueue_head(&c->try_wait);
1460 init_waitqueue_head(&c->bucket_wait); 1489 init_waitqueue_head(&c->bucket_wait);
1461 closure_init_unlocked(&c->uuid_write); 1490 sema_init(&c->uuid_write_mutex, 1);
1462 mutex_init(&c->sort_lock);
1463 1491
1464 spin_lock_init(&c->sort_time.lock);
1465 spin_lock_init(&c->btree_gc_time.lock); 1492 spin_lock_init(&c->btree_gc_time.lock);
1466 spin_lock_init(&c->btree_split_time.lock); 1493 spin_lock_init(&c->btree_split_time.lock);
1467 spin_lock_init(&c->btree_read_time.lock); 1494 spin_lock_init(&c->btree_read_time.lock);
@@ -1489,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1489 bucket_pages(c))) || 1516 bucket_pages(c))) ||
1490 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || 1517 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1491 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1518 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1492 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1493 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1519 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1494 bch_journal_alloc(c) || 1520 bch_journal_alloc(c) ||
1495 bch_btree_cache_alloc(c) || 1521 bch_btree_cache_alloc(c) ||
1496 bch_open_buckets_alloc(c)) 1522 bch_open_buckets_alloc(c) ||
1523 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1497 goto err; 1524 goto err;
1498 1525
1499 c->congested_read_threshold_us = 2000; 1526 c->congested_read_threshold_us = 2000;
@@ -1549,7 +1576,7 @@ static void run_cache_set(struct cache_set *c)
1549 k = &j->btree_root; 1576 k = &j->btree_root;
1550 1577
1551 err = "bad btree root"; 1578 err = "bad btree root";
1552 if (bch_btree_ptr_invalid(c, k)) 1579 if (__bch_btree_ptr_invalid(c, k))
1553 goto err; 1580 goto err;
1554 1581
1555 err = "error reading btree root"; 1582 err = "error reading btree root";
@@ -1743,6 +1770,7 @@ err:
1743void bch_cache_release(struct kobject *kobj) 1770void bch_cache_release(struct kobject *kobj)
1744{ 1771{
1745 struct cache *ca = container_of(kobj, struct cache, kobj); 1772 struct cache *ca = container_of(kobj, struct cache, kobj);
1773 unsigned i;
1746 1774
1747 if (ca->set) 1775 if (ca->set)
1748 ca->set->cache[ca->sb.nr_this_dev] = NULL; 1776 ca->set->cache[ca->sb.nr_this_dev] = NULL;
@@ -1756,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj)
1756 free_heap(&ca->heap); 1784 free_heap(&ca->heap);
1757 free_fifo(&ca->unused); 1785 free_fifo(&ca->unused);
1758 free_fifo(&ca->free_inc); 1786 free_fifo(&ca->free_inc);
1759 free_fifo(&ca->free); 1787
1788 for (i = 0; i < RESERVE_NR; i++)
1789 free_fifo(&ca->free[i]);
1760 1790
1761 if (ca->sb_bio.bi_inline_vecs[0].bv_page) 1791 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1762 put_page(ca->sb_bio.bi_io_vec[0].bv_page); 1792 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
@@ -1782,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1782 ca->journal.bio.bi_max_vecs = 8; 1812 ca->journal.bio.bi_max_vecs = 8;
1783 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; 1813 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1784 1814
1785 free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; 1815 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
1786 free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
1787 1816
1788 if (!init_fifo(&ca->free, free, GFP_KERNEL) || 1817 if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
1818 !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1819 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
1820 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
1789 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || 1821 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1790 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || 1822 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
1791 !init_heap(&ca->heap, free << 3, GFP_KERNEL) || 1823 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
@@ -2030,7 +2062,8 @@ static void bcache_exit(void)
2030 kobject_put(bcache_kobj); 2062 kobject_put(bcache_kobj);
2031 if (bcache_wq) 2063 if (bcache_wq)
2032 destroy_workqueue(bcache_wq); 2064 destroy_workqueue(bcache_wq);
2033 unregister_blkdev(bcache_major, "bcache"); 2065 if (bcache_major)
2066 unregister_blkdev(bcache_major, "bcache");
2034 unregister_reboot_notifier(&reboot); 2067 unregister_reboot_notifier(&reboot);
2035} 2068}
2036 2069
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index a1f85612f0b3..c6ab69333a6d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test);
102rw_attribute(key_merging_disabled); 102rw_attribute(key_merging_disabled);
103rw_attribute(gc_always_rewrite); 103rw_attribute(gc_always_rewrite);
104rw_attribute(expensive_debug_checks); 104rw_attribute(expensive_debug_checks);
105rw_attribute(freelist_percent);
106rw_attribute(cache_replacement_policy); 105rw_attribute(cache_replacement_policy);
107rw_attribute(btree_shrinker_disabled); 106rw_attribute(btree_shrinker_disabled);
108rw_attribute(copy_gc_enabled); 107rw_attribute(copy_gc_enabled);
@@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = {
401}; 400};
402KTYPE(bch_flash_dev); 401KTYPE(bch_flash_dev);
403 402
403struct bset_stats_op {
404 struct btree_op op;
405 size_t nodes;
406 struct bset_stats stats;
407};
408
409static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
410{
411 struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
412
413 op->nodes++;
414 bch_btree_keys_stats(&b->keys, &op->stats);
415
416 return MAP_CONTINUE;
417}
418
419int bch_bset_print_stats(struct cache_set *c, char *buf)
420{
421 struct bset_stats_op op;
422 int ret;
423
424 memset(&op, 0, sizeof(op));
425 bch_btree_op_init(&op.op, -1);
426
427 ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats);
428 if (ret < 0)
429 return ret;
430
431 return snprintf(buf, PAGE_SIZE,
432 "btree nodes: %zu\n"
433 "written sets: %zu\n"
434 "unwritten sets: %zu\n"
435 "written key bytes: %zu\n"
436 "unwritten key bytes: %zu\n"
437 "floats: %zu\n"
438 "failed: %zu\n",
439 op.nodes,
440 op.stats.sets_written, op.stats.sets_unwritten,
441 op.stats.bytes_written, op.stats.bytes_unwritten,
442 op.stats.floats, op.stats.failed);
443}
444
404SHOW(__bch_cache_set) 445SHOW(__bch_cache_set)
405{ 446{
406 unsigned root_usage(struct cache_set *c) 447 unsigned root_usage(struct cache_set *c)
@@ -419,7 +460,7 @@ lock_root:
419 rw_lock(false, b, b->level); 460 rw_lock(false, b, b->level);
420 } while (b != c->root); 461 } while (b != c->root);
421 462
422 for_each_key_filter(b, k, &iter, bch_ptr_bad) 463 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
423 bytes += bkey_bytes(k); 464 bytes += bkey_bytes(k);
424 465
425 rw_unlock(false, b); 466 rw_unlock(false, b);
@@ -434,7 +475,7 @@ lock_root:
434 475
435 mutex_lock(&c->bucket_lock); 476 mutex_lock(&c->bucket_lock);
436 list_for_each_entry(b, &c->btree_cache, list) 477 list_for_each_entry(b, &c->btree_cache, list)
437 ret += 1 << (b->page_order + PAGE_SHIFT); 478 ret += 1 << (b->keys.page_order + PAGE_SHIFT);
438 479
439 mutex_unlock(&c->bucket_lock); 480 mutex_unlock(&c->bucket_lock);
440 return ret; 481 return ret;
@@ -491,7 +532,7 @@ lock_root:
491 532
492 sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); 533 sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
493 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); 534 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
494 sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); 535 sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
495 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); 536 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
496 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); 537 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
497 538
@@ -711,9 +752,6 @@ SHOW(__bch_cache)
711 sysfs_print(io_errors, 752 sysfs_print(io_errors,
712 atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); 753 atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
713 754
714 sysfs_print(freelist_percent, ca->free.size * 100 /
715 ((size_t) ca->sb.nbuckets));
716
717 if (attr == &sysfs_cache_replacement_policy) 755 if (attr == &sysfs_cache_replacement_policy)
718 return bch_snprint_string_list(buf, PAGE_SIZE, 756 return bch_snprint_string_list(buf, PAGE_SIZE,
719 cache_replacement_policies, 757 cache_replacement_policies,
@@ -820,32 +858,6 @@ STORE(__bch_cache)
820 } 858 }
821 } 859 }
822 860
823 if (attr == &sysfs_freelist_percent) {
824 DECLARE_FIFO(long, free);
825 long i;
826 size_t p = strtoul_or_return(buf);
827
828 p = clamp_t(size_t,
829 ((size_t) ca->sb.nbuckets * p) / 100,
830 roundup_pow_of_two(ca->sb.nbuckets) >> 9,
831 ca->sb.nbuckets / 2);
832
833 if (!init_fifo_exact(&free, p, GFP_KERNEL))
834 return -ENOMEM;
835
836 mutex_lock(&ca->set->bucket_lock);
837
838 fifo_move(&free, &ca->free);
839 fifo_swap(&free, &ca->free);
840
841 mutex_unlock(&ca->set->bucket_lock);
842
843 while (fifo_pop(&free, i))
844 atomic_dec(&ca->buckets[i].pin);
845
846 free_fifo(&free);
847 }
848
849 if (attr == &sysfs_clear_stats) { 861 if (attr == &sysfs_clear_stats) {
850 atomic_long_set(&ca->sectors_written, 0); 862 atomic_long_set(&ca->sectors_written, 0);
851 atomic_long_set(&ca->btree_sectors_written, 0); 863 atomic_long_set(&ca->btree_sectors_written, 0);
@@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = {
869 &sysfs_metadata_written, 881 &sysfs_metadata_written,
870 &sysfs_io_errors, 882 &sysfs_io_errors,
871 &sysfs_clear_stats, 883 &sysfs_clear_stats,
872 &sysfs_freelist_percent,
873 &sysfs_cache_replacement_policy, 884 &sysfs_cache_replacement_policy,
874 NULL 885 NULL
875}; 886};
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1030c6020e98..ac7d0d1f70d7 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -2,6 +2,7 @@
2#ifndef _BCACHE_UTIL_H 2#ifndef _BCACHE_UTIL_H
3#define _BCACHE_UTIL_H 3#define _BCACHE_UTIL_H
4 4
5#include <linux/blkdev.h>
5#include <linux/errno.h> 6#include <linux/errno.h>
6#include <linux/kernel.h> 7#include <linux/kernel.h>
7#include <linux/llist.h> 8#include <linux/llist.h>
@@ -17,11 +18,13 @@ struct closure;
17 18
18#ifdef CONFIG_BCACHE_DEBUG 19#ifdef CONFIG_BCACHE_DEBUG
19 20
21#define EBUG_ON(cond) BUG_ON(cond)
20#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 22#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
21#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) 23#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
22 24
23#else /* DEBUG */ 25#else /* DEBUG */
24 26
27#define EBUG_ON(cond) do { if (cond); } while (0)
25#define atomic_dec_bug(v) atomic_dec(v) 28#define atomic_dec_bug(v) atomic_dec(v)
26#define atomic_inc_bug(v, i) atomic_inc(v) 29#define atomic_inc_bug(v, i) atomic_inc(v)
27 30
@@ -391,6 +394,11 @@ struct time_stats {
391 394
392void bch_time_stats_update(struct time_stats *stats, uint64_t time); 395void bch_time_stats_update(struct time_stats *stats, uint64_t time);
393 396
397static inline unsigned local_clock_us(void)
398{
399 return local_clock() >> 10;
400}
401
394#define NSEC_PER_ns 1L 402#define NSEC_PER_ns 1L
395#define NSEC_PER_us NSEC_PER_USEC 403#define NSEC_PER_us NSEC_PER_USEC
396#define NSEC_PER_ms NSEC_PER_MSEC 404#define NSEC_PER_ms NSEC_PER_MSEC