diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-30 14:40:10 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-30 14:40:10 -0500 |
commit | 53d8ab29f8f6d67e37857b68189b38fa3d87dd8e (patch) | |
tree | 3c770b58f0404c67b1b084f626dcafa8464c7512 /drivers/md/bcache | |
parent | f568849edac8611d603e00bd6cbbcfea09395ae6 (diff) | |
parent | 14424be4dbfa127001ad623869f7ee4c7635e991 (diff) |
Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver changes from Jens Axboe:
- bcache update from Kent Overstreet.
- two bcache fixes from Nicholas Swenson.
- cciss pci init error fix from Andrew.
- underflow fix in the parallel IDE pg_write code from Dan Carpenter.
I'm sure the 1 (or 0) users of that are now happy.
- two PCI related fixes for sx8 from Jingoo Han.
- floppy init fix for first block read from Jiri Kosina.
- pktcdvd error return miss fix from Julia Lawall.
- removal of IRQF_SHARED from the SEGA Dreamcast CD-ROM code from
Michael Opdenacker.
- comment typo fix for the loop driver from Olaf Hering.
- potential oops fix for null_blk from Raghavendra K T.
- two fixes from Sam Bradshaw (Micron) for the mtip32xx driver, fixing
an OOM problem and a problem with handling security locked conditions
* 'for-3.14/drivers' of git://git.kernel.dk/linux-block: (47 commits)
mg_disk: Spelling s/finised/finished/
null_blk: Null pointer deference problem in alloc_page_buffers
mtip32xx: Correctly handle security locked condition
mtip32xx: Make SGL container per-command to eliminate high order dma allocation
drivers/block/loop.c: fix comment typo in loop_config_discard
drivers/block/cciss.c:cciss_init_one(): use proper errnos
drivers/block/paride/pg.c: underflow bug in pg_write()
drivers/block/sx8.c: remove unnecessary pci_set_drvdata()
drivers/block/sx8.c: use module_pci_driver()
floppy: bail out in open() if drive is not responding to block0 read
bcache: Fix auxiliary search trees for key size > cacheline size
bcache: Don't return -EINTR when insert finished
bcache: Improve bucket_prio() calculation
bcache: Add bch_bkey_equal_header()
bcache: update bch_bkey_try_merge
bcache: Move insert_fixup() to btree_keys_ops
bcache: Convert sorting to btree_keys
bcache: Convert debug code to btree_keys
bcache: Convert btree_iter to struct btree_keys
bcache: Refactor bset_tree sysfs stats
...
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r-- | drivers/md/bcache/Makefile | 5 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 89 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 82 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 904 | ||||
-rw-r--r-- | drivers/md/bcache/bset.h | 440 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 676 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 62 | ||||
-rw-r--r-- | drivers/md/bcache/closure.c | 90 | ||||
-rw-r--r-- | drivers/md/bcache/closure.h | 355 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 247 | ||||
-rw-r--r-- | drivers/md/bcache/debug.h | 27 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 616 | ||||
-rw-r--r-- | drivers/md/bcache/extents.h | 13 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 75 | ||||
-rw-r--r-- | drivers/md/bcache/journal.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 72 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 21 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 103 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 79 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 8 |
21 files changed, 2212 insertions, 1755 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 0e9c82523be6..c488b846f831 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile | |||
@@ -1,7 +1,8 @@ | |||
1 | 1 | ||
2 | obj-$(CONFIG_BCACHE) += bcache.o | 2 | obj-$(CONFIG_BCACHE) += bcache.o |
3 | 3 | ||
4 | bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ | 4 | bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ |
5 | movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o | 5 | io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ |
6 | util.o writeback.o | ||
6 | 7 | ||
7 | CFLAGS_request.o += -Iblock | 8 | CFLAGS_request.o += -Iblock |
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4c9852d92b0a..c0d37d082443 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
@@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) | |||
132 | { | 132 | { |
133 | BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); | 133 | BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); |
134 | 134 | ||
135 | if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && | 135 | if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { |
136 | CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) | 136 | unsigned i; |
137 | return false; | 137 | |
138 | for (i = 0; i < RESERVE_NONE; i++) | ||
139 | if (!fifo_full(&ca->free[i])) | ||
140 | goto add; | ||
138 | 141 | ||
142 | return false; | ||
143 | } | ||
144 | add: | ||
139 | b->prio = 0; | 145 | b->prio = 0; |
140 | 146 | ||
141 | if (can_inc_bucket_gen(b) && | 147 | if (can_inc_bucket_gen(b) && |
@@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b) | |||
162 | fifo_push(&ca->free_inc, b - ca->buckets); | 168 | fifo_push(&ca->free_inc, b - ca->buckets); |
163 | } | 169 | } |
164 | 170 | ||
165 | #define bucket_prio(b) \ | 171 | /* |
166 | (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) | 172 | * Determines what order we're going to reuse buckets, smallest bucket_prio() |
173 | * first: we also take into account the number of sectors of live data in that | ||
174 | * bucket, and in order for that multiply to make sense we have to scale bucket | ||
175 | * | ||
176 | * Thus, we scale the bucket priorities so that the bucket with the smallest | ||
177 | * prio is worth 1/8th of what INITIAL_PRIO is worth. | ||
178 | */ | ||
179 | |||
180 | #define bucket_prio(b) \ | ||
181 | ({ \ | ||
182 | unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ | ||
183 | \ | ||
184 | (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ | ||
185 | }) | ||
167 | 186 | ||
168 | #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) | 187 | #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) |
169 | #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) | 188 | #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) |
@@ -304,6 +323,21 @@ do { \ | |||
304 | __set_current_state(TASK_RUNNING); \ | 323 | __set_current_state(TASK_RUNNING); \ |
305 | } while (0) | 324 | } while (0) |
306 | 325 | ||
326 | static int bch_allocator_push(struct cache *ca, long bucket) | ||
327 | { | ||
328 | unsigned i; | ||
329 | |||
330 | /* Prios/gens are actually the most important reserve */ | ||
331 | if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) | ||
332 | return true; | ||
333 | |||
334 | for (i = 0; i < RESERVE_NR; i++) | ||
335 | if (fifo_push(&ca->free[i], bucket)) | ||
336 | return true; | ||
337 | |||
338 | return false; | ||
339 | } | ||
340 | |||
307 | static int bch_allocator_thread(void *arg) | 341 | static int bch_allocator_thread(void *arg) |
308 | { | 342 | { |
309 | struct cache *ca = arg; | 343 | struct cache *ca = arg; |
@@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg) | |||
336 | mutex_lock(&ca->set->bucket_lock); | 370 | mutex_lock(&ca->set->bucket_lock); |
337 | } | 371 | } |
338 | 372 | ||
339 | allocator_wait(ca, !fifo_full(&ca->free)); | 373 | allocator_wait(ca, bch_allocator_push(ca, bucket)); |
340 | |||
341 | fifo_push(&ca->free, bucket); | ||
342 | wake_up(&ca->set->bucket_wait); | 374 | wake_up(&ca->set->bucket_wait); |
343 | } | 375 | } |
344 | 376 | ||
@@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg) | |||
365 | } | 397 | } |
366 | } | 398 | } |
367 | 399 | ||
368 | long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) | 400 | long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) |
369 | { | 401 | { |
370 | DEFINE_WAIT(w); | 402 | DEFINE_WAIT(w); |
371 | struct bucket *b; | 403 | struct bucket *b; |
372 | long r; | 404 | long r; |
373 | 405 | ||
374 | /* fastpath */ | 406 | /* fastpath */ |
375 | if (fifo_used(&ca->free) > ca->watermark[watermark]) { | 407 | if (fifo_pop(&ca->free[RESERVE_NONE], r) || |
376 | fifo_pop(&ca->free, r); | 408 | fifo_pop(&ca->free[reserve], r)) |
377 | goto out; | 409 | goto out; |
378 | } | ||
379 | 410 | ||
380 | if (!wait) | 411 | if (!wait) |
381 | return -1; | 412 | return -1; |
382 | 413 | ||
383 | while (1) { | 414 | do { |
384 | if (fifo_used(&ca->free) > ca->watermark[watermark]) { | ||
385 | fifo_pop(&ca->free, r); | ||
386 | break; | ||
387 | } | ||
388 | |||
389 | prepare_to_wait(&ca->set->bucket_wait, &w, | 415 | prepare_to_wait(&ca->set->bucket_wait, &w, |
390 | TASK_UNINTERRUPTIBLE); | 416 | TASK_UNINTERRUPTIBLE); |
391 | 417 | ||
392 | mutex_unlock(&ca->set->bucket_lock); | 418 | mutex_unlock(&ca->set->bucket_lock); |
393 | schedule(); | 419 | schedule(); |
394 | mutex_lock(&ca->set->bucket_lock); | 420 | mutex_lock(&ca->set->bucket_lock); |
395 | } | 421 | } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && |
422 | !fifo_pop(&ca->free[reserve], r)); | ||
396 | 423 | ||
397 | finish_wait(&ca->set->bucket_wait, &w); | 424 | finish_wait(&ca->set->bucket_wait, &w); |
398 | out: | 425 | out: |
@@ -401,12 +428,14 @@ out: | |||
401 | if (expensive_debug_checks(ca->set)) { | 428 | if (expensive_debug_checks(ca->set)) { |
402 | size_t iter; | 429 | size_t iter; |
403 | long i; | 430 | long i; |
431 | unsigned j; | ||
404 | 432 | ||
405 | for (iter = 0; iter < prio_buckets(ca) * 2; iter++) | 433 | for (iter = 0; iter < prio_buckets(ca) * 2; iter++) |
406 | BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); | 434 | BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); |
407 | 435 | ||
408 | fifo_for_each(i, &ca->free, iter) | 436 | for (j = 0; j < RESERVE_NR; j++) |
409 | BUG_ON(i == r); | 437 | fifo_for_each(i, &ca->free[j], iter) |
438 | BUG_ON(i == r); | ||
410 | fifo_for_each(i, &ca->free_inc, iter) | 439 | fifo_for_each(i, &ca->free_inc, iter) |
411 | BUG_ON(i == r); | 440 | BUG_ON(i == r); |
412 | fifo_for_each(i, &ca->unused, iter) | 441 | fifo_for_each(i, &ca->unused, iter) |
@@ -419,7 +448,7 @@ out: | |||
419 | 448 | ||
420 | SET_GC_SECTORS_USED(b, ca->sb.bucket_size); | 449 | SET_GC_SECTORS_USED(b, ca->sb.bucket_size); |
421 | 450 | ||
422 | if (watermark <= WATERMARK_METADATA) { | 451 | if (reserve <= RESERVE_PRIO) { |
423 | SET_GC_MARK(b, GC_MARK_METADATA); | 452 | SET_GC_MARK(b, GC_MARK_METADATA); |
424 | SET_GC_MOVE(b, 0); | 453 | SET_GC_MOVE(b, 0); |
425 | b->prio = BTREE_PRIO; | 454 | b->prio = BTREE_PRIO; |
@@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) | |||
445 | } | 474 | } |
446 | } | 475 | } |
447 | 476 | ||
448 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | 477 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, |
449 | struct bkey *k, int n, bool wait) | 478 | struct bkey *k, int n, bool wait) |
450 | { | 479 | { |
451 | int i; | 480 | int i; |
@@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | |||
459 | 488 | ||
460 | for (i = 0; i < n; i++) { | 489 | for (i = 0; i < n; i++) { |
461 | struct cache *ca = c->cache_by_alloc[i]; | 490 | struct cache *ca = c->cache_by_alloc[i]; |
462 | long b = bch_bucket_alloc(ca, watermark, wait); | 491 | long b = bch_bucket_alloc(ca, reserve, wait); |
463 | 492 | ||
464 | if (b == -1) | 493 | if (b == -1) |
465 | goto err; | 494 | goto err; |
@@ -478,12 +507,12 @@ err: | |||
478 | return -1; | 507 | return -1; |
479 | } | 508 | } |
480 | 509 | ||
481 | int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | 510 | int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, |
482 | struct bkey *k, int n, bool wait) | 511 | struct bkey *k, int n, bool wait) |
483 | { | 512 | { |
484 | int ret; | 513 | int ret; |
485 | mutex_lock(&c->bucket_lock); | 514 | mutex_lock(&c->bucket_lock); |
486 | ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); | 515 | ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); |
487 | mutex_unlock(&c->bucket_lock); | 516 | mutex_unlock(&c->bucket_lock); |
488 | return ret; | 517 | return ret; |
489 | } | 518 | } |
@@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, | |||
573 | 602 | ||
574 | while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { | 603 | while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { |
575 | unsigned watermark = write_prio | 604 | unsigned watermark = write_prio |
576 | ? WATERMARK_MOVINGGC | 605 | ? RESERVE_MOVINGGC |
577 | : WATERMARK_NONE; | 606 | : RESERVE_NONE; |
578 | 607 | ||
579 | spin_unlock(&c->data_bucket_lock); | 608 | spin_unlock(&c->data_bucket_lock); |
580 | 609 | ||
@@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca) | |||
689 | * Then 8 for btree allocations | 718 | * Then 8 for btree allocations |
690 | * Then half for the moving garbage collector | 719 | * Then half for the moving garbage collector |
691 | */ | 720 | */ |
692 | 721 | #if 0 | |
693 | ca->watermark[WATERMARK_PRIO] = 0; | 722 | ca->watermark[WATERMARK_PRIO] = 0; |
694 | 723 | ||
695 | ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); | 724 | ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); |
@@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca) | |||
699 | 728 | ||
700 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | 729 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + |
701 | ca->watermark[WATERMARK_MOVINGGC]; | 730 | ca->watermark[WATERMARK_MOVINGGC]; |
702 | 731 | #endif | |
703 | return 0; | 732 | return 0; |
704 | } | 733 | } |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dbdbca5a9591..0c707e4f4eaf 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -187,6 +187,7 @@ | |||
187 | #include <linux/types.h> | 187 | #include <linux/types.h> |
188 | #include <linux/workqueue.h> | 188 | #include <linux/workqueue.h> |
189 | 189 | ||
190 | #include "bset.h" | ||
190 | #include "util.h" | 191 | #include "util.h" |
191 | #include "closure.h" | 192 | #include "closure.h" |
192 | 193 | ||
@@ -309,7 +310,8 @@ struct cached_dev { | |||
309 | struct cache_sb sb; | 310 | struct cache_sb sb; |
310 | struct bio sb_bio; | 311 | struct bio sb_bio; |
311 | struct bio_vec sb_bv[1]; | 312 | struct bio_vec sb_bv[1]; |
312 | struct closure_with_waitlist sb_write; | 313 | struct closure sb_write; |
314 | struct semaphore sb_write_mutex; | ||
313 | 315 | ||
314 | /* Refcount on the cache set. Always nonzero when we're caching. */ | 316 | /* Refcount on the cache set. Always nonzero when we're caching. */ |
315 | atomic_t count; | 317 | atomic_t count; |
@@ -382,12 +384,12 @@ struct cached_dev { | |||
382 | unsigned writeback_rate_p_term_inverse; | 384 | unsigned writeback_rate_p_term_inverse; |
383 | }; | 385 | }; |
384 | 386 | ||
385 | enum alloc_watermarks { | 387 | enum alloc_reserve { |
386 | WATERMARK_PRIO, | 388 | RESERVE_BTREE, |
387 | WATERMARK_METADATA, | 389 | RESERVE_PRIO, |
388 | WATERMARK_MOVINGGC, | 390 | RESERVE_MOVINGGC, |
389 | WATERMARK_NONE, | 391 | RESERVE_NONE, |
390 | WATERMARK_MAX | 392 | RESERVE_NR, |
391 | }; | 393 | }; |
392 | 394 | ||
393 | struct cache { | 395 | struct cache { |
@@ -399,8 +401,6 @@ struct cache { | |||
399 | struct kobject kobj; | 401 | struct kobject kobj; |
400 | struct block_device *bdev; | 402 | struct block_device *bdev; |
401 | 403 | ||
402 | unsigned watermark[WATERMARK_MAX]; | ||
403 | |||
404 | struct task_struct *alloc_thread; | 404 | struct task_struct *alloc_thread; |
405 | 405 | ||
406 | struct closure prio; | 406 | struct closure prio; |
@@ -429,7 +429,7 @@ struct cache { | |||
429 | * because all the data they contained was overwritten), so we only | 429 | * because all the data they contained was overwritten), so we only |
430 | * need to discard them before they can be moved to the free list. | 430 | * need to discard them before they can be moved to the free list. |
431 | */ | 431 | */ |
432 | DECLARE_FIFO(long, free); | 432 | DECLARE_FIFO(long, free)[RESERVE_NR]; |
433 | DECLARE_FIFO(long, free_inc); | 433 | DECLARE_FIFO(long, free_inc); |
434 | DECLARE_FIFO(long, unused); | 434 | DECLARE_FIFO(long, unused); |
435 | 435 | ||
@@ -514,7 +514,8 @@ struct cache_set { | |||
514 | uint64_t cached_dev_sectors; | 514 | uint64_t cached_dev_sectors; |
515 | struct closure caching; | 515 | struct closure caching; |
516 | 516 | ||
517 | struct closure_with_waitlist sb_write; | 517 | struct closure sb_write; |
518 | struct semaphore sb_write_mutex; | ||
518 | 519 | ||
519 | mempool_t *search; | 520 | mempool_t *search; |
520 | mempool_t *bio_meta; | 521 | mempool_t *bio_meta; |
@@ -629,13 +630,15 @@ struct cache_set { | |||
629 | 630 | ||
630 | #ifdef CONFIG_BCACHE_DEBUG | 631 | #ifdef CONFIG_BCACHE_DEBUG |
631 | struct btree *verify_data; | 632 | struct btree *verify_data; |
633 | struct bset *verify_ondisk; | ||
632 | struct mutex verify_lock; | 634 | struct mutex verify_lock; |
633 | #endif | 635 | #endif |
634 | 636 | ||
635 | unsigned nr_uuids; | 637 | unsigned nr_uuids; |
636 | struct uuid_entry *uuids; | 638 | struct uuid_entry *uuids; |
637 | BKEY_PADDED(uuid_bucket); | 639 | BKEY_PADDED(uuid_bucket); |
638 | struct closure_with_waitlist uuid_write; | 640 | struct closure uuid_write; |
641 | struct semaphore uuid_write_mutex; | ||
639 | 642 | ||
640 | /* | 643 | /* |
641 | * A btree node on disk could have too many bsets for an iterator to fit | 644 | * A btree node on disk could have too many bsets for an iterator to fit |
@@ -643,13 +646,7 @@ struct cache_set { | |||
643 | */ | 646 | */ |
644 | mempool_t *fill_iter; | 647 | mempool_t *fill_iter; |
645 | 648 | ||
646 | /* | 649 | struct bset_sort_state sort; |
647 | * btree_sort() is a merge sort and requires temporary space - single | ||
648 | * element mempool | ||
649 | */ | ||
650 | struct mutex sort_lock; | ||
651 | struct bset *sort; | ||
652 | unsigned sort_crit_factor; | ||
653 | 650 | ||
654 | /* List of buckets we're currently writing data to */ | 651 | /* List of buckets we're currently writing data to */ |
655 | struct list_head data_buckets; | 652 | struct list_head data_buckets; |
@@ -665,7 +662,6 @@ struct cache_set { | |||
665 | unsigned congested_read_threshold_us; | 662 | unsigned congested_read_threshold_us; |
666 | unsigned congested_write_threshold_us; | 663 | unsigned congested_write_threshold_us; |
667 | 664 | ||
668 | struct time_stats sort_time; | ||
669 | struct time_stats btree_gc_time; | 665 | struct time_stats btree_gc_time; |
670 | struct time_stats btree_split_time; | 666 | struct time_stats btree_split_time; |
671 | struct time_stats btree_read_time; | 667 | struct time_stats btree_read_time; |
@@ -683,9 +679,9 @@ struct cache_set { | |||
683 | unsigned error_decay; | 679 | unsigned error_decay; |
684 | 680 | ||
685 | unsigned short journal_delay_ms; | 681 | unsigned short journal_delay_ms; |
682 | bool expensive_debug_checks; | ||
686 | unsigned verify:1; | 683 | unsigned verify:1; |
687 | unsigned key_merging_disabled:1; | 684 | unsigned key_merging_disabled:1; |
688 | unsigned expensive_debug_checks:1; | ||
689 | unsigned gc_always_rewrite:1; | 685 | unsigned gc_always_rewrite:1; |
690 | unsigned shrinker_disabled:1; | 686 | unsigned shrinker_disabled:1; |
691 | unsigned copy_gc_enabled:1; | 687 | unsigned copy_gc_enabled:1; |
@@ -707,13 +703,8 @@ struct bbio { | |||
707 | struct bio bio; | 703 | struct bio bio; |
708 | }; | 704 | }; |
709 | 705 | ||
710 | static inline unsigned local_clock_us(void) | ||
711 | { | ||
712 | return local_clock() >> 10; | ||
713 | } | ||
714 | |||
715 | #define BTREE_PRIO USHRT_MAX | 706 | #define BTREE_PRIO USHRT_MAX |
716 | #define INITIAL_PRIO 32768 | 707 | #define INITIAL_PRIO 32768U |
717 | 708 | ||
718 | #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) | 709 | #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) |
719 | #define btree_blocks(b) \ | 710 | #define btree_blocks(b) \ |
@@ -726,21 +717,6 @@ static inline unsigned local_clock_us(void) | |||
726 | #define bucket_bytes(c) ((c)->sb.bucket_size << 9) | 717 | #define bucket_bytes(c) ((c)->sb.bucket_size << 9) |
727 | #define block_bytes(c) ((c)->sb.block_size << 9) | 718 | #define block_bytes(c) ((c)->sb.block_size << 9) |
728 | 719 | ||
729 | #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) | ||
730 | #define set_bytes(i) __set_bytes(i, i->keys) | ||
731 | |||
732 | #define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) | ||
733 | #define set_blocks(i, c) __set_blocks(i, (i)->keys, c) | ||
734 | |||
735 | #define node(i, j) ((struct bkey *) ((i)->d + (j))) | ||
736 | #define end(i) node(i, (i)->keys) | ||
737 | |||
738 | #define index(i, b) \ | ||
739 | ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ | ||
740 | block_bytes(b->c))) | ||
741 | |||
742 | #define btree_data_space(b) (PAGE_SIZE << (b)->page_order) | ||
743 | |||
744 | #define prios_per_bucket(c) \ | 720 | #define prios_per_bucket(c) \ |
745 | ((bucket_bytes(c) - sizeof(struct prio_set)) / \ | 721 | ((bucket_bytes(c) - sizeof(struct prio_set)) / \ |
746 | sizeof(struct bucket_disk)) | 722 | sizeof(struct bucket_disk)) |
@@ -783,20 +759,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, | |||
783 | return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); | 759 | return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); |
784 | } | 760 | } |
785 | 761 | ||
786 | /* Btree key macros */ | 762 | static inline uint8_t gen_after(uint8_t a, uint8_t b) |
763 | { | ||
764 | uint8_t r = a - b; | ||
765 | return r > 128U ? 0 : r; | ||
766 | } | ||
787 | 767 | ||
788 | static inline void bkey_init(struct bkey *k) | 768 | static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, |
769 | unsigned i) | ||
789 | { | 770 | { |
790 | *k = ZERO_KEY; | 771 | return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); |
791 | } | 772 | } |
792 | 773 | ||
774 | static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | ||
775 | unsigned i) | ||
776 | { | ||
777 | return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); | ||
778 | } | ||
779 | |||
780 | /* Btree key macros */ | ||
781 | |||
793 | /* | 782 | /* |
794 | * This is used for various on disk data structures - cache_sb, prio_set, bset, | 783 | * This is used for various on disk data structures - cache_sb, prio_set, bset, |
795 | * jset: The checksum is _always_ the first 8 bytes of these structs | 784 | * jset: The checksum is _always_ the first 8 bytes of these structs |
796 | */ | 785 | */ |
797 | #define csum_set(i) \ | 786 | #define csum_set(i) \ |
798 | bch_crc64(((void *) (i)) + sizeof(uint64_t), \ | 787 | bch_crc64(((void *) (i)) + sizeof(uint64_t), \ |
799 | ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) | 788 | ((void *) bset_bkey_last(i)) - \ |
789 | (((void *) (i)) + sizeof(uint64_t))) | ||
800 | 790 | ||
801 | /* Error handling macros */ | 791 | /* Error handling macros */ |
802 | 792 | ||
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 7d388b8bb50e..4f6b5940e609 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c | |||
@@ -5,30 +5,134 @@ | |||
5 | * Copyright 2012 Google, Inc. | 5 | * Copyright 2012 Google, Inc. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "bcache.h" | 8 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ |
9 | #include "btree.h" | ||
10 | #include "debug.h" | ||
11 | 9 | ||
10 | #include "util.h" | ||
11 | #include "bset.h" | ||
12 | |||
13 | #include <linux/console.h> | ||
12 | #include <linux/random.h> | 14 | #include <linux/random.h> |
13 | #include <linux/prefetch.h> | 15 | #include <linux/prefetch.h> |
14 | 16 | ||
17 | #ifdef CONFIG_BCACHE_DEBUG | ||
18 | |||
19 | void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) | ||
20 | { | ||
21 | struct bkey *k, *next; | ||
22 | |||
23 | for (k = i->start; k < bset_bkey_last(i); k = next) { | ||
24 | next = bkey_next(k); | ||
25 | |||
26 | printk(KERN_ERR "block %u key %zi/%u: ", set, | ||
27 | (uint64_t *) k - i->d, i->keys); | ||
28 | |||
29 | if (b->ops->key_dump) | ||
30 | b->ops->key_dump(b, k); | ||
31 | else | ||
32 | printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); | ||
33 | |||
34 | if (next < bset_bkey_last(i) && | ||
35 | bkey_cmp(k, b->ops->is_extents ? | ||
36 | &START_KEY(next) : next) > 0) | ||
37 | printk(KERN_ERR "Key skipped backwards\n"); | ||
38 | } | ||
39 | } | ||
40 | |||
41 | void bch_dump_bucket(struct btree_keys *b) | ||
42 | { | ||
43 | unsigned i; | ||
44 | |||
45 | console_lock(); | ||
46 | for (i = 0; i <= b->nsets; i++) | ||
47 | bch_dump_bset(b, b->set[i].data, | ||
48 | bset_sector_offset(b, b->set[i].data)); | ||
49 | console_unlock(); | ||
50 | } | ||
51 | |||
52 | int __bch_count_data(struct btree_keys *b) | ||
53 | { | ||
54 | unsigned ret = 0; | ||
55 | struct btree_iter iter; | ||
56 | struct bkey *k; | ||
57 | |||
58 | if (b->ops->is_extents) | ||
59 | for_each_key(b, k, &iter) | ||
60 | ret += KEY_SIZE(k); | ||
61 | return ret; | ||
62 | } | ||
63 | |||
64 | void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) | ||
65 | { | ||
66 | va_list args; | ||
67 | struct bkey *k, *p = NULL; | ||
68 | struct btree_iter iter; | ||
69 | const char *err; | ||
70 | |||
71 | for_each_key(b, k, &iter) { | ||
72 | if (b->ops->is_extents) { | ||
73 | err = "Keys out of order"; | ||
74 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) | ||
75 | goto bug; | ||
76 | |||
77 | if (bch_ptr_invalid(b, k)) | ||
78 | continue; | ||
79 | |||
80 | err = "Overlapping keys"; | ||
81 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) | ||
82 | goto bug; | ||
83 | } else { | ||
84 | if (bch_ptr_bad(b, k)) | ||
85 | continue; | ||
86 | |||
87 | err = "Duplicate keys"; | ||
88 | if (p && !bkey_cmp(p, k)) | ||
89 | goto bug; | ||
90 | } | ||
91 | p = k; | ||
92 | } | ||
93 | #if 0 | ||
94 | err = "Key larger than btree node key"; | ||
95 | if (p && bkey_cmp(p, &b->key) > 0) | ||
96 | goto bug; | ||
97 | #endif | ||
98 | return; | ||
99 | bug: | ||
100 | bch_dump_bucket(b); | ||
101 | |||
102 | va_start(args, fmt); | ||
103 | vprintk(fmt, args); | ||
104 | va_end(args); | ||
105 | |||
106 | panic("bch_check_keys error: %s:\n", err); | ||
107 | } | ||
108 | |||
109 | static void bch_btree_iter_next_check(struct btree_iter *iter) | ||
110 | { | ||
111 | struct bkey *k = iter->data->k, *next = bkey_next(k); | ||
112 | |||
113 | if (next < iter->data->end && | ||
114 | bkey_cmp(k, iter->b->ops->is_extents ? | ||
115 | &START_KEY(next) : next) > 0) { | ||
116 | bch_dump_bucket(iter->b); | ||
117 | panic("Key skipped backwards\n"); | ||
118 | } | ||
119 | } | ||
120 | |||
121 | #else | ||
122 | |||
123 | static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} | ||
124 | |||
125 | #endif | ||
126 | |||
15 | /* Keylists */ | 127 | /* Keylists */ |
16 | 128 | ||
17 | int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) | 129 | int __bch_keylist_realloc(struct keylist *l, unsigned u64s) |
18 | { | 130 | { |
19 | size_t oldsize = bch_keylist_nkeys(l); | 131 | size_t oldsize = bch_keylist_nkeys(l); |
20 | size_t newsize = oldsize + 2 + nptrs; | 132 | size_t newsize = oldsize + u64s; |
21 | uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; | 133 | uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; |
22 | uint64_t *new_keys; | 134 | uint64_t *new_keys; |
23 | 135 | ||
24 | /* The journalling code doesn't handle the case where the keys to insert | ||
25 | * is bigger than an empty write: If we just return -ENOMEM here, | ||
26 | * bio_insert() and bio_invalidate() will insert the keys created so far | ||
27 | * and finish the rest when the keylist is empty. | ||
28 | */ | ||
29 | if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) | ||
30 | return -ENOMEM; | ||
31 | |||
32 | newsize = roundup_pow_of_two(newsize); | 136 | newsize = roundup_pow_of_two(newsize); |
33 | 137 | ||
34 | if (newsize <= KEYLIST_INLINE || | 138 | if (newsize <= KEYLIST_INLINE || |
@@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l) | |||
71 | bch_keylist_bytes(l)); | 175 | bch_keylist_bytes(l)); |
72 | } | 176 | } |
73 | 177 | ||
74 | /* Pointer validation */ | ||
75 | |||
76 | static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
77 | { | ||
78 | unsigned i; | ||
79 | |||
80 | for (i = 0; i < KEY_PTRS(k); i++) | ||
81 | if (ptr_available(c, k, i)) { | ||
82 | struct cache *ca = PTR_CACHE(c, k, i); | ||
83 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
84 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
85 | |||
86 | if (KEY_SIZE(k) + r > c->sb.bucket_size || | ||
87 | bucket < ca->sb.first_bucket || | ||
88 | bucket >= ca->sb.nbuckets) | ||
89 | return true; | ||
90 | } | ||
91 | |||
92 | return false; | ||
93 | } | ||
94 | |||
95 | bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
96 | { | ||
97 | char buf[80]; | ||
98 | |||
99 | if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) | ||
100 | goto bad; | ||
101 | |||
102 | if (__ptr_invalid(c, k)) | ||
103 | goto bad; | ||
104 | |||
105 | return false; | ||
106 | bad: | ||
107 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
108 | cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); | ||
109 | return true; | ||
110 | } | ||
111 | |||
112 | bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
113 | { | ||
114 | char buf[80]; | ||
115 | |||
116 | if (!KEY_SIZE(k)) | ||
117 | return true; | ||
118 | |||
119 | if (KEY_SIZE(k) > KEY_OFFSET(k)) | ||
120 | goto bad; | ||
121 | |||
122 | if (__ptr_invalid(c, k)) | ||
123 | goto bad; | ||
124 | |||
125 | return false; | ||
126 | bad: | ||
127 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
128 | cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); | ||
129 | return true; | ||
130 | } | ||
131 | |||
132 | static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, | ||
133 | unsigned ptr) | ||
134 | { | ||
135 | struct bucket *g = PTR_BUCKET(b->c, k, ptr); | ||
136 | char buf[80]; | ||
137 | |||
138 | if (mutex_trylock(&b->c->bucket_lock)) { | ||
139 | if (b->level) { | ||
140 | if (KEY_DIRTY(k) || | ||
141 | g->prio != BTREE_PRIO || | ||
142 | (b->c->gc_mark_valid && | ||
143 | GC_MARK(g) != GC_MARK_METADATA)) | ||
144 | goto err; | ||
145 | |||
146 | } else { | ||
147 | if (g->prio == BTREE_PRIO) | ||
148 | goto err; | ||
149 | |||
150 | if (KEY_DIRTY(k) && | ||
151 | b->c->gc_mark_valid && | ||
152 | GC_MARK(g) != GC_MARK_DIRTY) | ||
153 | goto err; | ||
154 | } | ||
155 | mutex_unlock(&b->c->bucket_lock); | ||
156 | } | ||
157 | |||
158 | return false; | ||
159 | err: | ||
160 | mutex_unlock(&b->c->bucket_lock); | ||
161 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
162 | btree_bug(b, | ||
163 | "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||
164 | buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), | ||
165 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
166 | return true; | ||
167 | } | ||
168 | |||
169 | bool bch_ptr_bad(struct btree *b, const struct bkey *k) | ||
170 | { | ||
171 | struct bucket *g; | ||
172 | unsigned i, stale; | ||
173 | |||
174 | if (!bkey_cmp(k, &ZERO_KEY) || | ||
175 | !KEY_PTRS(k) || | ||
176 | bch_ptr_invalid(b, k)) | ||
177 | return true; | ||
178 | |||
179 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
180 | if (!ptr_available(b->c, k, i)) | ||
181 | return true; | ||
182 | |||
183 | g = PTR_BUCKET(b->c, k, i); | ||
184 | stale = ptr_stale(b->c, k, i); | ||
185 | |||
186 | btree_bug_on(stale > 96, b, | ||
187 | "key too stale: %i, need_gc %u", | ||
188 | stale, b->c->need_gc); | ||
189 | |||
190 | btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), | ||
191 | b, "stale dirty pointer"); | ||
192 | |||
193 | if (stale) | ||
194 | return true; | ||
195 | |||
196 | if (expensive_debug_checks(b->c) && | ||
197 | ptr_bad_expensive_checks(b, k, i)) | ||
198 | return true; | ||
199 | } | ||
200 | |||
201 | return false; | ||
202 | } | ||
203 | |||
204 | /* Key/pointer manipulation */ | 178 | /* Key/pointer manipulation */ |
205 | 179 | ||
206 | void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, | 180 | void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, |
@@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k) | |||
255 | return true; | 229 | return true; |
256 | } | 230 | } |
257 | 231 | ||
258 | static uint64_t merge_chksums(struct bkey *l, struct bkey *r) | 232 | /* Auxiliary search trees */ |
233 | |||
234 | /* 32 bits total: */ | ||
235 | #define BKEY_MID_BITS 3 | ||
236 | #define BKEY_EXPONENT_BITS 7 | ||
237 | #define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS) | ||
238 | #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) | ||
239 | |||
240 | struct bkey_float { | ||
241 | unsigned exponent:BKEY_EXPONENT_BITS; | ||
242 | unsigned m:BKEY_MID_BITS; | ||
243 | unsigned mantissa:BKEY_MANTISSA_BITS; | ||
244 | } __packed; | ||
245 | |||
246 | /* | ||
247 | * BSET_CACHELINE was originally intended to match the hardware cacheline size - | ||
248 | * it used to be 64, but I realized the lookup code would touch slightly less | ||
249 | * memory if it was 128. | ||
250 | * | ||
251 | * It definites the number of bytes (in struct bset) per struct bkey_float in | ||
252 | * the auxiliar search tree - when we're done searching the bset_float tree we | ||
253 | * have this many bytes left that we do a linear search over. | ||
254 | * | ||
255 | * Since (after level 5) every level of the bset_tree is on a new cacheline, | ||
256 | * we're touching one fewer cacheline in the bset tree in exchange for one more | ||
257 | * cacheline in the linear search - but the linear search might stop before it | ||
258 | * gets to the second cacheline. | ||
259 | */ | ||
260 | |||
261 | #define BSET_CACHELINE 128 | ||
262 | |||
263 | /* Space required for the btree node keys */ | ||
264 | static inline size_t btree_keys_bytes(struct btree_keys *b) | ||
259 | { | 265 | { |
260 | return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & | 266 | return PAGE_SIZE << b->page_order; |
261 | ~((uint64_t)1 << 63); | ||
262 | } | 267 | } |
263 | 268 | ||
264 | /* Tries to merge l and r: l should be lower than r | 269 | static inline size_t btree_keys_cachelines(struct btree_keys *b) |
265 | * Returns true if we were able to merge. If we did merge, l will be the merged | ||
266 | * key, r will be untouched. | ||
267 | */ | ||
268 | bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) | ||
269 | { | 270 | { |
270 | unsigned i; | 271 | return btree_keys_bytes(b) / BSET_CACHELINE; |
272 | } | ||
271 | 273 | ||
272 | if (key_merging_disabled(b->c)) | 274 | /* Space required for the auxiliary search trees */ |
273 | return false; | 275 | static inline size_t bset_tree_bytes(struct btree_keys *b) |
276 | { | ||
277 | return btree_keys_cachelines(b) * sizeof(struct bkey_float); | ||
278 | } | ||
274 | 279 | ||
275 | if (KEY_PTRS(l) != KEY_PTRS(r) || | 280 | /* Space required for the prev pointers */ |
276 | KEY_DIRTY(l) != KEY_DIRTY(r) || | 281 | static inline size_t bset_prev_bytes(struct btree_keys *b) |
277 | bkey_cmp(l, &START_KEY(r))) | 282 | { |
278 | return false; | 283 | return btree_keys_cachelines(b) * sizeof(uint8_t); |
284 | } | ||
279 | 285 | ||
280 | for (i = 0; i < KEY_PTRS(l); i++) | 286 | /* Memory allocation */ |
281 | if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || | ||
282 | PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) | ||
283 | return false; | ||
284 | 287 | ||
285 | /* Keys with no pointers aren't restricted to one bucket and could | 288 | void bch_btree_keys_free(struct btree_keys *b) |
286 | * overflow KEY_SIZE | 289 | { |
287 | */ | 290 | struct bset_tree *t = b->set; |
288 | if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { | ||
289 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); | ||
290 | SET_KEY_SIZE(l, USHRT_MAX); | ||
291 | 291 | ||
292 | bch_cut_front(l, r); | 292 | if (bset_prev_bytes(b) < PAGE_SIZE) |
293 | return false; | 293 | kfree(t->prev); |
294 | } | 294 | else |
295 | free_pages((unsigned long) t->prev, | ||
296 | get_order(bset_prev_bytes(b))); | ||
295 | 297 | ||
296 | if (KEY_CSUM(l)) { | 298 | if (bset_tree_bytes(b) < PAGE_SIZE) |
297 | if (KEY_CSUM(r)) | 299 | kfree(t->tree); |
298 | l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); | 300 | else |
299 | else | 301 | free_pages((unsigned long) t->tree, |
300 | SET_KEY_CSUM(l, 0); | 302 | get_order(bset_tree_bytes(b))); |
301 | } | ||
302 | 303 | ||
303 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); | 304 | free_pages((unsigned long) t->data, b->page_order); |
304 | SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); | ||
305 | 305 | ||
306 | return true; | 306 | t->prev = NULL; |
307 | t->tree = NULL; | ||
308 | t->data = NULL; | ||
309 | } | ||
310 | EXPORT_SYMBOL(bch_btree_keys_free); | ||
311 | |||
312 | int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) | ||
313 | { | ||
314 | struct bset_tree *t = b->set; | ||
315 | |||
316 | BUG_ON(t->data); | ||
317 | |||
318 | b->page_order = page_order; | ||
319 | |||
320 | t->data = (void *) __get_free_pages(gfp, b->page_order); | ||
321 | if (!t->data) | ||
322 | goto err; | ||
323 | |||
324 | t->tree = bset_tree_bytes(b) < PAGE_SIZE | ||
325 | ? kmalloc(bset_tree_bytes(b), gfp) | ||
326 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); | ||
327 | if (!t->tree) | ||
328 | goto err; | ||
329 | |||
330 | t->prev = bset_prev_bytes(b) < PAGE_SIZE | ||
331 | ? kmalloc(bset_prev_bytes(b), gfp) | ||
332 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); | ||
333 | if (!t->prev) | ||
334 | goto err; | ||
335 | |||
336 | return 0; | ||
337 | err: | ||
338 | bch_btree_keys_free(b); | ||
339 | return -ENOMEM; | ||
307 | } | 340 | } |
341 | EXPORT_SYMBOL(bch_btree_keys_alloc); | ||
342 | |||
343 | void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, | ||
344 | bool *expensive_debug_checks) | ||
345 | { | ||
346 | unsigned i; | ||
347 | |||
348 | b->ops = ops; | ||
349 | b->expensive_debug_checks = expensive_debug_checks; | ||
350 | b->nsets = 0; | ||
351 | b->last_set_unwritten = 0; | ||
352 | |||
353 | /* XXX: shouldn't be needed */ | ||
354 | for (i = 0; i < MAX_BSETS; i++) | ||
355 | b->set[i].size = 0; | ||
356 | /* | ||
357 | * Second loop starts at 1 because b->keys[0]->data is the memory we | ||
358 | * allocated | ||
359 | */ | ||
360 | for (i = 1; i < MAX_BSETS; i++) | ||
361 | b->set[i].data = NULL; | ||
362 | } | ||
363 | EXPORT_SYMBOL(bch_btree_keys_init); | ||
308 | 364 | ||
309 | /* Binary tree stuff for auxiliary search trees */ | 365 | /* Binary tree stuff for auxiliary search trees */ |
310 | 366 | ||
@@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) | |||
455 | return ((void *) k - (void *) t->data) / BSET_CACHELINE; | 511 | return ((void *) k - (void *) t->data) / BSET_CACHELINE; |
456 | } | 512 | } |
457 | 513 | ||
458 | static unsigned bkey_to_cacheline_offset(struct bkey *k) | 514 | static unsigned bkey_to_cacheline_offset(struct bset_tree *t, |
515 | unsigned cacheline, | ||
516 | struct bkey *k) | ||
459 | { | 517 | { |
460 | return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); | 518 | return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); |
461 | } | 519 | } |
462 | 520 | ||
463 | static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) | 521 | static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) |
@@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j) | |||
504 | : tree_to_prev_bkey(t, j >> ffs(j)); | 562 | : tree_to_prev_bkey(t, j >> ffs(j)); |
505 | 563 | ||
506 | struct bkey *r = is_power_of_2(j + 1) | 564 | struct bkey *r = is_power_of_2(j + 1) |
507 | ? node(t->data, t->data->keys - bkey_u64s(&t->end)) | 565 | ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end)) |
508 | : tree_to_bkey(t, j >> (ffz(j) + 1)); | 566 | : tree_to_bkey(t, j >> (ffz(j) + 1)); |
509 | 567 | ||
510 | BUG_ON(m < l || m > r); | 568 | BUG_ON(m < l || m > r); |
@@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j) | |||
528 | f->exponent = 127; | 586 | f->exponent = 127; |
529 | } | 587 | } |
530 | 588 | ||
531 | static void bset_alloc_tree(struct btree *b, struct bset_tree *t) | 589 | static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) |
532 | { | 590 | { |
533 | if (t != b->sets) { | 591 | if (t != b->set) { |
534 | unsigned j = roundup(t[-1].size, | 592 | unsigned j = roundup(t[-1].size, |
535 | 64 / sizeof(struct bkey_float)); | 593 | 64 / sizeof(struct bkey_float)); |
536 | 594 | ||
@@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t) | |||
538 | t->prev = t[-1].prev + j; | 596 | t->prev = t[-1].prev + j; |
539 | } | 597 | } |
540 | 598 | ||
541 | while (t < b->sets + MAX_BSETS) | 599 | while (t < b->set + MAX_BSETS) |
542 | t++->size = 0; | 600 | t++->size = 0; |
543 | } | 601 | } |
544 | 602 | ||
545 | static void bset_build_unwritten_tree(struct btree *b) | 603 | static void bch_bset_build_unwritten_tree(struct btree_keys *b) |
546 | { | 604 | { |
547 | struct bset_tree *t = b->sets + b->nsets; | 605 | struct bset_tree *t = bset_tree_last(b); |
606 | |||
607 | BUG_ON(b->last_set_unwritten); | ||
608 | b->last_set_unwritten = 1; | ||
548 | 609 | ||
549 | bset_alloc_tree(b, t); | 610 | bset_alloc_tree(b, t); |
550 | 611 | ||
551 | if (t->tree != b->sets->tree + bset_tree_space(b)) { | 612 | if (t->tree != b->set->tree + btree_keys_cachelines(b)) { |
552 | t->prev[0] = bkey_to_cacheline_offset(t->data->start); | 613 | t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start); |
553 | t->size = 1; | 614 | t->size = 1; |
554 | } | 615 | } |
555 | } | 616 | } |
556 | 617 | ||
557 | static void bset_build_written_tree(struct btree *b) | 618 | void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) |
619 | { | ||
620 | if (i != b->set->data) { | ||
621 | b->set[++b->nsets].data = i; | ||
622 | i->seq = b->set->data->seq; | ||
623 | } else | ||
624 | get_random_bytes(&i->seq, sizeof(uint64_t)); | ||
625 | |||
626 | i->magic = magic; | ||
627 | i->version = 0; | ||
628 | i->keys = 0; | ||
629 | |||
630 | bch_bset_build_unwritten_tree(b); | ||
631 | } | ||
632 | EXPORT_SYMBOL(bch_bset_init_next); | ||
633 | |||
634 | void bch_bset_build_written_tree(struct btree_keys *b) | ||
558 | { | 635 | { |
559 | struct bset_tree *t = b->sets + b->nsets; | 636 | struct bset_tree *t = bset_tree_last(b); |
560 | struct bkey *k = t->data->start; | 637 | struct bkey *prev = NULL, *k = t->data->start; |
561 | unsigned j, cacheline = 1; | 638 | unsigned j, cacheline = 1; |
562 | 639 | ||
640 | b->last_set_unwritten = 0; | ||
641 | |||
563 | bset_alloc_tree(b, t); | 642 | bset_alloc_tree(b, t); |
564 | 643 | ||
565 | t->size = min_t(unsigned, | 644 | t->size = min_t(unsigned, |
566 | bkey_to_cacheline(t, end(t->data)), | 645 | bkey_to_cacheline(t, bset_bkey_last(t->data)), |
567 | b->sets->tree + bset_tree_space(b) - t->tree); | 646 | b->set->tree + btree_keys_cachelines(b) - t->tree); |
568 | 647 | ||
569 | if (t->size < 2) { | 648 | if (t->size < 2) { |
570 | t->size = 0; | 649 | t->size = 0; |
@@ -577,16 +656,14 @@ static void bset_build_written_tree(struct btree *b) | |||
577 | for (j = inorder_next(0, t->size); | 656 | for (j = inorder_next(0, t->size); |
578 | j; | 657 | j; |
579 | j = inorder_next(j, t->size)) { | 658 | j = inorder_next(j, t->size)) { |
580 | while (bkey_to_cacheline(t, k) != cacheline) | 659 | while (bkey_to_cacheline(t, k) < cacheline) |
581 | k = bkey_next(k); | 660 | prev = k, k = bkey_next(k); |
582 | 661 | ||
583 | t->prev[j] = bkey_u64s(k); | 662 | t->prev[j] = bkey_u64s(prev); |
584 | k = bkey_next(k); | 663 | t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); |
585 | cacheline++; | ||
586 | t->tree[j].m = bkey_to_cacheline_offset(k); | ||
587 | } | 664 | } |
588 | 665 | ||
589 | while (bkey_next(k) != end(t->data)) | 666 | while (bkey_next(k) != bset_bkey_last(t->data)) |
590 | k = bkey_next(k); | 667 | k = bkey_next(k); |
591 | 668 | ||
592 | t->end = *k; | 669 | t->end = *k; |
@@ -597,14 +674,17 @@ static void bset_build_written_tree(struct btree *b) | |||
597 | j = inorder_next(j, t->size)) | 674 | j = inorder_next(j, t->size)) |
598 | make_bfloat(t, j); | 675 | make_bfloat(t, j); |
599 | } | 676 | } |
677 | EXPORT_SYMBOL(bch_bset_build_written_tree); | ||
600 | 678 | ||
601 | void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) | 679 | /* Insert */ |
680 | |||
681 | void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) | ||
602 | { | 682 | { |
603 | struct bset_tree *t; | 683 | struct bset_tree *t; |
604 | unsigned inorder, j = 1; | 684 | unsigned inorder, j = 1; |
605 | 685 | ||
606 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 686 | for (t = b->set; t <= bset_tree_last(b); t++) |
607 | if (k < end(t->data)) | 687 | if (k < bset_bkey_last(t->data)) |
608 | goto found_set; | 688 | goto found_set; |
609 | 689 | ||
610 | BUG(); | 690 | BUG(); |
@@ -617,7 +697,7 @@ found_set: | |||
617 | if (k == t->data->start) | 697 | if (k == t->data->start) |
618 | goto fix_left; | 698 | goto fix_left; |
619 | 699 | ||
620 | if (bkey_next(k) == end(t->data)) { | 700 | if (bkey_next(k) == bset_bkey_last(t->data)) { |
621 | t->end = *k; | 701 | t->end = *k; |
622 | goto fix_right; | 702 | goto fix_right; |
623 | } | 703 | } |
@@ -642,10 +722,12 @@ fix_right: do { | |||
642 | j = j * 2 + 1; | 722 | j = j * 2 + 1; |
643 | } while (j < t->size); | 723 | } while (j < t->size); |
644 | } | 724 | } |
725 | EXPORT_SYMBOL(bch_bset_fix_invalidated_key); | ||
645 | 726 | ||
646 | void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) | 727 | static void bch_bset_fix_lookup_table(struct btree_keys *b, |
728 | struct bset_tree *t, | ||
729 | struct bkey *k) | ||
647 | { | 730 | { |
648 | struct bset_tree *t = &b->sets[b->nsets]; | ||
649 | unsigned shift = bkey_u64s(k); | 731 | unsigned shift = bkey_u64s(k); |
650 | unsigned j = bkey_to_cacheline(t, k); | 732 | unsigned j = bkey_to_cacheline(t, k); |
651 | 733 | ||
@@ -657,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) | |||
657 | * lookup table for the first key that is strictly greater than k: | 739 | * lookup table for the first key that is strictly greater than k: |
658 | * it's either k's cacheline or the next one | 740 | * it's either k's cacheline or the next one |
659 | */ | 741 | */ |
660 | if (j < t->size && | 742 | while (j < t->size && |
661 | table_to_bkey(t, j) <= k) | 743 | table_to_bkey(t, j) <= k) |
662 | j++; | 744 | j++; |
663 | 745 | ||
664 | /* Adjust all the lookup table entries, and find a new key for any that | 746 | /* Adjust all the lookup table entries, and find a new key for any that |
@@ -673,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) | |||
673 | while (k < cacheline_to_bkey(t, j, 0)) | 755 | while (k < cacheline_to_bkey(t, j, 0)) |
674 | k = bkey_next(k); | 756 | k = bkey_next(k); |
675 | 757 | ||
676 | t->prev[j] = bkey_to_cacheline_offset(k); | 758 | t->prev[j] = bkey_to_cacheline_offset(t, j, k); |
677 | } | 759 | } |
678 | } | 760 | } |
679 | 761 | ||
680 | if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) | 762 | if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) |
681 | return; | 763 | return; |
682 | 764 | ||
683 | /* Possibly add a new entry to the end of the lookup table */ | 765 | /* Possibly add a new entry to the end of the lookup table */ |
684 | 766 | ||
685 | for (k = table_to_bkey(t, t->size - 1); | 767 | for (k = table_to_bkey(t, t->size - 1); |
686 | k != end(t->data); | 768 | k != bset_bkey_last(t->data); |
687 | k = bkey_next(k)) | 769 | k = bkey_next(k)) |
688 | if (t->size == bkey_to_cacheline(t, k)) { | 770 | if (t->size == bkey_to_cacheline(t, k)) { |
689 | t->prev[t->size] = bkey_to_cacheline_offset(k); | 771 | t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); |
690 | t->size++; | 772 | t->size++; |
691 | } | 773 | } |
692 | } | 774 | } |
693 | 775 | ||
694 | void bch_bset_init_next(struct btree *b) | 776 | /* |
777 | * Tries to merge l and r: l should be lower than r | ||
778 | * Returns true if we were able to merge. If we did merge, l will be the merged | ||
779 | * key, r will be untouched. | ||
780 | */ | ||
781 | bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) | ||
695 | { | 782 | { |
696 | struct bset *i = write_block(b); | 783 | if (!b->ops->key_merge) |
784 | return false; | ||
697 | 785 | ||
698 | if (i != b->sets[0].data) { | 786 | /* |
699 | b->sets[++b->nsets].data = i; | 787 | * Generic header checks |
700 | i->seq = b->sets[0].data->seq; | 788 | * Assumes left and right are in order |
701 | } else | 789 | * Left and right must be exactly aligned |
702 | get_random_bytes(&i->seq, sizeof(uint64_t)); | 790 | */ |
791 | if (!bch_bkey_equal_header(l, r) || | ||
792 | bkey_cmp(l, &START_KEY(r))) | ||
793 | return false; | ||
703 | 794 | ||
704 | i->magic = bset_magic(&b->c->sb); | 795 | return b->ops->key_merge(b, l, r); |
705 | i->version = 0; | 796 | } |
706 | i->keys = 0; | 797 | EXPORT_SYMBOL(bch_bkey_try_merge); |
707 | 798 | ||
708 | bset_build_unwritten_tree(b); | 799 | void bch_bset_insert(struct btree_keys *b, struct bkey *where, |
800 | struct bkey *insert) | ||
801 | { | ||
802 | struct bset_tree *t = bset_tree_last(b); | ||
803 | |||
804 | BUG_ON(!b->last_set_unwritten); | ||
805 | BUG_ON(bset_byte_offset(b, t->data) + | ||
806 | __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > | ||
807 | PAGE_SIZE << b->page_order); | ||
808 | |||
809 | memmove((uint64_t *) where + bkey_u64s(insert), | ||
810 | where, | ||
811 | (void *) bset_bkey_last(t->data) - (void *) where); | ||
812 | |||
813 | t->data->keys += bkey_u64s(insert); | ||
814 | bkey_copy(where, insert); | ||
815 | bch_bset_fix_lookup_table(b, t, where); | ||
709 | } | 816 | } |
817 | EXPORT_SYMBOL(bch_bset_insert); | ||
818 | |||
819 | unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, | ||
820 | struct bkey *replace_key) | ||
821 | { | ||
822 | unsigned status = BTREE_INSERT_STATUS_NO_INSERT; | ||
823 | struct bset *i = bset_tree_last(b)->data; | ||
824 | struct bkey *m, *prev = NULL; | ||
825 | struct btree_iter iter; | ||
826 | |||
827 | BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); | ||
828 | |||
829 | m = bch_btree_iter_init(b, &iter, b->ops->is_extents | ||
830 | ? PRECEDING_KEY(&START_KEY(k)) | ||
831 | : PRECEDING_KEY(k)); | ||
832 | |||
833 | if (b->ops->insert_fixup(b, k, &iter, replace_key)) | ||
834 | return status; | ||
835 | |||
836 | status = BTREE_INSERT_STATUS_INSERT; | ||
837 | |||
838 | while (m != bset_bkey_last(i) && | ||
839 | bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) | ||
840 | prev = m, m = bkey_next(m); | ||
841 | |||
842 | /* prev is in the tree, if we merge we're done */ | ||
843 | status = BTREE_INSERT_STATUS_BACK_MERGE; | ||
844 | if (prev && | ||
845 | bch_bkey_try_merge(b, prev, k)) | ||
846 | goto merged; | ||
847 | #if 0 | ||
848 | status = BTREE_INSERT_STATUS_OVERWROTE; | ||
849 | if (m != bset_bkey_last(i) && | ||
850 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | ||
851 | goto copy; | ||
852 | #endif | ||
853 | status = BTREE_INSERT_STATUS_FRONT_MERGE; | ||
854 | if (m != bset_bkey_last(i) && | ||
855 | bch_bkey_try_merge(b, k, m)) | ||
856 | goto copy; | ||
857 | |||
858 | bch_bset_insert(b, m, k); | ||
859 | copy: bkey_copy(m, k); | ||
860 | merged: | ||
861 | return status; | ||
862 | } | ||
863 | EXPORT_SYMBOL(bch_btree_insert_key); | ||
864 | |||
865 | /* Lookup */ | ||
710 | 866 | ||
711 | struct bset_search_iter { | 867 | struct bset_search_iter { |
712 | struct bkey *l, *r; | 868 | struct bkey *l, *r; |
713 | }; | 869 | }; |
714 | 870 | ||
715 | static struct bset_search_iter bset_search_write_set(struct btree *b, | 871 | static struct bset_search_iter bset_search_write_set(struct bset_tree *t, |
716 | struct bset_tree *t, | ||
717 | const struct bkey *search) | 872 | const struct bkey *search) |
718 | { | 873 | { |
719 | unsigned li = 0, ri = t->size; | 874 | unsigned li = 0, ri = t->size; |
720 | 875 | ||
721 | BUG_ON(!b->nsets && | ||
722 | t->size < bkey_to_cacheline(t, end(t->data))); | ||
723 | |||
724 | while (li + 1 != ri) { | 876 | while (li + 1 != ri) { |
725 | unsigned m = (li + ri) >> 1; | 877 | unsigned m = (li + ri) >> 1; |
726 | 878 | ||
@@ -732,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b, | |||
732 | 884 | ||
733 | return (struct bset_search_iter) { | 885 | return (struct bset_search_iter) { |
734 | table_to_bkey(t, li), | 886 | table_to_bkey(t, li), |
735 | ri < t->size ? table_to_bkey(t, ri) : end(t->data) | 887 | ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data) |
736 | }; | 888 | }; |
737 | } | 889 | } |
738 | 890 | ||
739 | static struct bset_search_iter bset_search_tree(struct btree *b, | 891 | static struct bset_search_iter bset_search_tree(struct bset_tree *t, |
740 | struct bset_tree *t, | ||
741 | const struct bkey *search) | 892 | const struct bkey *search) |
742 | { | 893 | { |
743 | struct bkey *l, *r; | 894 | struct bkey *l, *r; |
@@ -784,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, | |||
784 | f = &t->tree[inorder_next(j, t->size)]; | 935 | f = &t->tree[inorder_next(j, t->size)]; |
785 | r = cacheline_to_bkey(t, inorder, f->m); | 936 | r = cacheline_to_bkey(t, inorder, f->m); |
786 | } else | 937 | } else |
787 | r = end(t->data); | 938 | r = bset_bkey_last(t->data); |
788 | } else { | 939 | } else { |
789 | r = cacheline_to_bkey(t, inorder, f->m); | 940 | r = cacheline_to_bkey(t, inorder, f->m); |
790 | 941 | ||
@@ -798,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, | |||
798 | return (struct bset_search_iter) {l, r}; | 949 | return (struct bset_search_iter) {l, r}; |
799 | } | 950 | } |
800 | 951 | ||
801 | struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | 952 | struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, |
802 | const struct bkey *search) | 953 | const struct bkey *search) |
803 | { | 954 | { |
804 | struct bset_search_iter i; | 955 | struct bset_search_iter i; |
@@ -820,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | |||
820 | 971 | ||
821 | if (unlikely(!t->size)) { | 972 | if (unlikely(!t->size)) { |
822 | i.l = t->data->start; | 973 | i.l = t->data->start; |
823 | i.r = end(t->data); | 974 | i.r = bset_bkey_last(t->data); |
824 | } else if (bset_written(b, t)) { | 975 | } else if (bset_written(b, t)) { |
825 | /* | 976 | /* |
826 | * Each node in the auxiliary search tree covers a certain range | 977 | * Each node in the auxiliary search tree covers a certain range |
@@ -830,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | |||
830 | */ | 981 | */ |
831 | 982 | ||
832 | if (unlikely(bkey_cmp(search, &t->end) >= 0)) | 983 | if (unlikely(bkey_cmp(search, &t->end) >= 0)) |
833 | return end(t->data); | 984 | return bset_bkey_last(t->data); |
834 | 985 | ||
835 | if (unlikely(bkey_cmp(search, t->data->start) < 0)) | 986 | if (unlikely(bkey_cmp(search, t->data->start) < 0)) |
836 | return t->data->start; | 987 | return t->data->start; |
837 | 988 | ||
838 | i = bset_search_tree(b, t, search); | 989 | i = bset_search_tree(t, search); |
839 | } else | 990 | } else { |
840 | i = bset_search_write_set(b, t, search); | 991 | BUG_ON(!b->nsets && |
992 | t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); | ||
841 | 993 | ||
842 | if (expensive_debug_checks(b->c)) { | 994 | i = bset_search_write_set(t, search); |
995 | } | ||
996 | |||
997 | if (btree_keys_expensive_checks(b)) { | ||
843 | BUG_ON(bset_written(b, t) && | 998 | BUG_ON(bset_written(b, t) && |
844 | i.l != t->data->start && | 999 | i.l != t->data->start && |
845 | bkey_cmp(tree_to_prev_bkey(t, | 1000 | bkey_cmp(tree_to_prev_bkey(t, |
846 | inorder_to_tree(bkey_to_cacheline(t, i.l), t)), | 1001 | inorder_to_tree(bkey_to_cacheline(t, i.l), t)), |
847 | search) > 0); | 1002 | search) > 0); |
848 | 1003 | ||
849 | BUG_ON(i.r != end(t->data) && | 1004 | BUG_ON(i.r != bset_bkey_last(t->data) && |
850 | bkey_cmp(i.r, search) <= 0); | 1005 | bkey_cmp(i.r, search) <= 0); |
851 | } | 1006 | } |
852 | 1007 | ||
@@ -856,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | |||
856 | 1011 | ||
857 | return i.l; | 1012 | return i.l; |
858 | } | 1013 | } |
1014 | EXPORT_SYMBOL(__bch_bset_search); | ||
859 | 1015 | ||
860 | /* Btree iterator */ | 1016 | /* Btree iterator */ |
861 | 1017 | ||
862 | /* | 1018 | typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, |
863 | * Returns true if l > r - unless l == r, in which case returns true if l is | 1019 | struct btree_iter_set); |
864 | * older than r. | 1020 | |
865 | * | ||
866 | * Necessary for btree_sort_fixup() - if there are multiple keys that compare | ||
867 | * equal in different sets, we have to process them newest to oldest. | ||
868 | */ | ||
869 | static inline bool btree_iter_cmp(struct btree_iter_set l, | 1021 | static inline bool btree_iter_cmp(struct btree_iter_set l, |
870 | struct btree_iter_set r) | 1022 | struct btree_iter_set r) |
871 | { | 1023 | { |
872 | int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); | 1024 | return bkey_cmp(l.k, r.k) > 0; |
873 | |||
874 | return c ? c > 0 : l.k < r.k; | ||
875 | } | 1025 | } |
876 | 1026 | ||
877 | static inline bool btree_iter_end(struct btree_iter *iter) | 1027 | static inline bool btree_iter_end(struct btree_iter *iter) |
@@ -888,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, | |||
888 | btree_iter_cmp)); | 1038 | btree_iter_cmp)); |
889 | } | 1039 | } |
890 | 1040 | ||
891 | struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, | 1041 | static struct bkey *__bch_btree_iter_init(struct btree_keys *b, |
892 | struct bkey *search, struct bset_tree *start) | 1042 | struct btree_iter *iter, |
1043 | struct bkey *search, | ||
1044 | struct bset_tree *start) | ||
893 | { | 1045 | { |
894 | struct bkey *ret = NULL; | 1046 | struct bkey *ret = NULL; |
895 | iter->size = ARRAY_SIZE(iter->data); | 1047 | iter->size = ARRAY_SIZE(iter->data); |
@@ -899,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, | |||
899 | iter->b = b; | 1051 | iter->b = b; |
900 | #endif | 1052 | #endif |
901 | 1053 | ||
902 | for (; start <= &b->sets[b->nsets]; start++) { | 1054 | for (; start <= bset_tree_last(b); start++) { |
903 | ret = bch_bset_search(b, start, search); | 1055 | ret = bch_bset_search(b, start, search); |
904 | bch_btree_iter_push(iter, ret, end(start->data)); | 1056 | bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); |
905 | } | 1057 | } |
906 | 1058 | ||
907 | return ret; | 1059 | return ret; |
908 | } | 1060 | } |
909 | 1061 | ||
910 | struct bkey *bch_btree_iter_next(struct btree_iter *iter) | 1062 | struct bkey *bch_btree_iter_init(struct btree_keys *b, |
1063 | struct btree_iter *iter, | ||
1064 | struct bkey *search) | ||
1065 | { | ||
1066 | return __bch_btree_iter_init(b, iter, search, b->set); | ||
1067 | } | ||
1068 | EXPORT_SYMBOL(bch_btree_iter_init); | ||
1069 | |||
1070 | static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, | ||
1071 | btree_iter_cmp_fn *cmp) | ||
911 | { | 1072 | { |
912 | struct btree_iter_set unused; | 1073 | struct btree_iter_set unused; |
913 | struct bkey *ret = NULL; | 1074 | struct bkey *ret = NULL; |
@@ -924,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) | |||
924 | } | 1085 | } |
925 | 1086 | ||
926 | if (iter->data->k == iter->data->end) | 1087 | if (iter->data->k == iter->data->end) |
927 | heap_pop(iter, unused, btree_iter_cmp); | 1088 | heap_pop(iter, unused, cmp); |
928 | else | 1089 | else |
929 | heap_sift(iter, 0, btree_iter_cmp); | 1090 | heap_sift(iter, 0, cmp); |
930 | } | 1091 | } |
931 | 1092 | ||
932 | return ret; | 1093 | return ret; |
933 | } | 1094 | } |
934 | 1095 | ||
1096 | struct bkey *bch_btree_iter_next(struct btree_iter *iter) | ||
1097 | { | ||
1098 | return __bch_btree_iter_next(iter, btree_iter_cmp); | ||
1099 | |||
1100 | } | ||
1101 | EXPORT_SYMBOL(bch_btree_iter_next); | ||
1102 | |||
935 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, | 1103 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, |
936 | struct btree *b, ptr_filter_fn fn) | 1104 | struct btree_keys *b, ptr_filter_fn fn) |
937 | { | 1105 | { |
938 | struct bkey *ret; | 1106 | struct bkey *ret; |
939 | 1107 | ||
@@ -946,70 +1114,58 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, | |||
946 | 1114 | ||
947 | /* Mergesort */ | 1115 | /* Mergesort */ |
948 | 1116 | ||
949 | static void sort_key_next(struct btree_iter *iter, | 1117 | void bch_bset_sort_state_free(struct bset_sort_state *state) |
950 | struct btree_iter_set *i) | ||
951 | { | 1118 | { |
952 | i->k = bkey_next(i->k); | 1119 | if (state->pool) |
953 | 1120 | mempool_destroy(state->pool); | |
954 | if (i->k == i->end) | ||
955 | *i = iter->data[--iter->used]; | ||
956 | } | 1121 | } |
957 | 1122 | ||
958 | static void btree_sort_fixup(struct btree_iter *iter) | 1123 | int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) |
959 | { | 1124 | { |
960 | while (iter->used > 1) { | 1125 | spin_lock_init(&state->time.lock); |
961 | struct btree_iter_set *top = iter->data, *i = top + 1; | ||
962 | 1126 | ||
963 | if (iter->used > 2 && | 1127 | state->page_order = page_order; |
964 | btree_iter_cmp(i[0], i[1])) | 1128 | state->crit_factor = int_sqrt(1 << page_order); |
965 | i++; | ||
966 | 1129 | ||
967 | if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) | 1130 | state->pool = mempool_create_page_pool(1, page_order); |
968 | break; | 1131 | if (!state->pool) |
969 | 1132 | return -ENOMEM; | |
970 | if (!KEY_SIZE(i->k)) { | ||
971 | sort_key_next(iter, i); | ||
972 | heap_sift(iter, i - top, btree_iter_cmp); | ||
973 | continue; | ||
974 | } | ||
975 | |||
976 | if (top->k > i->k) { | ||
977 | if (bkey_cmp(top->k, i->k) >= 0) | ||
978 | sort_key_next(iter, i); | ||
979 | else | ||
980 | bch_cut_front(top->k, i->k); | ||
981 | 1133 | ||
982 | heap_sift(iter, i - top, btree_iter_cmp); | 1134 | return 0; |
983 | } else { | ||
984 | /* can't happen because of comparison func */ | ||
985 | BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); | ||
986 | bch_cut_back(&START_KEY(i->k), top->k); | ||
987 | } | ||
988 | } | ||
989 | } | 1135 | } |
1136 | EXPORT_SYMBOL(bch_bset_sort_state_init); | ||
990 | 1137 | ||
991 | static void btree_mergesort(struct btree *b, struct bset *out, | 1138 | static void btree_mergesort(struct btree_keys *b, struct bset *out, |
992 | struct btree_iter *iter, | 1139 | struct btree_iter *iter, |
993 | bool fixup, bool remove_stale) | 1140 | bool fixup, bool remove_stale) |
994 | { | 1141 | { |
1142 | int i; | ||
995 | struct bkey *k, *last = NULL; | 1143 | struct bkey *k, *last = NULL; |
996 | bool (*bad)(struct btree *, const struct bkey *) = remove_stale | 1144 | BKEY_PADDED(k) tmp; |
1145 | bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale | ||
997 | ? bch_ptr_bad | 1146 | ? bch_ptr_bad |
998 | : bch_ptr_invalid; | 1147 | : bch_ptr_invalid; |
999 | 1148 | ||
1149 | /* Heapify the iterator, using our comparison function */ | ||
1150 | for (i = iter->used / 2 - 1; i >= 0; --i) | ||
1151 | heap_sift(iter, i, b->ops->sort_cmp); | ||
1152 | |||
1000 | while (!btree_iter_end(iter)) { | 1153 | while (!btree_iter_end(iter)) { |
1001 | if (fixup && !b->level) | 1154 | if (b->ops->sort_fixup && fixup) |
1002 | btree_sort_fixup(iter); | 1155 | k = b->ops->sort_fixup(iter, &tmp.k); |
1156 | else | ||
1157 | k = NULL; | ||
1158 | |||
1159 | if (!k) | ||
1160 | k = __bch_btree_iter_next(iter, b->ops->sort_cmp); | ||
1003 | 1161 | ||
1004 | k = bch_btree_iter_next(iter); | ||
1005 | if (bad(b, k)) | 1162 | if (bad(b, k)) |
1006 | continue; | 1163 | continue; |
1007 | 1164 | ||
1008 | if (!last) { | 1165 | if (!last) { |
1009 | last = out->start; | 1166 | last = out->start; |
1010 | bkey_copy(last, k); | 1167 | bkey_copy(last, k); |
1011 | } else if (b->level || | 1168 | } else if (!bch_bkey_try_merge(b, last, k)) { |
1012 | !bch_bkey_try_merge(b, last, k)) { | ||
1013 | last = bkey_next(last); | 1169 | last = bkey_next(last); |
1014 | bkey_copy(last, k); | 1170 | bkey_copy(last, k); |
1015 | } | 1171 | } |
@@ -1020,27 +1176,27 @@ static void btree_mergesort(struct btree *b, struct bset *out, | |||
1020 | pr_debug("sorted %i keys", out->keys); | 1176 | pr_debug("sorted %i keys", out->keys); |
1021 | } | 1177 | } |
1022 | 1178 | ||
1023 | static void __btree_sort(struct btree *b, struct btree_iter *iter, | 1179 | static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, |
1024 | unsigned start, unsigned order, bool fixup) | 1180 | unsigned start, unsigned order, bool fixup, |
1181 | struct bset_sort_state *state) | ||
1025 | { | 1182 | { |
1026 | uint64_t start_time; | 1183 | uint64_t start_time; |
1027 | bool remove_stale = !b->written; | 1184 | bool used_mempool = false; |
1028 | struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, | 1185 | struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, |
1029 | order); | 1186 | order); |
1030 | if (!out) { | 1187 | if (!out) { |
1031 | mutex_lock(&b->c->sort_lock); | 1188 | BUG_ON(order > state->page_order); |
1032 | out = b->c->sort; | 1189 | |
1033 | order = ilog2(bucket_pages(b->c)); | 1190 | out = page_address(mempool_alloc(state->pool, GFP_NOIO)); |
1191 | used_mempool = true; | ||
1192 | order = state->page_order; | ||
1034 | } | 1193 | } |
1035 | 1194 | ||
1036 | start_time = local_clock(); | 1195 | start_time = local_clock(); |
1037 | 1196 | ||
1038 | btree_mergesort(b, out, iter, fixup, remove_stale); | 1197 | btree_mergesort(b, out, iter, fixup, false); |
1039 | b->nsets = start; | 1198 | b->nsets = start; |
1040 | 1199 | ||
1041 | if (!fixup && !start && b->written) | ||
1042 | bch_btree_verify(b, out); | ||
1043 | |||
1044 | if (!start && order == b->page_order) { | 1200 | if (!start && order == b->page_order) { |
1045 | /* | 1201 | /* |
1046 | * Our temporary buffer is the same size as the btree node's | 1202 | * Our temporary buffer is the same size as the btree node's |
@@ -1048,84 +1204,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, | |||
1048 | * memcpy() | 1204 | * memcpy() |
1049 | */ | 1205 | */ |
1050 | 1206 | ||
1051 | out->magic = bset_magic(&b->c->sb); | 1207 | out->magic = b->set->data->magic; |
1052 | out->seq = b->sets[0].data->seq; | 1208 | out->seq = b->set->data->seq; |
1053 | out->version = b->sets[0].data->version; | 1209 | out->version = b->set->data->version; |
1054 | swap(out, b->sets[0].data); | 1210 | swap(out, b->set->data); |
1055 | |||
1056 | if (b->c->sort == b->sets[0].data) | ||
1057 | b->c->sort = out; | ||
1058 | } else { | 1211 | } else { |
1059 | b->sets[start].data->keys = out->keys; | 1212 | b->set[start].data->keys = out->keys; |
1060 | memcpy(b->sets[start].data->start, out->start, | 1213 | memcpy(b->set[start].data->start, out->start, |
1061 | (void *) end(out) - (void *) out->start); | 1214 | (void *) bset_bkey_last(out) - (void *) out->start); |
1062 | } | 1215 | } |
1063 | 1216 | ||
1064 | if (out == b->c->sort) | 1217 | if (used_mempool) |
1065 | mutex_unlock(&b->c->sort_lock); | 1218 | mempool_free(virt_to_page(out), state->pool); |
1066 | else | 1219 | else |
1067 | free_pages((unsigned long) out, order); | 1220 | free_pages((unsigned long) out, order); |
1068 | 1221 | ||
1069 | if (b->written) | 1222 | bch_bset_build_written_tree(b); |
1070 | bset_build_written_tree(b); | ||
1071 | 1223 | ||
1072 | if (!start) | 1224 | if (!start) |
1073 | bch_time_stats_update(&b->c->sort_time, start_time); | 1225 | bch_time_stats_update(&state->time, start_time); |
1074 | } | 1226 | } |
1075 | 1227 | ||
1076 | void bch_btree_sort_partial(struct btree *b, unsigned start) | 1228 | void bch_btree_sort_partial(struct btree_keys *b, unsigned start, |
1229 | struct bset_sort_state *state) | ||
1077 | { | 1230 | { |
1078 | size_t order = b->page_order, keys = 0; | 1231 | size_t order = b->page_order, keys = 0; |
1079 | struct btree_iter iter; | 1232 | struct btree_iter iter; |
1080 | int oldsize = bch_count_data(b); | 1233 | int oldsize = bch_count_data(b); |
1081 | 1234 | ||
1082 | __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); | 1235 | __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); |
1083 | |||
1084 | BUG_ON(b->sets[b->nsets].data == write_block(b) && | ||
1085 | (b->sets[b->nsets].size || b->nsets)); | ||
1086 | |||
1087 | 1236 | ||
1088 | if (start) { | 1237 | if (start) { |
1089 | unsigned i; | 1238 | unsigned i; |
1090 | 1239 | ||
1091 | for (i = start; i <= b->nsets; i++) | 1240 | for (i = start; i <= b->nsets; i++) |
1092 | keys += b->sets[i].data->keys; | 1241 | keys += b->set[i].data->keys; |
1093 | 1242 | ||
1094 | order = roundup_pow_of_two(__set_bytes(b->sets->data, | 1243 | order = get_order(__set_bytes(b->set->data, keys)); |
1095 | keys)) / PAGE_SIZE; | ||
1096 | if (order) | ||
1097 | order = ilog2(order); | ||
1098 | } | 1244 | } |
1099 | 1245 | ||
1100 | __btree_sort(b, &iter, start, order, false); | 1246 | __btree_sort(b, &iter, start, order, false, state); |
1101 | 1247 | ||
1102 | EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); | 1248 | EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); |
1103 | } | 1249 | } |
1250 | EXPORT_SYMBOL(bch_btree_sort_partial); | ||
1104 | 1251 | ||
1105 | void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) | 1252 | void bch_btree_sort_and_fix_extents(struct btree_keys *b, |
1253 | struct btree_iter *iter, | ||
1254 | struct bset_sort_state *state) | ||
1106 | { | 1255 | { |
1107 | BUG_ON(!b->written); | 1256 | __btree_sort(b, iter, 0, b->page_order, true, state); |
1108 | __btree_sort(b, iter, 0, b->page_order, true); | ||
1109 | } | 1257 | } |
1110 | 1258 | ||
1111 | void bch_btree_sort_into(struct btree *b, struct btree *new) | 1259 | void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, |
1260 | struct bset_sort_state *state) | ||
1112 | { | 1261 | { |
1113 | uint64_t start_time = local_clock(); | 1262 | uint64_t start_time = local_clock(); |
1114 | 1263 | ||
1115 | struct btree_iter iter; | 1264 | struct btree_iter iter; |
1116 | bch_btree_iter_init(b, &iter, NULL); | 1265 | bch_btree_iter_init(b, &iter, NULL); |
1117 | 1266 | ||
1118 | btree_mergesort(b, new->sets->data, &iter, false, true); | 1267 | btree_mergesort(b, new->set->data, &iter, false, true); |
1119 | 1268 | ||
1120 | bch_time_stats_update(&b->c->sort_time, start_time); | 1269 | bch_time_stats_update(&state->time, start_time); |
1121 | 1270 | ||
1122 | bkey_copy_key(&new->key, &b->key); | 1271 | new->set->size = 0; // XXX: why? |
1123 | new->sets->size = 0; | ||
1124 | } | 1272 | } |
1125 | 1273 | ||
1126 | #define SORT_CRIT (4096 / sizeof(uint64_t)) | 1274 | #define SORT_CRIT (4096 / sizeof(uint64_t)) |
1127 | 1275 | ||
1128 | void bch_btree_sort_lazy(struct btree *b) | 1276 | void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) |
1129 | { | 1277 | { |
1130 | unsigned crit = SORT_CRIT; | 1278 | unsigned crit = SORT_CRIT; |
1131 | int i; | 1279 | int i; |
@@ -1134,50 +1282,32 @@ void bch_btree_sort_lazy(struct btree *b) | |||
1134 | if (!b->nsets) | 1282 | if (!b->nsets) |
1135 | goto out; | 1283 | goto out; |
1136 | 1284 | ||
1137 | /* If not a leaf node, always sort */ | ||
1138 | if (b->level) { | ||
1139 | bch_btree_sort(b); | ||
1140 | return; | ||
1141 | } | ||
1142 | |||
1143 | for (i = b->nsets - 1; i >= 0; --i) { | 1285 | for (i = b->nsets - 1; i >= 0; --i) { |
1144 | crit *= b->c->sort_crit_factor; | 1286 | crit *= state->crit_factor; |
1145 | 1287 | ||
1146 | if (b->sets[i].data->keys < crit) { | 1288 | if (b->set[i].data->keys < crit) { |
1147 | bch_btree_sort_partial(b, i); | 1289 | bch_btree_sort_partial(b, i, state); |
1148 | return; | 1290 | return; |
1149 | } | 1291 | } |
1150 | } | 1292 | } |
1151 | 1293 | ||
1152 | /* Sort if we'd overflow */ | 1294 | /* Sort if we'd overflow */ |
1153 | if (b->nsets + 1 == MAX_BSETS) { | 1295 | if (b->nsets + 1 == MAX_BSETS) { |
1154 | bch_btree_sort(b); | 1296 | bch_btree_sort(b, state); |
1155 | return; | 1297 | return; |
1156 | } | 1298 | } |
1157 | 1299 | ||
1158 | out: | 1300 | out: |
1159 | bset_build_written_tree(b); | 1301 | bch_bset_build_written_tree(b); |
1160 | } | 1302 | } |
1303 | EXPORT_SYMBOL(bch_btree_sort_lazy); | ||
1161 | 1304 | ||
1162 | /* Sysfs stuff */ | 1305 | void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) |
1163 | |||
1164 | struct bset_stats { | ||
1165 | struct btree_op op; | ||
1166 | size_t nodes; | ||
1167 | size_t sets_written, sets_unwritten; | ||
1168 | size_t bytes_written, bytes_unwritten; | ||
1169 | size_t floats, failed; | ||
1170 | }; | ||
1171 | |||
1172 | static int btree_bset_stats(struct btree_op *op, struct btree *b) | ||
1173 | { | 1306 | { |
1174 | struct bset_stats *stats = container_of(op, struct bset_stats, op); | ||
1175 | unsigned i; | 1307 | unsigned i; |
1176 | 1308 | ||
1177 | stats->nodes++; | ||
1178 | |||
1179 | for (i = 0; i <= b->nsets; i++) { | 1309 | for (i = 0; i <= b->nsets; i++) { |
1180 | struct bset_tree *t = &b->sets[i]; | 1310 | struct bset_tree *t = &b->set[i]; |
1181 | size_t bytes = t->data->keys * sizeof(uint64_t); | 1311 | size_t bytes = t->data->keys * sizeof(uint64_t); |
1182 | size_t j; | 1312 | size_t j; |
1183 | 1313 | ||
@@ -1195,32 +1325,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b) | |||
1195 | stats->bytes_unwritten += bytes; | 1325 | stats->bytes_unwritten += bytes; |
1196 | } | 1326 | } |
1197 | } | 1327 | } |
1198 | |||
1199 | return MAP_CONTINUE; | ||
1200 | } | ||
1201 | |||
1202 | int bch_bset_print_stats(struct cache_set *c, char *buf) | ||
1203 | { | ||
1204 | struct bset_stats t; | ||
1205 | int ret; | ||
1206 | |||
1207 | memset(&t, 0, sizeof(struct bset_stats)); | ||
1208 | bch_btree_op_init(&t.op, -1); | ||
1209 | |||
1210 | ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); | ||
1211 | if (ret < 0) | ||
1212 | return ret; | ||
1213 | |||
1214 | return snprintf(buf, PAGE_SIZE, | ||
1215 | "btree nodes: %zu\n" | ||
1216 | "written sets: %zu\n" | ||
1217 | "unwritten sets: %zu\n" | ||
1218 | "written key bytes: %zu\n" | ||
1219 | "unwritten key bytes: %zu\n" | ||
1220 | "floats: %zu\n" | ||
1221 | "failed: %zu\n", | ||
1222 | t.nodes, | ||
1223 | t.sets_written, t.sets_unwritten, | ||
1224 | t.bytes_written, t.bytes_unwritten, | ||
1225 | t.floats, t.failed); | ||
1226 | } | 1328 | } |
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 1d3c24f9fa0e..003260f4ddf6 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h | |||
@@ -1,7 +1,11 @@ | |||
1 | #ifndef _BCACHE_BSET_H | 1 | #ifndef _BCACHE_BSET_H |
2 | #define _BCACHE_BSET_H | 2 | #define _BCACHE_BSET_H |
3 | 3 | ||
4 | #include <linux/slab.h> | 4 | #include <linux/bcache.h> |
5 | #include <linux/kernel.h> | ||
6 | #include <linux/types.h> | ||
7 | |||
8 | #include "util.h" /* for time_stats */ | ||
5 | 9 | ||
6 | /* | 10 | /* |
7 | * BKEYS: | 11 | * BKEYS: |
@@ -142,20 +146,13 @@ | |||
142 | * first key in that range of bytes again. | 146 | * first key in that range of bytes again. |
143 | */ | 147 | */ |
144 | 148 | ||
145 | /* Btree key comparison/iteration */ | 149 | struct btree_keys; |
150 | struct btree_iter; | ||
151 | struct btree_iter_set; | ||
152 | struct bkey_float; | ||
146 | 153 | ||
147 | #define MAX_BSETS 4U | 154 | #define MAX_BSETS 4U |
148 | 155 | ||
149 | struct btree_iter { | ||
150 | size_t size, used; | ||
151 | #ifdef CONFIG_BCACHE_DEBUG | ||
152 | struct btree *b; | ||
153 | #endif | ||
154 | struct btree_iter_set { | ||
155 | struct bkey *k, *end; | ||
156 | } data[MAX_BSETS]; | ||
157 | }; | ||
158 | |||
159 | struct bset_tree { | 156 | struct bset_tree { |
160 | /* | 157 | /* |
161 | * We construct a binary tree in an array as if the array | 158 | * We construct a binary tree in an array as if the array |
@@ -165,14 +162,14 @@ struct bset_tree { | |||
165 | */ | 162 | */ |
166 | 163 | ||
167 | /* size of the binary tree and prev array */ | 164 | /* size of the binary tree and prev array */ |
168 | unsigned size; | 165 | unsigned size; |
169 | 166 | ||
170 | /* function of size - precalculated for to_inorder() */ | 167 | /* function of size - precalculated for to_inorder() */ |
171 | unsigned extra; | 168 | unsigned extra; |
172 | 169 | ||
173 | /* copy of the last key in the set */ | 170 | /* copy of the last key in the set */ |
174 | struct bkey end; | 171 | struct bkey end; |
175 | struct bkey_float *tree; | 172 | struct bkey_float *tree; |
176 | 173 | ||
177 | /* | 174 | /* |
178 | * The nodes in the bset tree point to specific keys - this | 175 | * The nodes in the bset tree point to specific keys - this |
@@ -182,12 +179,219 @@ struct bset_tree { | |||
182 | * to keep bkey_float to 4 bytes and prev isn't used in the fast | 179 | * to keep bkey_float to 4 bytes and prev isn't used in the fast |
183 | * path. | 180 | * path. |
184 | */ | 181 | */ |
185 | uint8_t *prev; | 182 | uint8_t *prev; |
186 | 183 | ||
187 | /* The actual btree node, with pointers to each sorted set */ | 184 | /* The actual btree node, with pointers to each sorted set */ |
188 | struct bset *data; | 185 | struct bset *data; |
186 | }; | ||
187 | |||
188 | struct btree_keys_ops { | ||
189 | bool (*sort_cmp)(struct btree_iter_set, | ||
190 | struct btree_iter_set); | ||
191 | struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); | ||
192 | bool (*insert_fixup)(struct btree_keys *, struct bkey *, | ||
193 | struct btree_iter *, struct bkey *); | ||
194 | bool (*key_invalid)(struct btree_keys *, | ||
195 | const struct bkey *); | ||
196 | bool (*key_bad)(struct btree_keys *, const struct bkey *); | ||
197 | bool (*key_merge)(struct btree_keys *, | ||
198 | struct bkey *, struct bkey *); | ||
199 | void (*key_to_text)(char *, size_t, const struct bkey *); | ||
200 | void (*key_dump)(struct btree_keys *, const struct bkey *); | ||
201 | |||
202 | /* | ||
203 | * Only used for deciding whether to use START_KEY(k) or just the key | ||
204 | * itself in a couple places | ||
205 | */ | ||
206 | bool is_extents; | ||
207 | }; | ||
208 | |||
209 | struct btree_keys { | ||
210 | const struct btree_keys_ops *ops; | ||
211 | uint8_t page_order; | ||
212 | uint8_t nsets; | ||
213 | unsigned last_set_unwritten:1; | ||
214 | bool *expensive_debug_checks; | ||
215 | |||
216 | /* | ||
217 | * Sets of sorted keys - the real btree node - plus a binary search tree | ||
218 | * | ||
219 | * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||
220 | * to the memory we have allocated for this btree node. Additionally, | ||
221 | * set[0]->data points to the entire btree node as it exists on disk. | ||
222 | */ | ||
223 | struct bset_tree set[MAX_BSETS]; | ||
224 | }; | ||
225 | |||
226 | static inline struct bset_tree *bset_tree_last(struct btree_keys *b) | ||
227 | { | ||
228 | return b->set + b->nsets; | ||
229 | } | ||
230 | |||
231 | static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) | ||
232 | { | ||
233 | return t <= b->set + b->nsets - b->last_set_unwritten; | ||
234 | } | ||
235 | |||
236 | static inline bool bkey_written(struct btree_keys *b, struct bkey *k) | ||
237 | { | ||
238 | return !b->last_set_unwritten || k < b->set[b->nsets].data->start; | ||
239 | } | ||
240 | |||
241 | static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) | ||
242 | { | ||
243 | return ((size_t) i) - ((size_t) b->set->data); | ||
244 | } | ||
245 | |||
246 | static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) | ||
247 | { | ||
248 | return bset_byte_offset(b, i) >> 9; | ||
249 | } | ||
250 | |||
251 | #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) | ||
252 | #define set_bytes(i) __set_bytes(i, i->keys) | ||
253 | |||
254 | #define __set_blocks(i, k, block_bytes) \ | ||
255 | DIV_ROUND_UP(__set_bytes(i, k), block_bytes) | ||
256 | #define set_blocks(i, block_bytes) \ | ||
257 | __set_blocks(i, (i)->keys, block_bytes) | ||
258 | |||
259 | static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) | ||
260 | { | ||
261 | struct bset_tree *t = bset_tree_last(b); | ||
262 | |||
263 | BUG_ON((PAGE_SIZE << b->page_order) < | ||
264 | (bset_byte_offset(b, t->data) + set_bytes(t->data))); | ||
265 | |||
266 | if (!b->last_set_unwritten) | ||
267 | return 0; | ||
268 | |||
269 | return ((PAGE_SIZE << b->page_order) - | ||
270 | (bset_byte_offset(b, t->data) + set_bytes(t->data))) / | ||
271 | sizeof(u64); | ||
272 | } | ||
273 | |||
274 | static inline struct bset *bset_next_set(struct btree_keys *b, | ||
275 | unsigned block_bytes) | ||
276 | { | ||
277 | struct bset *i = bset_tree_last(b)->data; | ||
278 | |||
279 | return ((void *) i) + roundup(set_bytes(i), block_bytes); | ||
280 | } | ||
281 | |||
282 | void bch_btree_keys_free(struct btree_keys *); | ||
283 | int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); | ||
284 | void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, | ||
285 | bool *); | ||
286 | |||
287 | void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); | ||
288 | void bch_bset_build_written_tree(struct btree_keys *); | ||
289 | void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); | ||
290 | bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); | ||
291 | void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); | ||
292 | unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, | ||
293 | struct bkey *); | ||
294 | |||
295 | enum { | ||
296 | BTREE_INSERT_STATUS_NO_INSERT = 0, | ||
297 | BTREE_INSERT_STATUS_INSERT, | ||
298 | BTREE_INSERT_STATUS_BACK_MERGE, | ||
299 | BTREE_INSERT_STATUS_OVERWROTE, | ||
300 | BTREE_INSERT_STATUS_FRONT_MERGE, | ||
189 | }; | 301 | }; |
190 | 302 | ||
303 | /* Btree key iteration */ | ||
304 | |||
305 | struct btree_iter { | ||
306 | size_t size, used; | ||
307 | #ifdef CONFIG_BCACHE_DEBUG | ||
308 | struct btree_keys *b; | ||
309 | #endif | ||
310 | struct btree_iter_set { | ||
311 | struct bkey *k, *end; | ||
312 | } data[MAX_BSETS]; | ||
313 | }; | ||
314 | |||
315 | typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); | ||
316 | |||
317 | struct bkey *bch_btree_iter_next(struct btree_iter *); | ||
318 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | ||
319 | struct btree_keys *, ptr_filter_fn); | ||
320 | |||
321 | void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); | ||
322 | struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, | ||
323 | struct bkey *); | ||
324 | |||
325 | struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, | ||
326 | const struct bkey *); | ||
327 | |||
328 | /* | ||
329 | * Returns the first key that is strictly greater than search | ||
330 | */ | ||
331 | static inline struct bkey *bch_bset_search(struct btree_keys *b, | ||
332 | struct bset_tree *t, | ||
333 | const struct bkey *search) | ||
334 | { | ||
335 | return search ? __bch_bset_search(b, t, search) : t->data->start; | ||
336 | } | ||
337 | |||
338 | #define for_each_key_filter(b, k, iter, filter) \ | ||
339 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
340 | ((k) = bch_btree_iter_next_filter((iter), (b), filter));) | ||
341 | |||
342 | #define for_each_key(b, k, iter) \ | ||
343 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
344 | ((k) = bch_btree_iter_next(iter));) | ||
345 | |||
346 | /* Sorting */ | ||
347 | |||
348 | struct bset_sort_state { | ||
349 | mempool_t *pool; | ||
350 | |||
351 | unsigned page_order; | ||
352 | unsigned crit_factor; | ||
353 | |||
354 | struct time_stats time; | ||
355 | }; | ||
356 | |||
357 | void bch_bset_sort_state_free(struct bset_sort_state *); | ||
358 | int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); | ||
359 | void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); | ||
360 | void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, | ||
361 | struct bset_sort_state *); | ||
362 | void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, | ||
363 | struct bset_sort_state *); | ||
364 | void bch_btree_sort_partial(struct btree_keys *, unsigned, | ||
365 | struct bset_sort_state *); | ||
366 | |||
367 | static inline void bch_btree_sort(struct btree_keys *b, | ||
368 | struct bset_sort_state *state) | ||
369 | { | ||
370 | bch_btree_sort_partial(b, 0, state); | ||
371 | } | ||
372 | |||
373 | struct bset_stats { | ||
374 | size_t sets_written, sets_unwritten; | ||
375 | size_t bytes_written, bytes_unwritten; | ||
376 | size_t floats, failed; | ||
377 | }; | ||
378 | |||
379 | void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); | ||
380 | |||
381 | /* Bkey utility code */ | ||
382 | |||
383 | #define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) | ||
384 | |||
385 | static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) | ||
386 | { | ||
387 | return bkey_idx(i->start, idx); | ||
388 | } | ||
389 | |||
390 | static inline void bkey_init(struct bkey *k) | ||
391 | { | ||
392 | *k = ZERO_KEY; | ||
393 | } | ||
394 | |||
191 | static __always_inline int64_t bkey_cmp(const struct bkey *l, | 395 | static __always_inline int64_t bkey_cmp(const struct bkey *l, |
192 | const struct bkey *r) | 396 | const struct bkey *r) |
193 | { | 397 | { |
@@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, | |||
196 | : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); | 400 | : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); |
197 | } | 401 | } |
198 | 402 | ||
403 | void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | ||
404 | unsigned); | ||
405 | bool __bch_cut_front(const struct bkey *, struct bkey *); | ||
406 | bool __bch_cut_back(const struct bkey *, struct bkey *); | ||
407 | |||
408 | static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) | ||
409 | { | ||
410 | BUG_ON(bkey_cmp(where, k) > 0); | ||
411 | return __bch_cut_front(where, k); | ||
412 | } | ||
413 | |||
414 | static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | ||
415 | { | ||
416 | BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); | ||
417 | return __bch_cut_back(where, k); | ||
418 | } | ||
419 | |||
420 | #define PRECEDING_KEY(_k) \ | ||
421 | ({ \ | ||
422 | struct bkey *_ret = NULL; \ | ||
423 | \ | ||
424 | if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ | ||
425 | _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ | ||
426 | \ | ||
427 | if (!_ret->low) \ | ||
428 | _ret->high--; \ | ||
429 | _ret->low--; \ | ||
430 | } \ | ||
431 | \ | ||
432 | _ret; \ | ||
433 | }) | ||
434 | |||
435 | static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) | ||
436 | { | ||
437 | return b->ops->key_invalid(b, k); | ||
438 | } | ||
439 | |||
440 | static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) | ||
441 | { | ||
442 | return b->ops->key_bad(b, k); | ||
443 | } | ||
444 | |||
445 | static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, | ||
446 | size_t size, const struct bkey *k) | ||
447 | { | ||
448 | return b->ops->key_to_text(buf, size, k); | ||
449 | } | ||
450 | |||
451 | static inline bool bch_bkey_equal_header(const struct bkey *l, | ||
452 | const struct bkey *r) | ||
453 | { | ||
454 | return (KEY_DIRTY(l) == KEY_DIRTY(r) && | ||
455 | KEY_PTRS(l) == KEY_PTRS(r) && | ||
456 | KEY_CSUM(l) == KEY_CSUM(l)); | ||
457 | } | ||
458 | |||
199 | /* Keylists */ | 459 | /* Keylists */ |
200 | 460 | ||
201 | struct keylist { | 461 | struct keylist { |
@@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l) | |||
257 | 517 | ||
258 | struct bkey *bch_keylist_pop(struct keylist *); | 518 | struct bkey *bch_keylist_pop(struct keylist *); |
259 | void bch_keylist_pop_front(struct keylist *); | 519 | void bch_keylist_pop_front(struct keylist *); |
260 | int bch_keylist_realloc(struct keylist *, int, struct cache_set *); | 520 | int __bch_keylist_realloc(struct keylist *, unsigned); |
261 | |||
262 | void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | ||
263 | unsigned); | ||
264 | bool __bch_cut_front(const struct bkey *, struct bkey *); | ||
265 | bool __bch_cut_back(const struct bkey *, struct bkey *); | ||
266 | 521 | ||
267 | static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) | 522 | /* Debug stuff */ |
268 | { | ||
269 | BUG_ON(bkey_cmp(where, k) > 0); | ||
270 | return __bch_cut_front(where, k); | ||
271 | } | ||
272 | 523 | ||
273 | static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | 524 | #ifdef CONFIG_BCACHE_DEBUG |
274 | { | ||
275 | BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); | ||
276 | return __bch_cut_back(where, k); | ||
277 | } | ||
278 | |||
279 | const char *bch_ptr_status(struct cache_set *, const struct bkey *); | ||
280 | bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); | ||
281 | bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); | ||
282 | |||
283 | bool bch_ptr_bad(struct btree *, const struct bkey *); | ||
284 | |||
285 | static inline uint8_t gen_after(uint8_t a, uint8_t b) | ||
286 | { | ||
287 | uint8_t r = a - b; | ||
288 | return r > 128U ? 0 : r; | ||
289 | } | ||
290 | |||
291 | static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, | ||
292 | unsigned i) | ||
293 | { | ||
294 | return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); | ||
295 | } | ||
296 | |||
297 | static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | ||
298 | unsigned i) | ||
299 | { | ||
300 | return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); | ||
301 | } | ||
302 | |||
303 | |||
304 | typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); | ||
305 | |||
306 | struct bkey *bch_btree_iter_next(struct btree_iter *); | ||
307 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | ||
308 | struct btree *, ptr_filter_fn); | ||
309 | |||
310 | void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); | ||
311 | struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, | ||
312 | struct bkey *, struct bset_tree *); | ||
313 | |||
314 | /* 32 bits total: */ | ||
315 | #define BKEY_MID_BITS 3 | ||
316 | #define BKEY_EXPONENT_BITS 7 | ||
317 | #define BKEY_MANTISSA_BITS 22 | ||
318 | #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) | ||
319 | |||
320 | struct bkey_float { | ||
321 | unsigned exponent:BKEY_EXPONENT_BITS; | ||
322 | unsigned m:BKEY_MID_BITS; | ||
323 | unsigned mantissa:BKEY_MANTISSA_BITS; | ||
324 | } __packed; | ||
325 | |||
326 | /* | ||
327 | * BSET_CACHELINE was originally intended to match the hardware cacheline size - | ||
328 | * it used to be 64, but I realized the lookup code would touch slightly less | ||
329 | * memory if it was 128. | ||
330 | * | ||
331 | * It definites the number of bytes (in struct bset) per struct bkey_float in | ||
332 | * the auxiliar search tree - when we're done searching the bset_float tree we | ||
333 | * have this many bytes left that we do a linear search over. | ||
334 | * | ||
335 | * Since (after level 5) every level of the bset_tree is on a new cacheline, | ||
336 | * we're touching one fewer cacheline in the bset tree in exchange for one more | ||
337 | * cacheline in the linear search - but the linear search might stop before it | ||
338 | * gets to the second cacheline. | ||
339 | */ | ||
340 | |||
341 | #define BSET_CACHELINE 128 | ||
342 | #define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) | ||
343 | 525 | ||
344 | #define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) | 526 | int __bch_count_data(struct btree_keys *); |
345 | #define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) | 527 | void __bch_check_keys(struct btree_keys *, const char *, ...); |
528 | void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); | ||
529 | void bch_dump_bucket(struct btree_keys *); | ||
346 | 530 | ||
347 | void bch_bset_init_next(struct btree *); | 531 | #else |
348 | 532 | ||
349 | void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); | 533 | static inline int __bch_count_data(struct btree_keys *b) { return -1; } |
350 | void bch_bset_fix_lookup_table(struct btree *, struct bkey *); | 534 | static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} |
535 | static inline void bch_dump_bucket(struct btree_keys *b) {} | ||
536 | void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); | ||
351 | 537 | ||
352 | struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, | 538 | #endif |
353 | const struct bkey *); | ||
354 | 539 | ||
355 | /* | 540 | static inline bool btree_keys_expensive_checks(struct btree_keys *b) |
356 | * Returns the first key that is strictly greater than search | ||
357 | */ | ||
358 | static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, | ||
359 | const struct bkey *search) | ||
360 | { | 541 | { |
361 | return search ? __bch_bset_search(b, t, search) : t->data->start; | 542 | #ifdef CONFIG_BCACHE_DEBUG |
543 | return *b->expensive_debug_checks; | ||
544 | #else | ||
545 | return false; | ||
546 | #endif | ||
362 | } | 547 | } |
363 | 548 | ||
364 | #define PRECEDING_KEY(_k) \ | 549 | static inline int bch_count_data(struct btree_keys *b) |
365 | ({ \ | ||
366 | struct bkey *_ret = NULL; \ | ||
367 | \ | ||
368 | if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ | ||
369 | _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ | ||
370 | \ | ||
371 | if (!_ret->low) \ | ||
372 | _ret->high--; \ | ||
373 | _ret->low--; \ | ||
374 | } \ | ||
375 | \ | ||
376 | _ret; \ | ||
377 | }) | ||
378 | |||
379 | bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); | ||
380 | void bch_btree_sort_lazy(struct btree *); | ||
381 | void bch_btree_sort_into(struct btree *, struct btree *); | ||
382 | void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); | ||
383 | void bch_btree_sort_partial(struct btree *, unsigned); | ||
384 | |||
385 | static inline void bch_btree_sort(struct btree *b) | ||
386 | { | 550 | { |
387 | bch_btree_sort_partial(b, 0); | 551 | return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; |
388 | } | 552 | } |
389 | 553 | ||
390 | int bch_bset_print_stats(struct cache_set *, char *); | 554 | #define bch_check_keys(b, ...) \ |
555 | do { \ | ||
556 | if (btree_keys_expensive_checks(b)) \ | ||
557 | __bch_check_keys(b, __VA_ARGS__); \ | ||
558 | } while (0) | ||
391 | 559 | ||
392 | #endif | 560 | #endif |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 946ecd3b048b..98cc0a810a36 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -23,7 +23,7 @@ | |||
23 | #include "bcache.h" | 23 | #include "bcache.h" |
24 | #include "btree.h" | 24 | #include "btree.h" |
25 | #include "debug.h" | 25 | #include "debug.h" |
26 | #include "writeback.h" | 26 | #include "extents.h" |
27 | 27 | ||
28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
29 | #include <linux/bitops.h> | 29 | #include <linux/bitops.h> |
@@ -89,13 +89,6 @@ | |||
89 | * Test module load/unload | 89 | * Test module load/unload |
90 | */ | 90 | */ |
91 | 91 | ||
92 | enum { | ||
93 | BTREE_INSERT_STATUS_INSERT, | ||
94 | BTREE_INSERT_STATUS_BACK_MERGE, | ||
95 | BTREE_INSERT_STATUS_OVERWROTE, | ||
96 | BTREE_INSERT_STATUS_FRONT_MERGE, | ||
97 | }; | ||
98 | |||
99 | #define MAX_NEED_GC 64 | 92 | #define MAX_NEED_GC 64 |
100 | #define MAX_SAVE_PRIO 72 | 93 | #define MAX_SAVE_PRIO 72 |
101 | 94 | ||
@@ -106,14 +99,6 @@ enum { | |||
106 | 99 | ||
107 | static struct workqueue_struct *btree_io_wq; | 100 | static struct workqueue_struct *btree_io_wq; |
108 | 101 | ||
109 | static inline bool should_split(struct btree *b) | ||
110 | { | ||
111 | struct bset *i = write_block(b); | ||
112 | return b->written >= btree_blocks(b) || | ||
113 | (b->written + __set_blocks(i, i->keys + 15, b->c) | ||
114 | > btree_blocks(b)); | ||
115 | } | ||
116 | |||
117 | #define insert_lock(s, b) ((b)->level <= (s)->lock) | 102 | #define insert_lock(s, b) ((b)->level <= (s)->lock) |
118 | 103 | ||
119 | /* | 104 | /* |
@@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b) | |||
167 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | 152 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ |
168 | } \ | 153 | } \ |
169 | rw_unlock(_w, _b); \ | 154 | rw_unlock(_w, _b); \ |
155 | if (_r == -EINTR) \ | ||
156 | schedule(); \ | ||
170 | bch_cannibalize_unlock(c); \ | 157 | bch_cannibalize_unlock(c); \ |
171 | if (_r == -ENOSPC) { \ | 158 | if (_r == -ENOSPC) { \ |
172 | wait_event((c)->try_wait, \ | 159 | wait_event((c)->try_wait, \ |
@@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b) | |||
175 | } \ | 162 | } \ |
176 | } while (_r == -EINTR); \ | 163 | } while (_r == -EINTR); \ |
177 | \ | 164 | \ |
165 | finish_wait(&(c)->bucket_wait, &(op)->wait); \ | ||
178 | _r; \ | 166 | _r; \ |
179 | }) | 167 | }) |
180 | 168 | ||
169 | static inline struct bset *write_block(struct btree *b) | ||
170 | { | ||
171 | return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); | ||
172 | } | ||
173 | |||
181 | /* Btree key manipulation */ | 174 | /* Btree key manipulation */ |
182 | 175 | ||
183 | void bkey_put(struct cache_set *c, struct bkey *k) | 176 | void bkey_put(struct cache_set *c, struct bkey *k) |
@@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k) | |||
194 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) | 187 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) |
195 | { | 188 | { |
196 | uint64_t crc = b->key.ptr[0]; | 189 | uint64_t crc = b->key.ptr[0]; |
197 | void *data = (void *) i + 8, *end = end(i); | 190 | void *data = (void *) i + 8, *end = bset_bkey_last(i); |
198 | 191 | ||
199 | crc = bch_crc64_update(crc, data, end - data); | 192 | crc = bch_crc64_update(crc, data, end - data); |
200 | return crc ^ 0xffffffffffffffffULL; | 193 | return crc ^ 0xffffffffffffffffULL; |
201 | } | 194 | } |
202 | 195 | ||
203 | static void bch_btree_node_read_done(struct btree *b) | 196 | void bch_btree_node_read_done(struct btree *b) |
204 | { | 197 | { |
205 | const char *err = "bad btree header"; | 198 | const char *err = "bad btree header"; |
206 | struct bset *i = b->sets[0].data; | 199 | struct bset *i = btree_bset_first(b); |
207 | struct btree_iter *iter; | 200 | struct btree_iter *iter; |
208 | 201 | ||
209 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); | 202 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); |
@@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b) | |||
211 | iter->used = 0; | 204 | iter->used = 0; |
212 | 205 | ||
213 | #ifdef CONFIG_BCACHE_DEBUG | 206 | #ifdef CONFIG_BCACHE_DEBUG |
214 | iter->b = b; | 207 | iter->b = &b->keys; |
215 | #endif | 208 | #endif |
216 | 209 | ||
217 | if (!i->seq) | 210 | if (!i->seq) |
218 | goto err; | 211 | goto err; |
219 | 212 | ||
220 | for (; | 213 | for (; |
221 | b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; | 214 | b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; |
222 | i = write_block(b)) { | 215 | i = write_block(b)) { |
223 | err = "unsupported bset version"; | 216 | err = "unsupported bset version"; |
224 | if (i->version > BCACHE_BSET_VERSION) | 217 | if (i->version > BCACHE_BSET_VERSION) |
225 | goto err; | 218 | goto err; |
226 | 219 | ||
227 | err = "bad btree header"; | 220 | err = "bad btree header"; |
228 | if (b->written + set_blocks(i, b->c) > btree_blocks(b)) | 221 | if (b->written + set_blocks(i, block_bytes(b->c)) > |
222 | btree_blocks(b)) | ||
229 | goto err; | 223 | goto err; |
230 | 224 | ||
231 | err = "bad magic"; | 225 | err = "bad magic"; |
@@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b) | |||
245 | } | 239 | } |
246 | 240 | ||
247 | err = "empty set"; | 241 | err = "empty set"; |
248 | if (i != b->sets[0].data && !i->keys) | 242 | if (i != b->keys.set[0].data && !i->keys) |
249 | goto err; | 243 | goto err; |
250 | 244 | ||
251 | bch_btree_iter_push(iter, i->start, end(i)); | 245 | bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); |
252 | 246 | ||
253 | b->written += set_blocks(i, b->c); | 247 | b->written += set_blocks(i, block_bytes(b->c)); |
254 | } | 248 | } |
255 | 249 | ||
256 | err = "corrupted btree"; | 250 | err = "corrupted btree"; |
257 | for (i = write_block(b); | 251 | for (i = write_block(b); |
258 | index(i, b) < btree_blocks(b); | 252 | bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); |
259 | i = ((void *) i) + block_bytes(b->c)) | 253 | i = ((void *) i) + block_bytes(b->c)) |
260 | if (i->seq == b->sets[0].data->seq) | 254 | if (i->seq == b->keys.set[0].data->seq) |
261 | goto err; | 255 | goto err; |
262 | 256 | ||
263 | bch_btree_sort_and_fix_extents(b, iter); | 257 | bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); |
264 | 258 | ||
265 | i = b->sets[0].data; | 259 | i = b->keys.set[0].data; |
266 | err = "short btree key"; | 260 | err = "short btree key"; |
267 | if (b->sets[0].size && | 261 | if (b->keys.set[0].size && |
268 | bkey_cmp(&b->key, &b->sets[0].end) < 0) | 262 | bkey_cmp(&b->key, &b->keys.set[0].end) < 0) |
269 | goto err; | 263 | goto err; |
270 | 264 | ||
271 | if (b->written < btree_blocks(b)) | 265 | if (b->written < btree_blocks(b)) |
272 | bch_bset_init_next(b); | 266 | bch_bset_init_next(&b->keys, write_block(b), |
267 | bset_magic(&b->c->sb)); | ||
273 | out: | 268 | out: |
274 | mempool_free(iter, b->c->fill_iter); | 269 | mempool_free(iter, b->c->fill_iter); |
275 | return; | 270 | return; |
276 | err: | 271 | err: |
277 | set_btree_node_io_error(b); | 272 | set_btree_node_io_error(b); |
278 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", | 273 | bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", |
279 | err, PTR_BUCKET_NR(b->c, &b->key, 0), | 274 | err, PTR_BUCKET_NR(b->c, &b->key, 0), |
280 | index(i, b), i->keys); | 275 | bset_block_offset(b, i), i->keys); |
281 | goto out; | 276 | goto out; |
282 | } | 277 | } |
283 | 278 | ||
@@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error) | |||
287 | closure_put(cl); | 282 | closure_put(cl); |
288 | } | 283 | } |
289 | 284 | ||
290 | void bch_btree_node_read(struct btree *b) | 285 | static void bch_btree_node_read(struct btree *b) |
291 | { | 286 | { |
292 | uint64_t start_time = local_clock(); | 287 | uint64_t start_time = local_clock(); |
293 | struct closure cl; | 288 | struct closure cl; |
@@ -303,7 +298,7 @@ void bch_btree_node_read(struct btree *b) | |||
303 | bio->bi_end_io = btree_node_read_endio; | 298 | bio->bi_end_io = btree_node_read_endio; |
304 | bio->bi_private = &cl; | 299 | bio->bi_private = &cl; |
305 | 300 | ||
306 | bch_bio_map(bio, b->sets[0].data); | 301 | bch_bio_map(bio, b->keys.set[0].data); |
307 | 302 | ||
308 | bch_submit_bbio(bio, b->c, &b->key, 0); | 303 | bch_submit_bbio(bio, b->c, &b->key, 0); |
309 | closure_sync(&cl); | 304 | closure_sync(&cl); |
@@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) | |||
340 | w->journal = NULL; | 335 | w->journal = NULL; |
341 | } | 336 | } |
342 | 337 | ||
338 | static void btree_node_write_unlock(struct closure *cl) | ||
339 | { | ||
340 | struct btree *b = container_of(cl, struct btree, io); | ||
341 | |||
342 | up(&b->io_mutex); | ||
343 | } | ||
344 | |||
343 | static void __btree_node_write_done(struct closure *cl) | 345 | static void __btree_node_write_done(struct closure *cl) |
344 | { | 346 | { |
345 | struct btree *b = container_of(cl, struct btree, io.cl); | 347 | struct btree *b = container_of(cl, struct btree, io); |
346 | struct btree_write *w = btree_prev_write(b); | 348 | struct btree_write *w = btree_prev_write(b); |
347 | 349 | ||
348 | bch_bbio_free(b->bio, b->c); | 350 | bch_bbio_free(b->bio, b->c); |
@@ -353,12 +355,12 @@ static void __btree_node_write_done(struct closure *cl) | |||
353 | queue_delayed_work(btree_io_wq, &b->work, | 355 | queue_delayed_work(btree_io_wq, &b->work, |
354 | msecs_to_jiffies(30000)); | 356 | msecs_to_jiffies(30000)); |
355 | 357 | ||
356 | closure_return(cl); | 358 | closure_return_with_destructor(cl, btree_node_write_unlock); |
357 | } | 359 | } |
358 | 360 | ||
359 | static void btree_node_write_done(struct closure *cl) | 361 | static void btree_node_write_done(struct closure *cl) |
360 | { | 362 | { |
361 | struct btree *b = container_of(cl, struct btree, io.cl); | 363 | struct btree *b = container_of(cl, struct btree, io); |
362 | struct bio_vec *bv; | 364 | struct bio_vec *bv; |
363 | int n; | 365 | int n; |
364 | 366 | ||
@@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl) | |||
371 | static void btree_node_write_endio(struct bio *bio, int error) | 373 | static void btree_node_write_endio(struct bio *bio, int error) |
372 | { | 374 | { |
373 | struct closure *cl = bio->bi_private; | 375 | struct closure *cl = bio->bi_private; |
374 | struct btree *b = container_of(cl, struct btree, io.cl); | 376 | struct btree *b = container_of(cl, struct btree, io); |
375 | 377 | ||
376 | if (error) | 378 | if (error) |
377 | set_btree_node_io_error(b); | 379 | set_btree_node_io_error(b); |
@@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error) | |||
382 | 384 | ||
383 | static void do_btree_node_write(struct btree *b) | 385 | static void do_btree_node_write(struct btree *b) |
384 | { | 386 | { |
385 | struct closure *cl = &b->io.cl; | 387 | struct closure *cl = &b->io; |
386 | struct bset *i = b->sets[b->nsets].data; | 388 | struct bset *i = btree_bset_last(b); |
387 | BKEY_PADDED(key) k; | 389 | BKEY_PADDED(key) k; |
388 | 390 | ||
389 | i->version = BCACHE_BSET_VERSION; | 391 | i->version = BCACHE_BSET_VERSION; |
@@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b) | |||
395 | b->bio->bi_end_io = btree_node_write_endio; | 397 | b->bio->bi_end_io = btree_node_write_endio; |
396 | b->bio->bi_private = cl; | 398 | b->bio->bi_private = cl; |
397 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; | 399 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; |
398 | b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c); | 400 | b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); |
399 | bch_bio_map(b->bio, i); | 401 | bch_bio_map(b->bio, i); |
400 | 402 | ||
401 | /* | 403 | /* |
@@ -414,7 +416,8 @@ static void do_btree_node_write(struct btree *b) | |||
414 | */ | 416 | */ |
415 | 417 | ||
416 | bkey_copy(&k.key, &b->key); | 418 | bkey_copy(&k.key, &b->key); |
417 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | 419 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + |
420 | bset_sector_offset(&b->keys, i)); | ||
418 | 421 | ||
419 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { | 422 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { |
420 | int j; | 423 | int j; |
@@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b) | |||
435 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 438 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
436 | 439 | ||
437 | closure_sync(cl); | 440 | closure_sync(cl); |
438 | __btree_node_write_done(cl); | 441 | continue_at_nobarrier(cl, __btree_node_write_done, NULL); |
439 | } | 442 | } |
440 | } | 443 | } |
441 | 444 | ||
442 | void bch_btree_node_write(struct btree *b, struct closure *parent) | 445 | void bch_btree_node_write(struct btree *b, struct closure *parent) |
443 | { | 446 | { |
444 | struct bset *i = b->sets[b->nsets].data; | 447 | struct bset *i = btree_bset_last(b); |
445 | 448 | ||
446 | trace_bcache_btree_write(b); | 449 | trace_bcache_btree_write(b); |
447 | 450 | ||
448 | BUG_ON(current->bio_list); | 451 | BUG_ON(current->bio_list); |
449 | BUG_ON(b->written >= btree_blocks(b)); | 452 | BUG_ON(b->written >= btree_blocks(b)); |
450 | BUG_ON(b->written && !i->keys); | 453 | BUG_ON(b->written && !i->keys); |
451 | BUG_ON(b->sets->data->seq != i->seq); | 454 | BUG_ON(btree_bset_first(b)->seq != i->seq); |
452 | bch_check_keys(b, "writing"); | 455 | bch_check_keys(&b->keys, "writing"); |
453 | 456 | ||
454 | cancel_delayed_work(&b->work); | 457 | cancel_delayed_work(&b->work); |
455 | 458 | ||
456 | /* If caller isn't waiting for write, parent refcount is cache set */ | 459 | /* If caller isn't waiting for write, parent refcount is cache set */ |
457 | closure_lock(&b->io, parent ?: &b->c->cl); | 460 | down(&b->io_mutex); |
461 | closure_init(&b->io, parent ?: &b->c->cl); | ||
458 | 462 | ||
459 | clear_bit(BTREE_NODE_dirty, &b->flags); | 463 | clear_bit(BTREE_NODE_dirty, &b->flags); |
460 | change_bit(BTREE_NODE_write_idx, &b->flags); | 464 | change_bit(BTREE_NODE_write_idx, &b->flags); |
461 | 465 | ||
462 | do_btree_node_write(b); | 466 | do_btree_node_write(b); |
463 | 467 | ||
464 | b->written += set_blocks(i, b->c); | 468 | atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, |
465 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | ||
466 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); | 469 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); |
467 | 470 | ||
468 | bch_btree_sort_lazy(b); | 471 | b->written += set_blocks(i, block_bytes(b->c)); |
472 | |||
473 | /* If not a leaf node, always sort */ | ||
474 | if (b->level && b->keys.nsets) | ||
475 | bch_btree_sort(&b->keys, &b->c->sort); | ||
476 | else | ||
477 | bch_btree_sort_lazy(&b->keys, &b->c->sort); | ||
478 | |||
479 | /* | ||
480 | * do verify if there was more than one set initially (i.e. we did a | ||
481 | * sort) and we sorted down to a single set: | ||
482 | */ | ||
483 | if (i != b->keys.set->data && !b->keys.nsets) | ||
484 | bch_btree_verify(b); | ||
469 | 485 | ||
470 | if (b->written < btree_blocks(b)) | 486 | if (b->written < btree_blocks(b)) |
471 | bch_bset_init_next(b); | 487 | bch_bset_init_next(&b->keys, write_block(b), |
488 | bset_magic(&b->c->sb)); | ||
472 | } | 489 | } |
473 | 490 | ||
474 | static void bch_btree_node_write_sync(struct btree *b) | 491 | static void bch_btree_node_write_sync(struct btree *b) |
@@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w) | |||
493 | 510 | ||
494 | static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) | 511 | static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) |
495 | { | 512 | { |
496 | struct bset *i = b->sets[b->nsets].data; | 513 | struct bset *i = btree_bset_last(b); |
497 | struct btree_write *w = btree_current_write(b); | 514 | struct btree_write *w = btree_current_write(b); |
498 | 515 | ||
499 | BUG_ON(!b->written); | 516 | BUG_ON(!b->written); |
@@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) | |||
528 | * mca -> memory cache | 545 | * mca -> memory cache |
529 | */ | 546 | */ |
530 | 547 | ||
531 | static void mca_reinit(struct btree *b) | ||
532 | { | ||
533 | unsigned i; | ||
534 | |||
535 | b->flags = 0; | ||
536 | b->written = 0; | ||
537 | b->nsets = 0; | ||
538 | |||
539 | for (i = 0; i < MAX_BSETS; i++) | ||
540 | b->sets[i].size = 0; | ||
541 | /* | ||
542 | * Second loop starts at 1 because b->sets[0]->data is the memory we | ||
543 | * allocated | ||
544 | */ | ||
545 | for (i = 1; i < MAX_BSETS; i++) | ||
546 | b->sets[i].data = NULL; | ||
547 | } | ||
548 | |||
549 | #define mca_reserve(c) (((c->root && c->root->level) \ | 548 | #define mca_reserve(c) (((c->root && c->root->level) \ |
550 | ? c->root->level : 1) * 8 + 16) | 549 | ? c->root->level : 1) * 8 + 16) |
551 | #define mca_can_free(c) \ | 550 | #define mca_can_free(c) \ |
@@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b) | |||
553 | 552 | ||
554 | static void mca_data_free(struct btree *b) | 553 | static void mca_data_free(struct btree *b) |
555 | { | 554 | { |
556 | struct bset_tree *t = b->sets; | 555 | BUG_ON(b->io_mutex.count != 1); |
557 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | ||
558 | 556 | ||
559 | if (bset_prev_bytes(b) < PAGE_SIZE) | 557 | bch_btree_keys_free(&b->keys); |
560 | kfree(t->prev); | ||
561 | else | ||
562 | free_pages((unsigned long) t->prev, | ||
563 | get_order(bset_prev_bytes(b))); | ||
564 | 558 | ||
565 | if (bset_tree_bytes(b) < PAGE_SIZE) | ||
566 | kfree(t->tree); | ||
567 | else | ||
568 | free_pages((unsigned long) t->tree, | ||
569 | get_order(bset_tree_bytes(b))); | ||
570 | |||
571 | free_pages((unsigned long) t->data, b->page_order); | ||
572 | |||
573 | t->prev = NULL; | ||
574 | t->tree = NULL; | ||
575 | t->data = NULL; | ||
576 | list_move(&b->list, &b->c->btree_cache_freed); | ||
577 | b->c->bucket_cache_used--; | 559 | b->c->bucket_cache_used--; |
560 | list_move(&b->list, &b->c->btree_cache_freed); | ||
578 | } | 561 | } |
579 | 562 | ||
580 | static void mca_bucket_free(struct btree *b) | 563 | static void mca_bucket_free(struct btree *b) |
@@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k) | |||
593 | 576 | ||
594 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | 577 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) |
595 | { | 578 | { |
596 | struct bset_tree *t = b->sets; | 579 | if (!bch_btree_keys_alloc(&b->keys, |
597 | BUG_ON(t->data); | 580 | max_t(unsigned, |
598 | 581 | ilog2(b->c->btree_pages), | |
599 | b->page_order = max_t(unsigned, | 582 | btree_order(k)), |
600 | ilog2(b->c->btree_pages), | 583 | gfp)) { |
601 | btree_order(k)); | 584 | b->c->bucket_cache_used++; |
602 | 585 | list_move(&b->list, &b->c->btree_cache); | |
603 | t->data = (void *) __get_free_pages(gfp, b->page_order); | 586 | } else { |
604 | if (!t->data) | 587 | list_move(&b->list, &b->c->btree_cache_freed); |
605 | goto err; | 588 | } |
606 | |||
607 | t->tree = bset_tree_bytes(b) < PAGE_SIZE | ||
608 | ? kmalloc(bset_tree_bytes(b), gfp) | ||
609 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); | ||
610 | if (!t->tree) | ||
611 | goto err; | ||
612 | |||
613 | t->prev = bset_prev_bytes(b) < PAGE_SIZE | ||
614 | ? kmalloc(bset_prev_bytes(b), gfp) | ||
615 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); | ||
616 | if (!t->prev) | ||
617 | goto err; | ||
618 | |||
619 | list_move(&b->list, &b->c->btree_cache); | ||
620 | b->c->bucket_cache_used++; | ||
621 | return; | ||
622 | err: | ||
623 | mca_data_free(b); | ||
624 | } | 589 | } |
625 | 590 | ||
626 | static struct btree *mca_bucket_alloc(struct cache_set *c, | 591 | static struct btree *mca_bucket_alloc(struct cache_set *c, |
@@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, | |||
635 | INIT_LIST_HEAD(&b->list); | 600 | INIT_LIST_HEAD(&b->list); |
636 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); | 601 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); |
637 | b->c = c; | 602 | b->c = c; |
638 | closure_init_unlocked(&b->io); | 603 | sema_init(&b->io_mutex, 1); |
639 | 604 | ||
640 | mca_data_alloc(b, k, gfp); | 605 | mca_data_alloc(b, k, gfp); |
641 | return b; | 606 | return b; |
@@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) | |||
651 | if (!down_write_trylock(&b->lock)) | 616 | if (!down_write_trylock(&b->lock)) |
652 | return -ENOMEM; | 617 | return -ENOMEM; |
653 | 618 | ||
654 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | 619 | BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); |
655 | 620 | ||
656 | if (b->page_order < min_order || | 621 | if (b->keys.page_order < min_order) |
657 | (!flush && | 622 | goto out_unlock; |
658 | (btree_node_dirty(b) || | 623 | |
659 | atomic_read(&b->io.cl.remaining) != -1))) { | 624 | if (!flush) { |
660 | rw_unlock(true, b); | 625 | if (btree_node_dirty(b)) |
661 | return -ENOMEM; | 626 | goto out_unlock; |
627 | |||
628 | if (down_trylock(&b->io_mutex)) | ||
629 | goto out_unlock; | ||
630 | up(&b->io_mutex); | ||
662 | } | 631 | } |
663 | 632 | ||
664 | if (btree_node_dirty(b)) | 633 | if (btree_node_dirty(b)) |
665 | bch_btree_node_write_sync(b); | 634 | bch_btree_node_write_sync(b); |
666 | 635 | ||
667 | /* wait for any in flight btree write */ | 636 | /* wait for any in flight btree write */ |
668 | closure_wait_event(&b->io.wait, &cl, | 637 | down(&b->io_mutex); |
669 | atomic_read(&b->io.cl.remaining) == -1); | 638 | up(&b->io_mutex); |
670 | 639 | ||
671 | return 0; | 640 | return 0; |
641 | out_unlock: | ||
642 | rw_unlock(true, b); | ||
643 | return -ENOMEM; | ||
672 | } | 644 | } |
673 | 645 | ||
674 | static unsigned long bch_mca_scan(struct shrinker *shrink, | 646 | static unsigned long bch_mca_scan(struct shrinker *shrink, |
@@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, | |||
714 | } | 686 | } |
715 | } | 687 | } |
716 | 688 | ||
717 | /* | ||
718 | * Can happen right when we first start up, before we've read in any | ||
719 | * btree nodes | ||
720 | */ | ||
721 | if (list_empty(&c->btree_cache)) | ||
722 | goto out; | ||
723 | |||
724 | for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { | 689 | for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { |
690 | if (list_empty(&c->btree_cache)) | ||
691 | goto out; | ||
692 | |||
725 | b = list_first_entry(&c->btree_cache, struct btree, list); | 693 | b = list_first_entry(&c->btree_cache, struct btree, list); |
726 | list_rotate_left(&c->btree_cache); | 694 | list_rotate_left(&c->btree_cache); |
727 | 695 | ||
@@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c) | |||
767 | #ifdef CONFIG_BCACHE_DEBUG | 735 | #ifdef CONFIG_BCACHE_DEBUG |
768 | if (c->verify_data) | 736 | if (c->verify_data) |
769 | list_move(&c->verify_data->list, &c->btree_cache); | 737 | list_move(&c->verify_data->list, &c->btree_cache); |
738 | |||
739 | free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); | ||
770 | #endif | 740 | #endif |
771 | 741 | ||
772 | list_splice(&c->btree_cache_freeable, | 742 | list_splice(&c->btree_cache_freeable, |
@@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c) | |||
807 | #ifdef CONFIG_BCACHE_DEBUG | 777 | #ifdef CONFIG_BCACHE_DEBUG |
808 | mutex_init(&c->verify_lock); | 778 | mutex_init(&c->verify_lock); |
809 | 779 | ||
780 | c->verify_ondisk = (void *) | ||
781 | __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); | ||
782 | |||
810 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | 783 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); |
811 | 784 | ||
812 | if (c->verify_data && | 785 | if (c->verify_data && |
813 | c->verify_data->sets[0].data) | 786 | c->verify_data->keys.set->data) |
814 | list_del_init(&c->verify_data->list); | 787 | list_del_init(&c->verify_data->list); |
815 | else | 788 | else |
816 | c->verify_data = NULL; | 789 | c->verify_data = NULL; |
@@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) | |||
908 | list_for_each_entry(b, &c->btree_cache_freed, list) | 881 | list_for_each_entry(b, &c->btree_cache_freed, list) |
909 | if (!mca_reap(b, 0, false)) { | 882 | if (!mca_reap(b, 0, false)) { |
910 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); | 883 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); |
911 | if (!b->sets[0].data) | 884 | if (!b->keys.set[0].data) |
912 | goto err; | 885 | goto err; |
913 | else | 886 | else |
914 | goto out; | 887 | goto out; |
@@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) | |||
919 | goto err; | 892 | goto err; |
920 | 893 | ||
921 | BUG_ON(!down_write_trylock(&b->lock)); | 894 | BUG_ON(!down_write_trylock(&b->lock)); |
922 | if (!b->sets->data) | 895 | if (!b->keys.set->data) |
923 | goto err; | 896 | goto err; |
924 | out: | 897 | out: |
925 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | 898 | BUG_ON(b->io_mutex.count != 1); |
926 | 899 | ||
927 | bkey_copy(&b->key, k); | 900 | bkey_copy(&b->key, k); |
928 | list_move(&b->list, &c->btree_cache); | 901 | list_move(&b->list, &c->btree_cache); |
@@ -930,10 +903,17 @@ out: | |||
930 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); | 903 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); |
931 | 904 | ||
932 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | 905 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); |
933 | b->level = level; | ||
934 | b->parent = (void *) ~0UL; | 906 | b->parent = (void *) ~0UL; |
907 | b->flags = 0; | ||
908 | b->written = 0; | ||
909 | b->level = level; | ||
935 | 910 | ||
936 | mca_reinit(b); | 911 | if (!b->level) |
912 | bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, | ||
913 | &b->c->expensive_debug_checks); | ||
914 | else | ||
915 | bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, | ||
916 | &b->c->expensive_debug_checks); | ||
937 | 917 | ||
938 | return b; | 918 | return b; |
939 | err: | 919 | err: |
@@ -994,13 +974,13 @@ retry: | |||
994 | 974 | ||
995 | b->accessed = 1; | 975 | b->accessed = 1; |
996 | 976 | ||
997 | for (; i <= b->nsets && b->sets[i].size; i++) { | 977 | for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { |
998 | prefetch(b->sets[i].tree); | 978 | prefetch(b->keys.set[i].tree); |
999 | prefetch(b->sets[i].data); | 979 | prefetch(b->keys.set[i].data); |
1000 | } | 980 | } |
1001 | 981 | ||
1002 | for (; i <= b->nsets; i++) | 982 | for (; i <= b->keys.nsets; i++) |
1003 | prefetch(b->sets[i].data); | 983 | prefetch(b->keys.set[i].data); |
1004 | 984 | ||
1005 | if (btree_node_io_error(b)) { | 985 | if (btree_node_io_error(b)) { |
1006 | rw_unlock(write, b); | 986 | rw_unlock(write, b); |
@@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) | |||
1063 | 1043 | ||
1064 | mutex_lock(&c->bucket_lock); | 1044 | mutex_lock(&c->bucket_lock); |
1065 | retry: | 1045 | retry: |
1066 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) | 1046 | if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) |
1067 | goto err; | 1047 | goto err; |
1068 | 1048 | ||
1069 | bkey_put(c, &k.key); | 1049 | bkey_put(c, &k.key); |
@@ -1080,7 +1060,7 @@ retry: | |||
1080 | } | 1060 | } |
1081 | 1061 | ||
1082 | b->accessed = 1; | 1062 | b->accessed = 1; |
1083 | bch_bset_init_next(b); | 1063 | bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); |
1084 | 1064 | ||
1085 | mutex_unlock(&c->bucket_lock); | 1065 | mutex_unlock(&c->bucket_lock); |
1086 | 1066 | ||
@@ -1098,8 +1078,10 @@ err: | |||
1098 | static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) | 1078 | static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) |
1099 | { | 1079 | { |
1100 | struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); | 1080 | struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); |
1101 | if (!IS_ERR_OR_NULL(n)) | 1081 | if (!IS_ERR_OR_NULL(n)) { |
1102 | bch_btree_sort_into(b, n); | 1082 | bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); |
1083 | bkey_copy_key(&n->key, &b->key); | ||
1084 | } | ||
1103 | 1085 | ||
1104 | return n; | 1086 | return n; |
1105 | } | 1087 | } |
@@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) | |||
1120 | atomic_inc(&b->c->prio_blocked); | 1102 | atomic_inc(&b->c->prio_blocked); |
1121 | } | 1103 | } |
1122 | 1104 | ||
1105 | static int btree_check_reserve(struct btree *b, struct btree_op *op) | ||
1106 | { | ||
1107 | struct cache_set *c = b->c; | ||
1108 | struct cache *ca; | ||
1109 | unsigned i, reserve = c->root->level * 2 + 1; | ||
1110 | int ret = 0; | ||
1111 | |||
1112 | mutex_lock(&c->bucket_lock); | ||
1113 | |||
1114 | for_each_cache(ca, c, i) | ||
1115 | if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { | ||
1116 | if (op) | ||
1117 | prepare_to_wait(&c->bucket_wait, &op->wait, | ||
1118 | TASK_UNINTERRUPTIBLE); | ||
1119 | ret = -EINTR; | ||
1120 | break; | ||
1121 | } | ||
1122 | |||
1123 | mutex_unlock(&c->bucket_lock); | ||
1124 | return ret; | ||
1125 | } | ||
1126 | |||
1123 | /* Garbage collection */ | 1127 | /* Garbage collection */ |
1124 | 1128 | ||
1125 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | 1129 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) |
@@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) | |||
1183 | 1187 | ||
1184 | gc->nodes++; | 1188 | gc->nodes++; |
1185 | 1189 | ||
1186 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1190 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { |
1187 | stale = max(stale, btree_mark_key(b, k)); | 1191 | stale = max(stale, btree_mark_key(b, k)); |
1188 | keys++; | 1192 | keys++; |
1189 | 1193 | ||
1190 | if (bch_ptr_bad(b, k)) | 1194 | if (bch_ptr_bad(&b->keys, k)) |
1191 | continue; | 1195 | continue; |
1192 | 1196 | ||
1193 | gc->key_bytes += bkey_u64s(k); | 1197 | gc->key_bytes += bkey_u64s(k); |
@@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) | |||
1197 | gc->data += KEY_SIZE(k); | 1201 | gc->data += KEY_SIZE(k); |
1198 | } | 1202 | } |
1199 | 1203 | ||
1200 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 1204 | for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) |
1201 | btree_bug_on(t->size && | 1205 | btree_bug_on(t->size && |
1202 | bset_written(b, t) && | 1206 | bset_written(&b->keys, t) && |
1203 | bkey_cmp(&b->key, &t->end) < 0, | 1207 | bkey_cmp(&b->key, &t->end) < 0, |
1204 | b, "found short btree key in gc"); | 1208 | b, "found short btree key in gc"); |
1205 | 1209 | ||
@@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1243 | blocks = btree_default_blocks(b->c) * 2 / 3; | 1247 | blocks = btree_default_blocks(b->c) * 2 / 3; |
1244 | 1248 | ||
1245 | if (nodes < 2 || | 1249 | if (nodes < 2 || |
1246 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) | 1250 | __set_blocks(b->keys.set[0].data, keys, |
1251 | block_bytes(b->c)) > blocks * (nodes - 1)) | ||
1247 | return 0; | 1252 | return 0; |
1248 | 1253 | ||
1249 | for (i = 0; i < nodes; i++) { | 1254 | for (i = 0; i < nodes; i++) { |
@@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1253 | } | 1258 | } |
1254 | 1259 | ||
1255 | for (i = nodes - 1; i > 0; --i) { | 1260 | for (i = nodes - 1; i > 0; --i) { |
1256 | struct bset *n1 = new_nodes[i]->sets->data; | 1261 | struct bset *n1 = btree_bset_first(new_nodes[i]); |
1257 | struct bset *n2 = new_nodes[i - 1]->sets->data; | 1262 | struct bset *n2 = btree_bset_first(new_nodes[i - 1]); |
1258 | struct bkey *k, *last = NULL; | 1263 | struct bkey *k, *last = NULL; |
1259 | 1264 | ||
1260 | keys = 0; | 1265 | keys = 0; |
1261 | 1266 | ||
1262 | if (i > 1) { | 1267 | if (i > 1) { |
1263 | for (k = n2->start; | 1268 | for (k = n2->start; |
1264 | k < end(n2); | 1269 | k < bset_bkey_last(n2); |
1265 | k = bkey_next(k)) { | 1270 | k = bkey_next(k)) { |
1266 | if (__set_blocks(n1, n1->keys + keys + | 1271 | if (__set_blocks(n1, n1->keys + keys + |
1267 | bkey_u64s(k), b->c) > blocks) | 1272 | bkey_u64s(k), |
1273 | block_bytes(b->c)) > blocks) | ||
1268 | break; | 1274 | break; |
1269 | 1275 | ||
1270 | last = k; | 1276 | last = k; |
@@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1280 | * though) | 1286 | * though) |
1281 | */ | 1287 | */ |
1282 | if (__set_blocks(n1, n1->keys + n2->keys, | 1288 | if (__set_blocks(n1, n1->keys + n2->keys, |
1283 | b->c) > btree_blocks(new_nodes[i])) | 1289 | block_bytes(b->c)) > |
1290 | btree_blocks(new_nodes[i])) | ||
1284 | goto out_nocoalesce; | 1291 | goto out_nocoalesce; |
1285 | 1292 | ||
1286 | keys = n2->keys; | 1293 | keys = n2->keys; |
@@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1288 | last = &r->b->key; | 1295 | last = &r->b->key; |
1289 | } | 1296 | } |
1290 | 1297 | ||
1291 | BUG_ON(__set_blocks(n1, n1->keys + keys, | 1298 | BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > |
1292 | b->c) > btree_blocks(new_nodes[i])); | 1299 | btree_blocks(new_nodes[i])); |
1293 | 1300 | ||
1294 | if (last) | 1301 | if (last) |
1295 | bkey_copy_key(&new_nodes[i]->key, last); | 1302 | bkey_copy_key(&new_nodes[i]->key, last); |
1296 | 1303 | ||
1297 | memcpy(end(n1), | 1304 | memcpy(bset_bkey_last(n1), |
1298 | n2->start, | 1305 | n2->start, |
1299 | (void *) node(n2, keys) - (void *) n2->start); | 1306 | (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); |
1300 | 1307 | ||
1301 | n1->keys += keys; | 1308 | n1->keys += keys; |
1302 | r[i].keys = n1->keys; | 1309 | r[i].keys = n1->keys; |
1303 | 1310 | ||
1304 | memmove(n2->start, | 1311 | memmove(n2->start, |
1305 | node(n2, keys), | 1312 | bset_bkey_idx(n2, keys), |
1306 | (void *) end(n2) - (void *) node(n2, keys)); | 1313 | (void *) bset_bkey_last(n2) - |
1314 | (void *) bset_bkey_idx(n2, keys)); | ||
1307 | 1315 | ||
1308 | n2->keys -= keys; | 1316 | n2->keys -= keys; |
1309 | 1317 | ||
1310 | if (bch_keylist_realloc(keylist, | 1318 | if (__bch_keylist_realloc(keylist, |
1311 | KEY_PTRS(&new_nodes[i]->key), b->c)) | 1319 | bkey_u64s(&new_nodes[i]->key))) |
1312 | goto out_nocoalesce; | 1320 | goto out_nocoalesce; |
1313 | 1321 | ||
1314 | bch_btree_node_write(new_nodes[i], &cl); | 1322 | bch_btree_node_write(new_nodes[i], &cl); |
@@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1316 | } | 1324 | } |
1317 | 1325 | ||
1318 | for (i = 0; i < nodes; i++) { | 1326 | for (i = 0; i < nodes; i++) { |
1319 | if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) | 1327 | if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) |
1320 | goto out_nocoalesce; | 1328 | goto out_nocoalesce; |
1321 | 1329 | ||
1322 | make_btree_freeing_key(r[i].b, keylist->top); | 1330 | make_btree_freeing_key(r[i].b, keylist->top); |
@@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1324 | } | 1332 | } |
1325 | 1333 | ||
1326 | /* We emptied out this node */ | 1334 | /* We emptied out this node */ |
1327 | BUG_ON(new_nodes[0]->sets->data->keys); | 1335 | BUG_ON(btree_bset_first(new_nodes[0])->keys); |
1328 | btree_node_free(new_nodes[0]); | 1336 | btree_node_free(new_nodes[0]); |
1329 | rw_unlock(true, new_nodes[0]); | 1337 | rw_unlock(true, new_nodes[0]); |
1330 | 1338 | ||
@@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b) | |||
1370 | struct btree_iter iter; | 1378 | struct btree_iter iter; |
1371 | unsigned ret = 0; | 1379 | unsigned ret = 0; |
1372 | 1380 | ||
1373 | for_each_key_filter(b, k, &iter, bch_ptr_bad) | 1381 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) |
1374 | ret += bkey_u64s(k); | 1382 | ret += bkey_u64s(k); |
1375 | 1383 | ||
1376 | return ret; | 1384 | return ret; |
@@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1390 | struct gc_merge_info *last = r + GC_MERGE_NODES - 1; | 1398 | struct gc_merge_info *last = r + GC_MERGE_NODES - 1; |
1391 | 1399 | ||
1392 | bch_keylist_init(&keys); | 1400 | bch_keylist_init(&keys); |
1393 | bch_btree_iter_init(b, &iter, &b->c->gc_done); | 1401 | bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); |
1394 | 1402 | ||
1395 | for (i = 0; i < GC_MERGE_NODES; i++) | 1403 | for (i = 0; i < GC_MERGE_NODES; i++) |
1396 | r[i].b = ERR_PTR(-EINTR); | 1404 | r[i].b = ERR_PTR(-EINTR); |
1397 | 1405 | ||
1398 | while (1) { | 1406 | while (1) { |
1399 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | 1407 | k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); |
1400 | if (k) { | 1408 | if (k) { |
1401 | r->b = bch_btree_node_get(b->c, k, b->level - 1, true); | 1409 | r->b = bch_btree_node_get(b->c, k, b->level - 1, true); |
1402 | if (IS_ERR(r->b)) { | 1410 | if (IS_ERR(r->b)) { |
@@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1416 | 1424 | ||
1417 | if (!IS_ERR(last->b)) { | 1425 | if (!IS_ERR(last->b)) { |
1418 | should_rewrite = btree_gc_mark_node(last->b, gc); | 1426 | should_rewrite = btree_gc_mark_node(last->b, gc); |
1419 | if (should_rewrite) { | 1427 | if (should_rewrite && |
1428 | !btree_check_reserve(b, NULL)) { | ||
1420 | n = btree_node_alloc_replacement(last->b, | 1429 | n = btree_node_alloc_replacement(last->b, |
1421 | false); | 1430 | false); |
1422 | 1431 | ||
@@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | |||
1705 | struct bucket *g; | 1714 | struct bucket *g; |
1706 | struct btree_iter iter; | 1715 | struct btree_iter iter; |
1707 | 1716 | ||
1708 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1717 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { |
1709 | for (i = 0; i < KEY_PTRS(k); i++) { | 1718 | for (i = 0; i < KEY_PTRS(k); i++) { |
1710 | if (!ptr_available(b->c, k, i)) | 1719 | if (!ptr_available(b->c, k, i)) |
1711 | continue; | 1720 | continue; |
@@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | |||
1728 | } | 1737 | } |
1729 | 1738 | ||
1730 | if (b->level) { | 1739 | if (b->level) { |
1731 | bch_btree_iter_init(b, &iter, NULL); | 1740 | bch_btree_iter_init(&b->keys, &iter, NULL); |
1732 | 1741 | ||
1733 | do { | 1742 | do { |
1734 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | 1743 | k = bch_btree_iter_next_filter(&iter, &b->keys, |
1744 | bch_ptr_bad); | ||
1735 | if (k) | 1745 | if (k) |
1736 | btree_node_prefetch(b->c, k, b->level - 1); | 1746 | btree_node_prefetch(b->c, k, b->level - 1); |
1737 | 1747 | ||
@@ -1774,235 +1784,36 @@ err: | |||
1774 | 1784 | ||
1775 | /* Btree insertion */ | 1785 | /* Btree insertion */ |
1776 | 1786 | ||
1777 | static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) | 1787 | static bool btree_insert_key(struct btree *b, struct bkey *k, |
1778 | { | 1788 | struct bkey *replace_key) |
1779 | struct bset *i = b->sets[b->nsets].data; | ||
1780 | |||
1781 | memmove((uint64_t *) where + bkey_u64s(insert), | ||
1782 | where, | ||
1783 | (void *) end(i) - (void *) where); | ||
1784 | |||
1785 | i->keys += bkey_u64s(insert); | ||
1786 | bkey_copy(where, insert); | ||
1787 | bch_bset_fix_lookup_table(b, where); | ||
1788 | } | ||
1789 | |||
1790 | static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, | ||
1791 | struct btree_iter *iter, | ||
1792 | struct bkey *replace_key) | ||
1793 | { | 1789 | { |
1794 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) | 1790 | unsigned status; |
1795 | { | ||
1796 | if (KEY_DIRTY(k)) | ||
1797 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | ||
1798 | offset, -sectors); | ||
1799 | } | ||
1800 | |||
1801 | uint64_t old_offset; | ||
1802 | unsigned old_size, sectors_found = 0; | ||
1803 | |||
1804 | while (1) { | ||
1805 | struct bkey *k = bch_btree_iter_next(iter); | ||
1806 | if (!k || | ||
1807 | bkey_cmp(&START_KEY(k), insert) >= 0) | ||
1808 | break; | ||
1809 | |||
1810 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | ||
1811 | continue; | ||
1812 | |||
1813 | old_offset = KEY_START(k); | ||
1814 | old_size = KEY_SIZE(k); | ||
1815 | |||
1816 | /* | ||
1817 | * We might overlap with 0 size extents; we can't skip these | ||
1818 | * because if they're in the set we're inserting to we have to | ||
1819 | * adjust them so they don't overlap with the key we're | ||
1820 | * inserting. But we don't want to check them for replace | ||
1821 | * operations. | ||
1822 | */ | ||
1823 | |||
1824 | if (replace_key && KEY_SIZE(k)) { | ||
1825 | /* | ||
1826 | * k might have been split since we inserted/found the | ||
1827 | * key we're replacing | ||
1828 | */ | ||
1829 | unsigned i; | ||
1830 | uint64_t offset = KEY_START(k) - | ||
1831 | KEY_START(replace_key); | ||
1832 | |||
1833 | /* But it must be a subset of the replace key */ | ||
1834 | if (KEY_START(k) < KEY_START(replace_key) || | ||
1835 | KEY_OFFSET(k) > KEY_OFFSET(replace_key)) | ||
1836 | goto check_failed; | ||
1837 | |||
1838 | /* We didn't find a key that we were supposed to */ | ||
1839 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | ||
1840 | goto check_failed; | ||
1841 | |||
1842 | if (KEY_PTRS(k) != KEY_PTRS(replace_key) || | ||
1843 | KEY_DIRTY(k) != KEY_DIRTY(replace_key)) | ||
1844 | goto check_failed; | ||
1845 | |||
1846 | /* skip past gen */ | ||
1847 | offset <<= 8; | ||
1848 | |||
1849 | BUG_ON(!KEY_PTRS(replace_key)); | ||
1850 | 1791 | ||
1851 | for (i = 0; i < KEY_PTRS(replace_key); i++) | 1792 | BUG_ON(bkey_cmp(k, &b->key) > 0); |
1852 | if (k->ptr[i] != replace_key->ptr[i] + offset) | ||
1853 | goto check_failed; | ||
1854 | |||
1855 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | ||
1856 | } | ||
1857 | |||
1858 | if (bkey_cmp(insert, k) < 0 && | ||
1859 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | ||
1860 | /* | ||
1861 | * We overlapped in the middle of an existing key: that | ||
1862 | * means we have to split the old key. But we have to do | ||
1863 | * slightly different things depending on whether the | ||
1864 | * old key has been written out yet. | ||
1865 | */ | ||
1866 | |||
1867 | struct bkey *top; | ||
1868 | |||
1869 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); | ||
1870 | |||
1871 | if (bkey_written(b, k)) { | ||
1872 | /* | ||
1873 | * We insert a new key to cover the top of the | ||
1874 | * old key, and the old key is modified in place | ||
1875 | * to represent the bottom split. | ||
1876 | * | ||
1877 | * It's completely arbitrary whether the new key | ||
1878 | * is the top or the bottom, but it has to match | ||
1879 | * up with what btree_sort_fixup() does - it | ||
1880 | * doesn't check for this kind of overlap, it | ||
1881 | * depends on us inserting a new key for the top | ||
1882 | * here. | ||
1883 | */ | ||
1884 | top = bch_bset_search(b, &b->sets[b->nsets], | ||
1885 | insert); | ||
1886 | shift_keys(b, top, k); | ||
1887 | } else { | ||
1888 | BKEY_PADDED(key) temp; | ||
1889 | bkey_copy(&temp.key, k); | ||
1890 | shift_keys(b, k, &temp.key); | ||
1891 | top = bkey_next(k); | ||
1892 | } | ||
1893 | |||
1894 | bch_cut_front(insert, top); | ||
1895 | bch_cut_back(&START_KEY(insert), k); | ||
1896 | bch_bset_fix_invalidated_key(b, k); | ||
1897 | return false; | ||
1898 | } | ||
1899 | |||
1900 | if (bkey_cmp(insert, k) < 0) { | ||
1901 | bch_cut_front(insert, k); | ||
1902 | } else { | ||
1903 | if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) | ||
1904 | old_offset = KEY_START(insert); | ||
1905 | |||
1906 | if (bkey_written(b, k) && | ||
1907 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | ||
1908 | /* | ||
1909 | * Completely overwrote, so we don't have to | ||
1910 | * invalidate the binary search tree | ||
1911 | */ | ||
1912 | bch_cut_front(k, k); | ||
1913 | } else { | ||
1914 | __bch_cut_back(&START_KEY(insert), k); | ||
1915 | bch_bset_fix_invalidated_key(b, k); | ||
1916 | } | ||
1917 | } | ||
1918 | |||
1919 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); | ||
1920 | } | ||
1921 | 1793 | ||
1922 | check_failed: | 1794 | status = bch_btree_insert_key(&b->keys, k, replace_key); |
1923 | if (replace_key) { | 1795 | if (status != BTREE_INSERT_STATUS_NO_INSERT) { |
1924 | if (!sectors_found) { | 1796 | bch_check_keys(&b->keys, "%u for %s", status, |
1925 | return true; | 1797 | replace_key ? "replace" : "insert"); |
1926 | } else if (sectors_found < KEY_SIZE(insert)) { | ||
1927 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | ||
1928 | (KEY_SIZE(insert) - sectors_found)); | ||
1929 | SET_KEY_SIZE(insert, sectors_found); | ||
1930 | } | ||
1931 | } | ||
1932 | 1798 | ||
1933 | return false; | 1799 | trace_bcache_btree_insert_key(b, k, replace_key != NULL, |
1800 | status); | ||
1801 | return true; | ||
1802 | } else | ||
1803 | return false; | ||
1934 | } | 1804 | } |
1935 | 1805 | ||
1936 | static bool btree_insert_key(struct btree *b, struct btree_op *op, | 1806 | static size_t insert_u64s_remaining(struct btree *b) |
1937 | struct bkey *k, struct bkey *replace_key) | ||
1938 | { | 1807 | { |
1939 | struct bset *i = b->sets[b->nsets].data; | 1808 | ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys); |
1940 | struct bkey *m, *prev; | ||
1941 | unsigned status = BTREE_INSERT_STATUS_INSERT; | ||
1942 | |||
1943 | BUG_ON(bkey_cmp(k, &b->key) > 0); | ||
1944 | BUG_ON(b->level && !KEY_PTRS(k)); | ||
1945 | BUG_ON(!b->level && !KEY_OFFSET(k)); | ||
1946 | |||
1947 | if (!b->level) { | ||
1948 | struct btree_iter iter; | ||
1949 | |||
1950 | /* | ||
1951 | * bset_search() returns the first key that is strictly greater | ||
1952 | * than the search key - but for back merging, we want to find | ||
1953 | * the previous key. | ||
1954 | */ | ||
1955 | prev = NULL; | ||
1956 | m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); | ||
1957 | 1809 | ||
1958 | if (fix_overlapping_extents(b, k, &iter, replace_key)) { | 1810 | /* |
1959 | op->insert_collision = true; | 1811 | * Might land in the middle of an existing extent and have to split it |
1960 | return false; | 1812 | */ |
1961 | } | 1813 | if (b->keys.ops->is_extents) |
1962 | 1814 | ret -= KEY_MAX_U64S; | |
1963 | if (KEY_DIRTY(k)) | ||
1964 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | ||
1965 | KEY_START(k), KEY_SIZE(k)); | ||
1966 | |||
1967 | while (m != end(i) && | ||
1968 | bkey_cmp(k, &START_KEY(m)) > 0) | ||
1969 | prev = m, m = bkey_next(m); | ||
1970 | |||
1971 | if (key_merging_disabled(b->c)) | ||
1972 | goto insert; | ||
1973 | |||
1974 | /* prev is in the tree, if we merge we're done */ | ||
1975 | status = BTREE_INSERT_STATUS_BACK_MERGE; | ||
1976 | if (prev && | ||
1977 | bch_bkey_try_merge(b, prev, k)) | ||
1978 | goto merged; | ||
1979 | |||
1980 | status = BTREE_INSERT_STATUS_OVERWROTE; | ||
1981 | if (m != end(i) && | ||
1982 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | ||
1983 | goto copy; | ||
1984 | |||
1985 | status = BTREE_INSERT_STATUS_FRONT_MERGE; | ||
1986 | if (m != end(i) && | ||
1987 | bch_bkey_try_merge(b, k, m)) | ||
1988 | goto copy; | ||
1989 | } else { | ||
1990 | BUG_ON(replace_key); | ||
1991 | m = bch_bset_search(b, &b->sets[b->nsets], k); | ||
1992 | } | ||
1993 | |||
1994 | insert: shift_keys(b, m, k); | ||
1995 | copy: bkey_copy(m, k); | ||
1996 | merged: | ||
1997 | bch_check_keys(b, "%u for %s", status, | ||
1998 | replace_key ? "replace" : "insert"); | ||
1999 | |||
2000 | if (b->level && !KEY_OFFSET(k)) | ||
2001 | btree_current_write(b)->prio_blocked++; | ||
2002 | |||
2003 | trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); | ||
2004 | 1815 | ||
2005 | return true; | 1816 | return max(ret, 0L); |
2006 | } | 1817 | } |
2007 | 1818 | ||
2008 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, | 1819 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, |
@@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, | |||
2010 | struct bkey *replace_key) | 1821 | struct bkey *replace_key) |
2011 | { | 1822 | { |
2012 | bool ret = false; | 1823 | bool ret = false; |
2013 | int oldsize = bch_count_data(b); | 1824 | int oldsize = bch_count_data(&b->keys); |
2014 | 1825 | ||
2015 | while (!bch_keylist_empty(insert_keys)) { | 1826 | while (!bch_keylist_empty(insert_keys)) { |
2016 | struct bset *i = write_block(b); | ||
2017 | struct bkey *k = insert_keys->keys; | 1827 | struct bkey *k = insert_keys->keys; |
2018 | 1828 | ||
2019 | if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) | 1829 | if (bkey_u64s(k) > insert_u64s_remaining(b)) |
2020 | > btree_blocks(b)) | ||
2021 | break; | 1830 | break; |
2022 | 1831 | ||
2023 | if (bkey_cmp(k, &b->key) <= 0) { | 1832 | if (bkey_cmp(k, &b->key) <= 0) { |
2024 | if (!b->level) | 1833 | if (!b->level) |
2025 | bkey_put(b->c, k); | 1834 | bkey_put(b->c, k); |
2026 | 1835 | ||
2027 | ret |= btree_insert_key(b, op, k, replace_key); | 1836 | ret |= btree_insert_key(b, k, replace_key); |
2028 | bch_keylist_pop_front(insert_keys); | 1837 | bch_keylist_pop_front(insert_keys); |
2029 | } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { | 1838 | } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { |
2030 | BKEY_PADDED(key) temp; | 1839 | BKEY_PADDED(key) temp; |
@@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, | |||
2033 | bch_cut_back(&b->key, &temp.key); | 1842 | bch_cut_back(&b->key, &temp.key); |
2034 | bch_cut_front(&b->key, insert_keys->keys); | 1843 | bch_cut_front(&b->key, insert_keys->keys); |
2035 | 1844 | ||
2036 | ret |= btree_insert_key(b, op, &temp.key, replace_key); | 1845 | ret |= btree_insert_key(b, &temp.key, replace_key); |
2037 | break; | 1846 | break; |
2038 | } else { | 1847 | } else { |
2039 | break; | 1848 | break; |
2040 | } | 1849 | } |
2041 | } | 1850 | } |
2042 | 1851 | ||
1852 | if (!ret) | ||
1853 | op->insert_collision = true; | ||
1854 | |||
2043 | BUG_ON(!bch_keylist_empty(insert_keys) && b->level); | 1855 | BUG_ON(!bch_keylist_empty(insert_keys) && b->level); |
2044 | 1856 | ||
2045 | BUG_ON(bch_count_data(b) < oldsize); | 1857 | BUG_ON(bch_count_data(&b->keys) < oldsize); |
2046 | return ret; | 1858 | return ret; |
2047 | } | 1859 | } |
2048 | 1860 | ||
@@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
2059 | closure_init_stack(&cl); | 1871 | closure_init_stack(&cl); |
2060 | bch_keylist_init(&parent_keys); | 1872 | bch_keylist_init(&parent_keys); |
2061 | 1873 | ||
1874 | if (!b->level && | ||
1875 | btree_check_reserve(b, op)) | ||
1876 | return -EINTR; | ||
1877 | |||
2062 | n1 = btree_node_alloc_replacement(b, true); | 1878 | n1 = btree_node_alloc_replacement(b, true); |
2063 | if (IS_ERR(n1)) | 1879 | if (IS_ERR(n1)) |
2064 | goto err; | 1880 | goto err; |
2065 | 1881 | ||
2066 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | 1882 | split = set_blocks(btree_bset_first(n1), |
1883 | block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; | ||
2067 | 1884 | ||
2068 | if (split) { | 1885 | if (split) { |
2069 | unsigned keys = 0; | 1886 | unsigned keys = 0; |
2070 | 1887 | ||
2071 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); | 1888 | trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); |
2072 | 1889 | ||
2073 | n2 = bch_btree_node_alloc(b->c, b->level, true); | 1890 | n2 = bch_btree_node_alloc(b->c, b->level, true); |
2074 | if (IS_ERR(n2)) | 1891 | if (IS_ERR(n2)) |
@@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
2087 | * search tree yet | 1904 | * search tree yet |
2088 | */ | 1905 | */ |
2089 | 1906 | ||
2090 | while (keys < (n1->sets[0].data->keys * 3) / 5) | 1907 | while (keys < (btree_bset_first(n1)->keys * 3) / 5) |
2091 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | 1908 | keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), |
1909 | keys)); | ||
2092 | 1910 | ||
2093 | bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); | 1911 | bkey_copy_key(&n1->key, |
2094 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | 1912 | bset_bkey_idx(btree_bset_first(n1), keys)); |
1913 | keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); | ||
2095 | 1914 | ||
2096 | n2->sets[0].data->keys = n1->sets[0].data->keys - keys; | 1915 | btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; |
2097 | n1->sets[0].data->keys = keys; | 1916 | btree_bset_first(n1)->keys = keys; |
2098 | 1917 | ||
2099 | memcpy(n2->sets[0].data->start, | 1918 | memcpy(btree_bset_first(n2)->start, |
2100 | end(n1->sets[0].data), | 1919 | bset_bkey_last(btree_bset_first(n1)), |
2101 | n2->sets[0].data->keys * sizeof(uint64_t)); | 1920 | btree_bset_first(n2)->keys * sizeof(uint64_t)); |
2102 | 1921 | ||
2103 | bkey_copy_key(&n2->key, &b->key); | 1922 | bkey_copy_key(&n2->key, &b->key); |
2104 | 1923 | ||
@@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
2106 | bch_btree_node_write(n2, &cl); | 1925 | bch_btree_node_write(n2, &cl); |
2107 | rw_unlock(true, n2); | 1926 | rw_unlock(true, n2); |
2108 | } else { | 1927 | } else { |
2109 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); | 1928 | trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); |
2110 | 1929 | ||
2111 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); | 1930 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); |
2112 | } | 1931 | } |
@@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
2149 | 1968 | ||
2150 | return 0; | 1969 | return 0; |
2151 | err_free2: | 1970 | err_free2: |
1971 | bkey_put(b->c, &n2->key); | ||
2152 | btree_node_free(n2); | 1972 | btree_node_free(n2); |
2153 | rw_unlock(true, n2); | 1973 | rw_unlock(true, n2); |
2154 | err_free1: | 1974 | err_free1: |
1975 | bkey_put(b->c, &n1->key); | ||
2155 | btree_node_free(n1); | 1976 | btree_node_free(n1); |
2156 | rw_unlock(true, n1); | 1977 | rw_unlock(true, n1); |
2157 | err: | 1978 | err: |
1979 | WARN(1, "bcache: btree split failed"); | ||
1980 | |||
2158 | if (n3 == ERR_PTR(-EAGAIN) || | 1981 | if (n3 == ERR_PTR(-EAGAIN) || |
2159 | n2 == ERR_PTR(-EAGAIN) || | 1982 | n2 == ERR_PTR(-EAGAIN) || |
2160 | n1 == ERR_PTR(-EAGAIN)) | 1983 | n1 == ERR_PTR(-EAGAIN)) |
2161 | return -EAGAIN; | 1984 | return -EAGAIN; |
2162 | 1985 | ||
2163 | pr_warn("couldn't split"); | ||
2164 | return -ENOMEM; | 1986 | return -ENOMEM; |
2165 | } | 1987 | } |
2166 | 1988 | ||
@@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, | |||
2171 | { | 1993 | { |
2172 | BUG_ON(b->level && replace_key); | 1994 | BUG_ON(b->level && replace_key); |
2173 | 1995 | ||
2174 | if (should_split(b)) { | 1996 | if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { |
2175 | if (current->bio_list) { | 1997 | if (current->bio_list) { |
2176 | op->lock = b->c->root->level + 1; | 1998 | op->lock = b->c->root->level + 1; |
2177 | return -EAGAIN; | 1999 | return -EAGAIN; |
@@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, | |||
2180 | return -EINTR; | 2002 | return -EINTR; |
2181 | } else { | 2003 | } else { |
2182 | /* Invalidated all iterators */ | 2004 | /* Invalidated all iterators */ |
2183 | return btree_split(b, op, insert_keys, replace_key) ?: | 2005 | int ret = btree_split(b, op, insert_keys, replace_key); |
2184 | -EINTR; | 2006 | |
2007 | return bch_keylist_empty(insert_keys) ? | ||
2008 | 0 : ret ?: -EINTR; | ||
2185 | } | 2009 | } |
2186 | } else { | 2010 | } else { |
2187 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | 2011 | BUG_ON(write_block(b) != btree_bset_last(b)); |
2188 | 2012 | ||
2189 | if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { | 2013 | if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { |
2190 | if (!b->level) | 2014 | if (!b->level) |
@@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, | |||
2323 | struct bkey *k; | 2147 | struct bkey *k; |
2324 | struct btree_iter iter; | 2148 | struct btree_iter iter; |
2325 | 2149 | ||
2326 | bch_btree_iter_init(b, &iter, from); | 2150 | bch_btree_iter_init(&b->keys, &iter, from); |
2327 | 2151 | ||
2328 | while ((k = bch_btree_iter_next_filter(&iter, b, | 2152 | while ((k = bch_btree_iter_next_filter(&iter, &b->keys, |
2329 | bch_ptr_bad))) { | 2153 | bch_ptr_bad))) { |
2330 | ret = btree(map_nodes_recurse, k, b, | 2154 | ret = btree(map_nodes_recurse, k, b, |
2331 | op, from, fn, flags); | 2155 | op, from, fn, flags); |
@@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, | |||
2356 | struct bkey *k; | 2180 | struct bkey *k; |
2357 | struct btree_iter iter; | 2181 | struct btree_iter iter; |
2358 | 2182 | ||
2359 | bch_btree_iter_init(b, &iter, from); | 2183 | bch_btree_iter_init(&b->keys, &iter, from); |
2360 | 2184 | ||
2361 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { | 2185 | while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { |
2362 | ret = !b->level | 2186 | ret = !b->level |
2363 | ? fn(op, b, k) | 2187 | ? fn(op, b, k) |
2364 | : btree(map_keys_recurse, k, b, op, from, fn, flags); | 2188 | : btree(map_keys_recurse, k, b, op, from, fn, flags); |
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 767e75570896..af065e97e55c 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h | |||
@@ -130,20 +130,12 @@ struct btree { | |||
130 | unsigned long flags; | 130 | unsigned long flags; |
131 | uint16_t written; /* would be nice to kill */ | 131 | uint16_t written; /* would be nice to kill */ |
132 | uint8_t level; | 132 | uint8_t level; |
133 | uint8_t nsets; | 133 | |
134 | uint8_t page_order; | 134 | struct btree_keys keys; |
135 | |||
136 | /* | ||
137 | * Set of sorted keys - the real btree node - plus a binary search tree | ||
138 | * | ||
139 | * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||
140 | * to the memory we have allocated for this btree node. Additionally, | ||
141 | * set[0]->data points to the entire btree node as it exists on disk. | ||
142 | */ | ||
143 | struct bset_tree sets[MAX_BSETS]; | ||
144 | 135 | ||
145 | /* For outstanding btree writes, used as a lock - protects write_idx */ | 136 | /* For outstanding btree writes, used as a lock - protects write_idx */ |
146 | struct closure_with_waitlist io; | 137 | struct closure io; |
138 | struct semaphore io_mutex; | ||
147 | 139 | ||
148 | struct list_head list; | 140 | struct list_head list; |
149 | struct delayed_work work; | 141 | struct delayed_work work; |
@@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b) | |||
179 | return b->writes + (btree_node_write_idx(b) ^ 1); | 171 | return b->writes + (btree_node_write_idx(b) ^ 1); |
180 | } | 172 | } |
181 | 173 | ||
182 | static inline unsigned bset_offset(struct btree *b, struct bset *i) | 174 | static inline struct bset *btree_bset_first(struct btree *b) |
183 | { | 175 | { |
184 | return (((size_t) i) - ((size_t) b->sets->data)) >> 9; | 176 | return b->keys.set->data; |
185 | } | 177 | } |
186 | 178 | ||
187 | static inline struct bset *write_block(struct btree *b) | 179 | static inline struct bset *btree_bset_last(struct btree *b) |
188 | { | 180 | { |
189 | return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); | 181 | return bset_tree_last(&b->keys)->data; |
190 | } | 182 | } |
191 | 183 | ||
192 | static inline bool bset_written(struct btree *b, struct bset_tree *t) | 184 | static inline unsigned bset_block_offset(struct btree *b, struct bset *i) |
193 | { | 185 | { |
194 | return t->data < write_block(b); | 186 | return bset_sector_offset(&b->keys, i) >> b->c->block_bits; |
195 | } | ||
196 | |||
197 | static inline bool bkey_written(struct btree *b, struct bkey *k) | ||
198 | { | ||
199 | return k < write_block(b)->start; | ||
200 | } | 187 | } |
201 | 188 | ||
202 | static inline void set_gc_sectors(struct cache_set *c) | 189 | static inline void set_gc_sectors(struct cache_set *c) |
@@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c) | |||
204 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); | 191 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); |
205 | } | 192 | } |
206 | 193 | ||
207 | static inline struct bkey *bch_btree_iter_init(struct btree *b, | ||
208 | struct btree_iter *iter, | ||
209 | struct bkey *search) | ||
210 | { | ||
211 | return __bch_btree_iter_init(b, iter, search, b->sets); | ||
212 | } | ||
213 | |||
214 | static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | ||
215 | { | ||
216 | if (b->level) | ||
217 | return bch_btree_ptr_invalid(b->c, k); | ||
218 | else | ||
219 | return bch_extent_ptr_invalid(b->c, k); | ||
220 | } | ||
221 | |||
222 | void bkey_put(struct cache_set *c, struct bkey *k); | 194 | void bkey_put(struct cache_set *c, struct bkey *k); |
223 | 195 | ||
224 | /* Looping macros */ | 196 | /* Looping macros */ |
@@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k); | |||
229 | iter++) \ | 201 | iter++) \ |
230 | hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) | 202 | hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) |
231 | 203 | ||
232 | #define for_each_key_filter(b, k, iter, filter) \ | ||
233 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
234 | ((k) = bch_btree_iter_next_filter((iter), b, filter));) | ||
235 | |||
236 | #define for_each_key(b, k, iter) \ | ||
237 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
238 | ((k) = bch_btree_iter_next(iter));) | ||
239 | |||
240 | /* Recursing down the btree */ | 204 | /* Recursing down the btree */ |
241 | 205 | ||
242 | struct btree_op { | 206 | struct btree_op { |
207 | /* for waiting on btree reserve in btree_split() */ | ||
208 | wait_queue_t wait; | ||
209 | |||
243 | /* Btree level at which we start taking write locks */ | 210 | /* Btree level at which we start taking write locks */ |
244 | short lock; | 211 | short lock; |
245 | 212 | ||
@@ -249,6 +216,7 @@ struct btree_op { | |||
249 | static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) | 216 | static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) |
250 | { | 217 | { |
251 | memset(op, 0, sizeof(struct btree_op)); | 218 | memset(op, 0, sizeof(struct btree_op)); |
219 | init_wait(&op->wait); | ||
252 | op->lock = write_lock_level; | 220 | op->lock = write_lock_level; |
253 | } | 221 | } |
254 | 222 | ||
@@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b) | |||
267 | (w ? up_write : up_read)(&b->lock); | 235 | (w ? up_write : up_read)(&b->lock); |
268 | } | 236 | } |
269 | 237 | ||
270 | void bch_btree_node_read(struct btree *); | 238 | void bch_btree_node_read_done(struct btree *); |
271 | void bch_btree_node_write(struct btree *, struct closure *); | 239 | void bch_btree_node_write(struct btree *, struct closure *); |
272 | 240 | ||
273 | void bch_btree_set_root(struct btree *); | 241 | void bch_btree_set_root(struct btree *); |
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index dfff2410322e..7a228de95fd7 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c | |||
@@ -11,19 +11,6 @@ | |||
11 | 11 | ||
12 | #include "closure.h" | 12 | #include "closure.h" |
13 | 13 | ||
14 | #define CL_FIELD(type, field) \ | ||
15 | case TYPE_ ## type: \ | ||
16 | return &container_of(cl, struct type, cl)->field | ||
17 | |||
18 | static struct closure_waitlist *closure_waitlist(struct closure *cl) | ||
19 | { | ||
20 | switch (cl->type) { | ||
21 | CL_FIELD(closure_with_waitlist, wait); | ||
22 | default: | ||
23 | return NULL; | ||
24 | } | ||
25 | } | ||
26 | |||
27 | static inline void closure_put_after_sub(struct closure *cl, int flags) | 14 | static inline void closure_put_after_sub(struct closure *cl, int flags) |
28 | { | 15 | { |
29 | int r = flags & CLOSURE_REMAINING_MASK; | 16 | int r = flags & CLOSURE_REMAINING_MASK; |
@@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) | |||
42 | closure_queue(cl); | 29 | closure_queue(cl); |
43 | } else { | 30 | } else { |
44 | struct closure *parent = cl->parent; | 31 | struct closure *parent = cl->parent; |
45 | struct closure_waitlist *wait = closure_waitlist(cl); | ||
46 | closure_fn *destructor = cl->fn; | 32 | closure_fn *destructor = cl->fn; |
47 | 33 | ||
48 | closure_debug_destroy(cl); | 34 | closure_debug_destroy(cl); |
49 | 35 | ||
50 | smp_mb(); | ||
51 | atomic_set(&cl->remaining, -1); | ||
52 | |||
53 | if (wait) | ||
54 | closure_wake_up(wait); | ||
55 | |||
56 | if (destructor) | 36 | if (destructor) |
57 | destructor(cl); | 37 | destructor(cl); |
58 | 38 | ||
@@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v) | |||
69 | } | 49 | } |
70 | EXPORT_SYMBOL(closure_sub); | 50 | EXPORT_SYMBOL(closure_sub); |
71 | 51 | ||
52 | /** | ||
53 | * closure_put - decrement a closure's refcount | ||
54 | */ | ||
72 | void closure_put(struct closure *cl) | 55 | void closure_put(struct closure *cl) |
73 | { | 56 | { |
74 | closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); | 57 | closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); |
75 | } | 58 | } |
76 | EXPORT_SYMBOL(closure_put); | 59 | EXPORT_SYMBOL(closure_put); |
77 | 60 | ||
78 | static void set_waiting(struct closure *cl, unsigned long f) | 61 | /** |
79 | { | 62 | * closure_wake_up - wake up all closures on a wait list, without memory barrier |
80 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 63 | */ |
81 | cl->waiting_on = f; | ||
82 | #endif | ||
83 | } | ||
84 | |||
85 | void __closure_wake_up(struct closure_waitlist *wait_list) | 64 | void __closure_wake_up(struct closure_waitlist *wait_list) |
86 | { | 65 | { |
87 | struct llist_node *list; | 66 | struct llist_node *list; |
@@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list) | |||
106 | cl = container_of(reverse, struct closure, list); | 85 | cl = container_of(reverse, struct closure, list); |
107 | reverse = llist_next(reverse); | 86 | reverse = llist_next(reverse); |
108 | 87 | ||
109 | set_waiting(cl, 0); | 88 | closure_set_waiting(cl, 0); |
110 | closure_sub(cl, CLOSURE_WAITING + 1); | 89 | closure_sub(cl, CLOSURE_WAITING + 1); |
111 | } | 90 | } |
112 | } | 91 | } |
113 | EXPORT_SYMBOL(__closure_wake_up); | 92 | EXPORT_SYMBOL(__closure_wake_up); |
114 | 93 | ||
115 | bool closure_wait(struct closure_waitlist *list, struct closure *cl) | 94 | /** |
95 | * closure_wait - add a closure to a waitlist | ||
96 | * | ||
97 | * @waitlist will own a ref on @cl, which will be released when | ||
98 | * closure_wake_up() is called on @waitlist. | ||
99 | * | ||
100 | */ | ||
101 | bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) | ||
116 | { | 102 | { |
117 | if (atomic_read(&cl->remaining) & CLOSURE_WAITING) | 103 | if (atomic_read(&cl->remaining) & CLOSURE_WAITING) |
118 | return false; | 104 | return false; |
119 | 105 | ||
120 | set_waiting(cl, _RET_IP_); | 106 | closure_set_waiting(cl, _RET_IP_); |
121 | atomic_add(CLOSURE_WAITING + 1, &cl->remaining); | 107 | atomic_add(CLOSURE_WAITING + 1, &cl->remaining); |
122 | llist_add(&cl->list, &list->list); | 108 | llist_add(&cl->list, &waitlist->list); |
123 | 109 | ||
124 | return true; | 110 | return true; |
125 | } | 111 | } |
126 | EXPORT_SYMBOL(closure_wait); | 112 | EXPORT_SYMBOL(closure_wait); |
127 | 113 | ||
128 | /** | 114 | /** |
129 | * closure_sync() - sleep until a closure a closure has nothing left to wait on | 115 | * closure_sync - sleep until a closure a closure has nothing left to wait on |
130 | * | 116 | * |
131 | * Sleeps until the refcount hits 1 - the thread that's running the closure owns | 117 | * Sleeps until the refcount hits 1 - the thread that's running the closure owns |
132 | * the last refcount. | 118 | * the last refcount. |
@@ -148,46 +134,6 @@ void closure_sync(struct closure *cl) | |||
148 | } | 134 | } |
149 | EXPORT_SYMBOL(closure_sync); | 135 | EXPORT_SYMBOL(closure_sync); |
150 | 136 | ||
151 | /** | ||
152 | * closure_trylock() - try to acquire the closure, without waiting | ||
153 | * @cl: closure to lock | ||
154 | * | ||
155 | * Returns true if the closure was succesfully locked. | ||
156 | */ | ||
157 | bool closure_trylock(struct closure *cl, struct closure *parent) | ||
158 | { | ||
159 | if (atomic_cmpxchg(&cl->remaining, -1, | ||
160 | CLOSURE_REMAINING_INITIALIZER) != -1) | ||
161 | return false; | ||
162 | |||
163 | smp_mb(); | ||
164 | |||
165 | cl->parent = parent; | ||
166 | if (parent) | ||
167 | closure_get(parent); | ||
168 | |||
169 | closure_set_ret_ip(cl); | ||
170 | closure_debug_create(cl); | ||
171 | return true; | ||
172 | } | ||
173 | EXPORT_SYMBOL(closure_trylock); | ||
174 | |||
175 | void __closure_lock(struct closure *cl, struct closure *parent, | ||
176 | struct closure_waitlist *wait_list) | ||
177 | { | ||
178 | struct closure wait; | ||
179 | closure_init_stack(&wait); | ||
180 | |||
181 | while (1) { | ||
182 | if (closure_trylock(cl, parent)) | ||
183 | return; | ||
184 | |||
185 | closure_wait_event(wait_list, &wait, | ||
186 | atomic_read(&cl->remaining) == -1); | ||
187 | } | ||
188 | } | ||
189 | EXPORT_SYMBOL(__closure_lock); | ||
190 | |||
191 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 137 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
192 | 138 | ||
193 | static LIST_HEAD(closure_list); | 139 | static LIST_HEAD(closure_list); |
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 9762f1be3304..7ef7461912be 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h | |||
@@ -72,30 +72,6 @@ | |||
72 | * closure - _always_ use continue_at(). Doing so consistently will help | 72 | * closure - _always_ use continue_at(). Doing so consistently will help |
73 | * eliminate an entire class of particularly pernicious races. | 73 | * eliminate an entire class of particularly pernicious races. |
74 | * | 74 | * |
75 | * For a closure to wait on an arbitrary event, we need to introduce waitlists: | ||
76 | * | ||
77 | * struct closure_waitlist list; | ||
78 | * closure_wait_event(list, cl, condition); | ||
79 | * closure_wake_up(wait_list); | ||
80 | * | ||
81 | * These work analagously to wait_event() and wake_up() - except that instead of | ||
82 | * operating on the current thread (for wait_event()) and lists of threads, they | ||
83 | * operate on an explicit closure and lists of closures. | ||
84 | * | ||
85 | * Because it's a closure we can now wait either synchronously or | ||
86 | * asynchronously. closure_wait_event() returns the current value of the | ||
87 | * condition, and if it returned false continue_at() or closure_sync() can be | ||
88 | * used to wait for it to become true. | ||
89 | * | ||
90 | * It's useful for waiting on things when you can't sleep in the context in | ||
91 | * which you must check the condition (perhaps a spinlock held, or you might be | ||
92 | * beneath generic_make_request() - in which case you can't sleep on IO). | ||
93 | * | ||
94 | * closure_wait_event() will wait either synchronously or asynchronously, | ||
95 | * depending on whether the closure is in blocking mode or not. You can pick a | ||
96 | * mode explicitly with closure_wait_event_sync() and | ||
97 | * closure_wait_event_async(), which do just what you might expect. | ||
98 | * | ||
99 | * Lastly, you might have a wait list dedicated to a specific event, and have no | 75 | * Lastly, you might have a wait list dedicated to a specific event, and have no |
100 | * need for specifying the condition - you just want to wait until someone runs | 76 | * need for specifying the condition - you just want to wait until someone runs |
101 | * closure_wake_up() on the appropriate wait list. In that case, just use | 77 | * closure_wake_up() on the appropriate wait list. In that case, just use |
@@ -121,40 +97,6 @@ | |||
121 | * All this implies that a closure should typically be embedded in a particular | 97 | * All this implies that a closure should typically be embedded in a particular |
122 | * struct (which its refcount will normally control the lifetime of), and that | 98 | * struct (which its refcount will normally control the lifetime of), and that |
123 | * struct can very much be thought of as a stack frame. | 99 | * struct can very much be thought of as a stack frame. |
124 | * | ||
125 | * Locking: | ||
126 | * | ||
127 | * Closures are based on work items but they can be thought of as more like | ||
128 | * threads - in that like threads and unlike work items they have a well | ||
129 | * defined lifetime; they are created (with closure_init()) and eventually | ||
130 | * complete after a continue_at(cl, NULL, NULL). | ||
131 | * | ||
132 | * Suppose you've got some larger structure with a closure embedded in it that's | ||
133 | * used for periodically doing garbage collection. You only want one garbage | ||
134 | * collection happening at a time, so the natural thing to do is protect it with | ||
135 | * a lock. However, it's difficult to use a lock protecting a closure correctly | ||
136 | * because the unlock should come after the last continue_to() (additionally, if | ||
137 | * you're using the closure asynchronously a mutex won't work since a mutex has | ||
138 | * to be unlocked by the same process that locked it). | ||
139 | * | ||
140 | * So to make it less error prone and more efficient, we also have the ability | ||
141 | * to use closures as locks: | ||
142 | * | ||
143 | * closure_init_unlocked(); | ||
144 | * closure_trylock(); | ||
145 | * | ||
146 | * That's all we need for trylock() - the last closure_put() implicitly unlocks | ||
147 | * it for you. But for closure_lock(), we also need a wait list: | ||
148 | * | ||
149 | * struct closure_with_waitlist frobnicator_cl; | ||
150 | * | ||
151 | * closure_init_unlocked(&frobnicator_cl); | ||
152 | * closure_lock(&frobnicator_cl); | ||
153 | * | ||
154 | * A closure_with_waitlist embeds a closure and a wait list - much like struct | ||
155 | * delayed_work embeds a work item and a timer_list. The important thing is, use | ||
156 | * it exactly like you would a regular closure and closure_put() will magically | ||
157 | * handle everything for you. | ||
158 | */ | 100 | */ |
159 | 101 | ||
160 | struct closure; | 102 | struct closure; |
@@ -164,12 +106,6 @@ struct closure_waitlist { | |||
164 | struct llist_head list; | 106 | struct llist_head list; |
165 | }; | 107 | }; |
166 | 108 | ||
167 | enum closure_type { | ||
168 | TYPE_closure = 0, | ||
169 | TYPE_closure_with_waitlist = 1, | ||
170 | MAX_CLOSURE_TYPE = 1, | ||
171 | }; | ||
172 | |||
173 | enum closure_state { | 109 | enum closure_state { |
174 | /* | 110 | /* |
175 | * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by | 111 | * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by |
@@ -224,8 +160,6 @@ struct closure { | |||
224 | 160 | ||
225 | atomic_t remaining; | 161 | atomic_t remaining; |
226 | 162 | ||
227 | enum closure_type type; | ||
228 | |||
229 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 163 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
230 | #define CLOSURE_MAGIC_DEAD 0xc054dead | 164 | #define CLOSURE_MAGIC_DEAD 0xc054dead |
231 | #define CLOSURE_MAGIC_ALIVE 0xc054a11e | 165 | #define CLOSURE_MAGIC_ALIVE 0xc054a11e |
@@ -237,34 +171,12 @@ struct closure { | |||
237 | #endif | 171 | #endif |
238 | }; | 172 | }; |
239 | 173 | ||
240 | struct closure_with_waitlist { | ||
241 | struct closure cl; | ||
242 | struct closure_waitlist wait; | ||
243 | }; | ||
244 | |||
245 | extern unsigned invalid_closure_type(void); | ||
246 | |||
247 | #define __CLOSURE_TYPE(cl, _t) \ | ||
248 | __builtin_types_compatible_p(typeof(cl), struct _t) \ | ||
249 | ? TYPE_ ## _t : \ | ||
250 | |||
251 | #define __closure_type(cl) \ | ||
252 | ( \ | ||
253 | __CLOSURE_TYPE(cl, closure) \ | ||
254 | __CLOSURE_TYPE(cl, closure_with_waitlist) \ | ||
255 | invalid_closure_type() \ | ||
256 | ) | ||
257 | |||
258 | void closure_sub(struct closure *cl, int v); | 174 | void closure_sub(struct closure *cl, int v); |
259 | void closure_put(struct closure *cl); | 175 | void closure_put(struct closure *cl); |
260 | void __closure_wake_up(struct closure_waitlist *list); | 176 | void __closure_wake_up(struct closure_waitlist *list); |
261 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); | 177 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); |
262 | void closure_sync(struct closure *cl); | 178 | void closure_sync(struct closure *cl); |
263 | 179 | ||
264 | bool closure_trylock(struct closure *cl, struct closure *parent); | ||
265 | void __closure_lock(struct closure *cl, struct closure *parent, | ||
266 | struct closure_waitlist *wait_list); | ||
267 | |||
268 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 180 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
269 | 181 | ||
270 | void closure_debug_init(void); | 182 | void closure_debug_init(void); |
@@ -293,134 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl) | |||
293 | #endif | 205 | #endif |
294 | } | 206 | } |
295 | 207 | ||
296 | static inline void closure_get(struct closure *cl) | 208 | static inline void closure_set_waiting(struct closure *cl, unsigned long f) |
297 | { | 209 | { |
298 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 210 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
299 | BUG_ON((atomic_inc_return(&cl->remaining) & | 211 | cl->waiting_on = f; |
300 | CLOSURE_REMAINING_MASK) <= 1); | ||
301 | #else | ||
302 | atomic_inc(&cl->remaining); | ||
303 | #endif | 212 | #endif |
304 | } | 213 | } |
305 | 214 | ||
306 | static inline void closure_set_stopped(struct closure *cl) | 215 | static inline void __closure_end_sleep(struct closure *cl) |
307 | { | 216 | { |
308 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); | 217 | __set_current_state(TASK_RUNNING); |
218 | |||
219 | if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) | ||
220 | atomic_sub(CLOSURE_SLEEPING, &cl->remaining); | ||
309 | } | 221 | } |
310 | 222 | ||
311 | static inline bool closure_is_unlocked(struct closure *cl) | 223 | static inline void __closure_start_sleep(struct closure *cl) |
312 | { | 224 | { |
313 | return atomic_read(&cl->remaining) == -1; | 225 | closure_set_ip(cl); |
226 | cl->task = current; | ||
227 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
228 | |||
229 | if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) | ||
230 | atomic_add(CLOSURE_SLEEPING, &cl->remaining); | ||
314 | } | 231 | } |
315 | 232 | ||
316 | static inline void do_closure_init(struct closure *cl, struct closure *parent, | 233 | static inline void closure_set_stopped(struct closure *cl) |
317 | bool running) | ||
318 | { | 234 | { |
319 | cl->parent = parent; | 235 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); |
320 | if (parent) | 236 | } |
321 | closure_get(parent); | ||
322 | |||
323 | if (running) { | ||
324 | closure_debug_create(cl); | ||
325 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); | ||
326 | } else | ||
327 | atomic_set(&cl->remaining, -1); | ||
328 | 237 | ||
238 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | ||
239 | struct workqueue_struct *wq) | ||
240 | { | ||
241 | BUG_ON(object_is_on_stack(cl)); | ||
329 | closure_set_ip(cl); | 242 | closure_set_ip(cl); |
243 | cl->fn = fn; | ||
244 | cl->wq = wq; | ||
245 | /* between atomic_dec() in closure_put() */ | ||
246 | smp_mb__before_atomic_dec(); | ||
330 | } | 247 | } |
331 | 248 | ||
332 | /* | 249 | static inline void closure_queue(struct closure *cl) |
333 | * Hack to get at the embedded closure if there is one, by doing an unsafe cast: | 250 | { |
334 | * the result of __closure_type() is thrown away, it's used merely for type | 251 | struct workqueue_struct *wq = cl->wq; |
335 | * checking. | 252 | if (wq) { |
336 | */ | 253 | INIT_WORK(&cl->work, cl->work.func); |
337 | #define __to_internal_closure(cl) \ | 254 | BUG_ON(!queue_work(wq, &cl->work)); |
338 | ({ \ | 255 | } else |
339 | BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ | 256 | cl->fn(cl); |
340 | (struct closure *) cl; \ | 257 | } |
341 | }) | ||
342 | |||
343 | #define closure_init_type(cl, parent, running) \ | ||
344 | do { \ | ||
345 | struct closure *_cl = __to_internal_closure(cl); \ | ||
346 | _cl->type = __closure_type(*(cl)); \ | ||
347 | do_closure_init(_cl, parent, running); \ | ||
348 | } while (0) | ||
349 | 258 | ||
350 | /** | 259 | /** |
351 | * __closure_init() - Initialize a closure, skipping the memset() | 260 | * closure_get - increment a closure's refcount |
352 | * | ||
353 | * May be used instead of closure_init() when memory has already been zeroed. | ||
354 | */ | 261 | */ |
355 | #define __closure_init(cl, parent) \ | 262 | static inline void closure_get(struct closure *cl) |
356 | closure_init_type(cl, parent, true) | 263 | { |
264 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
265 | BUG_ON((atomic_inc_return(&cl->remaining) & | ||
266 | CLOSURE_REMAINING_MASK) <= 1); | ||
267 | #else | ||
268 | atomic_inc(&cl->remaining); | ||
269 | #endif | ||
270 | } | ||
357 | 271 | ||
358 | /** | 272 | /** |
359 | * closure_init() - Initialize a closure, setting the refcount to 1 | 273 | * closure_init - Initialize a closure, setting the refcount to 1 |
360 | * @cl: closure to initialize | 274 | * @cl: closure to initialize |
361 | * @parent: parent of the new closure. cl will take a refcount on it for its | 275 | * @parent: parent of the new closure. cl will take a refcount on it for its |
362 | * lifetime; may be NULL. | 276 | * lifetime; may be NULL. |
363 | */ | 277 | */ |
364 | #define closure_init(cl, parent) \ | 278 | static inline void closure_init(struct closure *cl, struct closure *parent) |
365 | do { \ | ||
366 | memset((cl), 0, sizeof(*(cl))); \ | ||
367 | __closure_init(cl, parent); \ | ||
368 | } while (0) | ||
369 | |||
370 | static inline void closure_init_stack(struct closure *cl) | ||
371 | { | 279 | { |
372 | memset(cl, 0, sizeof(struct closure)); | 280 | memset(cl, 0, sizeof(struct closure)); |
373 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); | 281 | cl->parent = parent; |
374 | } | 282 | if (parent) |
375 | 283 | closure_get(parent); | |
376 | /** | ||
377 | * closure_init_unlocked() - Initialize a closure but leave it unlocked. | ||
378 | * @cl: closure to initialize | ||
379 | * | ||
380 | * For when the closure will be used as a lock. The closure may not be used | ||
381 | * until after a closure_lock() or closure_trylock(). | ||
382 | */ | ||
383 | #define closure_init_unlocked(cl) \ | ||
384 | do { \ | ||
385 | memset((cl), 0, sizeof(*(cl))); \ | ||
386 | closure_init_type(cl, NULL, false); \ | ||
387 | } while (0) | ||
388 | |||
389 | /** | ||
390 | * closure_lock() - lock and initialize a closure. | ||
391 | * @cl: the closure to lock | ||
392 | * @parent: the new parent for this closure | ||
393 | * | ||
394 | * The closure must be of one of the types that has a waitlist (otherwise we | ||
395 | * wouldn't be able to sleep on contention). | ||
396 | * | ||
397 | * @parent has exactly the same meaning as in closure_init(); if non null, the | ||
398 | * closure will take a reference on @parent which will be released when it is | ||
399 | * unlocked. | ||
400 | */ | ||
401 | #define closure_lock(cl, parent) \ | ||
402 | __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) | ||
403 | 284 | ||
404 | static inline void __closure_end_sleep(struct closure *cl) | 285 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); |
405 | { | ||
406 | __set_current_state(TASK_RUNNING); | ||
407 | 286 | ||
408 | if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) | 287 | closure_debug_create(cl); |
409 | atomic_sub(CLOSURE_SLEEPING, &cl->remaining); | 288 | closure_set_ip(cl); |
410 | } | 289 | } |
411 | 290 | ||
412 | static inline void __closure_start_sleep(struct closure *cl) | 291 | static inline void closure_init_stack(struct closure *cl) |
413 | { | 292 | { |
414 | closure_set_ip(cl); | 293 | memset(cl, 0, sizeof(struct closure)); |
415 | cl->task = current; | 294 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); |
416 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
417 | |||
418 | if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) | ||
419 | atomic_add(CLOSURE_SLEEPING, &cl->remaining); | ||
420 | } | 295 | } |
421 | 296 | ||
422 | /** | 297 | /** |
423 | * closure_wake_up() - wake up all closures on a wait list. | 298 | * closure_wake_up - wake up all closures on a wait list. |
424 | */ | 299 | */ |
425 | static inline void closure_wake_up(struct closure_waitlist *list) | 300 | static inline void closure_wake_up(struct closure_waitlist *list) |
426 | { | 301 | { |
@@ -428,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list) | |||
428 | __closure_wake_up(list); | 303 | __closure_wake_up(list); |
429 | } | 304 | } |
430 | 305 | ||
431 | /* | 306 | /** |
432 | * Wait on an event, synchronously or asynchronously - analogous to wait_event() | 307 | * continue_at - jump to another function with barrier |
433 | * but for closures. | 308 | * |
434 | * | 309 | * After @cl is no longer waiting on anything (i.e. all outstanding refs have |
435 | * The loop is oddly structured so as to avoid a race; we must check the | 310 | * been dropped with closure_put()), it will resume execution at @fn running out |
436 | * condition again after we've added ourself to the waitlist. We know if we were | 311 | * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). |
437 | * already on the waitlist because closure_wait() returns false; thus, we only | 312 | * |
438 | * schedule or break if closure_wait() returns false. If it returns true, we | 313 | * NOTE: This macro expands to a return in the calling function! |
439 | * just loop again - rechecking the condition. | 314 | * |
440 | * | 315 | * This is because after calling continue_at() you no longer have a ref on @cl, |
441 | * The __closure_wake_up() is necessary because we may race with the event | 316 | * and whatever @cl owns may be freed out from under you - a running closure fn |
442 | * becoming true; i.e. we see event false -> wait -> recheck condition, but the | 317 | * has a ref on its own closure which continue_at() drops. |
443 | * thread that made the event true may have called closure_wake_up() before we | ||
444 | * added ourself to the wait list. | ||
445 | * | ||
446 | * We have to call closure_sync() at the end instead of just | ||
447 | * __closure_end_sleep() because a different thread might've called | ||
448 | * closure_wake_up() before us and gotten preempted before they dropped the | ||
449 | * refcount on our closure. If this was a stack allocated closure, that would be | ||
450 | * bad. | ||
451 | */ | 318 | */ |
452 | #define closure_wait_event(list, cl, condition) \ | ||
453 | ({ \ | ||
454 | typeof(condition) ret; \ | ||
455 | \ | ||
456 | while (1) { \ | ||
457 | ret = (condition); \ | ||
458 | if (ret) { \ | ||
459 | __closure_wake_up(list); \ | ||
460 | closure_sync(cl); \ | ||
461 | break; \ | ||
462 | } \ | ||
463 | \ | ||
464 | __closure_start_sleep(cl); \ | ||
465 | \ | ||
466 | if (!closure_wait(list, cl)) \ | ||
467 | schedule(); \ | ||
468 | } \ | ||
469 | \ | ||
470 | ret; \ | ||
471 | }) | ||
472 | |||
473 | static inline void closure_queue(struct closure *cl) | ||
474 | { | ||
475 | struct workqueue_struct *wq = cl->wq; | ||
476 | if (wq) { | ||
477 | INIT_WORK(&cl->work, cl->work.func); | ||
478 | BUG_ON(!queue_work(wq, &cl->work)); | ||
479 | } else | ||
480 | cl->fn(cl); | ||
481 | } | ||
482 | |||
483 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | ||
484 | struct workqueue_struct *wq) | ||
485 | { | ||
486 | BUG_ON(object_is_on_stack(cl)); | ||
487 | closure_set_ip(cl); | ||
488 | cl->fn = fn; | ||
489 | cl->wq = wq; | ||
490 | /* between atomic_dec() in closure_put() */ | ||
491 | smp_mb__before_atomic_dec(); | ||
492 | } | ||
493 | |||
494 | #define continue_at(_cl, _fn, _wq) \ | 319 | #define continue_at(_cl, _fn, _wq) \ |
495 | do { \ | 320 | do { \ |
496 | set_closure_fn(_cl, _fn, _wq); \ | 321 | set_closure_fn(_cl, _fn, _wq); \ |
@@ -498,8 +323,28 @@ do { \ | |||
498 | return; \ | 323 | return; \ |
499 | } while (0) | 324 | } while (0) |
500 | 325 | ||
326 | /** | ||
327 | * closure_return - finish execution of a closure | ||
328 | * | ||
329 | * This is used to indicate that @cl is finished: when all outstanding refs on | ||
330 | * @cl have been dropped @cl's ref on its parent closure (as passed to | ||
331 | * closure_init()) will be dropped, if one was specified - thus this can be | ||
332 | * thought of as returning to the parent closure. | ||
333 | */ | ||
501 | #define closure_return(_cl) continue_at((_cl), NULL, NULL) | 334 | #define closure_return(_cl) continue_at((_cl), NULL, NULL) |
502 | 335 | ||
336 | /** | ||
337 | * continue_at_nobarrier - jump to another function without barrier | ||
338 | * | ||
339 | * Causes @fn to be executed out of @cl, in @wq context (or called directly if | ||
340 | * @wq is NULL). | ||
341 | * | ||
342 | * NOTE: like continue_at(), this macro expands to a return in the caller! | ||
343 | * | ||
344 | * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, | ||
345 | * thus it's not safe to touch anything protected by @cl after a | ||
346 | * continue_at_nobarrier(). | ||
347 | */ | ||
503 | #define continue_at_nobarrier(_cl, _fn, _wq) \ | 348 | #define continue_at_nobarrier(_cl, _fn, _wq) \ |
504 | do { \ | 349 | do { \ |
505 | set_closure_fn(_cl, _fn, _wq); \ | 350 | set_closure_fn(_cl, _fn, _wq); \ |
@@ -507,6 +352,15 @@ do { \ | |||
507 | return; \ | 352 | return; \ |
508 | } while (0) | 353 | } while (0) |
509 | 354 | ||
355 | /** | ||
356 | * closure_return - finish execution of a closure, with destructor | ||
357 | * | ||
358 | * Works like closure_return(), except @destructor will be called when all | ||
359 | * outstanding refs on @cl have been dropped; @destructor may be used to safely | ||
360 | * free the memory occupied by @cl, and it is called with the ref on the parent | ||
361 | * closure still held - so @destructor could safely return an item to a | ||
362 | * freelist protected by @cl's parent. | ||
363 | */ | ||
510 | #define closure_return_with_destructor(_cl, _destructor) \ | 364 | #define closure_return_with_destructor(_cl, _destructor) \ |
511 | do { \ | 365 | do { \ |
512 | set_closure_fn(_cl, _destructor, NULL); \ | 366 | set_closure_fn(_cl, _destructor, NULL); \ |
@@ -514,6 +368,13 @@ do { \ | |||
514 | return; \ | 368 | return; \ |
515 | } while (0) | 369 | } while (0) |
516 | 370 | ||
371 | /** | ||
372 | * closure_call - execute @fn out of a new, uninitialized closure | ||
373 | * | ||
374 | * Typically used when running out of one closure, and we want to run @fn | ||
375 | * asynchronously out of a new closure - @parent will then wait for @cl to | ||
376 | * finish. | ||
377 | */ | ||
517 | static inline void closure_call(struct closure *cl, closure_fn fn, | 378 | static inline void closure_call(struct closure *cl, closure_fn fn, |
518 | struct workqueue_struct *wq, | 379 | struct workqueue_struct *wq, |
519 | struct closure *parent) | 380 | struct closure *parent) |
@@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn, | |||
522 | continue_at_nobarrier(cl, fn, wq); | 383 | continue_at_nobarrier(cl, fn, wq); |
523 | } | 384 | } |
524 | 385 | ||
525 | static inline void closure_trylock_call(struct closure *cl, closure_fn fn, | ||
526 | struct workqueue_struct *wq, | ||
527 | struct closure *parent) | ||
528 | { | ||
529 | if (closure_trylock(cl, parent)) | ||
530 | continue_at_nobarrier(cl, fn, wq); | ||
531 | } | ||
532 | |||
533 | #endif /* _LINUX_CLOSURE_H */ | 386 | #endif /* _LINUX_CLOSURE_H */ |
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 03cb4d114e16..8b1f1d5c1819 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include "bcache.h" | 8 | #include "bcache.h" |
9 | #include "btree.h" | 9 | #include "btree.h" |
10 | #include "debug.h" | 10 | #include "debug.h" |
11 | #include "extents.h" | ||
11 | 12 | ||
12 | #include <linux/console.h> | 13 | #include <linux/console.h> |
13 | #include <linux/debugfs.h> | 14 | #include <linux/debugfs.h> |
@@ -17,156 +18,88 @@ | |||
17 | 18 | ||
18 | static struct dentry *debug; | 19 | static struct dentry *debug; |
19 | 20 | ||
20 | const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | ||
21 | { | ||
22 | unsigned i; | ||
23 | |||
24 | for (i = 0; i < KEY_PTRS(k); i++) | ||
25 | if (ptr_available(c, k, i)) { | ||
26 | struct cache *ca = PTR_CACHE(c, k, i); | ||
27 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
28 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
29 | |||
30 | if (KEY_SIZE(k) + r > c->sb.bucket_size) | ||
31 | return "bad, length too big"; | ||
32 | if (bucket < ca->sb.first_bucket) | ||
33 | return "bad, short offset"; | ||
34 | if (bucket >= ca->sb.nbuckets) | ||
35 | return "bad, offset past end of device"; | ||
36 | if (ptr_stale(c, k, i)) | ||
37 | return "stale"; | ||
38 | } | ||
39 | |||
40 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
41 | return "bad, null key"; | ||
42 | if (!KEY_PTRS(k)) | ||
43 | return "bad, no pointers"; | ||
44 | if (!KEY_SIZE(k)) | ||
45 | return "zeroed key"; | ||
46 | return ""; | ||
47 | } | ||
48 | |||
49 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) | ||
50 | { | ||
51 | unsigned i = 0; | ||
52 | char *out = buf, *end = buf + size; | ||
53 | |||
54 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | ||
55 | |||
56 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); | ||
57 | |||
58 | if (KEY_PTRS(k)) | ||
59 | while (1) { | ||
60 | p("%llu:%llu gen %llu", | ||
61 | PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); | ||
62 | |||
63 | if (++i == KEY_PTRS(k)) | ||
64 | break; | ||
65 | |||
66 | p(", "); | ||
67 | } | ||
68 | |||
69 | p("]"); | ||
70 | |||
71 | if (KEY_DIRTY(k)) | ||
72 | p(" dirty"); | ||
73 | if (KEY_CSUM(k)) | ||
74 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | ||
75 | #undef p | ||
76 | return out - buf; | ||
77 | } | ||
78 | |||
79 | #ifdef CONFIG_BCACHE_DEBUG | 21 | #ifdef CONFIG_BCACHE_DEBUG |
80 | 22 | ||
81 | static void dump_bset(struct btree *b, struct bset *i) | 23 | #define for_each_written_bset(b, start, i) \ |
82 | { | 24 | for (i = (start); \ |
83 | struct bkey *k, *next; | 25 | (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ |
84 | unsigned j; | 26 | i->seq == (start)->seq; \ |
85 | char buf[80]; | 27 | i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ |
86 | 28 | block_bytes(b->c)) | |
87 | for (k = i->start; k < end(i); k = next) { | ||
88 | next = bkey_next(k); | ||
89 | |||
90 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
91 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | ||
92 | (uint64_t *) k - i->d, i->keys, buf); | ||
93 | |||
94 | for (j = 0; j < KEY_PTRS(k); j++) { | ||
95 | size_t n = PTR_BUCKET_NR(b->c, k, j); | ||
96 | printk(" bucket %zu", n); | ||
97 | |||
98 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | ||
99 | printk(" prio %i", | ||
100 | PTR_BUCKET(b->c, k, j)->prio); | ||
101 | } | ||
102 | 29 | ||
103 | printk(" %s\n", bch_ptr_status(b->c, k)); | 30 | void bch_btree_verify(struct btree *b) |
104 | |||
105 | if (next < end(i) && | ||
106 | bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) | ||
107 | printk(KERN_ERR "Key skipped backwards\n"); | ||
108 | } | ||
109 | } | ||
110 | |||
111 | static void bch_dump_bucket(struct btree *b) | ||
112 | { | ||
113 | unsigned i; | ||
114 | |||
115 | console_lock(); | ||
116 | for (i = 0; i <= b->nsets; i++) | ||
117 | dump_bset(b, b->sets[i].data); | ||
118 | console_unlock(); | ||
119 | } | ||
120 | |||
121 | void bch_btree_verify(struct btree *b, struct bset *new) | ||
122 | { | 31 | { |
123 | struct btree *v = b->c->verify_data; | 32 | struct btree *v = b->c->verify_data; |
124 | struct closure cl; | 33 | struct bset *ondisk, *sorted, *inmemory; |
125 | closure_init_stack(&cl); | 34 | struct bio *bio; |
126 | 35 | ||
127 | if (!b->c->verify) | 36 | if (!b->c->verify || !b->c->verify_ondisk) |
128 | return; | 37 | return; |
129 | 38 | ||
130 | closure_wait_event(&b->io.wait, &cl, | 39 | down(&b->io_mutex); |
131 | atomic_read(&b->io.cl.remaining) == -1); | ||
132 | |||
133 | mutex_lock(&b->c->verify_lock); | 40 | mutex_lock(&b->c->verify_lock); |
134 | 41 | ||
42 | ondisk = b->c->verify_ondisk; | ||
43 | sorted = b->c->verify_data->keys.set->data; | ||
44 | inmemory = b->keys.set->data; | ||
45 | |||
135 | bkey_copy(&v->key, &b->key); | 46 | bkey_copy(&v->key, &b->key); |
136 | v->written = 0; | 47 | v->written = 0; |
137 | v->level = b->level; | 48 | v->level = b->level; |
49 | v->keys.ops = b->keys.ops; | ||
50 | |||
51 | bio = bch_bbio_alloc(b->c); | ||
52 | bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; | ||
53 | bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); | ||
54 | bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; | ||
55 | bch_bio_map(bio, sorted); | ||
138 | 56 | ||
139 | bch_btree_node_read(v); | 57 | submit_bio_wait(REQ_META|READ_SYNC, bio); |
140 | closure_wait_event(&v->io.wait, &cl, | 58 | bch_bbio_free(bio, b->c); |
141 | atomic_read(&b->io.cl.remaining) == -1); | ||
142 | 59 | ||
143 | if (new->keys != v->sets[0].data->keys || | 60 | memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); |
144 | memcmp(new->start, | 61 | |
145 | v->sets[0].data->start, | 62 | bch_btree_node_read_done(v); |
146 | (void *) end(new) - (void *) new->start)) { | 63 | sorted = v->keys.set->data; |
147 | unsigned i, j; | 64 | |
65 | if (inmemory->keys != sorted->keys || | ||
66 | memcmp(inmemory->start, | ||
67 | sorted->start, | ||
68 | (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { | ||
69 | struct bset *i; | ||
70 | unsigned j; | ||
148 | 71 | ||
149 | console_lock(); | 72 | console_lock(); |
150 | 73 | ||
151 | printk(KERN_ERR "*** original memory node:\n"); | 74 | printk(KERN_ERR "*** in memory:\n"); |
152 | for (i = 0; i <= b->nsets; i++) | 75 | bch_dump_bset(&b->keys, inmemory, 0); |
153 | dump_bset(b, b->sets[i].data); | ||
154 | 76 | ||
155 | printk(KERN_ERR "*** sorted memory node:\n"); | 77 | printk(KERN_ERR "*** read back in:\n"); |
156 | dump_bset(b, new); | 78 | bch_dump_bset(&v->keys, sorted, 0); |
157 | 79 | ||
158 | printk(KERN_ERR "*** on disk node:\n"); | 80 | for_each_written_bset(b, ondisk, i) { |
159 | dump_bset(v, v->sets[0].data); | 81 | unsigned block = ((void *) i - (void *) ondisk) / |
82 | block_bytes(b->c); | ||
83 | |||
84 | printk(KERN_ERR "*** on disk block %u:\n", block); | ||
85 | bch_dump_bset(&b->keys, i, block); | ||
86 | } | ||
160 | 87 | ||
161 | for (j = 0; j < new->keys; j++) | 88 | printk(KERN_ERR "*** block %zu not written\n", |
162 | if (new->d[j] != v->sets[0].data->d[j]) | 89 | ((void *) i - (void *) ondisk) / block_bytes(b->c)); |
90 | |||
91 | for (j = 0; j < inmemory->keys; j++) | ||
92 | if (inmemory->d[j] != sorted->d[j]) | ||
163 | break; | 93 | break; |
164 | 94 | ||
95 | printk(KERN_ERR "b->written %u\n", b->written); | ||
96 | |||
165 | console_unlock(); | 97 | console_unlock(); |
166 | panic("verify failed at %u\n", j); | 98 | panic("verify failed at %u\n", j); |
167 | } | 99 | } |
168 | 100 | ||
169 | mutex_unlock(&b->c->verify_lock); | 101 | mutex_unlock(&b->c->verify_lock); |
102 | up(&b->io_mutex); | ||
170 | } | 103 | } |
171 | 104 | ||
172 | void bch_data_verify(struct cached_dev *dc, struct bio *bio) | 105 | void bch_data_verify(struct cached_dev *dc, struct bio *bio) |
@@ -207,74 +140,6 @@ out_put: | |||
207 | bio_put(check); | 140 | bio_put(check); |
208 | } | 141 | } |
209 | 142 | ||
210 | int __bch_count_data(struct btree *b) | ||
211 | { | ||
212 | unsigned ret = 0; | ||
213 | struct btree_iter iter; | ||
214 | struct bkey *k; | ||
215 | |||
216 | if (!b->level) | ||
217 | for_each_key(b, k, &iter) | ||
218 | ret += KEY_SIZE(k); | ||
219 | return ret; | ||
220 | } | ||
221 | |||
222 | void __bch_check_keys(struct btree *b, const char *fmt, ...) | ||
223 | { | ||
224 | va_list args; | ||
225 | struct bkey *k, *p = NULL; | ||
226 | struct btree_iter iter; | ||
227 | const char *err; | ||
228 | |||
229 | for_each_key(b, k, &iter) { | ||
230 | if (!b->level) { | ||
231 | err = "Keys out of order"; | ||
232 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) | ||
233 | goto bug; | ||
234 | |||
235 | if (bch_ptr_invalid(b, k)) | ||
236 | continue; | ||
237 | |||
238 | err = "Overlapping keys"; | ||
239 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) | ||
240 | goto bug; | ||
241 | } else { | ||
242 | if (bch_ptr_bad(b, k)) | ||
243 | continue; | ||
244 | |||
245 | err = "Duplicate keys"; | ||
246 | if (p && !bkey_cmp(p, k)) | ||
247 | goto bug; | ||
248 | } | ||
249 | p = k; | ||
250 | } | ||
251 | |||
252 | err = "Key larger than btree node key"; | ||
253 | if (p && bkey_cmp(p, &b->key) > 0) | ||
254 | goto bug; | ||
255 | |||
256 | return; | ||
257 | bug: | ||
258 | bch_dump_bucket(b); | ||
259 | |||
260 | va_start(args, fmt); | ||
261 | vprintk(fmt, args); | ||
262 | va_end(args); | ||
263 | |||
264 | panic("bcache error: %s:\n", err); | ||
265 | } | ||
266 | |||
267 | void bch_btree_iter_next_check(struct btree_iter *iter) | ||
268 | { | ||
269 | struct bkey *k = iter->data->k, *next = bkey_next(k); | ||
270 | |||
271 | if (next < iter->data->end && | ||
272 | bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { | ||
273 | bch_dump_bucket(iter->b); | ||
274 | panic("Key skipped backwards\n"); | ||
275 | } | ||
276 | } | ||
277 | |||
278 | #endif | 143 | #endif |
279 | 144 | ||
280 | #ifdef CONFIG_DEBUG_FS | 145 | #ifdef CONFIG_DEBUG_FS |
@@ -321,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, | |||
321 | if (!w) | 186 | if (!w) |
322 | break; | 187 | break; |
323 | 188 | ||
324 | bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); | 189 | bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); |
325 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); | 190 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); |
326 | bch_keybuf_del(&i->keys, w); | 191 | bch_keybuf_del(&i->keys, w); |
327 | } | 192 | } |
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 2ede60e31874..1f63c195d247 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h | |||
@@ -1,47 +1,30 @@ | |||
1 | #ifndef _BCACHE_DEBUG_H | 1 | #ifndef _BCACHE_DEBUG_H |
2 | #define _BCACHE_DEBUG_H | 2 | #define _BCACHE_DEBUG_H |
3 | 3 | ||
4 | /* Btree/bkey debug printing */ | 4 | struct bio; |
5 | 5 | struct cached_dev; | |
6 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); | 6 | struct cache_set; |
7 | 7 | ||
8 | #ifdef CONFIG_BCACHE_DEBUG | 8 | #ifdef CONFIG_BCACHE_DEBUG |
9 | 9 | ||
10 | void bch_btree_verify(struct btree *, struct bset *); | 10 | void bch_btree_verify(struct btree *); |
11 | void bch_data_verify(struct cached_dev *, struct bio *); | 11 | void bch_data_verify(struct cached_dev *, struct bio *); |
12 | int __bch_count_data(struct btree *); | ||
13 | void __bch_check_keys(struct btree *, const char *, ...); | ||
14 | void bch_btree_iter_next_check(struct btree_iter *); | ||
15 | 12 | ||
16 | #define EBUG_ON(cond) BUG_ON(cond) | ||
17 | #define expensive_debug_checks(c) ((c)->expensive_debug_checks) | 13 | #define expensive_debug_checks(c) ((c)->expensive_debug_checks) |
18 | #define key_merging_disabled(c) ((c)->key_merging_disabled) | 14 | #define key_merging_disabled(c) ((c)->key_merging_disabled) |
19 | #define bypass_torture_test(d) ((d)->bypass_torture_test) | 15 | #define bypass_torture_test(d) ((d)->bypass_torture_test) |
20 | 16 | ||
21 | #else /* DEBUG */ | 17 | #else /* DEBUG */ |
22 | 18 | ||
23 | static inline void bch_btree_verify(struct btree *b, struct bset *i) {} | 19 | static inline void bch_btree_verify(struct btree *b) {} |
24 | static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} | 20 | static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} |
25 | static inline int __bch_count_data(struct btree *b) { return -1; } | ||
26 | static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} | ||
27 | static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} | ||
28 | 21 | ||
29 | #define EBUG_ON(cond) do { if (cond); } while (0) | ||
30 | #define expensive_debug_checks(c) 0 | 22 | #define expensive_debug_checks(c) 0 |
31 | #define key_merging_disabled(c) 0 | 23 | #define key_merging_disabled(c) 0 |
32 | #define bypass_torture_test(d) 0 | 24 | #define bypass_torture_test(d) 0 |
33 | 25 | ||
34 | #endif | 26 | #endif |
35 | 27 | ||
36 | #define bch_count_data(b) \ | ||
37 | (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) | ||
38 | |||
39 | #define bch_check_keys(b, ...) \ | ||
40 | do { \ | ||
41 | if (expensive_debug_checks((b)->c)) \ | ||
42 | __bch_check_keys(b, __VA_ARGS__); \ | ||
43 | } while (0) | ||
44 | |||
45 | #ifdef CONFIG_DEBUG_FS | 28 | #ifdef CONFIG_DEBUG_FS |
46 | void bch_debug_init_cache_set(struct cache_set *); | 29 | void bch_debug_init_cache_set(struct cache_set *); |
47 | #else | 30 | #else |
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c new file mode 100644 index 000000000000..c3ead586dc27 --- /dev/null +++ b/drivers/md/bcache/extents.c | |||
@@ -0,0 +1,616 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | ||
3 | * | ||
4 | * Uses a block device as cache for other block devices; optimized for SSDs. | ||
5 | * All allocation is done in buckets, which should match the erase block size | ||
6 | * of the device. | ||
7 | * | ||
8 | * Buckets containing cached data are kept on a heap sorted by priority; | ||
9 | * bucket priority is increased on cache hit, and periodically all the buckets | ||
10 | * on the heap have their priority scaled down. This currently is just used as | ||
11 | * an LRU but in the future should allow for more intelligent heuristics. | ||
12 | * | ||
13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | ||
14 | * counter. Garbage collection is used to remove stale pointers. | ||
15 | * | ||
16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | ||
17 | * as keys are inserted we only sort the pages that have not yet been written. | ||
18 | * When garbage collection is run, we resort the entire node. | ||
19 | * | ||
20 | * All configuration is done via sysfs; see Documentation/bcache.txt. | ||
21 | */ | ||
22 | |||
23 | #include "bcache.h" | ||
24 | #include "btree.h" | ||
25 | #include "debug.h" | ||
26 | #include "extents.h" | ||
27 | #include "writeback.h" | ||
28 | |||
29 | static void sort_key_next(struct btree_iter *iter, | ||
30 | struct btree_iter_set *i) | ||
31 | { | ||
32 | i->k = bkey_next(i->k); | ||
33 | |||
34 | if (i->k == i->end) | ||
35 | *i = iter->data[--iter->used]; | ||
36 | } | ||
37 | |||
38 | static bool bch_key_sort_cmp(struct btree_iter_set l, | ||
39 | struct btree_iter_set r) | ||
40 | { | ||
41 | int64_t c = bkey_cmp(l.k, r.k); | ||
42 | |||
43 | return c ? c > 0 : l.k < r.k; | ||
44 | } | ||
45 | |||
46 | static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
47 | { | ||
48 | unsigned i; | ||
49 | |||
50 | for (i = 0; i < KEY_PTRS(k); i++) | ||
51 | if (ptr_available(c, k, i)) { | ||
52 | struct cache *ca = PTR_CACHE(c, k, i); | ||
53 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
54 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
55 | |||
56 | if (KEY_SIZE(k) + r > c->sb.bucket_size || | ||
57 | bucket < ca->sb.first_bucket || | ||
58 | bucket >= ca->sb.nbuckets) | ||
59 | return true; | ||
60 | } | ||
61 | |||
62 | return false; | ||
63 | } | ||
64 | |||
65 | /* Common among btree and extent ptrs */ | ||
66 | |||
67 | static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | ||
68 | { | ||
69 | unsigned i; | ||
70 | |||
71 | for (i = 0; i < KEY_PTRS(k); i++) | ||
72 | if (ptr_available(c, k, i)) { | ||
73 | struct cache *ca = PTR_CACHE(c, k, i); | ||
74 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
75 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
76 | |||
77 | if (KEY_SIZE(k) + r > c->sb.bucket_size) | ||
78 | return "bad, length too big"; | ||
79 | if (bucket < ca->sb.first_bucket) | ||
80 | return "bad, short offset"; | ||
81 | if (bucket >= ca->sb.nbuckets) | ||
82 | return "bad, offset past end of device"; | ||
83 | if (ptr_stale(c, k, i)) | ||
84 | return "stale"; | ||
85 | } | ||
86 | |||
87 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
88 | return "bad, null key"; | ||
89 | if (!KEY_PTRS(k)) | ||
90 | return "bad, no pointers"; | ||
91 | if (!KEY_SIZE(k)) | ||
92 | return "zeroed key"; | ||
93 | return ""; | ||
94 | } | ||
95 | |||
96 | void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) | ||
97 | { | ||
98 | unsigned i = 0; | ||
99 | char *out = buf, *end = buf + size; | ||
100 | |||
101 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | ||
102 | |||
103 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); | ||
104 | |||
105 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
106 | if (i) | ||
107 | p(", "); | ||
108 | |||
109 | if (PTR_DEV(k, i) == PTR_CHECK_DEV) | ||
110 | p("check dev"); | ||
111 | else | ||
112 | p("%llu:%llu gen %llu", PTR_DEV(k, i), | ||
113 | PTR_OFFSET(k, i), PTR_GEN(k, i)); | ||
114 | } | ||
115 | |||
116 | p("]"); | ||
117 | |||
118 | if (KEY_DIRTY(k)) | ||
119 | p(" dirty"); | ||
120 | if (KEY_CSUM(k)) | ||
121 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | ||
122 | #undef p | ||
123 | } | ||
124 | |||
125 | static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) | ||
126 | { | ||
127 | struct btree *b = container_of(keys, struct btree, keys); | ||
128 | unsigned j; | ||
129 | char buf[80]; | ||
130 | |||
131 | bch_extent_to_text(buf, sizeof(buf), k); | ||
132 | printk(" %s", buf); | ||
133 | |||
134 | for (j = 0; j < KEY_PTRS(k); j++) { | ||
135 | size_t n = PTR_BUCKET_NR(b->c, k, j); | ||
136 | printk(" bucket %zu", n); | ||
137 | |||
138 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | ||
139 | printk(" prio %i", | ||
140 | PTR_BUCKET(b->c, k, j)->prio); | ||
141 | } | ||
142 | |||
143 | printk(" %s\n", bch_ptr_status(b->c, k)); | ||
144 | } | ||
145 | |||
146 | /* Btree ptrs */ | ||
147 | |||
148 | bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
149 | { | ||
150 | char buf[80]; | ||
151 | |||
152 | if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) | ||
153 | goto bad; | ||
154 | |||
155 | if (__ptr_invalid(c, k)) | ||
156 | goto bad; | ||
157 | |||
158 | return false; | ||
159 | bad: | ||
160 | bch_extent_to_text(buf, sizeof(buf), k); | ||
161 | cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); | ||
162 | return true; | ||
163 | } | ||
164 | |||
165 | static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) | ||
166 | { | ||
167 | struct btree *b = container_of(bk, struct btree, keys); | ||
168 | return __bch_btree_ptr_invalid(b->c, k); | ||
169 | } | ||
170 | |||
171 | static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) | ||
172 | { | ||
173 | unsigned i; | ||
174 | char buf[80]; | ||
175 | struct bucket *g; | ||
176 | |||
177 | if (mutex_trylock(&b->c->bucket_lock)) { | ||
178 | for (i = 0; i < KEY_PTRS(k); i++) | ||
179 | if (ptr_available(b->c, k, i)) { | ||
180 | g = PTR_BUCKET(b->c, k, i); | ||
181 | |||
182 | if (KEY_DIRTY(k) || | ||
183 | g->prio != BTREE_PRIO || | ||
184 | (b->c->gc_mark_valid && | ||
185 | GC_MARK(g) != GC_MARK_METADATA)) | ||
186 | goto err; | ||
187 | } | ||
188 | |||
189 | mutex_unlock(&b->c->bucket_lock); | ||
190 | } | ||
191 | |||
192 | return false; | ||
193 | err: | ||
194 | mutex_unlock(&b->c->bucket_lock); | ||
195 | bch_extent_to_text(buf, sizeof(buf), k); | ||
196 | btree_bug(b, | ||
197 | "inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||
198 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | ||
199 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
200 | return true; | ||
201 | } | ||
202 | |||
203 | static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) | ||
204 | { | ||
205 | struct btree *b = container_of(bk, struct btree, keys); | ||
206 | unsigned i; | ||
207 | |||
208 | if (!bkey_cmp(k, &ZERO_KEY) || | ||
209 | !KEY_PTRS(k) || | ||
210 | bch_ptr_invalid(bk, k)) | ||
211 | return true; | ||
212 | |||
213 | for (i = 0; i < KEY_PTRS(k); i++) | ||
214 | if (!ptr_available(b->c, k, i) || | ||
215 | ptr_stale(b->c, k, i)) | ||
216 | return true; | ||
217 | |||
218 | if (expensive_debug_checks(b->c) && | ||
219 | btree_ptr_bad_expensive(b, k)) | ||
220 | return true; | ||
221 | |||
222 | return false; | ||
223 | } | ||
224 | |||
225 | static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, | ||
226 | struct bkey *insert, | ||
227 | struct btree_iter *iter, | ||
228 | struct bkey *replace_key) | ||
229 | { | ||
230 | struct btree *b = container_of(bk, struct btree, keys); | ||
231 | |||
232 | if (!KEY_OFFSET(insert)) | ||
233 | btree_current_write(b)->prio_blocked++; | ||
234 | |||
235 | return false; | ||
236 | } | ||
237 | |||
238 | const struct btree_keys_ops bch_btree_keys_ops = { | ||
239 | .sort_cmp = bch_key_sort_cmp, | ||
240 | .insert_fixup = bch_btree_ptr_insert_fixup, | ||
241 | .key_invalid = bch_btree_ptr_invalid, | ||
242 | .key_bad = bch_btree_ptr_bad, | ||
243 | .key_to_text = bch_extent_to_text, | ||
244 | .key_dump = bch_bkey_dump, | ||
245 | }; | ||
246 | |||
247 | /* Extents */ | ||
248 | |||
249 | /* | ||
250 | * Returns true if l > r - unless l == r, in which case returns true if l is | ||
251 | * older than r. | ||
252 | * | ||
253 | * Necessary for btree_sort_fixup() - if there are multiple keys that compare | ||
254 | * equal in different sets, we have to process them newest to oldest. | ||
255 | */ | ||
256 | static bool bch_extent_sort_cmp(struct btree_iter_set l, | ||
257 | struct btree_iter_set r) | ||
258 | { | ||
259 | int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); | ||
260 | |||
261 | return c ? c > 0 : l.k < r.k; | ||
262 | } | ||
263 | |||
264 | static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, | ||
265 | struct bkey *tmp) | ||
266 | { | ||
267 | while (iter->used > 1) { | ||
268 | struct btree_iter_set *top = iter->data, *i = top + 1; | ||
269 | |||
270 | if (iter->used > 2 && | ||
271 | bch_extent_sort_cmp(i[0], i[1])) | ||
272 | i++; | ||
273 | |||
274 | if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) | ||
275 | break; | ||
276 | |||
277 | if (!KEY_SIZE(i->k)) { | ||
278 | sort_key_next(iter, i); | ||
279 | heap_sift(iter, i - top, bch_extent_sort_cmp); | ||
280 | continue; | ||
281 | } | ||
282 | |||
283 | if (top->k > i->k) { | ||
284 | if (bkey_cmp(top->k, i->k) >= 0) | ||
285 | sort_key_next(iter, i); | ||
286 | else | ||
287 | bch_cut_front(top->k, i->k); | ||
288 | |||
289 | heap_sift(iter, i - top, bch_extent_sort_cmp); | ||
290 | } else { | ||
291 | /* can't happen because of comparison func */ | ||
292 | BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); | ||
293 | |||
294 | if (bkey_cmp(i->k, top->k) < 0) { | ||
295 | bkey_copy(tmp, top->k); | ||
296 | |||
297 | bch_cut_back(&START_KEY(i->k), tmp); | ||
298 | bch_cut_front(i->k, top->k); | ||
299 | heap_sift(iter, 0, bch_extent_sort_cmp); | ||
300 | |||
301 | return tmp; | ||
302 | } else { | ||
303 | bch_cut_back(&START_KEY(i->k), top->k); | ||
304 | } | ||
305 | } | ||
306 | } | ||
307 | |||
308 | return NULL; | ||
309 | } | ||
310 | |||
311 | static bool bch_extent_insert_fixup(struct btree_keys *b, | ||
312 | struct bkey *insert, | ||
313 | struct btree_iter *iter, | ||
314 | struct bkey *replace_key) | ||
315 | { | ||
316 | struct cache_set *c = container_of(b, struct btree, keys)->c; | ||
317 | |||
318 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) | ||
319 | { | ||
320 | if (KEY_DIRTY(k)) | ||
321 | bcache_dev_sectors_dirty_add(c, KEY_INODE(k), | ||
322 | offset, -sectors); | ||
323 | } | ||
324 | |||
325 | uint64_t old_offset; | ||
326 | unsigned old_size, sectors_found = 0; | ||
327 | |||
328 | BUG_ON(!KEY_OFFSET(insert)); | ||
329 | BUG_ON(!KEY_SIZE(insert)); | ||
330 | |||
331 | while (1) { | ||
332 | struct bkey *k = bch_btree_iter_next(iter); | ||
333 | if (!k) | ||
334 | break; | ||
335 | |||
336 | if (bkey_cmp(&START_KEY(k), insert) >= 0) { | ||
337 | if (KEY_SIZE(k)) | ||
338 | break; | ||
339 | else | ||
340 | continue; | ||
341 | } | ||
342 | |||
343 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | ||
344 | continue; | ||
345 | |||
346 | old_offset = KEY_START(k); | ||
347 | old_size = KEY_SIZE(k); | ||
348 | |||
349 | /* | ||
350 | * We might overlap with 0 size extents; we can't skip these | ||
351 | * because if they're in the set we're inserting to we have to | ||
352 | * adjust them so they don't overlap with the key we're | ||
353 | * inserting. But we don't want to check them for replace | ||
354 | * operations. | ||
355 | */ | ||
356 | |||
357 | if (replace_key && KEY_SIZE(k)) { | ||
358 | /* | ||
359 | * k might have been split since we inserted/found the | ||
360 | * key we're replacing | ||
361 | */ | ||
362 | unsigned i; | ||
363 | uint64_t offset = KEY_START(k) - | ||
364 | KEY_START(replace_key); | ||
365 | |||
366 | /* But it must be a subset of the replace key */ | ||
367 | if (KEY_START(k) < KEY_START(replace_key) || | ||
368 | KEY_OFFSET(k) > KEY_OFFSET(replace_key)) | ||
369 | goto check_failed; | ||
370 | |||
371 | /* We didn't find a key that we were supposed to */ | ||
372 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | ||
373 | goto check_failed; | ||
374 | |||
375 | if (!bch_bkey_equal_header(k, replace_key)) | ||
376 | goto check_failed; | ||
377 | |||
378 | /* skip past gen */ | ||
379 | offset <<= 8; | ||
380 | |||
381 | BUG_ON(!KEY_PTRS(replace_key)); | ||
382 | |||
383 | for (i = 0; i < KEY_PTRS(replace_key); i++) | ||
384 | if (k->ptr[i] != replace_key->ptr[i] + offset) | ||
385 | goto check_failed; | ||
386 | |||
387 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | ||
388 | } | ||
389 | |||
390 | if (bkey_cmp(insert, k) < 0 && | ||
391 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | ||
392 | /* | ||
393 | * We overlapped in the middle of an existing key: that | ||
394 | * means we have to split the old key. But we have to do | ||
395 | * slightly different things depending on whether the | ||
396 | * old key has been written out yet. | ||
397 | */ | ||
398 | |||
399 | struct bkey *top; | ||
400 | |||
401 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); | ||
402 | |||
403 | if (bkey_written(b, k)) { | ||
404 | /* | ||
405 | * We insert a new key to cover the top of the | ||
406 | * old key, and the old key is modified in place | ||
407 | * to represent the bottom split. | ||
408 | * | ||
409 | * It's completely arbitrary whether the new key | ||
410 | * is the top or the bottom, but it has to match | ||
411 | * up with what btree_sort_fixup() does - it | ||
412 | * doesn't check for this kind of overlap, it | ||
413 | * depends on us inserting a new key for the top | ||
414 | * here. | ||
415 | */ | ||
416 | top = bch_bset_search(b, bset_tree_last(b), | ||
417 | insert); | ||
418 | bch_bset_insert(b, top, k); | ||
419 | } else { | ||
420 | BKEY_PADDED(key) temp; | ||
421 | bkey_copy(&temp.key, k); | ||
422 | bch_bset_insert(b, k, &temp.key); | ||
423 | top = bkey_next(k); | ||
424 | } | ||
425 | |||
426 | bch_cut_front(insert, top); | ||
427 | bch_cut_back(&START_KEY(insert), k); | ||
428 | bch_bset_fix_invalidated_key(b, k); | ||
429 | goto out; | ||
430 | } | ||
431 | |||
432 | if (bkey_cmp(insert, k) < 0) { | ||
433 | bch_cut_front(insert, k); | ||
434 | } else { | ||
435 | if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) | ||
436 | old_offset = KEY_START(insert); | ||
437 | |||
438 | if (bkey_written(b, k) && | ||
439 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | ||
440 | /* | ||
441 | * Completely overwrote, so we don't have to | ||
442 | * invalidate the binary search tree | ||
443 | */ | ||
444 | bch_cut_front(k, k); | ||
445 | } else { | ||
446 | __bch_cut_back(&START_KEY(insert), k); | ||
447 | bch_bset_fix_invalidated_key(b, k); | ||
448 | } | ||
449 | } | ||
450 | |||
451 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); | ||
452 | } | ||
453 | |||
454 | check_failed: | ||
455 | if (replace_key) { | ||
456 | if (!sectors_found) { | ||
457 | return true; | ||
458 | } else if (sectors_found < KEY_SIZE(insert)) { | ||
459 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | ||
460 | (KEY_SIZE(insert) - sectors_found)); | ||
461 | SET_KEY_SIZE(insert, sectors_found); | ||
462 | } | ||
463 | } | ||
464 | out: | ||
465 | if (KEY_DIRTY(insert)) | ||
466 | bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), | ||
467 | KEY_START(insert), | ||
468 | KEY_SIZE(insert)); | ||
469 | |||
470 | return false; | ||
471 | } | ||
472 | |||
473 | static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) | ||
474 | { | ||
475 | struct btree *b = container_of(bk, struct btree, keys); | ||
476 | char buf[80]; | ||
477 | |||
478 | if (!KEY_SIZE(k)) | ||
479 | return true; | ||
480 | |||
481 | if (KEY_SIZE(k) > KEY_OFFSET(k)) | ||
482 | goto bad; | ||
483 | |||
484 | if (__ptr_invalid(b->c, k)) | ||
485 | goto bad; | ||
486 | |||
487 | return false; | ||
488 | bad: | ||
489 | bch_extent_to_text(buf, sizeof(buf), k); | ||
490 | cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); | ||
491 | return true; | ||
492 | } | ||
493 | |||
494 | static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, | ||
495 | unsigned ptr) | ||
496 | { | ||
497 | struct bucket *g = PTR_BUCKET(b->c, k, ptr); | ||
498 | char buf[80]; | ||
499 | |||
500 | if (mutex_trylock(&b->c->bucket_lock)) { | ||
501 | if (b->c->gc_mark_valid && | ||
502 | ((GC_MARK(g) != GC_MARK_DIRTY && | ||
503 | KEY_DIRTY(k)) || | ||
504 | GC_MARK(g) == GC_MARK_METADATA)) | ||
505 | goto err; | ||
506 | |||
507 | if (g->prio == BTREE_PRIO) | ||
508 | goto err; | ||
509 | |||
510 | mutex_unlock(&b->c->bucket_lock); | ||
511 | } | ||
512 | |||
513 | return false; | ||
514 | err: | ||
515 | mutex_unlock(&b->c->bucket_lock); | ||
516 | bch_extent_to_text(buf, sizeof(buf), k); | ||
517 | btree_bug(b, | ||
518 | "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||
519 | buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), | ||
520 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
521 | return true; | ||
522 | } | ||
523 | |||
524 | static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) | ||
525 | { | ||
526 | struct btree *b = container_of(bk, struct btree, keys); | ||
527 | struct bucket *g; | ||
528 | unsigned i, stale; | ||
529 | |||
530 | if (!KEY_PTRS(k) || | ||
531 | bch_extent_invalid(bk, k)) | ||
532 | return true; | ||
533 | |||
534 | for (i = 0; i < KEY_PTRS(k); i++) | ||
535 | if (!ptr_available(b->c, k, i)) | ||
536 | return true; | ||
537 | |||
538 | if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) | ||
539 | return false; | ||
540 | |||
541 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
542 | g = PTR_BUCKET(b->c, k, i); | ||
543 | stale = ptr_stale(b->c, k, i); | ||
544 | |||
545 | btree_bug_on(stale > 96, b, | ||
546 | "key too stale: %i, need_gc %u", | ||
547 | stale, b->c->need_gc); | ||
548 | |||
549 | btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), | ||
550 | b, "stale dirty pointer"); | ||
551 | |||
552 | if (stale) | ||
553 | return true; | ||
554 | |||
555 | if (expensive_debug_checks(b->c) && | ||
556 | bch_extent_bad_expensive(b, k, i)) | ||
557 | return true; | ||
558 | } | ||
559 | |||
560 | return false; | ||
561 | } | ||
562 | |||
563 | static uint64_t merge_chksums(struct bkey *l, struct bkey *r) | ||
564 | { | ||
565 | return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & | ||
566 | ~((uint64_t)1 << 63); | ||
567 | } | ||
568 | |||
569 | static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) | ||
570 | { | ||
571 | struct btree *b = container_of(bk, struct btree, keys); | ||
572 | unsigned i; | ||
573 | |||
574 | if (key_merging_disabled(b->c)) | ||
575 | return false; | ||
576 | |||
577 | for (i = 0; i < KEY_PTRS(l); i++) | ||
578 | if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || | ||
579 | PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) | ||
580 | return false; | ||
581 | |||
582 | /* Keys with no pointers aren't restricted to one bucket and could | ||
583 | * overflow KEY_SIZE | ||
584 | */ | ||
585 | if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { | ||
586 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); | ||
587 | SET_KEY_SIZE(l, USHRT_MAX); | ||
588 | |||
589 | bch_cut_front(l, r); | ||
590 | return false; | ||
591 | } | ||
592 | |||
593 | if (KEY_CSUM(l)) { | ||
594 | if (KEY_CSUM(r)) | ||
595 | l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); | ||
596 | else | ||
597 | SET_KEY_CSUM(l, 0); | ||
598 | } | ||
599 | |||
600 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); | ||
601 | SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); | ||
602 | |||
603 | return true; | ||
604 | } | ||
605 | |||
606 | const struct btree_keys_ops bch_extent_keys_ops = { | ||
607 | .sort_cmp = bch_extent_sort_cmp, | ||
608 | .sort_fixup = bch_extent_sort_fixup, | ||
609 | .insert_fixup = bch_extent_insert_fixup, | ||
610 | .key_invalid = bch_extent_invalid, | ||
611 | .key_bad = bch_extent_bad, | ||
612 | .key_merge = bch_extent_merge, | ||
613 | .key_to_text = bch_extent_to_text, | ||
614 | .key_dump = bch_bkey_dump, | ||
615 | .is_extents = true, | ||
616 | }; | ||
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h new file mode 100644 index 000000000000..e4e23409782d --- /dev/null +++ b/drivers/md/bcache/extents.h | |||
@@ -0,0 +1,13 @@ | |||
1 | #ifndef _BCACHE_EXTENTS_H | ||
2 | #define _BCACHE_EXTENTS_H | ||
3 | |||
4 | extern const struct btree_keys_ops bch_btree_keys_ops; | ||
5 | extern const struct btree_keys_ops bch_extent_keys_ops; | ||
6 | |||
7 | struct bkey; | ||
8 | struct cache_set; | ||
9 | |||
10 | void bch_extent_to_text(char *, size_t, const struct bkey *); | ||
11 | bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); | ||
12 | |||
13 | #endif /* _BCACHE_EXTENTS_H */ | ||
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7eafdf09a0ae..18039affc306 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c | |||
@@ -44,11 +44,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, | |||
44 | 44 | ||
45 | closure_init_stack(&cl); | 45 | closure_init_stack(&cl); |
46 | 46 | ||
47 | pr_debug("reading %llu", (uint64_t) bucket); | 47 | pr_debug("reading %u", bucket_index); |
48 | 48 | ||
49 | while (offset < ca->sb.bucket_size) { | 49 | while (offset < ca->sb.bucket_size) { |
50 | reread: left = ca->sb.bucket_size - offset; | 50 | reread: left = ca->sb.bucket_size - offset; |
51 | len = min_t(unsigned, left, PAGE_SECTORS * 8); | 51 | len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); |
52 | 52 | ||
53 | bio_reset(bio); | 53 | bio_reset(bio); |
54 | bio->bi_iter.bi_sector = bucket + offset; | 54 | bio->bi_iter.bi_sector = bucket + offset; |
@@ -74,19 +74,28 @@ reread: left = ca->sb.bucket_size - offset; | |||
74 | struct list_head *where; | 74 | struct list_head *where; |
75 | size_t blocks, bytes = set_bytes(j); | 75 | size_t blocks, bytes = set_bytes(j); |
76 | 76 | ||
77 | if (j->magic != jset_magic(&ca->sb)) | 77 | if (j->magic != jset_magic(&ca->sb)) { |
78 | pr_debug("%u: bad magic", bucket_index); | ||
78 | return ret; | 79 | return ret; |
80 | } | ||
79 | 81 | ||
80 | if (bytes > left << 9) | 82 | if (bytes > left << 9 || |
83 | bytes > PAGE_SIZE << JSET_BITS) { | ||
84 | pr_info("%u: too big, %zu bytes, offset %u", | ||
85 | bucket_index, bytes, offset); | ||
81 | return ret; | 86 | return ret; |
87 | } | ||
82 | 88 | ||
83 | if (bytes > len << 9) | 89 | if (bytes > len << 9) |
84 | goto reread; | 90 | goto reread; |
85 | 91 | ||
86 | if (j->csum != csum_set(j)) | 92 | if (j->csum != csum_set(j)) { |
93 | pr_info("%u: bad csum, %zu bytes, offset %u", | ||
94 | bucket_index, bytes, offset); | ||
87 | return ret; | 95 | return ret; |
96 | } | ||
88 | 97 | ||
89 | blocks = set_blocks(j, ca->set); | 98 | blocks = set_blocks(j, block_bytes(ca->set)); |
90 | 99 | ||
91 | while (!list_empty(list)) { | 100 | while (!list_empty(list)) { |
92 | i = list_first_entry(list, | 101 | i = list_first_entry(list, |
@@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) | |||
275 | } | 284 | } |
276 | 285 | ||
277 | for (k = i->j.start; | 286 | for (k = i->j.start; |
278 | k < end(&i->j); | 287 | k < bset_bkey_last(&i->j); |
279 | k = bkey_next(k)) { | 288 | k = bkey_next(k)) { |
280 | unsigned j; | 289 | unsigned j; |
281 | 290 | ||
@@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) | |||
313 | n, i->j.seq - 1, start, end); | 322 | n, i->j.seq - 1, start, end); |
314 | 323 | ||
315 | for (k = i->j.start; | 324 | for (k = i->j.start; |
316 | k < end(&i->j); | 325 | k < bset_bkey_last(&i->j); |
317 | k = bkey_next(k)) { | 326 | k = bkey_next(k)) { |
318 | trace_bcache_journal_replay_key(k); | 327 | trace_bcache_journal_replay_key(k); |
319 | 328 | ||
@@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl) | |||
555 | continue_at_nobarrier(cl, journal_write, system_wq); | 564 | continue_at_nobarrier(cl, journal_write, system_wq); |
556 | } | 565 | } |
557 | 566 | ||
567 | static void journal_write_unlock(struct closure *cl) | ||
568 | { | ||
569 | struct cache_set *c = container_of(cl, struct cache_set, journal.io); | ||
570 | |||
571 | c->journal.io_in_flight = 0; | ||
572 | spin_unlock(&c->journal.lock); | ||
573 | } | ||
574 | |||
558 | static void journal_write_unlocked(struct closure *cl) | 575 | static void journal_write_unlocked(struct closure *cl) |
559 | __releases(c->journal.lock) | 576 | __releases(c->journal.lock) |
560 | { | 577 | { |
@@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl) | |||
562 | struct cache *ca; | 579 | struct cache *ca; |
563 | struct journal_write *w = c->journal.cur; | 580 | struct journal_write *w = c->journal.cur; |
564 | struct bkey *k = &c->journal.key; | 581 | struct bkey *k = &c->journal.key; |
565 | unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; | 582 | unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * |
583 | c->sb.block_size; | ||
566 | 584 | ||
567 | struct bio *bio; | 585 | struct bio *bio; |
568 | struct bio_list list; | 586 | struct bio_list list; |
569 | bio_list_init(&list); | 587 | bio_list_init(&list); |
570 | 588 | ||
571 | if (!w->need_write) { | 589 | if (!w->need_write) { |
572 | /* | 590 | closure_return_with_destructor(cl, journal_write_unlock); |
573 | * XXX: have to unlock closure before we unlock journal lock, | ||
574 | * else we race with bch_journal(). But this way we race | ||
575 | * against cache set unregister. Doh. | ||
576 | */ | ||
577 | set_closure_fn(cl, NULL, NULL); | ||
578 | closure_sub(cl, CLOSURE_RUNNING + 1); | ||
579 | spin_unlock(&c->journal.lock); | ||
580 | return; | ||
581 | } else if (journal_full(&c->journal)) { | 591 | } else if (journal_full(&c->journal)) { |
582 | journal_reclaim(c); | 592 | journal_reclaim(c); |
583 | spin_unlock(&c->journal.lock); | 593 | spin_unlock(&c->journal.lock); |
@@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl) | |||
586 | continue_at(cl, journal_write, system_wq); | 596 | continue_at(cl, journal_write, system_wq); |
587 | } | 597 | } |
588 | 598 | ||
589 | c->journal.blocks_free -= set_blocks(w->data, c); | 599 | c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); |
590 | 600 | ||
591 | w->data->btree_level = c->root->level; | 601 | w->data->btree_level = c->root->level; |
592 | 602 | ||
@@ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c) | |||
653 | 663 | ||
654 | w->need_write = true; | 664 | w->need_write = true; |
655 | 665 | ||
656 | if (closure_trylock(cl, &c->cl)) | 666 | if (!c->journal.io_in_flight) { |
657 | journal_write_unlocked(cl); | 667 | c->journal.io_in_flight = 1; |
658 | else | 668 | closure_call(cl, journal_write_unlocked, NULL, &c->cl); |
669 | } else { | ||
659 | spin_unlock(&c->journal.lock); | 670 | spin_unlock(&c->journal.lock); |
671 | } | ||
660 | } | 672 | } |
661 | 673 | ||
662 | static struct journal_write *journal_wait_for_write(struct cache_set *c, | 674 | static struct journal_write *journal_wait_for_write(struct cache_set *c, |
@@ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | |||
664 | { | 676 | { |
665 | size_t sectors; | 677 | size_t sectors; |
666 | struct closure cl; | 678 | struct closure cl; |
679 | bool wait = false; | ||
667 | 680 | ||
668 | closure_init_stack(&cl); | 681 | closure_init_stack(&cl); |
669 | 682 | ||
@@ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | |||
673 | struct journal_write *w = c->journal.cur; | 686 | struct journal_write *w = c->journal.cur; |
674 | 687 | ||
675 | sectors = __set_blocks(w->data, w->data->keys + nkeys, | 688 | sectors = __set_blocks(w->data, w->data->keys + nkeys, |
676 | c) * c->sb.block_size; | 689 | block_bytes(c)) * c->sb.block_size; |
677 | 690 | ||
678 | if (sectors <= min_t(size_t, | 691 | if (sectors <= min_t(size_t, |
679 | c->journal.blocks_free * c->sb.block_size, | 692 | c->journal.blocks_free * c->sb.block_size, |
680 | PAGE_SECTORS << JSET_BITS)) | 693 | PAGE_SECTORS << JSET_BITS)) |
681 | return w; | 694 | return w; |
682 | 695 | ||
683 | /* XXX: tracepoint */ | 696 | if (wait) |
697 | closure_wait(&c->journal.wait, &cl); | ||
698 | |||
684 | if (!journal_full(&c->journal)) { | 699 | if (!journal_full(&c->journal)) { |
685 | trace_bcache_journal_entry_full(c); | 700 | if (wait) |
701 | trace_bcache_journal_entry_full(c); | ||
686 | 702 | ||
687 | /* | 703 | /* |
688 | * XXX: If we were inserting so many keys that they | 704 | * XXX: If we were inserting so many keys that they |
@@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | |||
692 | */ | 708 | */ |
693 | BUG_ON(!w->data->keys); | 709 | BUG_ON(!w->data->keys); |
694 | 710 | ||
695 | closure_wait(&w->wait, &cl); | ||
696 | journal_try_write(c); /* unlocks */ | 711 | journal_try_write(c); /* unlocks */ |
697 | } else { | 712 | } else { |
698 | trace_bcache_journal_full(c); | 713 | if (wait) |
714 | trace_bcache_journal_full(c); | ||
699 | 715 | ||
700 | closure_wait(&c->journal.wait, &cl); | ||
701 | journal_reclaim(c); | 716 | journal_reclaim(c); |
702 | spin_unlock(&c->journal.lock); | 717 | spin_unlock(&c->journal.lock); |
703 | 718 | ||
@@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | |||
706 | 721 | ||
707 | closure_sync(&cl); | 722 | closure_sync(&cl); |
708 | spin_lock(&c->journal.lock); | 723 | spin_lock(&c->journal.lock); |
724 | wait = true; | ||
709 | } | 725 | } |
710 | } | 726 | } |
711 | 727 | ||
@@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c, | |||
736 | 752 | ||
737 | w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); | 753 | w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); |
738 | 754 | ||
739 | memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); | 755 | memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); |
740 | w->data->keys += bch_keylist_nkeys(keys); | 756 | w->data->keys += bch_keylist_nkeys(keys); |
741 | 757 | ||
742 | ret = &fifo_back(&c->journal.pin); | 758 | ret = &fifo_back(&c->journal.pin); |
@@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c) | |||
780 | { | 796 | { |
781 | struct journal *j = &c->journal; | 797 | struct journal *j = &c->journal; |
782 | 798 | ||
783 | closure_init_unlocked(&j->io); | ||
784 | spin_lock_init(&j->lock); | 799 | spin_lock_init(&j->lock); |
785 | INIT_DELAYED_WORK(&j->work, journal_write_work); | 800 | INIT_DELAYED_WORK(&j->work, journal_write_work); |
786 | 801 | ||
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index a6472fda94b2..9180c4465075 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h | |||
@@ -104,6 +104,7 @@ struct journal { | |||
104 | /* used when waiting because the journal was full */ | 104 | /* used when waiting because the journal was full */ |
105 | struct closure_waitlist wait; | 105 | struct closure_waitlist wait; |
106 | struct closure io; | 106 | struct closure io; |
107 | int io_in_flight; | ||
107 | struct delayed_work work; | 108 | struct delayed_work work; |
108 | 109 | ||
109 | /* Number of blocks free in the bucket(s) we're currently writing to */ | 110 | /* Number of blocks free in the bucket(s) we're currently writing to */ |
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 052bd24d24b4..9eb60d102de8 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c | |||
@@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c) | |||
211 | for_each_cache(ca, c, i) { | 211 | for_each_cache(ca, c, i) { |
212 | unsigned sectors_to_move = 0; | 212 | unsigned sectors_to_move = 0; |
213 | unsigned reserve_sectors = ca->sb.bucket_size * | 213 | unsigned reserve_sectors = ca->sb.bucket_size * |
214 | min(fifo_used(&ca->free), ca->free.size / 2); | 214 | fifo_used(&ca->free[RESERVE_MOVINGGC]); |
215 | 215 | ||
216 | ca->heap.used = 0; | 216 | ca->heap.used = 0; |
217 | 217 | ||
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index c906571997d7..72cd213f213f 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
@@ -254,6 +254,24 @@ static void bch_data_insert_keys(struct closure *cl) | |||
254 | closure_return(cl); | 254 | closure_return(cl); |
255 | } | 255 | } |
256 | 256 | ||
257 | static int bch_keylist_realloc(struct keylist *l, unsigned u64s, | ||
258 | struct cache_set *c) | ||
259 | { | ||
260 | size_t oldsize = bch_keylist_nkeys(l); | ||
261 | size_t newsize = oldsize + u64s; | ||
262 | |||
263 | /* | ||
264 | * The journalling code doesn't handle the case where the keys to insert | ||
265 | * is bigger than an empty write: If we just return -ENOMEM here, | ||
266 | * bio_insert() and bio_invalidate() will insert the keys created so far | ||
267 | * and finish the rest when the keylist is empty. | ||
268 | */ | ||
269 | if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) | ||
270 | return -ENOMEM; | ||
271 | |||
272 | return __bch_keylist_realloc(l, u64s); | ||
273 | } | ||
274 | |||
257 | static void bch_data_invalidate(struct closure *cl) | 275 | static void bch_data_invalidate(struct closure *cl) |
258 | { | 276 | { |
259 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); | 277 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
@@ -266,7 +284,7 @@ static void bch_data_invalidate(struct closure *cl) | |||
266 | unsigned sectors = min(bio_sectors(bio), | 284 | unsigned sectors = min(bio_sectors(bio), |
267 | 1U << (KEY_SIZE_BITS - 1)); | 285 | 1U << (KEY_SIZE_BITS - 1)); |
268 | 286 | ||
269 | if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) | 287 | if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) |
270 | goto out; | 288 | goto out; |
271 | 289 | ||
272 | bio->bi_iter.bi_sector += sectors; | 290 | bio->bi_iter.bi_sector += sectors; |
@@ -356,7 +374,7 @@ static void bch_data_insert_start(struct closure *cl) | |||
356 | 374 | ||
357 | /* 1 for the device pointer and 1 for the chksum */ | 375 | /* 1 for the device pointer and 1 for the chksum */ |
358 | if (bch_keylist_realloc(&op->insert_keys, | 376 | if (bch_keylist_realloc(&op->insert_keys, |
359 | 1 + (op->csum ? 1 : 0), | 377 | 3 + (op->csum ? 1 : 0), |
360 | op->c)) | 378 | op->c)) |
361 | continue_at(cl, bch_data_insert_keys, bcache_wq); | 379 | continue_at(cl, bch_data_insert_keys, bcache_wq); |
362 | 380 | ||
@@ -596,14 +614,12 @@ struct search { | |||
596 | /* Stack frame for bio_complete */ | 614 | /* Stack frame for bio_complete */ |
597 | struct closure cl; | 615 | struct closure cl; |
598 | 616 | ||
599 | struct bcache_device *d; | ||
600 | |||
601 | struct bbio bio; | 617 | struct bbio bio; |
602 | struct bio *orig_bio; | 618 | struct bio *orig_bio; |
603 | struct bio *cache_miss; | 619 | struct bio *cache_miss; |
620 | struct bcache_device *d; | ||
604 | 621 | ||
605 | unsigned insert_bio_sectors; | 622 | unsigned insert_bio_sectors; |
606 | |||
607 | unsigned recoverable:1; | 623 | unsigned recoverable:1; |
608 | unsigned write:1; | 624 | unsigned write:1; |
609 | unsigned read_dirty_data:1; | 625 | unsigned read_dirty_data:1; |
@@ -629,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error) | |||
629 | 645 | ||
630 | if (error) | 646 | if (error) |
631 | s->iop.error = error; | 647 | s->iop.error = error; |
632 | else if (ptr_stale(s->iop.c, &b->key, 0)) { | 648 | else if (!KEY_DIRTY(&b->key) && |
649 | ptr_stale(s->iop.c, &b->key, 0)) { | ||
633 | atomic_long_inc(&s->iop.c->cache_read_races); | 650 | atomic_long_inc(&s->iop.c->cache_read_races); |
634 | s->iop.error = -EINTR; | 651 | s->iop.error = -EINTR; |
635 | } | 652 | } |
@@ -710,10 +727,13 @@ static void cache_lookup(struct closure *cl) | |||
710 | { | 727 | { |
711 | struct search *s = container_of(cl, struct search, iop.cl); | 728 | struct search *s = container_of(cl, struct search, iop.cl); |
712 | struct bio *bio = &s->bio.bio; | 729 | struct bio *bio = &s->bio.bio; |
730 | int ret; | ||
731 | |||
732 | bch_btree_op_init(&s->op, -1); | ||
713 | 733 | ||
714 | int ret = bch_btree_map_keys(&s->op, s->iop.c, | 734 | ret = bch_btree_map_keys(&s->op, s->iop.c, |
715 | &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), | 735 | &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), |
716 | cache_lookup_fn, MAP_END_KEY); | 736 | cache_lookup_fn, MAP_END_KEY); |
717 | if (ret == -EAGAIN) | 737 | if (ret == -EAGAIN) |
718 | continue_at(cl, cache_lookup, bcache_wq); | 738 | continue_at(cl, cache_lookup, bcache_wq); |
719 | 739 | ||
@@ -754,12 +774,12 @@ static void bio_complete(struct search *s) | |||
754 | } | 774 | } |
755 | } | 775 | } |
756 | 776 | ||
757 | static void do_bio_hook(struct search *s) | 777 | static void do_bio_hook(struct search *s, struct bio *orig_bio) |
758 | { | 778 | { |
759 | struct bio *bio = &s->bio.bio; | 779 | struct bio *bio = &s->bio.bio; |
760 | 780 | ||
761 | bio_init(bio); | 781 | bio_init(bio); |
762 | __bio_clone_fast(bio, s->orig_bio); | 782 | __bio_clone_fast(bio, orig_bio); |
763 | bio->bi_end_io = request_endio; | 783 | bio->bi_end_io = request_endio; |
764 | bio->bi_private = &s->cl; | 784 | bio->bi_private = &s->cl; |
765 | 785 | ||
@@ -778,26 +798,32 @@ static void search_free(struct closure *cl) | |||
778 | mempool_free(s, s->d->c->search); | 798 | mempool_free(s, s->d->c->search); |
779 | } | 799 | } |
780 | 800 | ||
781 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | 801 | static inline struct search *search_alloc(struct bio *bio, |
802 | struct bcache_device *d) | ||
782 | { | 803 | { |
783 | struct search *s; | 804 | struct search *s; |
784 | 805 | ||
785 | s = mempool_alloc(d->c->search, GFP_NOIO); | 806 | s = mempool_alloc(d->c->search, GFP_NOIO); |
786 | memset(s, 0, offsetof(struct search, iop.insert_keys)); | ||
787 | 807 | ||
788 | __closure_init(&s->cl, NULL); | 808 | closure_init(&s->cl, NULL); |
809 | do_bio_hook(s, bio); | ||
789 | 810 | ||
790 | s->iop.inode = d->id; | ||
791 | s->iop.c = d->c; | ||
792 | s->d = d; | ||
793 | s->op.lock = -1; | ||
794 | s->iop.write_point = hash_long((unsigned long) current, 16); | ||
795 | s->orig_bio = bio; | 811 | s->orig_bio = bio; |
796 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | 812 | s->cache_miss = NULL; |
797 | s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; | 813 | s->d = d; |
798 | s->recoverable = 1; | 814 | s->recoverable = 1; |
815 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | ||
816 | s->read_dirty_data = 0; | ||
799 | s->start_time = jiffies; | 817 | s->start_time = jiffies; |
800 | do_bio_hook(s); | 818 | |
819 | s->iop.c = d->c; | ||
820 | s->iop.bio = NULL; | ||
821 | s->iop.inode = d->id; | ||
822 | s->iop.write_point = hash_long((unsigned long) current, 16); | ||
823 | s->iop.write_prio = 0; | ||
824 | s->iop.error = 0; | ||
825 | s->iop.flags = 0; | ||
826 | s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; | ||
801 | 827 | ||
802 | return s; | 828 | return s; |
803 | } | 829 | } |
@@ -843,7 +869,7 @@ static void cached_dev_read_error(struct closure *cl) | |||
843 | trace_bcache_read_retry(s->orig_bio); | 869 | trace_bcache_read_retry(s->orig_bio); |
844 | 870 | ||
845 | s->iop.error = 0; | 871 | s->iop.error = 0; |
846 | do_bio_hook(s); | 872 | do_bio_hook(s, s->orig_bio); |
847 | 873 | ||
848 | /* XXX: invalidate cache */ | 874 | /* XXX: invalidate cache */ |
849 | 875 | ||
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 2cd65bf073c2..39f21dbedc38 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h | |||
@@ -13,17 +13,22 @@ struct data_insert_op { | |||
13 | uint16_t write_prio; | 13 | uint16_t write_prio; |
14 | short error; | 14 | short error; |
15 | 15 | ||
16 | unsigned bypass:1; | 16 | union { |
17 | unsigned writeback:1; | 17 | uint16_t flags; |
18 | unsigned flush_journal:1; | ||
19 | unsigned csum:1; | ||
20 | 18 | ||
21 | unsigned replace:1; | 19 | struct { |
22 | unsigned replace_collision:1; | 20 | unsigned bypass:1; |
21 | unsigned writeback:1; | ||
22 | unsigned flush_journal:1; | ||
23 | unsigned csum:1; | ||
23 | 24 | ||
24 | unsigned insert_data_done:1; | 25 | unsigned replace:1; |
26 | unsigned replace_collision:1; | ||
27 | |||
28 | unsigned insert_data_done:1; | ||
29 | }; | ||
30 | }; | ||
25 | 31 | ||
26 | /* Anything past this point won't get zeroed in search_alloc() */ | ||
27 | struct keylist insert_keys; | 32 | struct keylist insert_keys; |
28 | BKEY_PADDED(replace_key); | 33 | BKEY_PADDED(replace_key); |
29 | }; | 34 | }; |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 93d593f957f6..24a3a1546caa 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include "bcache.h" | 9 | #include "bcache.h" |
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "extents.h" | ||
12 | #include "request.h" | 13 | #include "request.h" |
13 | #include "writeback.h" | 14 | #include "writeback.h" |
14 | 15 | ||
@@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error) | |||
225 | struct cached_dev *dc = bio->bi_private; | 226 | struct cached_dev *dc = bio->bi_private; |
226 | /* XXX: error checking */ | 227 | /* XXX: error checking */ |
227 | 228 | ||
228 | closure_put(&dc->sb_write.cl); | 229 | closure_put(&dc->sb_write); |
229 | } | 230 | } |
230 | 231 | ||
231 | static void __write_super(struct cache_sb *sb, struct bio *bio) | 232 | static void __write_super(struct cache_sb *sb, struct bio *bio) |
@@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) | |||
263 | submit_bio(REQ_WRITE, bio); | 264 | submit_bio(REQ_WRITE, bio); |
264 | } | 265 | } |
265 | 266 | ||
267 | static void bch_write_bdev_super_unlock(struct closure *cl) | ||
268 | { | ||
269 | struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); | ||
270 | |||
271 | up(&dc->sb_write_mutex); | ||
272 | } | ||
273 | |||
266 | void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) | 274 | void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) |
267 | { | 275 | { |
268 | struct closure *cl = &dc->sb_write.cl; | 276 | struct closure *cl = &dc->sb_write; |
269 | struct bio *bio = &dc->sb_bio; | 277 | struct bio *bio = &dc->sb_bio; |
270 | 278 | ||
271 | closure_lock(&dc->sb_write, parent); | 279 | down(&dc->sb_write_mutex); |
280 | closure_init(cl, parent); | ||
272 | 281 | ||
273 | bio_reset(bio); | 282 | bio_reset(bio); |
274 | bio->bi_bdev = dc->bdev; | 283 | bio->bi_bdev = dc->bdev; |
@@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) | |||
278 | closure_get(cl); | 287 | closure_get(cl); |
279 | __write_super(&dc->sb, bio); | 288 | __write_super(&dc->sb, bio); |
280 | 289 | ||
281 | closure_return(cl); | 290 | closure_return_with_destructor(cl, bch_write_bdev_super_unlock); |
282 | } | 291 | } |
283 | 292 | ||
284 | static void write_super_endio(struct bio *bio, int error) | 293 | static void write_super_endio(struct bio *bio, int error) |
@@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error) | |||
286 | struct cache *ca = bio->bi_private; | 295 | struct cache *ca = bio->bi_private; |
287 | 296 | ||
288 | bch_count_io_errors(ca, error, "writing superblock"); | 297 | bch_count_io_errors(ca, error, "writing superblock"); |
289 | closure_put(&ca->set->sb_write.cl); | 298 | closure_put(&ca->set->sb_write); |
299 | } | ||
300 | |||
301 | static void bcache_write_super_unlock(struct closure *cl) | ||
302 | { | ||
303 | struct cache_set *c = container_of(cl, struct cache_set, sb_write); | ||
304 | |||
305 | up(&c->sb_write_mutex); | ||
290 | } | 306 | } |
291 | 307 | ||
292 | void bcache_write_super(struct cache_set *c) | 308 | void bcache_write_super(struct cache_set *c) |
293 | { | 309 | { |
294 | struct closure *cl = &c->sb_write.cl; | 310 | struct closure *cl = &c->sb_write; |
295 | struct cache *ca; | 311 | struct cache *ca; |
296 | unsigned i; | 312 | unsigned i; |
297 | 313 | ||
298 | closure_lock(&c->sb_write, &c->cl); | 314 | down(&c->sb_write_mutex); |
315 | closure_init(cl, &c->cl); | ||
299 | 316 | ||
300 | c->sb.seq++; | 317 | c->sb.seq++; |
301 | 318 | ||
@@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c) | |||
317 | __write_super(&ca->sb, bio); | 334 | __write_super(&ca->sb, bio); |
318 | } | 335 | } |
319 | 336 | ||
320 | closure_return(cl); | 337 | closure_return_with_destructor(cl, bcache_write_super_unlock); |
321 | } | 338 | } |
322 | 339 | ||
323 | /* UUID io */ | 340 | /* UUID io */ |
@@ -325,23 +342,31 @@ void bcache_write_super(struct cache_set *c) | |||
325 | static void uuid_endio(struct bio *bio, int error) | 342 | static void uuid_endio(struct bio *bio, int error) |
326 | { | 343 | { |
327 | struct closure *cl = bio->bi_private; | 344 | struct closure *cl = bio->bi_private; |
328 | struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); | 345 | struct cache_set *c = container_of(cl, struct cache_set, uuid_write); |
329 | 346 | ||
330 | cache_set_err_on(error, c, "accessing uuids"); | 347 | cache_set_err_on(error, c, "accessing uuids"); |
331 | bch_bbio_free(bio, c); | 348 | bch_bbio_free(bio, c); |
332 | closure_put(cl); | 349 | closure_put(cl); |
333 | } | 350 | } |
334 | 351 | ||
352 | static void uuid_io_unlock(struct closure *cl) | ||
353 | { | ||
354 | struct cache_set *c = container_of(cl, struct cache_set, uuid_write); | ||
355 | |||
356 | up(&c->uuid_write_mutex); | ||
357 | } | ||
358 | |||
335 | static void uuid_io(struct cache_set *c, unsigned long rw, | 359 | static void uuid_io(struct cache_set *c, unsigned long rw, |
336 | struct bkey *k, struct closure *parent) | 360 | struct bkey *k, struct closure *parent) |
337 | { | 361 | { |
338 | struct closure *cl = &c->uuid_write.cl; | 362 | struct closure *cl = &c->uuid_write; |
339 | struct uuid_entry *u; | 363 | struct uuid_entry *u; |
340 | unsigned i; | 364 | unsigned i; |
341 | char buf[80]; | 365 | char buf[80]; |
342 | 366 | ||
343 | BUG_ON(!parent); | 367 | BUG_ON(!parent); |
344 | closure_lock(&c->uuid_write, parent); | 368 | down(&c->uuid_write_mutex); |
369 | closure_init(cl, parent); | ||
345 | 370 | ||
346 | for (i = 0; i < KEY_PTRS(k); i++) { | 371 | for (i = 0; i < KEY_PTRS(k); i++) { |
347 | struct bio *bio = bch_bbio_alloc(c); | 372 | struct bio *bio = bch_bbio_alloc(c); |
@@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | |||
359 | break; | 384 | break; |
360 | } | 385 | } |
361 | 386 | ||
362 | bch_bkey_to_text(buf, sizeof(buf), k); | 387 | bch_extent_to_text(buf, sizeof(buf), k); |
363 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); | 388 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); |
364 | 389 | ||
365 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) | 390 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) |
@@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | |||
368 | u - c->uuids, u->uuid, u->label, | 393 | u - c->uuids, u->uuid, u->label, |
369 | u->first_reg, u->last_reg, u->invalidated); | 394 | u->first_reg, u->last_reg, u->invalidated); |
370 | 395 | ||
371 | closure_return(cl); | 396 | closure_return_with_destructor(cl, uuid_io_unlock); |
372 | } | 397 | } |
373 | 398 | ||
374 | static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) | 399 | static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) |
375 | { | 400 | { |
376 | struct bkey *k = &j->uuid_bucket; | 401 | struct bkey *k = &j->uuid_bucket; |
377 | 402 | ||
378 | if (bch_btree_ptr_invalid(c, k)) | 403 | if (__bch_btree_ptr_invalid(c, k)) |
379 | return "bad uuid pointer"; | 404 | return "bad uuid pointer"; |
380 | 405 | ||
381 | bkey_copy(&c->uuid_bucket, k); | 406 | bkey_copy(&c->uuid_bucket, k); |
@@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c) | |||
420 | 445 | ||
421 | lockdep_assert_held(&bch_register_lock); | 446 | lockdep_assert_held(&bch_register_lock); |
422 | 447 | ||
423 | if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) | 448 | if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) |
424 | return 1; | 449 | return 1; |
425 | 450 | ||
426 | SET_KEY_SIZE(&k.key, c->sb.bucket_size); | 451 | SET_KEY_SIZE(&k.key, c->sb.bucket_size); |
@@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca) | |||
538 | atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), | 563 | atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), |
539 | &ca->meta_sectors_written); | 564 | &ca->meta_sectors_written); |
540 | 565 | ||
541 | pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), | 566 | //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), |
542 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | 567 | // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); |
543 | 568 | ||
544 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { | 569 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { |
545 | long bucket; | 570 | long bucket; |
@@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca) | |||
558 | p->magic = pset_magic(&ca->sb); | 583 | p->magic = pset_magic(&ca->sb); |
559 | p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); | 584 | p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); |
560 | 585 | ||
561 | bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); | 586 | bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); |
562 | BUG_ON(bucket == -1); | 587 | BUG_ON(bucket == -1); |
563 | 588 | ||
564 | mutex_unlock(&ca->set->bucket_lock); | 589 | mutex_unlock(&ca->set->bucket_lock); |
@@ -1098,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | |||
1098 | set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); | 1123 | set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); |
1099 | kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); | 1124 | kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); |
1100 | INIT_WORK(&dc->detach, cached_dev_detach_finish); | 1125 | INIT_WORK(&dc->detach, cached_dev_detach_finish); |
1101 | closure_init_unlocked(&dc->sb_write); | 1126 | sema_init(&dc->sb_write_mutex, 1); |
1102 | INIT_LIST_HEAD(&dc->io_lru); | 1127 | INIT_LIST_HEAD(&dc->io_lru); |
1103 | spin_lock_init(&dc->io_lock); | 1128 | spin_lock_init(&dc->io_lock); |
1104 | bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); | 1129 | bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); |
@@ -1110,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | |||
1110 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); | 1135 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); |
1111 | } | 1136 | } |
1112 | 1137 | ||
1138 | dc->disk.stripe_size = q->limits.io_opt >> 9; | ||
1139 | |||
1140 | if (dc->disk.stripe_size) | ||
1141 | dc->partial_stripes_expensive = | ||
1142 | q->limits.raid_partial_stripes_expensive; | ||
1143 | |||
1113 | ret = bcache_device_init(&dc->disk, block_size, | 1144 | ret = bcache_device_init(&dc->disk, block_size, |
1114 | dc->bdev->bd_part->nr_sects - dc->sb.data_offset); | 1145 | dc->bdev->bd_part->nr_sects - dc->sb.data_offset); |
1115 | if (ret) | 1146 | if (ret) |
@@ -1321,8 +1352,8 @@ static void cache_set_free(struct closure *cl) | |||
1321 | if (ca) | 1352 | if (ca) |
1322 | kobject_put(&ca->kobj); | 1353 | kobject_put(&ca->kobj); |
1323 | 1354 | ||
1355 | bch_bset_sort_state_free(&c->sort); | ||
1324 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); | 1356 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); |
1325 | free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); | ||
1326 | 1357 | ||
1327 | if (c->bio_split) | 1358 | if (c->bio_split) |
1328 | bioset_free(c->bio_split); | 1359 | bioset_free(c->bio_split); |
@@ -1447,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1447 | c->block_bits = ilog2(sb->block_size); | 1478 | c->block_bits = ilog2(sb->block_size); |
1448 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); | 1479 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); |
1449 | 1480 | ||
1450 | c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; | 1481 | c->btree_pages = bucket_pages(c); |
1451 | if (c->btree_pages > BTREE_MAX_PAGES) | 1482 | if (c->btree_pages > BTREE_MAX_PAGES) |
1452 | c->btree_pages = max_t(int, c->btree_pages / 4, | 1483 | c->btree_pages = max_t(int, c->btree_pages / 4, |
1453 | BTREE_MAX_PAGES); | 1484 | BTREE_MAX_PAGES); |
1454 | 1485 | ||
1455 | c->sort_crit_factor = int_sqrt(c->btree_pages); | 1486 | sema_init(&c->sb_write_mutex, 1); |
1456 | |||
1457 | closure_init_unlocked(&c->sb_write); | ||
1458 | mutex_init(&c->bucket_lock); | 1487 | mutex_init(&c->bucket_lock); |
1459 | init_waitqueue_head(&c->try_wait); | 1488 | init_waitqueue_head(&c->try_wait); |
1460 | init_waitqueue_head(&c->bucket_wait); | 1489 | init_waitqueue_head(&c->bucket_wait); |
1461 | closure_init_unlocked(&c->uuid_write); | 1490 | sema_init(&c->uuid_write_mutex, 1); |
1462 | mutex_init(&c->sort_lock); | ||
1463 | 1491 | ||
1464 | spin_lock_init(&c->sort_time.lock); | ||
1465 | spin_lock_init(&c->btree_gc_time.lock); | 1492 | spin_lock_init(&c->btree_gc_time.lock); |
1466 | spin_lock_init(&c->btree_split_time.lock); | 1493 | spin_lock_init(&c->btree_split_time.lock); |
1467 | spin_lock_init(&c->btree_read_time.lock); | 1494 | spin_lock_init(&c->btree_read_time.lock); |
@@ -1489,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1489 | bucket_pages(c))) || | 1516 | bucket_pages(c))) || |
1490 | !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || | 1517 | !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || |
1491 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | 1518 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || |
1492 | !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || | ||
1493 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || | 1519 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || |
1494 | bch_journal_alloc(c) || | 1520 | bch_journal_alloc(c) || |
1495 | bch_btree_cache_alloc(c) || | 1521 | bch_btree_cache_alloc(c) || |
1496 | bch_open_buckets_alloc(c)) | 1522 | bch_open_buckets_alloc(c) || |
1523 | bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) | ||
1497 | goto err; | 1524 | goto err; |
1498 | 1525 | ||
1499 | c->congested_read_threshold_us = 2000; | 1526 | c->congested_read_threshold_us = 2000; |
@@ -1549,7 +1576,7 @@ static void run_cache_set(struct cache_set *c) | |||
1549 | k = &j->btree_root; | 1576 | k = &j->btree_root; |
1550 | 1577 | ||
1551 | err = "bad btree root"; | 1578 | err = "bad btree root"; |
1552 | if (bch_btree_ptr_invalid(c, k)) | 1579 | if (__bch_btree_ptr_invalid(c, k)) |
1553 | goto err; | 1580 | goto err; |
1554 | 1581 | ||
1555 | err = "error reading btree root"; | 1582 | err = "error reading btree root"; |
@@ -1743,6 +1770,7 @@ err: | |||
1743 | void bch_cache_release(struct kobject *kobj) | 1770 | void bch_cache_release(struct kobject *kobj) |
1744 | { | 1771 | { |
1745 | struct cache *ca = container_of(kobj, struct cache, kobj); | 1772 | struct cache *ca = container_of(kobj, struct cache, kobj); |
1773 | unsigned i; | ||
1746 | 1774 | ||
1747 | if (ca->set) | 1775 | if (ca->set) |
1748 | ca->set->cache[ca->sb.nr_this_dev] = NULL; | 1776 | ca->set->cache[ca->sb.nr_this_dev] = NULL; |
@@ -1756,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj) | |||
1756 | free_heap(&ca->heap); | 1784 | free_heap(&ca->heap); |
1757 | free_fifo(&ca->unused); | 1785 | free_fifo(&ca->unused); |
1758 | free_fifo(&ca->free_inc); | 1786 | free_fifo(&ca->free_inc); |
1759 | free_fifo(&ca->free); | 1787 | |
1788 | for (i = 0; i < RESERVE_NR; i++) | ||
1789 | free_fifo(&ca->free[i]); | ||
1760 | 1790 | ||
1761 | if (ca->sb_bio.bi_inline_vecs[0].bv_page) | 1791 | if (ca->sb_bio.bi_inline_vecs[0].bv_page) |
1762 | put_page(ca->sb_bio.bi_io_vec[0].bv_page); | 1792 | put_page(ca->sb_bio.bi_io_vec[0].bv_page); |
@@ -1782,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) | |||
1782 | ca->journal.bio.bi_max_vecs = 8; | 1812 | ca->journal.bio.bi_max_vecs = 8; |
1783 | ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; | 1813 | ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; |
1784 | 1814 | ||
1785 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; | 1815 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; |
1786 | free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); | ||
1787 | 1816 | ||
1788 | if (!init_fifo(&ca->free, free, GFP_KERNEL) || | 1817 | if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || |
1818 | !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || | ||
1819 | !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || | ||
1820 | !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || | ||
1789 | !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || | 1821 | !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || |
1790 | !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || | 1822 | !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || |
1791 | !init_heap(&ca->heap, free << 3, GFP_KERNEL) || | 1823 | !init_heap(&ca->heap, free << 3, GFP_KERNEL) || |
@@ -2030,7 +2062,8 @@ static void bcache_exit(void) | |||
2030 | kobject_put(bcache_kobj); | 2062 | kobject_put(bcache_kobj); |
2031 | if (bcache_wq) | 2063 | if (bcache_wq) |
2032 | destroy_workqueue(bcache_wq); | 2064 | destroy_workqueue(bcache_wq); |
2033 | unregister_blkdev(bcache_major, "bcache"); | 2065 | if (bcache_major) |
2066 | unregister_blkdev(bcache_major, "bcache"); | ||
2034 | unregister_reboot_notifier(&reboot); | 2067 | unregister_reboot_notifier(&reboot); |
2035 | } | 2068 | } |
2036 | 2069 | ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index a1f85612f0b3..c6ab69333a6d 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test); | |||
102 | rw_attribute(key_merging_disabled); | 102 | rw_attribute(key_merging_disabled); |
103 | rw_attribute(gc_always_rewrite); | 103 | rw_attribute(gc_always_rewrite); |
104 | rw_attribute(expensive_debug_checks); | 104 | rw_attribute(expensive_debug_checks); |
105 | rw_attribute(freelist_percent); | ||
106 | rw_attribute(cache_replacement_policy); | 105 | rw_attribute(cache_replacement_policy); |
107 | rw_attribute(btree_shrinker_disabled); | 106 | rw_attribute(btree_shrinker_disabled); |
108 | rw_attribute(copy_gc_enabled); | 107 | rw_attribute(copy_gc_enabled); |
@@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = { | |||
401 | }; | 400 | }; |
402 | KTYPE(bch_flash_dev); | 401 | KTYPE(bch_flash_dev); |
403 | 402 | ||
403 | struct bset_stats_op { | ||
404 | struct btree_op op; | ||
405 | size_t nodes; | ||
406 | struct bset_stats stats; | ||
407 | }; | ||
408 | |||
409 | static int btree_bset_stats(struct btree_op *b_op, struct btree *b) | ||
410 | { | ||
411 | struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); | ||
412 | |||
413 | op->nodes++; | ||
414 | bch_btree_keys_stats(&b->keys, &op->stats); | ||
415 | |||
416 | return MAP_CONTINUE; | ||
417 | } | ||
418 | |||
419 | int bch_bset_print_stats(struct cache_set *c, char *buf) | ||
420 | { | ||
421 | struct bset_stats_op op; | ||
422 | int ret; | ||
423 | |||
424 | memset(&op, 0, sizeof(op)); | ||
425 | bch_btree_op_init(&op.op, -1); | ||
426 | |||
427 | ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); | ||
428 | if (ret < 0) | ||
429 | return ret; | ||
430 | |||
431 | return snprintf(buf, PAGE_SIZE, | ||
432 | "btree nodes: %zu\n" | ||
433 | "written sets: %zu\n" | ||
434 | "unwritten sets: %zu\n" | ||
435 | "written key bytes: %zu\n" | ||
436 | "unwritten key bytes: %zu\n" | ||
437 | "floats: %zu\n" | ||
438 | "failed: %zu\n", | ||
439 | op.nodes, | ||
440 | op.stats.sets_written, op.stats.sets_unwritten, | ||
441 | op.stats.bytes_written, op.stats.bytes_unwritten, | ||
442 | op.stats.floats, op.stats.failed); | ||
443 | } | ||
444 | |||
404 | SHOW(__bch_cache_set) | 445 | SHOW(__bch_cache_set) |
405 | { | 446 | { |
406 | unsigned root_usage(struct cache_set *c) | 447 | unsigned root_usage(struct cache_set *c) |
@@ -419,7 +460,7 @@ lock_root: | |||
419 | rw_lock(false, b, b->level); | 460 | rw_lock(false, b, b->level); |
420 | } while (b != c->root); | 461 | } while (b != c->root); |
421 | 462 | ||
422 | for_each_key_filter(b, k, &iter, bch_ptr_bad) | 463 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) |
423 | bytes += bkey_bytes(k); | 464 | bytes += bkey_bytes(k); |
424 | 465 | ||
425 | rw_unlock(false, b); | 466 | rw_unlock(false, b); |
@@ -434,7 +475,7 @@ lock_root: | |||
434 | 475 | ||
435 | mutex_lock(&c->bucket_lock); | 476 | mutex_lock(&c->bucket_lock); |
436 | list_for_each_entry(b, &c->btree_cache, list) | 477 | list_for_each_entry(b, &c->btree_cache, list) |
437 | ret += 1 << (b->page_order + PAGE_SHIFT); | 478 | ret += 1 << (b->keys.page_order + PAGE_SHIFT); |
438 | 479 | ||
439 | mutex_unlock(&c->bucket_lock); | 480 | mutex_unlock(&c->bucket_lock); |
440 | return ret; | 481 | return ret; |
@@ -491,7 +532,7 @@ lock_root: | |||
491 | 532 | ||
492 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); | 533 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); |
493 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); | 534 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); |
494 | sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); | 535 | sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); |
495 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); | 536 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); |
496 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); | 537 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); |
497 | 538 | ||
@@ -711,9 +752,6 @@ SHOW(__bch_cache) | |||
711 | sysfs_print(io_errors, | 752 | sysfs_print(io_errors, |
712 | atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); | 753 | atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); |
713 | 754 | ||
714 | sysfs_print(freelist_percent, ca->free.size * 100 / | ||
715 | ((size_t) ca->sb.nbuckets)); | ||
716 | |||
717 | if (attr == &sysfs_cache_replacement_policy) | 755 | if (attr == &sysfs_cache_replacement_policy) |
718 | return bch_snprint_string_list(buf, PAGE_SIZE, | 756 | return bch_snprint_string_list(buf, PAGE_SIZE, |
719 | cache_replacement_policies, | 757 | cache_replacement_policies, |
@@ -820,32 +858,6 @@ STORE(__bch_cache) | |||
820 | } | 858 | } |
821 | } | 859 | } |
822 | 860 | ||
823 | if (attr == &sysfs_freelist_percent) { | ||
824 | DECLARE_FIFO(long, free); | ||
825 | long i; | ||
826 | size_t p = strtoul_or_return(buf); | ||
827 | |||
828 | p = clamp_t(size_t, | ||
829 | ((size_t) ca->sb.nbuckets * p) / 100, | ||
830 | roundup_pow_of_two(ca->sb.nbuckets) >> 9, | ||
831 | ca->sb.nbuckets / 2); | ||
832 | |||
833 | if (!init_fifo_exact(&free, p, GFP_KERNEL)) | ||
834 | return -ENOMEM; | ||
835 | |||
836 | mutex_lock(&ca->set->bucket_lock); | ||
837 | |||
838 | fifo_move(&free, &ca->free); | ||
839 | fifo_swap(&free, &ca->free); | ||
840 | |||
841 | mutex_unlock(&ca->set->bucket_lock); | ||
842 | |||
843 | while (fifo_pop(&free, i)) | ||
844 | atomic_dec(&ca->buckets[i].pin); | ||
845 | |||
846 | free_fifo(&free); | ||
847 | } | ||
848 | |||
849 | if (attr == &sysfs_clear_stats) { | 861 | if (attr == &sysfs_clear_stats) { |
850 | atomic_long_set(&ca->sectors_written, 0); | 862 | atomic_long_set(&ca->sectors_written, 0); |
851 | atomic_long_set(&ca->btree_sectors_written, 0); | 863 | atomic_long_set(&ca->btree_sectors_written, 0); |
@@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = { | |||
869 | &sysfs_metadata_written, | 881 | &sysfs_metadata_written, |
870 | &sysfs_io_errors, | 882 | &sysfs_io_errors, |
871 | &sysfs_clear_stats, | 883 | &sysfs_clear_stats, |
872 | &sysfs_freelist_percent, | ||
873 | &sysfs_cache_replacement_policy, | 884 | &sysfs_cache_replacement_policy, |
874 | NULL | 885 | NULL |
875 | }; | 886 | }; |
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1030c6020e98..ac7d0d1f70d7 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #ifndef _BCACHE_UTIL_H | 2 | #ifndef _BCACHE_UTIL_H |
3 | #define _BCACHE_UTIL_H | 3 | #define _BCACHE_UTIL_H |
4 | 4 | ||
5 | #include <linux/blkdev.h> | ||
5 | #include <linux/errno.h> | 6 | #include <linux/errno.h> |
6 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
7 | #include <linux/llist.h> | 8 | #include <linux/llist.h> |
@@ -17,11 +18,13 @@ struct closure; | |||
17 | 18 | ||
18 | #ifdef CONFIG_BCACHE_DEBUG | 19 | #ifdef CONFIG_BCACHE_DEBUG |
19 | 20 | ||
21 | #define EBUG_ON(cond) BUG_ON(cond) | ||
20 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) | 22 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) |
21 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) | 23 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) |
22 | 24 | ||
23 | #else /* DEBUG */ | 25 | #else /* DEBUG */ |
24 | 26 | ||
27 | #define EBUG_ON(cond) do { if (cond); } while (0) | ||
25 | #define atomic_dec_bug(v) atomic_dec(v) | 28 | #define atomic_dec_bug(v) atomic_dec(v) |
26 | #define atomic_inc_bug(v, i) atomic_inc(v) | 29 | #define atomic_inc_bug(v, i) atomic_inc(v) |
27 | 30 | ||
@@ -391,6 +394,11 @@ struct time_stats { | |||
391 | 394 | ||
392 | void bch_time_stats_update(struct time_stats *stats, uint64_t time); | 395 | void bch_time_stats_update(struct time_stats *stats, uint64_t time); |
393 | 396 | ||
397 | static inline unsigned local_clock_us(void) | ||
398 | { | ||
399 | return local_clock() >> 10; | ||
400 | } | ||
401 | |||
394 | #define NSEC_PER_ns 1L | 402 | #define NSEC_PER_ns 1L |
395 | #define NSEC_PER_us NSEC_PER_USEC | 403 | #define NSEC_PER_us NSEC_PER_USEC |
396 | #define NSEC_PER_ms NSEC_PER_MSEC | 404 | #define NSEC_PER_ms NSEC_PER_MSEC |