aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-07-02 02:32:57 -0400
committerJens Axboe <axboe@kernel.dk>2013-07-02 02:32:57 -0400
commitd0e3d0238d83b05d7846c7281524e0f814633dbd (patch)
tree9b3750123faf4975bde744bb9da9219fb697e374
parent5f0e5afa0de4522abb3ea7d1369039b94e740ec5 (diff)
parent8e51e414a3c6d92ef2cc41720c67342a8e2c0bf7 (diff)
Merge branch 'bcache-for-3.11' of git://evilpiepirate.org/~kent/linux-bcache into for-3.11/drivers
-rw-r--r--Documentation/bcache.txt47
-rw-r--r--MAINTAINERS2
-rw-r--r--drivers/md/bcache/alloc.c44
-rw-r--r--drivers/md/bcache/bcache.h56
-rw-r--r--drivers/md/bcache/bset.c56
-rw-r--r--drivers/md/bcache/bset.h4
-rw-r--r--drivers/md/bcache/btree.c447
-rw-r--r--drivers/md/bcache/btree.h35
-rw-r--r--drivers/md/bcache/debug.c178
-rw-r--r--drivers/md/bcache/debug.h11
-rw-r--r--drivers/md/bcache/io.c68
-rw-r--r--drivers/md/bcache/journal.c18
-rw-r--r--drivers/md/bcache/movinggc.c24
-rw-r--r--drivers/md/bcache/request.c189
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/super.c131
-rw-r--r--drivers/md/bcache/sysfs.c66
-rw-r--r--drivers/md/bcache/trace.c47
-rw-r--r--drivers/md/bcache/util.c17
-rw-r--r--drivers/md/bcache/util.h6
-rw-r--r--drivers/md/bcache/writeback.c133
-rw-r--r--drivers/md/bcache/writeback.h64
-rw-r--r--include/trace/events/bcache.h381
23 files changed, 1119 insertions, 907 deletions
diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt
index b3a7e7d384f6..32b6c3189d98 100644
--- a/Documentation/bcache.txt
+++ b/Documentation/bcache.txt
@@ -46,29 +46,33 @@ you format your backing devices and cache device at the same time, you won't
46have to manually attach: 46have to manually attach:
47 make-bcache -B /dev/sda /dev/sdb -C /dev/sdc 47 make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
48 48
49To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register: 49bcache-tools now ships udev rules, and bcache devices are known to the kernel
50immediately. Without udev, you can manually register devices like this:
50 51
51 echo /dev/sdb > /sys/fs/bcache/register 52 echo /dev/sdb > /sys/fs/bcache/register
52 echo /dev/sdc > /sys/fs/bcache/register 53 echo /dev/sdc > /sys/fs/bcache/register
53 54
54To register your bcache devices automatically, you could add something like 55Registering the backing device makes the bcache device show up in /dev; you can
55this to an init script: 56now format it and use it as normal. But the first time using a new bcache
57device, it'll be running in passthrough mode until you attach it to a cache.
58See the section on attaching.
56 59
57 echo /dev/sd* > /sys/fs/bcache/register_quiet 60The devices show up as:
58 61
59It'll look for bcache superblocks and ignore everything that doesn't have one. 62 /dev/bcache<N>
60 63
61Registering the backing device makes the bcache show up in /dev; you can now 64As well as (with udev):
62format it and use it as normal. But the first time using a new bcache device,
63it'll be running in passthrough mode until you attach it to a cache. See the
64section on attaching.
65 65
66The devices show up at /dev/bcacheN, and can be controlled via sysfs from 66 /dev/bcache/by-uuid/<uuid>
67/sys/block/bcacheN/bcache: 67 /dev/bcache/by-label/<label>
68
69To get started:
68 70
69 mkfs.ext4 /dev/bcache0 71 mkfs.ext4 /dev/bcache0
70 mount /dev/bcache0 /mnt 72 mount /dev/bcache0 /mnt
71 73
74You can control bcache devices through sysfs at /sys/block/bcache<N>/bcache .
75
72Cache devices are managed as sets; multiple caches per set isn't supported yet 76Cache devices are managed as sets; multiple caches per set isn't supported yet
73but will allow for mirroring of metadata and dirty data in the future. Your new 77but will allow for mirroring of metadata and dirty data in the future. Your new
74cache set shows up as /sys/fs/bcache/<UUID> 78cache set shows up as /sys/fs/bcache/<UUID>
@@ -80,11 +84,11 @@ must be attached to your cache set to enable caching. Attaching a backing
80device to a cache set is done thusly, with the UUID of the cache set in 84device to a cache set is done thusly, with the UUID of the cache set in
81/sys/fs/bcache: 85/sys/fs/bcache:
82 86
83 echo <UUID> > /sys/block/bcache0/bcache/attach 87 echo <CSET-UUID> > /sys/block/bcache0/bcache/attach
84 88
85This only has to be done once. The next time you reboot, just reregister all 89This only has to be done once. The next time you reboot, just reregister all
86your bcache devices. If a backing device has data in a cache somewhere, the 90your bcache devices. If a backing device has data in a cache somewhere, the
87/dev/bcache# device won't be created until the cache shows up - particularly 91/dev/bcache<N> device won't be created until the cache shows up - particularly
88important if you have writeback caching turned on. 92important if you have writeback caching turned on.
89 93
90If you're booting up and your cache device is gone and never coming back, you 94If you're booting up and your cache device is gone and never coming back, you
@@ -181,7 +185,7 @@ want for getting the best possible numbers when benchmarking.
181 185
182 In practice this isn't an issue because as soon as a write comes along it'll 186 In practice this isn't an issue because as soon as a write comes along it'll
183 cause the btree node to be split, and you need almost no write traffic for 187 cause the btree node to be split, and you need almost no write traffic for
184 this to not show up enough to be noticable (especially since bcache's btree 188 this to not show up enough to be noticeable (especially since bcache's btree
185 nodes are huge and index large regions of the device). But when you're 189 nodes are huge and index large regions of the device). But when you're
186 benchmarking, if you're trying to warm the cache by reading a bunch of data 190 benchmarking, if you're trying to warm the cache by reading a bunch of data
187 and there's no other traffic - that can be a problem. 191 and there's no other traffic - that can be a problem.
@@ -191,6 +195,9 @@ want for getting the best possible numbers when benchmarking.
191 195
192SYSFS - BACKING DEVICE: 196SYSFS - BACKING DEVICE:
193 197
198Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and
199(if attached) /sys/fs/bcache/<cset-uuid>/bdev*
200
194attach 201attach
195 Echo the UUID of a cache set to this file to enable caching. 202 Echo the UUID of a cache set to this file to enable caching.
196 203
@@ -222,7 +229,7 @@ running
222 it's in passthrough mode or caching). 229 it's in passthrough mode or caching).
223 230
224sequential_cutoff 231sequential_cutoff
225 A sequential IO will bypass the cache once it passes this threshhold; the 232 A sequential IO will bypass the cache once it passes this threshold; the
226 most recent 128 IOs are tracked so sequential IO can be detected even when 233 most recent 128 IOs are tracked so sequential IO can be detected even when
227 it isn't all done at once. 234 it isn't all done at once.
228 235
@@ -296,10 +303,12 @@ cache_miss_collisions
296 since the synchronization for cache misses was rewritten) 303 since the synchronization for cache misses was rewritten)
297 304
298cache_readaheads 305cache_readaheads
299 Count of times readahead occured. 306 Count of times readahead occurred.
300 307
301SYSFS - CACHE SET: 308SYSFS - CACHE SET:
302 309
310Available at /sys/fs/bcache/<cset-uuid>
311
303average_key_size 312average_key_size
304 Average data per key in the btree. 313 Average data per key in the btree.
305 314
@@ -362,7 +371,7 @@ unregister
362SYSFS - CACHE SET INTERNAL: 371SYSFS - CACHE SET INTERNAL:
363 372
364This directory also exposes timings for a number of internal operations, with 373This directory also exposes timings for a number of internal operations, with
365separate files for average duration, average frequency, last occurence and max 374separate files for average duration, average frequency, last occurrence and max
366duration: garbage collection, btree read, btree node sorts and btree splits. 375duration: garbage collection, btree read, btree node sorts and btree splits.
367 376
368active_journal_entries 377active_journal_entries
@@ -390,6 +399,8 @@ trigger_gc
390 399
391SYSFS - CACHE DEVICE: 400SYSFS - CACHE DEVICE:
392 401
402Available at /sys/block/<cdev>/bcache
403
393block_size 404block_size
394 Minimum granularity of writes - should match hardware sector size. 405 Minimum granularity of writes - should match hardware sector size.
395 406
@@ -417,7 +428,7 @@ freelist_percent
417 space. 428 space.
418 429
419io_errors 430io_errors
420 Number of errors that have occured, decayed by io_error_halflife. 431 Number of errors that have occurred, decayed by io_error_halflife.
421 432
422metadata_written 433metadata_written
423 Sum of all non data writes (btree writes and all other metadata). 434 Sum of all non data writes (btree writes and all other metadata).
diff --git a/MAINTAINERS b/MAINTAINERS
index bc3832a9db42..a0a76fb7323f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1621,7 +1621,7 @@ S: Maintained
1621F: drivers/net/hamradio/baycom* 1621F: drivers/net/hamradio/baycom*
1622 1622
1623BCACHE (BLOCK LAYER CACHE) 1623BCACHE (BLOCK LAYER CACHE)
1624M: Kent Overstreet <koverstreet@google.com> 1624M: Kent Overstreet <kmo@daterainc.com>
1625L: linux-bcache@vger.kernel.org 1625L: linux-bcache@vger.kernel.org
1626W: http://bcache.evilpiepirate.org 1626W: http://bcache.evilpiepirate.org
1627S: Maintained: 1627S: Maintained:
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 048f2947e08b..b54b73b9b2b7 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,7 +63,9 @@
63#include "bcache.h" 63#include "bcache.h"
64#include "btree.h" 64#include "btree.h"
65 65
66#include <linux/kthread.h>
66#include <linux/random.h> 67#include <linux/random.h>
68#include <trace/events/bcache.h>
67 69
68#define MAX_IN_FLIGHT_DISCARDS 8U 70#define MAX_IN_FLIGHT_DISCARDS 8U
69 71
@@ -151,7 +153,7 @@ static void discard_finish(struct work_struct *w)
151 mutex_unlock(&ca->set->bucket_lock); 153 mutex_unlock(&ca->set->bucket_lock);
152 154
153 closure_wake_up(&ca->set->bucket_wait); 155 closure_wake_up(&ca->set->bucket_wait);
154 wake_up(&ca->set->alloc_wait); 156 wake_up_process(ca->alloc_thread);
155 157
156 closure_put(&ca->set->cl); 158 closure_put(&ca->set->cl);
157} 159}
@@ -350,38 +352,31 @@ static void invalidate_buckets(struct cache *ca)
350 break; 352 break;
351 } 353 }
352 354
353 pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", 355 trace_bcache_alloc_invalidate(ca);
354 fifo_used(&ca->free), ca->free.size,
355 fifo_used(&ca->free_inc), ca->free_inc.size,
356 fifo_used(&ca->unused), ca->unused.size);
357} 356}
358 357
359#define allocator_wait(ca, cond) \ 358#define allocator_wait(ca, cond) \
360do { \ 359do { \
361 DEFINE_WAIT(__wait); \
362 \
363 while (1) { \ 360 while (1) { \
364 prepare_to_wait(&ca->set->alloc_wait, \ 361 set_current_state(TASK_INTERRUPTIBLE); \
365 &__wait, TASK_INTERRUPTIBLE); \
366 if (cond) \ 362 if (cond) \
367 break; \ 363 break; \
368 \ 364 \
369 mutex_unlock(&(ca)->set->bucket_lock); \ 365 mutex_unlock(&(ca)->set->bucket_lock); \
370 if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ 366 if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
371 finish_wait(&ca->set->alloc_wait, &__wait); \ 367 closure_put(&ca->set->cl); \
372 closure_return(cl); \ 368 return 0; \
373 } \ 369 } \
374 \ 370 \
375 schedule(); \ 371 schedule(); \
376 mutex_lock(&(ca)->set->bucket_lock); \ 372 mutex_lock(&(ca)->set->bucket_lock); \
377 } \ 373 } \
378 \ 374 __set_current_state(TASK_RUNNING); \
379 finish_wait(&ca->set->alloc_wait, &__wait); \
380} while (0) 375} while (0)
381 376
382void bch_allocator_thread(struct closure *cl) 377static int bch_allocator_thread(void *arg)
383{ 378{
384 struct cache *ca = container_of(cl, struct cache, alloc); 379 struct cache *ca = arg;
385 380
386 mutex_lock(&ca->set->bucket_lock); 381 mutex_lock(&ca->set->bucket_lock);
387 382
@@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
442{ 437{
443 long r = -1; 438 long r = -1;
444again: 439again:
445 wake_up(&ca->set->alloc_wait); 440 wake_up_process(ca->alloc_thread);
446 441
447 if (fifo_used(&ca->free) > ca->watermark[watermark] && 442 if (fifo_used(&ca->free) > ca->watermark[watermark] &&
448 fifo_pop(&ca->free, r)) { 443 fifo_pop(&ca->free, r)) {
@@ -476,9 +471,7 @@ again:
476 return r; 471 return r;
477 } 472 }
478 473
479 pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", 474 trace_bcache_alloc_fail(ca);
480 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
481 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
482 475
483 if (cl) { 476 if (cl) {
484 closure_wait(&ca->set->bucket_wait, cl); 477 closure_wait(&ca->set->bucket_wait, cl);
@@ -552,6 +545,19 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
552 545
553/* Init */ 546/* Init */
554 547
548int bch_cache_allocator_start(struct cache *ca)
549{
550 ca->alloc_thread = kthread_create(bch_allocator_thread,
551 ca, "bcache_allocator");
552 if (IS_ERR(ca->alloc_thread))
553 return PTR_ERR(ca->alloc_thread);
554
555 closure_get(&ca->set->cl);
556 wake_up_process(ca->alloc_thread);
557
558 return 0;
559}
560
555void bch_cache_allocator_exit(struct cache *ca) 561void bch_cache_allocator_exit(struct cache *ca)
556{ 562{
557 struct discard *d; 563 struct discard *d;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d3e15b42a4ab..342ba86c6e4f 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -178,7 +178,6 @@
178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ 178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
179 179
180#include <linux/bio.h> 180#include <linux/bio.h>
181#include <linux/blktrace_api.h>
182#include <linux/kobject.h> 181#include <linux/kobject.h>
183#include <linux/list.h> 182#include <linux/list.h>
184#include <linux/mutex.h> 183#include <linux/mutex.h>
@@ -388,8 +387,6 @@ struct keybuf_key {
388typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); 387typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
389 388
390struct keybuf { 389struct keybuf {
391 keybuf_pred_fn *key_predicate;
392
393 struct bkey last_scanned; 390 struct bkey last_scanned;
394 spinlock_t lock; 391 spinlock_t lock;
395 392
@@ -438,8 +435,10 @@ struct bcache_device {
438 /* If nonzero, we're detaching/unregistering from cache set */ 435 /* If nonzero, we're detaching/unregistering from cache set */
439 atomic_t detaching; 436 atomic_t detaching;
440 437
441 atomic_long_t sectors_dirty; 438 uint64_t nr_stripes;
442 unsigned long sectors_dirty_gc; 439 unsigned stripe_size_bits;
440 atomic_t *stripe_sectors_dirty;
441
443 unsigned long sectors_dirty_last; 442 unsigned long sectors_dirty_last;
444 long sectors_dirty_derivative; 443 long sectors_dirty_derivative;
445 444
@@ -531,6 +530,7 @@ struct cached_dev {
531 unsigned sequential_merge:1; 530 unsigned sequential_merge:1;
532 unsigned verify:1; 531 unsigned verify:1;
533 532
533 unsigned partial_stripes_expensive:1;
534 unsigned writeback_metadata:1; 534 unsigned writeback_metadata:1;
535 unsigned writeback_running:1; 535 unsigned writeback_running:1;
536 unsigned char writeback_percent; 536 unsigned char writeback_percent;
@@ -565,8 +565,7 @@ struct cache {
565 565
566 unsigned watermark[WATERMARK_MAX]; 566 unsigned watermark[WATERMARK_MAX];
567 567
568 struct closure alloc; 568 struct task_struct *alloc_thread;
569 struct workqueue_struct *alloc_workqueue;
570 569
571 struct closure prio; 570 struct closure prio;
572 struct prio_set *disk_buckets; 571 struct prio_set *disk_buckets;
@@ -703,9 +702,6 @@ struct cache_set {
703 /* For the btree cache */ 702 /* For the btree cache */
704 struct shrinker shrink; 703 struct shrinker shrink;
705 704
706 /* For the allocator itself */
707 wait_queue_head_t alloc_wait;
708
709 /* For the btree cache and anything allocation related */ 705 /* For the btree cache and anything allocation related */
710 struct mutex bucket_lock; 706 struct mutex bucket_lock;
711 707
@@ -823,10 +819,9 @@ struct cache_set {
823 819
824 /* 820 /*
825 * A btree node on disk could have too many bsets for an iterator to fit 821 * A btree node on disk could have too many bsets for an iterator to fit
826 * on the stack - this is a single element mempool for btree_read_work() 822 * on the stack - have to dynamically allocate them
827 */ 823 */
828 struct mutex fill_lock; 824 mempool_t *fill_iter;
829 struct btree_iter *fill_iter;
830 825
831 /* 826 /*
832 * btree_sort() is a merge sort and requires temporary space - single 827 * btree_sort() is a merge sort and requires temporary space - single
@@ -834,6 +829,7 @@ struct cache_set {
834 */ 829 */
835 struct mutex sort_lock; 830 struct mutex sort_lock;
836 struct bset *sort; 831 struct bset *sort;
832 unsigned sort_crit_factor;
837 833
838 /* List of buckets we're currently writing data to */ 834 /* List of buckets we're currently writing data to */
839 struct list_head data_buckets; 835 struct list_head data_buckets;
@@ -906,8 +902,6 @@ static inline unsigned local_clock_us(void)
906 return local_clock() >> 10; 902 return local_clock() >> 10;
907} 903}
908 904
909#define MAX_BSETS 4U
910
911#define BTREE_PRIO USHRT_MAX 905#define BTREE_PRIO USHRT_MAX
912#define INITIAL_PRIO 32768 906#define INITIAL_PRIO 32768
913 907
@@ -1112,23 +1106,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k)
1112 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); 1106 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
1113} 1107}
1114 1108
1115/* Blktrace macros */
1116
1117#define blktrace_msg(c, fmt, ...) \
1118do { \
1119 struct request_queue *q = bdev_get_queue(c->bdev); \
1120 if (q) \
1121 blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
1122} while (0)
1123
1124#define blktrace_msg_all(s, fmt, ...) \
1125do { \
1126 struct cache *_c; \
1127 unsigned i; \
1128 for_each_cache(_c, (s), i) \
1129 blktrace_msg(_c, fmt, ##__VA_ARGS__); \
1130} while (0)
1131
1132static inline void cached_dev_put(struct cached_dev *dc) 1109static inline void cached_dev_put(struct cached_dev *dc)
1133{ 1110{
1134 if (atomic_dec_and_test(&dc->count)) 1111 if (atomic_dec_and_test(&dc->count))
@@ -1173,10 +1150,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b)
1173 static struct kobj_attribute ksysfs_##n = \ 1150 static struct kobj_attribute ksysfs_##n = \
1174 __ATTR(n, S_IWUSR|S_IRUSR, show, store) 1151 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1175 1152
1176/* Forward declarations */ 1153static inline void wake_up_allocators(struct cache_set *c)
1154{
1155 struct cache *ca;
1156 unsigned i;
1157
1158 for_each_cache(ca, c, i)
1159 wake_up_process(ca->alloc_thread);
1160}
1177 1161
1178void bch_writeback_queue(struct cached_dev *); 1162/* Forward declarations */
1179void bch_writeback_add(struct cached_dev *, unsigned);
1180 1163
1181void bch_count_io_errors(struct cache *, int, const char *); 1164void bch_count_io_errors(struct cache *, int, const char *);
1182void bch_bbio_count_io_errors(struct cache_set *, struct bio *, 1165void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
@@ -1193,7 +1176,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
1193uint8_t bch_inc_gen(struct cache *, struct bucket *); 1176uint8_t bch_inc_gen(struct cache *, struct bucket *);
1194void bch_rescale_priorities(struct cache_set *, int); 1177void bch_rescale_priorities(struct cache_set *, int);
1195bool bch_bucket_add_unused(struct cache *, struct bucket *); 1178bool bch_bucket_add_unused(struct cache *, struct bucket *);
1196void bch_allocator_thread(struct closure *);
1197 1179
1198long bch_bucket_alloc(struct cache *, unsigned, struct closure *); 1180long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
1199void bch_bucket_free(struct cache_set *, struct bkey *); 1181void bch_bucket_free(struct cache_set *, struct bkey *);
@@ -1241,9 +1223,9 @@ void bch_cache_set_stop(struct cache_set *);
1241struct cache_set *bch_cache_set_alloc(struct cache_sb *); 1223struct cache_set *bch_cache_set_alloc(struct cache_sb *);
1242void bch_btree_cache_free(struct cache_set *); 1224void bch_btree_cache_free(struct cache_set *);
1243int bch_btree_cache_alloc(struct cache_set *); 1225int bch_btree_cache_alloc(struct cache_set *);
1244void bch_cached_dev_writeback_init(struct cached_dev *);
1245void bch_moving_init_cache_set(struct cache_set *); 1226void bch_moving_init_cache_set(struct cache_set *);
1246 1227
1228int bch_cache_allocator_start(struct cache *ca);
1247void bch_cache_allocator_exit(struct cache *ca); 1229void bch_cache_allocator_exit(struct cache *ca);
1248int bch_cache_allocator_init(struct cache *ca); 1230int bch_cache_allocator_init(struct cache *ca);
1249 1231
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index cb4578a327b9..a0f190ac17a4 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
78bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) 78bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
79{ 79{
80 unsigned i; 80 unsigned i;
81 char buf[80];
81 82
82 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) 83 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
83 goto bad; 84 goto bad;
@@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
102 103
103 return false; 104 return false;
104bad: 105bad:
105 cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); 106 bch_bkey_to_text(buf, sizeof(buf), k);
107 cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
106 return true; 108 return true;
107} 109}
108 110
@@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
162#ifdef CONFIG_BCACHE_EDEBUG 164#ifdef CONFIG_BCACHE_EDEBUG
163bug: 165bug:
164 mutex_unlock(&b->c->bucket_lock); 166 mutex_unlock(&b->c->bucket_lock);
165 btree_bug(b, 167
168 {
169 char buf[80];
170
171 bch_bkey_to_text(buf, sizeof(buf), k);
172 btree_bug(b,
166"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 173"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
167 pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), 174 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
168 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 175 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
176 }
169 return true; 177 return true;
170#endif 178#endif
171} 179}
@@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
1084 new->sets->size = 0; 1092 new->sets->size = 0;
1085} 1093}
1086 1094
1095#define SORT_CRIT (4096 / sizeof(uint64_t))
1096
1087void bch_btree_sort_lazy(struct btree *b) 1097void bch_btree_sort_lazy(struct btree *b)
1088{ 1098{
1089 if (b->nsets) { 1099 unsigned crit = SORT_CRIT;
1090 unsigned i, j, keys = 0, total; 1100 int i;
1091 1101
1092 for (i = 0; i <= b->nsets; i++) 1102 /* Don't sort if nothing to do */
1093 keys += b->sets[i].data->keys; 1103 if (!b->nsets)
1094 1104 goto out;
1095 total = keys;
1096 1105
1097 for (j = 0; j < b->nsets; j++) { 1106 /* If not a leaf node, always sort */
1098 if (keys * 2 < total || 1107 if (b->level) {
1099 keys < 1000) { 1108 bch_btree_sort(b);
1100 bch_btree_sort_partial(b, j); 1109 return;
1101 return; 1110 }
1102 }
1103 1111
1104 keys -= b->sets[j].data->keys; 1112 for (i = b->nsets - 1; i >= 0; --i) {
1105 } 1113 crit *= b->c->sort_crit_factor;
1106 1114
1107 /* Must sort if b->nsets == 3 or we'll overflow */ 1115 if (b->sets[i].data->keys < crit) {
1108 if (b->nsets >= (MAX_BSETS - 1) - b->level) { 1116 bch_btree_sort_partial(b, i);
1109 bch_btree_sort(b);
1110 return; 1117 return;
1111 } 1118 }
1112 } 1119 }
1113 1120
1121 /* Sort if we'd overflow */
1122 if (b->nsets + 1 == MAX_BSETS) {
1123 bch_btree_sort(b);
1124 return;
1125 }
1126
1127out:
1114 bset_build_written_tree(b); 1128 bset_build_written_tree(b);
1115} 1129}
1116 1130
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 57a9cff41546..ae115a253d73 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,6 +1,8 @@
1#ifndef _BCACHE_BSET_H 1#ifndef _BCACHE_BSET_H
2#define _BCACHE_BSET_H 2#define _BCACHE_BSET_H
3 3
4#include <linux/slab.h>
5
4/* 6/*
5 * BKEYS: 7 * BKEYS:
6 * 8 *
@@ -142,6 +144,8 @@
142 144
143/* Btree key comparison/iteration */ 145/* Btree key comparison/iteration */
144 146
147#define MAX_BSETS 4U
148
145struct btree_iter { 149struct btree_iter {
146 size_t size, used; 150 size_t size, used;
147 struct btree_iter_set { 151 struct btree_iter_set {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a5658f04e62..15b58239c683 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -24,6 +24,7 @@
24#include "btree.h" 24#include "btree.h"
25#include "debug.h" 25#include "debug.h"
26#include "request.h" 26#include "request.h"
27#include "writeback.h"
27 28
28#include <linux/slab.h> 29#include <linux/slab.h>
29#include <linux/bitops.h> 30#include <linux/bitops.h>
@@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
134 return crc ^ 0xffffffffffffffffULL; 135 return crc ^ 0xffffffffffffffffULL;
135} 136}
136 137
137static void btree_bio_endio(struct bio *bio, int error) 138static void bch_btree_node_read_done(struct btree *b)
138{ 139{
139 struct closure *cl = bio->bi_private;
140 struct btree *b = container_of(cl, struct btree, io.cl);
141
142 if (error)
143 set_btree_node_io_error(b);
144
145 bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
146 ? "writing btree" : "reading btree");
147 closure_put(cl);
148}
149
150static void btree_bio_init(struct btree *b)
151{
152 BUG_ON(b->bio);
153 b->bio = bch_bbio_alloc(b->c);
154
155 b->bio->bi_end_io = btree_bio_endio;
156 b->bio->bi_private = &b->io.cl;
157}
158
159void bch_btree_read_done(struct closure *cl)
160{
161 struct btree *b = container_of(cl, struct btree, io.cl);
162 struct bset *i = b->sets[0].data;
163 struct btree_iter *iter = b->c->fill_iter;
164 const char *err = "bad btree header"; 140 const char *err = "bad btree header";
165 BUG_ON(b->nsets || b->written); 141 struct bset *i = b->sets[0].data;
166 142 struct btree_iter *iter;
167 bch_bbio_free(b->bio, b->c);
168 b->bio = NULL;
169 143
170 mutex_lock(&b->c->fill_lock); 144 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
171 iter->used = 0; 146 iter->used = 0;
172 147
173 if (btree_node_io_error(b) || 148 if (!i->seq)
174 !i->seq)
175 goto err; 149 goto err;
176 150
177 for (; 151 for (;
@@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl)
228 if (b->written < btree_blocks(b)) 202 if (b->written < btree_blocks(b))
229 bch_bset_init_next(b); 203 bch_bset_init_next(b);
230out: 204out:
231 205 mempool_free(iter, b->c->fill_iter);
232 mutex_unlock(&b->c->fill_lock); 206 return;
233
234 spin_lock(&b->c->btree_read_time_lock);
235 bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
236 spin_unlock(&b->c->btree_read_time_lock);
237
238 smp_wmb(); /* read_done is our write lock */
239 set_btree_node_read_done(b);
240
241 closure_return(cl);
242err: 207err:
243 set_btree_node_io_error(b); 208 set_btree_node_io_error(b);
244 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 209 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
@@ -247,48 +212,69 @@ err:
247 goto out; 212 goto out;
248} 213}
249 214
250void bch_btree_read(struct btree *b) 215static void btree_node_read_endio(struct bio *bio, int error)
251{ 216{
252 BUG_ON(b->nsets || b->written); 217 struct closure *cl = bio->bi_private;
218 closure_put(cl);
219}
253 220
254 if (!closure_trylock(&b->io.cl, &b->c->cl)) 221void bch_btree_node_read(struct btree *b)
255 BUG(); 222{
223 uint64_t start_time = local_clock();
224 struct closure cl;
225 struct bio *bio;
226
227 trace_bcache_btree_read(b);
228
229 closure_init_stack(&cl);
256 230
257 b->io_start_time = local_clock(); 231 bio = bch_bbio_alloc(b->c);
232 bio->bi_rw = REQ_META|READ_SYNC;
233 bio->bi_size = KEY_SIZE(&b->key) << 9;
234 bio->bi_end_io = btree_node_read_endio;
235 bio->bi_private = &cl;
258 236
259 btree_bio_init(b); 237 bch_bio_map(bio, b->sets[0].data);
260 b->bio->bi_rw = REQ_META|READ_SYNC;
261 b->bio->bi_size = KEY_SIZE(&b->key) << 9;
262 238
263 bch_bio_map(b->bio, b->sets[0].data); 239 bch_submit_bbio(bio, b->c, &b->key, 0);
240 closure_sync(&cl);
264 241
265 pr_debug("%s", pbtree(b)); 242 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
266 trace_bcache_btree_read(b->bio); 243 set_btree_node_io_error(b);
267 bch_submit_bbio(b->bio, b->c, &b->key, 0); 244
245 bch_bbio_free(bio, b->c);
246
247 if (btree_node_io_error(b))
248 goto err;
268 249
269 continue_at(&b->io.cl, bch_btree_read_done, system_wq); 250 bch_btree_node_read_done(b);
251
252 spin_lock(&b->c->btree_read_time_lock);
253 bch_time_stats_update(&b->c->btree_read_time, start_time);
254 spin_unlock(&b->c->btree_read_time_lock);
255
256 return;
257err:
258 bch_cache_set_error(b->c, "io error reading bucket %lu",
259 PTR_BUCKET_NR(b->c, &b->key, 0));
270} 260}
271 261
272static void btree_complete_write(struct btree *b, struct btree_write *w) 262static void btree_complete_write(struct btree *b, struct btree_write *w)
273{ 263{
274 if (w->prio_blocked && 264 if (w->prio_blocked &&
275 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 265 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
276 wake_up(&b->c->alloc_wait); 266 wake_up_allocators(b->c);
277 267
278 if (w->journal) { 268 if (w->journal) {
279 atomic_dec_bug(w->journal); 269 atomic_dec_bug(w->journal);
280 __closure_wake_up(&b->c->journal.wait); 270 __closure_wake_up(&b->c->journal.wait);
281 } 271 }
282 272
283 if (w->owner)
284 closure_put(w->owner);
285
286 w->prio_blocked = 0; 273 w->prio_blocked = 0;
287 w->journal = NULL; 274 w->journal = NULL;
288 w->owner = NULL;
289} 275}
290 276
291static void __btree_write_done(struct closure *cl) 277static void __btree_node_write_done(struct closure *cl)
292{ 278{
293 struct btree *b = container_of(cl, struct btree, io.cl); 279 struct btree *b = container_of(cl, struct btree, io.cl);
294 struct btree_write *w = btree_prev_write(b); 280 struct btree_write *w = btree_prev_write(b);
@@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl)
304 closure_return(cl); 290 closure_return(cl);
305} 291}
306 292
307static void btree_write_done(struct closure *cl) 293static void btree_node_write_done(struct closure *cl)
308{ 294{
309 struct btree *b = container_of(cl, struct btree, io.cl); 295 struct btree *b = container_of(cl, struct btree, io.cl);
310 struct bio_vec *bv; 296 struct bio_vec *bv;
@@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl)
313 __bio_for_each_segment(bv, b->bio, n, 0) 299 __bio_for_each_segment(bv, b->bio, n, 0)
314 __free_page(bv->bv_page); 300 __free_page(bv->bv_page);
315 301
316 __btree_write_done(cl); 302 __btree_node_write_done(cl);
303}
304
305static void btree_node_write_endio(struct bio *bio, int error)
306{
307 struct closure *cl = bio->bi_private;
308 struct btree *b = container_of(cl, struct btree, io.cl);
309
310 if (error)
311 set_btree_node_io_error(b);
312
313 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
314 closure_put(cl);
317} 315}
318 316
319static void do_btree_write(struct btree *b) 317static void do_btree_node_write(struct btree *b)
320{ 318{
321 struct closure *cl = &b->io.cl; 319 struct closure *cl = &b->io.cl;
322 struct bset *i = b->sets[b->nsets].data; 320 struct bset *i = b->sets[b->nsets].data;
@@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b)
325 i->version = BCACHE_BSET_VERSION; 323 i->version = BCACHE_BSET_VERSION;
326 i->csum = btree_csum_set(b, i); 324 i->csum = btree_csum_set(b, i);
327 325
328 btree_bio_init(b); 326 BUG_ON(b->bio);
329 b->bio->bi_rw = REQ_META|WRITE_SYNC; 327 b->bio = bch_bbio_alloc(b->c);
330 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 328
329 b->bio->bi_end_io = btree_node_write_endio;
330 b->bio->bi_private = &b->io.cl;
331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
331 bch_bio_map(b->bio, i); 333 bch_bio_map(b->bio, i);
332 334
335 /*
336 * If we're appending to a leaf node, we don't technically need FUA -
337 * this write just needs to be persisted before the next journal write,
338 * which will be marked FLUSH|FUA.
339 *
340 * Similarly if we're writing a new btree root - the pointer is going to
341 * be in the next journal entry.
342 *
343 * But if we're writing a new btree node (that isn't a root) or
344 * appending to a non leaf btree node, we need either FUA or a flush
345 * when we write the parent with the new pointer. FUA is cheaper than a
346 * flush, and writes appending to leaf nodes aren't blocking anything so
347 * just make all btree node writes FUA to keep things sane.
348 */
349
333 bkey_copy(&k.key, &b->key); 350 bkey_copy(&k.key, &b->key);
334 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 351 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
335 352
336 if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { 353 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
337 int j; 354 int j;
338 struct bio_vec *bv; 355 struct bio_vec *bv;
339 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 356 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b)
342 memcpy(page_address(bv->bv_page), 359 memcpy(page_address(bv->bv_page),
343 base + j * PAGE_SIZE, PAGE_SIZE); 360 base + j * PAGE_SIZE, PAGE_SIZE);
344 361
345 trace_bcache_btree_write(b->bio);
346 bch_submit_bbio(b->bio, b->c, &k.key, 0); 362 bch_submit_bbio(b->bio, b->c, &k.key, 0);
347 363
348 continue_at(cl, btree_write_done, NULL); 364 continue_at(cl, btree_node_write_done, NULL);
349 } else { 365 } else {
350 b->bio->bi_vcnt = 0; 366 b->bio->bi_vcnt = 0;
351 bch_bio_map(b->bio, i); 367 bch_bio_map(b->bio, i);
352 368
353 trace_bcache_btree_write(b->bio);
354 bch_submit_bbio(b->bio, b->c, &k.key, 0); 369 bch_submit_bbio(b->bio, b->c, &k.key, 0);
355 370
356 closure_sync(cl); 371 closure_sync(cl);
357 __btree_write_done(cl); 372 __btree_node_write_done(cl);
358 } 373 }
359} 374}
360 375
361static void __btree_write(struct btree *b) 376void bch_btree_node_write(struct btree *b, struct closure *parent)
362{ 377{
363 struct bset *i = b->sets[b->nsets].data; 378 struct bset *i = b->sets[b->nsets].data;
364 379
380 trace_bcache_btree_write(b);
381
365 BUG_ON(current->bio_list); 382 BUG_ON(current->bio_list);
383 BUG_ON(b->written >= btree_blocks(b));
384 BUG_ON(b->written && !i->keys);
385 BUG_ON(b->sets->data->seq != i->seq);
386 bch_check_key_order(b, i);
366 387
367 closure_lock(&b->io, &b->c->cl);
368 cancel_delayed_work(&b->work); 388 cancel_delayed_work(&b->work);
369 389
390 /* If caller isn't waiting for write, parent refcount is cache set */
391 closure_lock(&b->io, parent ?: &b->c->cl);
392
370 clear_bit(BTREE_NODE_dirty, &b->flags); 393 clear_bit(BTREE_NODE_dirty, &b->flags);
371 change_bit(BTREE_NODE_write_idx, &b->flags); 394 change_bit(BTREE_NODE_write_idx, &b->flags);
372 395
373 bch_check_key_order(b, i); 396 do_btree_node_write(b);
374 BUG_ON(b->written && !i->keys);
375
376 do_btree_write(b);
377
378 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
379 397
380 b->written += set_blocks(i, b->c); 398 b->written += set_blocks(i, b->c);
381 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, 399 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
@@ -387,37 +405,31 @@ static void __btree_write(struct btree *b)
387 bch_bset_init_next(b); 405 bch_bset_init_next(b);
388} 406}
389 407
390static void btree_write_work(struct work_struct *w) 408static void btree_node_write_work(struct work_struct *w)
391{ 409{
392 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 410 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
393 411
394 down_write(&b->lock); 412 rw_lock(true, b, b->level);
395 413
396 if (btree_node_dirty(b)) 414 if (btree_node_dirty(b))
397 __btree_write(b); 415 bch_btree_node_write(b, NULL);
398 up_write(&b->lock); 416 rw_unlock(true, b);
399} 417}
400 418
401void bch_btree_write(struct btree *b, bool now, struct btree_op *op) 419static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
402{ 420{
403 struct bset *i = b->sets[b->nsets].data; 421 struct bset *i = b->sets[b->nsets].data;
404 struct btree_write *w = btree_current_write(b); 422 struct btree_write *w = btree_current_write(b);
405 423
406 BUG_ON(b->written && 424 BUG_ON(!b->written);
407 (b->written >= btree_blocks(b) || 425 BUG_ON(!i->keys);
408 i->seq != b->sets[0].data->seq ||
409 !i->keys));
410 426
411 if (!btree_node_dirty(b)) { 427 if (!btree_node_dirty(b))
412 set_btree_node_dirty(b); 428 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
413 queue_delayed_work(btree_io_wq, &b->work,
414 msecs_to_jiffies(30000));
415 }
416 429
417 w->prio_blocked += b->prio_blocked; 430 set_btree_node_dirty(b);
418 b->prio_blocked = 0;
419 431
420 if (op && op->journal && !b->level) { 432 if (op && op->journal) {
421 if (w->journal && 433 if (w->journal &&
422 journal_pin_cmp(b->c, w, op)) { 434 journal_pin_cmp(b->c, w, op)) {
423 atomic_dec_bug(w->journal); 435 atomic_dec_bug(w->journal);
@@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
430 } 442 }
431 } 443 }
432 444
433 if (current->bio_list)
434 return;
435
436 /* Force write if set is too big */ 445 /* Force write if set is too big */
437 if (now || 446 if (set_bytes(i) > PAGE_SIZE - 48 &&
438 b->level || 447 !current->bio_list)
439 set_bytes(i) > PAGE_SIZE - 48) { 448 bch_btree_node_write(b, NULL);
440 if (op && now) {
441 /* Must wait on multiple writes */
442 BUG_ON(w->owner);
443 w->owner = &op->cl;
444 closure_get(&op->cl);
445 }
446
447 __btree_write(b);
448 }
449 BUG_ON(!b->written);
450} 449}
451 450
452/* 451/*
@@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
559 init_rwsem(&b->lock); 558 init_rwsem(&b->lock);
560 lockdep_set_novalidate_class(&b->lock); 559 lockdep_set_novalidate_class(&b->lock);
561 INIT_LIST_HEAD(&b->list); 560 INIT_LIST_HEAD(&b->list);
562 INIT_DELAYED_WORK(&b->work, btree_write_work); 561 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
563 b->c = c; 562 b->c = c;
564 closure_init_unlocked(&b->io); 563 closure_init_unlocked(&b->io);
565 564
@@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
582 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
583 582
584 if (cl && btree_node_dirty(b)) 583 if (cl && btree_node_dirty(b))
585 bch_btree_write(b, true, NULL); 584 bch_btree_node_write(b, NULL);
586 585
587 if (cl) 586 if (cl)
588 closure_wait_event_async(&b->io.wait, cl, 587 closure_wait_event_async(&b->io.wait, cl,
@@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
623 else if (!mutex_trylock(&c->bucket_lock)) 622 else if (!mutex_trylock(&c->bucket_lock))
624 return -1; 623 return -1;
625 624
625 /*
626 * It's _really_ critical that we don't free too many btree nodes - we
627 * have to always leave ourselves a reserve. The reserve is how we
628 * guarantee that allocating memory for a new btree node can always
629 * succeed, so that inserting keys into the btree can always succeed and
630 * IO can always make forward progress:
631 */
626 nr /= c->btree_pages; 632 nr /= c->btree_pages;
627 nr = min_t(unsigned long, nr, mca_can_free(c)); 633 nr = min_t(unsigned long, nr, mca_can_free(c));
628 634
@@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
766 int ret = -ENOMEM; 772 int ret = -ENOMEM;
767 struct btree *i; 773 struct btree *i;
768 774
775 trace_bcache_btree_cache_cannibalize(c);
776
769 if (!cl) 777 if (!cl)
770 return ERR_PTR(-ENOMEM); 778 return ERR_PTR(-ENOMEM);
771 779
@@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
784 return ERR_PTR(-EAGAIN); 792 return ERR_PTR(-EAGAIN);
785 } 793 }
786 794
787 /* XXX: tracepoint */
788 c->try_harder = cl; 795 c->try_harder = cl;
789 c->try_harder_start = local_clock(); 796 c->try_harder_start = local_clock();
790retry: 797retry:
@@ -905,6 +912,9 @@ retry:
905 b = mca_find(c, k); 912 b = mca_find(c, k);
906 913
907 if (!b) { 914 if (!b) {
915 if (current->bio_list)
916 return ERR_PTR(-EAGAIN);
917
908 mutex_lock(&c->bucket_lock); 918 mutex_lock(&c->bucket_lock);
909 b = mca_alloc(c, k, level, &op->cl); 919 b = mca_alloc(c, k, level, &op->cl);
910 mutex_unlock(&c->bucket_lock); 920 mutex_unlock(&c->bucket_lock);
@@ -914,7 +924,7 @@ retry:
914 if (IS_ERR(b)) 924 if (IS_ERR(b))
915 return b; 925 return b;
916 926
917 bch_btree_read(b); 927 bch_btree_node_read(b);
918 928
919 if (!write) 929 if (!write)
920 downgrade_write(&b->lock); 930 downgrade_write(&b->lock);
@@ -937,15 +947,12 @@ retry:
937 for (; i <= b->nsets; i++) 947 for (; i <= b->nsets; i++)
938 prefetch(b->sets[i].data); 948 prefetch(b->sets[i].data);
939 949
940 if (!closure_wait_event(&b->io.wait, &op->cl, 950 if (btree_node_io_error(b)) {
941 btree_node_read_done(b))) {
942 rw_unlock(write, b); 951 rw_unlock(write, b);
943 b = ERR_PTR(-EAGAIN); 952 return ERR_PTR(-EIO);
944 } else if (btree_node_io_error(b)) { 953 }
945 rw_unlock(write, b); 954
946 b = ERR_PTR(-EIO); 955 BUG_ON(!b->written);
947 } else
948 BUG_ON(!b->written);
949 956
950 return b; 957 return b;
951} 958}
@@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
959 mutex_unlock(&c->bucket_lock); 966 mutex_unlock(&c->bucket_lock);
960 967
961 if (!IS_ERR_OR_NULL(b)) { 968 if (!IS_ERR_OR_NULL(b)) {
962 bch_btree_read(b); 969 bch_btree_node_read(b);
963 rw_unlock(true, b); 970 rw_unlock(true, b);
964 } 971 }
965} 972}
@@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
970{ 977{
971 unsigned i; 978 unsigned i;
972 979
980 trace_bcache_btree_node_free(b);
981
973 /* 982 /*
974 * The BUG_ON() in btree_node_get() implies that we must have a write 983 * The BUG_ON() in btree_node_get() implies that we must have a write
975 * lock on parent to free or even invalidate a node 984 * lock on parent to free or even invalidate a node
976 */ 985 */
977 BUG_ON(op->lock <= b->level); 986 BUG_ON(op->lock <= b->level);
978 BUG_ON(b == b->c->root); 987 BUG_ON(b == b->c->root);
979 pr_debug("bucket %s", pbtree(b));
980 988
981 if (btree_node_dirty(b)) 989 if (btree_node_dirty(b))
982 btree_complete_write(b, btree_current_write(b)); 990 btree_complete_write(b, btree_current_write(b));
983 clear_bit(BTREE_NODE_dirty, &b->flags); 991 clear_bit(BTREE_NODE_dirty, &b->flags);
984 992
985 if (b->prio_blocked &&
986 !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
987 wake_up(&b->c->alloc_wait);
988
989 b->prio_blocked = 0;
990
991 cancel_delayed_work(&b->work); 993 cancel_delayed_work(&b->work);
992 994
993 mutex_lock(&b->c->bucket_lock); 995 mutex_lock(&b->c->bucket_lock);
@@ -1028,17 +1030,20 @@ retry:
1028 goto retry; 1030 goto retry;
1029 } 1031 }
1030 1032
1031 set_btree_node_read_done(b);
1032 b->accessed = 1; 1033 b->accessed = 1;
1033 bch_bset_init_next(b); 1034 bch_bset_init_next(b);
1034 1035
1035 mutex_unlock(&c->bucket_lock); 1036 mutex_unlock(&c->bucket_lock);
1037
1038 trace_bcache_btree_node_alloc(b);
1036 return b; 1039 return b;
1037err_free: 1040err_free:
1038 bch_bucket_free(c, &k.key); 1041 bch_bucket_free(c, &k.key);
1039 __bkey_put(c, &k.key); 1042 __bkey_put(c, &k.key);
1040err: 1043err:
1041 mutex_unlock(&c->bucket_lock); 1044 mutex_unlock(&c->bucket_lock);
1045
1046 trace_bcache_btree_node_alloc_fail(b);
1042 return b; 1047 return b;
1043} 1048}
1044 1049
@@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1137 gc->nkeys++; 1142 gc->nkeys++;
1138 1143
1139 gc->data += KEY_SIZE(k); 1144 gc->data += KEY_SIZE(k);
1140 if (KEY_DIRTY(k)) { 1145 if (KEY_DIRTY(k))
1141 gc->dirty += KEY_SIZE(k); 1146 gc->dirty += KEY_SIZE(k);
1142 if (d)
1143 d->sectors_dirty_gc += KEY_SIZE(k);
1144 }
1145 } 1147 }
1146 1148
1147 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1149 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1166 1168
1167 if (!IS_ERR_OR_NULL(n)) { 1169 if (!IS_ERR_OR_NULL(n)) {
1168 swap(b, n); 1170 swap(b, n);
1171 __bkey_put(b->c, &b->key);
1169 1172
1170 memcpy(k->ptr, b->key.ptr, 1173 memcpy(k->ptr, b->key.ptr,
1171 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1174 sizeof(uint64_t) * KEY_PTRS(&b->key));
1172 1175
1173 __bkey_put(b->c, &b->key);
1174 atomic_inc(&b->c->prio_blocked);
1175 b->prio_blocked++;
1176
1177 btree_node_free(n, op); 1176 btree_node_free(n, op);
1178 up_write(&n->lock); 1177 up_write(&n->lock);
1179 } 1178 }
@@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1278 btree_node_free(r->b, op); 1277 btree_node_free(r->b, op);
1279 up_write(&r->b->lock); 1278 up_write(&r->b->lock);
1280 1279
1281 pr_debug("coalesced %u nodes", nodes); 1280 trace_bcache_btree_gc_coalesce(nodes);
1282 1281
1283 gc->nodes--; 1282 gc->nodes--;
1284 nodes--; 1283 nodes--;
@@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1293 void write(struct btree *r) 1292 void write(struct btree *r)
1294 { 1293 {
1295 if (!r->written) 1294 if (!r->written)
1296 bch_btree_write(r, true, op); 1295 bch_btree_node_write(r, &op->cl);
1297 else if (btree_node_dirty(r)) { 1296 else if (btree_node_dirty(r))
1298 BUG_ON(btree_current_write(r)->owner); 1297 bch_btree_node_write(r, writes);
1299 btree_current_write(r)->owner = writes;
1300 closure_get(writes);
1301
1302 bch_btree_write(r, true, NULL);
1303 }
1304 1298
1305 up_write(&r->lock); 1299 up_write(&r->lock);
1306 } 1300 }
@@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1386 ret = btree_gc_recurse(b, op, writes, gc); 1380 ret = btree_gc_recurse(b, op, writes, gc);
1387 1381
1388 if (!b->written || btree_node_dirty(b)) { 1382 if (!b->written || btree_node_dirty(b)) {
1389 atomic_inc(&b->c->prio_blocked); 1383 bch_btree_node_write(b, n ? &op->cl : NULL);
1390 b->prio_blocked++;
1391 bch_btree_write(b, true, n ? op : NULL);
1392 } 1384 }
1393 1385
1394 if (!IS_ERR_OR_NULL(n)) { 1386 if (!IS_ERR_OR_NULL(n)) {
@@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c)
1405{ 1397{
1406 struct cache *ca; 1398 struct cache *ca;
1407 struct bucket *b; 1399 struct bucket *b;
1408 struct bcache_device **d;
1409 unsigned i; 1400 unsigned i;
1410 1401
1411 if (!c->gc_mark_valid) 1402 if (!c->gc_mark_valid)
@@ -1423,12 +1414,6 @@ static void btree_gc_start(struct cache_set *c)
1423 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 1414 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1424 } 1415 }
1425 1416
1426 for (d = c->devices;
1427 d < c->devices + c->nr_uuids;
1428 d++)
1429 if (*d)
1430 (*d)->sectors_dirty_gc = 0;
1431
1432 mutex_unlock(&c->bucket_lock); 1417 mutex_unlock(&c->bucket_lock);
1433} 1418}
1434 1419
@@ -1437,7 +1422,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1437 size_t available = 0; 1422 size_t available = 0;
1438 struct bucket *b; 1423 struct bucket *b;
1439 struct cache *ca; 1424 struct cache *ca;
1440 struct bcache_device **d;
1441 unsigned i; 1425 unsigned i;
1442 1426
1443 mutex_lock(&c->bucket_lock); 1427 mutex_lock(&c->bucket_lock);
@@ -1480,22 +1464,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1480 } 1464 }
1481 } 1465 }
1482 1466
1483 for (d = c->devices;
1484 d < c->devices + c->nr_uuids;
1485 d++)
1486 if (*d) {
1487 unsigned long last =
1488 atomic_long_read(&((*d)->sectors_dirty));
1489 long difference = (*d)->sectors_dirty_gc - last;
1490
1491 pr_debug("sectors dirty off by %li", difference);
1492
1493 (*d)->sectors_dirty_last += difference;
1494
1495 atomic_long_set(&((*d)->sectors_dirty),
1496 (*d)->sectors_dirty_gc);
1497 }
1498
1499 mutex_unlock(&c->bucket_lock); 1467 mutex_unlock(&c->bucket_lock);
1500 return available; 1468 return available;
1501} 1469}
@@ -1508,10 +1476,9 @@ static void bch_btree_gc(struct closure *cl)
1508 struct gc_stat stats; 1476 struct gc_stat stats;
1509 struct closure writes; 1477 struct closure writes;
1510 struct btree_op op; 1478 struct btree_op op;
1511
1512 uint64_t start_time = local_clock(); 1479 uint64_t start_time = local_clock();
1513 trace_bcache_gc_start(c->sb.set_uuid); 1480
1514 blktrace_msg_all(c, "Starting gc"); 1481 trace_bcache_gc_start(c);
1515 1482
1516 memset(&stats, 0, sizeof(struct gc_stat)); 1483 memset(&stats, 0, sizeof(struct gc_stat));
1517 closure_init_stack(&writes); 1484 closure_init_stack(&writes);
@@ -1520,14 +1487,14 @@ static void bch_btree_gc(struct closure *cl)
1520 1487
1521 btree_gc_start(c); 1488 btree_gc_start(c);
1522 1489
1490 atomic_inc(&c->prio_blocked);
1491
1523 ret = btree_root(gc_root, c, &op, &writes, &stats); 1492 ret = btree_root(gc_root, c, &op, &writes, &stats);
1524 closure_sync(&op.cl); 1493 closure_sync(&op.cl);
1525 closure_sync(&writes); 1494 closure_sync(&writes);
1526 1495
1527 if (ret) { 1496 if (ret) {
1528 blktrace_msg_all(c, "Stopped gc");
1529 pr_warn("gc failed!"); 1497 pr_warn("gc failed!");
1530
1531 continue_at(cl, bch_btree_gc, bch_gc_wq); 1498 continue_at(cl, bch_btree_gc, bch_gc_wq);
1532 } 1499 }
1533 1500
@@ -1537,6 +1504,9 @@ static void bch_btree_gc(struct closure *cl)
1537 1504
1538 available = bch_btree_gc_finish(c); 1505 available = bch_btree_gc_finish(c);
1539 1506
1507 atomic_dec(&c->prio_blocked);
1508 wake_up_allocators(c);
1509
1540 bch_time_stats_update(&c->btree_gc_time, start_time); 1510 bch_time_stats_update(&c->btree_gc_time, start_time);
1541 1511
1542 stats.key_bytes *= sizeof(uint64_t); 1512 stats.key_bytes *= sizeof(uint64_t);
@@ -1544,10 +1514,8 @@ static void bch_btree_gc(struct closure *cl)
1544 stats.data <<= 9; 1514 stats.data <<= 9;
1545 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1515 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1546 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1516 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1547 blktrace_msg_all(c, "Finished gc");
1548 1517
1549 trace_bcache_gc_end(c->sb.set_uuid); 1518 trace_bcache_gc_end(c);
1550 wake_up(&c->alloc_wait);
1551 1519
1552 continue_at(cl, bch_moving_gc, bch_gc_wq); 1520 continue_at(cl, bch_moving_gc, bch_gc_wq);
1553} 1521}
@@ -1654,14 +1622,14 @@ static bool fix_overlapping_extents(struct btree *b,
1654 struct btree_iter *iter, 1622 struct btree_iter *iter,
1655 struct btree_op *op) 1623 struct btree_op *op)
1656{ 1624{
1657 void subtract_dirty(struct bkey *k, int sectors) 1625 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1658 { 1626 {
1659 struct bcache_device *d = b->c->devices[KEY_INODE(k)]; 1627 if (KEY_DIRTY(k))
1660 1628 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1661 if (KEY_DIRTY(k) && d) 1629 offset, -sectors);
1662 atomic_long_sub(sectors, &d->sectors_dirty);
1663 } 1630 }
1664 1631
1632 uint64_t old_offset;
1665 unsigned old_size, sectors_found = 0; 1633 unsigned old_size, sectors_found = 0;
1666 1634
1667 while (1) { 1635 while (1) {
@@ -1673,6 +1641,7 @@ static bool fix_overlapping_extents(struct btree *b,
1673 if (bkey_cmp(k, &START_KEY(insert)) <= 0) 1641 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1674 continue; 1642 continue;
1675 1643
1644 old_offset = KEY_START(k);
1676 old_size = KEY_SIZE(k); 1645 old_size = KEY_SIZE(k);
1677 1646
1678 /* 1647 /*
@@ -1728,7 +1697,7 @@ static bool fix_overlapping_extents(struct btree *b,
1728 1697
1729 struct bkey *top; 1698 struct bkey *top;
1730 1699
1731 subtract_dirty(k, KEY_SIZE(insert)); 1700 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1732 1701
1733 if (bkey_written(b, k)) { 1702 if (bkey_written(b, k)) {
1734 /* 1703 /*
@@ -1775,7 +1744,7 @@ static bool fix_overlapping_extents(struct btree *b,
1775 } 1744 }
1776 } 1745 }
1777 1746
1778 subtract_dirty(k, old_size - KEY_SIZE(k)); 1747 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1779 } 1748 }
1780 1749
1781check_failed: 1750check_failed:
@@ -1798,7 +1767,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1798{ 1767{
1799 struct bset *i = b->sets[b->nsets].data; 1768 struct bset *i = b->sets[b->nsets].data;
1800 struct bkey *m, *prev; 1769 struct bkey *m, *prev;
1801 const char *status = "insert"; 1770 unsigned status = BTREE_INSERT_STATUS_INSERT;
1802 1771
1803 BUG_ON(bkey_cmp(k, &b->key) > 0); 1772 BUG_ON(bkey_cmp(k, &b->key) > 0);
1804 BUG_ON(b->level && !KEY_PTRS(k)); 1773 BUG_ON(b->level && !KEY_PTRS(k));
@@ -1831,17 +1800,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1831 goto insert; 1800 goto insert;
1832 1801
1833 /* prev is in the tree, if we merge we're done */ 1802 /* prev is in the tree, if we merge we're done */
1834 status = "back merging"; 1803 status = BTREE_INSERT_STATUS_BACK_MERGE;
1835 if (prev && 1804 if (prev &&
1836 bch_bkey_try_merge(b, prev, k)) 1805 bch_bkey_try_merge(b, prev, k))
1837 goto merged; 1806 goto merged;
1838 1807
1839 status = "overwrote front"; 1808 status = BTREE_INSERT_STATUS_OVERWROTE;
1840 if (m != end(i) && 1809 if (m != end(i) &&
1841 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) 1810 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1842 goto copy; 1811 goto copy;
1843 1812
1844 status = "front merge"; 1813 status = BTREE_INSERT_STATUS_FRONT_MERGE;
1845 if (m != end(i) && 1814 if (m != end(i) &&
1846 bch_bkey_try_merge(b, k, m)) 1815 bch_bkey_try_merge(b, k, m))
1847 goto copy; 1816 goto copy;
@@ -1851,21 +1820,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1851insert: shift_keys(b, m, k); 1820insert: shift_keys(b, m, k);
1852copy: bkey_copy(m, k); 1821copy: bkey_copy(m, k);
1853merged: 1822merged:
1854 bch_check_keys(b, "%s for %s at %s: %s", status, 1823 if (KEY_DIRTY(k))
1855 op_type(op), pbtree(b), pkey(k)); 1824 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1856 bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, 1825 KEY_START(k), KEY_SIZE(k));
1857 op_type(op), pbtree(b), pkey(k)); 1826
1827 bch_check_keys(b, "%u for %s", status, op_type(op));
1858 1828
1859 if (b->level && !KEY_OFFSET(k)) 1829 if (b->level && !KEY_OFFSET(k))
1860 b->prio_blocked++; 1830 btree_current_write(b)->prio_blocked++;
1861 1831
1862 pr_debug("%s for %s at %s: %s", status, 1832 trace_bcache_btree_insert_key(b, k, op->type, status);
1863 op_type(op), pbtree(b), pkey(k));
1864 1833
1865 return true; 1834 return true;
1866} 1835}
1867 1836
1868bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1837static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
1869{ 1838{
1870 bool ret = false; 1839 bool ret = false;
1871 struct bkey *k; 1840 struct bkey *k;
@@ -1896,7 +1865,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1896 should_split(b)) 1865 should_split(b))
1897 goto out; 1866 goto out;
1898 1867
1899 op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); 1868 op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
1900 1869
1901 SET_KEY_PTRS(&op->replace, 1); 1870 SET_KEY_PTRS(&op->replace, 1);
1902 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 1871 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
@@ -1907,7 +1876,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1907 1876
1908 BUG_ON(op->type != BTREE_INSERT); 1877 BUG_ON(op->type != BTREE_INSERT);
1909 BUG_ON(!btree_insert_key(b, op, &tmp.k)); 1878 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1910 bch_btree_write(b, false, NULL);
1911 ret = true; 1879 ret = true;
1912out: 1880out:
1913 downgrade_write(&b->lock); 1881 downgrade_write(&b->lock);
@@ -1929,12 +1897,11 @@ static int btree_split(struct btree *b, struct btree_op *op)
1929 1897
1930 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1898 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
1931 1899
1932 pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
1933 pbtree(b), n1->sets[0].data->keys);
1934
1935 if (split) { 1900 if (split) {
1936 unsigned keys = 0; 1901 unsigned keys = 0;
1937 1902
1903 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
1904
1938 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 1905 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
1939 if (IS_ERR(n2)) 1906 if (IS_ERR(n2))
1940 goto err_free1; 1907 goto err_free1;
@@ -1967,18 +1934,21 @@ static int btree_split(struct btree *b, struct btree_op *op)
1967 bkey_copy_key(&n2->key, &b->key); 1934 bkey_copy_key(&n2->key, &b->key);
1968 1935
1969 bch_keylist_add(&op->keys, &n2->key); 1936 bch_keylist_add(&op->keys, &n2->key);
1970 bch_btree_write(n2, true, op); 1937 bch_btree_node_write(n2, &op->cl);
1971 rw_unlock(true, n2); 1938 rw_unlock(true, n2);
1972 } else 1939 } else {
1940 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
1941
1973 bch_btree_insert_keys(n1, op); 1942 bch_btree_insert_keys(n1, op);
1943 }
1974 1944
1975 bch_keylist_add(&op->keys, &n1->key); 1945 bch_keylist_add(&op->keys, &n1->key);
1976 bch_btree_write(n1, true, op); 1946 bch_btree_node_write(n1, &op->cl);
1977 1947
1978 if (n3) { 1948 if (n3) {
1979 bkey_copy_key(&n3->key, &MAX_KEY); 1949 bkey_copy_key(&n3->key, &MAX_KEY);
1980 bch_btree_insert_keys(n3, op); 1950 bch_btree_insert_keys(n3, op);
1981 bch_btree_write(n3, true, op); 1951 bch_btree_node_write(n3, &op->cl);
1982 1952
1983 closure_sync(&op->cl); 1953 closure_sync(&op->cl);
1984 bch_btree_set_root(n3); 1954 bch_btree_set_root(n3);
@@ -2082,8 +2052,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2082 2052
2083 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2053 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2084 2054
2085 if (bch_btree_insert_keys(b, op)) 2055 if (bch_btree_insert_keys(b, op)) {
2086 bch_btree_write(b, false, op); 2056 if (!b->level)
2057 bch_btree_leaf_dirty(b, op);
2058 else
2059 bch_btree_node_write(b, &op->cl);
2060 }
2087 } 2061 }
2088 2062
2089 return 0; 2063 return 0;
@@ -2140,6 +2114,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
2140void bch_btree_set_root(struct btree *b) 2114void bch_btree_set_root(struct btree *b)
2141{ 2115{
2142 unsigned i; 2116 unsigned i;
2117 struct closure cl;
2118
2119 closure_init_stack(&cl);
2120
2121 trace_bcache_btree_set_root(b);
2143 2122
2144 BUG_ON(!b->written); 2123 BUG_ON(!b->written);
2145 2124
@@ -2153,8 +2132,8 @@ void bch_btree_set_root(struct btree *b)
2153 b->c->root = b; 2132 b->c->root = b;
2154 __bkey_put(b->c, &b->key); 2133 __bkey_put(b->c, &b->key);
2155 2134
2156 bch_journal_meta(b->c, NULL); 2135 bch_journal_meta(b->c, &cl);
2157 pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); 2136 closure_sync(&cl);
2158} 2137}
2159 2138
2160/* Cache lookup */ 2139/* Cache lookup */
@@ -2215,9 +2194,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2215 KEY_OFFSET(k) - bio->bi_sector); 2194 KEY_OFFSET(k) - bio->bi_sector);
2216 2195
2217 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 2196 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
2218 if (!n)
2219 return -EAGAIN;
2220
2221 if (n == bio) 2197 if (n == bio)
2222 op->lookup_done = true; 2198 op->lookup_done = true;
2223 2199
@@ -2240,7 +2216,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2240 n->bi_end_io = bch_cache_read_endio; 2216 n->bi_end_io = bch_cache_read_endio;
2241 n->bi_private = &s->cl; 2217 n->bi_private = &s->cl;
2242 2218
2243 trace_bcache_cache_hit(n);
2244 __bch_submit_bbio(n, b->c); 2219 __bch_submit_bbio(n, b->c);
2245 } 2220 }
2246 2221
@@ -2257,9 +2232,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
2257 struct btree_iter iter; 2232 struct btree_iter iter;
2258 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); 2233 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2259 2234
2260 pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
2261 (uint64_t) bio->bi_sector);
2262
2263 do { 2235 do {
2264 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 2236 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2265 if (!k) { 2237 if (!k) {
@@ -2303,7 +2275,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2303} 2275}
2304 2276
2305static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2277static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2306 struct keybuf *buf, struct bkey *end) 2278 struct keybuf *buf, struct bkey *end,
2279 keybuf_pred_fn *pred)
2307{ 2280{
2308 struct btree_iter iter; 2281 struct btree_iter iter;
2309 bch_btree_iter_init(b, &iter, &buf->last_scanned); 2282 bch_btree_iter_init(b, &iter, &buf->last_scanned);
@@ -2322,11 +2295,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2322 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2295 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2323 break; 2296 break;
2324 2297
2325 if (buf->key_predicate(buf, k)) { 2298 if (pred(buf, k)) {
2326 struct keybuf_key *w; 2299 struct keybuf_key *w;
2327 2300
2328 pr_debug("%s", pkey(k));
2329
2330 spin_lock(&buf->lock); 2301 spin_lock(&buf->lock);
2331 2302
2332 w = array_alloc(&buf->freelist); 2303 w = array_alloc(&buf->freelist);
@@ -2343,7 +2314,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2343 if (!k) 2314 if (!k)
2344 break; 2315 break;
2345 2316
2346 btree(refill_keybuf, k, b, op, buf, end); 2317 btree(refill_keybuf, k, b, op, buf, end, pred);
2347 /* 2318 /*
2348 * Might get an error here, but can't really do anything 2319 * Might get an error here, but can't really do anything
2349 * and it'll get logged elsewhere. Just read what we 2320 * and it'll get logged elsewhere. Just read what we
@@ -2361,7 +2332,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2361} 2332}
2362 2333
2363void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2334void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2364 struct bkey *end) 2335 struct bkey *end, keybuf_pred_fn *pred)
2365{ 2336{
2366 struct bkey start = buf->last_scanned; 2337 struct bkey start = buf->last_scanned;
2367 struct btree_op op; 2338 struct btree_op op;
@@ -2369,7 +2340,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2369 2340
2370 cond_resched(); 2341 cond_resched();
2371 2342
2372 btree_root(refill_keybuf, c, &op, buf, end); 2343 btree_root(refill_keybuf, c, &op, buf, end, pred);
2373 closure_sync(&op.cl); 2344 closure_sync(&op.cl);
2374 2345
2375 pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2346 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
@@ -2455,7 +2426,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2455 2426
2456struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2427struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2457 struct keybuf *buf, 2428 struct keybuf *buf,
2458 struct bkey *end) 2429 struct bkey *end,
2430 keybuf_pred_fn *pred)
2459{ 2431{
2460 struct keybuf_key *ret; 2432 struct keybuf_key *ret;
2461 2433
@@ -2469,15 +2441,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2469 break; 2441 break;
2470 } 2442 }
2471 2443
2472 bch_refill_keybuf(c, buf, end); 2444 bch_refill_keybuf(c, buf, end, pred);
2473 } 2445 }
2474 2446
2475 return ret; 2447 return ret;
2476} 2448}
2477 2449
2478void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) 2450void bch_keybuf_init(struct keybuf *buf)
2479{ 2451{
2480 buf->key_predicate = fn;
2481 buf->last_scanned = MAX_KEY; 2452 buf->last_scanned = MAX_KEY;
2482 buf->keys = RB_ROOT; 2453 buf->keys = RB_ROOT;
2483 2454
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index af4a7092a28c..3333d3723633 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -102,7 +102,6 @@
102#include "debug.h" 102#include "debug.h"
103 103
104struct btree_write { 104struct btree_write {
105 struct closure *owner;
106 atomic_t *journal; 105 atomic_t *journal;
107 106
108 /* If btree_split() frees a btree node, it writes a new pointer to that 107 /* If btree_split() frees a btree node, it writes a new pointer to that
@@ -142,16 +141,12 @@ struct btree {
142 */ 141 */
143 struct bset_tree sets[MAX_BSETS]; 142 struct bset_tree sets[MAX_BSETS];
144 143
145 /* Used to refcount bio splits, also protects b->bio */ 144 /* For outstanding btree writes, used as a lock - protects write_idx */
146 struct closure_with_waitlist io; 145 struct closure_with_waitlist io;
147 146
148 /* Gets transferred to w->prio_blocked - see the comment there */
149 int prio_blocked;
150
151 struct list_head list; 147 struct list_head list;
152 struct delayed_work work; 148 struct delayed_work work;
153 149
154 uint64_t io_start_time;
155 struct btree_write writes[2]; 150 struct btree_write writes[2];
156 struct bio *bio; 151 struct bio *bio;
157}; 152};
@@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
164{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ 159{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
165 160
166enum btree_flags { 161enum btree_flags {
167 BTREE_NODE_read_done,
168 BTREE_NODE_io_error, 162 BTREE_NODE_io_error,
169 BTREE_NODE_dirty, 163 BTREE_NODE_dirty,
170 BTREE_NODE_write_idx, 164 BTREE_NODE_write_idx,
171}; 165};
172 166
173BTREE_FLAG(read_done);
174BTREE_FLAG(io_error); 167BTREE_FLAG(io_error);
175BTREE_FLAG(dirty); 168BTREE_FLAG(dirty);
176BTREE_FLAG(write_idx); 169BTREE_FLAG(write_idx);
@@ -278,6 +271,13 @@ struct btree_op {
278 BKEY_PADDED(replace); 271 BKEY_PADDED(replace);
279}; 272};
280 273
274enum {
275 BTREE_INSERT_STATUS_INSERT,
276 BTREE_INSERT_STATUS_BACK_MERGE,
277 BTREE_INSERT_STATUS_OVERWROTE,
278 BTREE_INSERT_STATUS_FRONT_MERGE,
279};
280
281void bch_btree_op_init_stack(struct btree_op *); 281void bch_btree_op_init_stack(struct btree_op *);
282 282
283static inline void rw_lock(bool w, struct btree *b, int level) 283static inline void rw_lock(bool w, struct btree *b, int level)
@@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b)
293#ifdef CONFIG_BCACHE_EDEBUG 293#ifdef CONFIG_BCACHE_EDEBUG
294 unsigned i; 294 unsigned i;
295 295
296 if (w && 296 if (w && b->key.ptr[0])
297 b->key.ptr[0] &&
298 btree_node_read_done(b))
299 for (i = 0; i <= b->nsets; i++) 297 for (i = 0; i <= b->nsets; i++)
300 bch_check_key_order(b, b->sets[i].data); 298 bch_check_key_order(b, b->sets[i].data);
301#endif 299#endif
@@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b)
370 > btree_blocks(b)); 368 > btree_blocks(b));
371} 369}
372 370
373void bch_btree_read_done(struct closure *); 371void bch_btree_node_read(struct btree *);
374void bch_btree_read(struct btree *); 372void bch_btree_node_write(struct btree *, struct closure *);
375void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
376 373
377void bch_cannibalize_unlock(struct cache_set *, struct closure *); 374void bch_cannibalize_unlock(struct cache_set *, struct closure *);
378void bch_btree_set_root(struct btree *); 375void bch_btree_set_root(struct btree *);
@@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
380struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, 377struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
381 int, struct btree_op *); 378 int, struct btree_op *);
382 379
383bool bch_btree_insert_keys(struct btree *, struct btree_op *);
384bool bch_btree_insert_check_key(struct btree *, struct btree_op *, 380bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
385 struct bio *); 381 struct bio *);
386int bch_btree_insert(struct btree_op *, struct cache_set *); 382int bch_btree_insert(struct btree_op *, struct cache_set *);
@@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *);
393int bch_btree_check(struct cache_set *, struct btree_op *); 389int bch_btree_check(struct cache_set *, struct btree_op *);
394uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); 390uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
395 391
396void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); 392void bch_keybuf_init(struct keybuf *);
397void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); 393void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
394 keybuf_pred_fn *);
398bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, 395bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
399 struct bkey *); 396 struct bkey *);
400void bch_keybuf_del(struct keybuf *, struct keybuf_key *); 397void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
401struct keybuf_key *bch_keybuf_next(struct keybuf *); 398struct keybuf_key *bch_keybuf_next(struct keybuf *);
402struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, 399struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
403 struct keybuf *, struct bkey *); 400 struct bkey *, keybuf_pred_fn *);
404 401
405#endif 402#endif
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 89fd5204924e..88e6411eab4f 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
47 return ""; 47 return "";
48} 48}
49 49
50struct keyprint_hack bch_pkey(const struct bkey *k) 50int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
51{ 51{
52 unsigned i = 0; 52 unsigned i = 0;
53 struct keyprint_hack r; 53 char *out = buf, *end = buf + size;
54 char *out = r.s, *end = r.s + KEYHACK_SIZE;
55 54
56#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) 55#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
57 56
@@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k)
75 if (KEY_CSUM(k)) 74 if (KEY_CSUM(k))
76 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); 75 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
77#undef p 76#undef p
78 return r; 77 return out - buf;
79} 78}
80 79
81struct keyprint_hack bch_pbtree(const struct btree *b) 80int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
82{ 81{
83 struct keyprint_hack r; 82 return scnprintf(buf, size, "%zu level %i/%i",
84 83 PTR_BUCKET_NR(b->c, &b->key, 0),
85 snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), 84 b->level, b->c->root ? b->c->root->level : -1);
86 b->level, b->c->root ? b->c->root->level : -1);
87 return r;
88} 85}
89 86
90#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) 87#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
@@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i)
100{ 97{
101 struct bkey *k; 98 struct bkey *k;
102 unsigned j; 99 unsigned j;
100 char buf[80];
103 101
104 for (k = i->start; k < end(i); k = bkey_next(k)) { 102 for (k = i->start; k < end(i); k = bkey_next(k)) {
103 bch_bkey_to_text(buf, sizeof(buf), k);
105 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), 104 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
106 (uint64_t *) k - i->d, i->keys, pkey(k)); 105 (uint64_t *) k - i->d, i->keys, buf);
107 106
108 for (j = 0; j < KEY_PTRS(k); j++) { 107 for (j = 0; j < KEY_PTRS(k); j++) {
109 size_t n = PTR_BUCKET_NR(b->c, k, j); 108 size_t n = PTR_BUCKET_NR(b->c, k, j);
@@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
144 v->written = 0; 143 v->written = 0;
145 v->level = b->level; 144 v->level = b->level;
146 145
147 bch_btree_read(v); 146 bch_btree_node_read(v);
148 closure_wait_event(&v->io.wait, &cl, 147 closure_wait_event(&v->io.wait, &cl,
149 atomic_read(&b->io.cl.remaining) == -1); 148 atomic_read(&b->io.cl.remaining) == -1);
150 149
@@ -200,7 +199,7 @@ void bch_data_verify(struct search *s)
200 if (!check) 199 if (!check)
201 return; 200 return;
202 201
203 if (bch_bio_alloc_pages(check, GFP_NOIO)) 202 if (bio_alloc_pages(check, GFP_NOIO))
204 goto out_put; 203 goto out_put;
205 204
206 check->bi_rw = READ_SYNC; 205 check->bi_rw = READ_SYNC;
@@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
252 va_list args) 251 va_list args)
253{ 252{
254 unsigned i; 253 unsigned i;
254 char buf[80];
255 255
256 console_lock(); 256 console_lock();
257 257
@@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
262 262
263 console_unlock(); 263 console_unlock();
264 264
265 panic("at %s\n", pbtree(b)); 265 bch_btree_to_text(buf, sizeof(buf), b);
266 panic("at %s\n", buf);
266} 267}
267 268
268void bch_check_key_order_msg(struct btree *b, struct bset *i, 269void bch_check_key_order_msg(struct btree *b, struct bset *i,
@@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
337{ 338{
338 struct dump_iterator *i = file->private_data; 339 struct dump_iterator *i = file->private_data;
339 ssize_t ret = 0; 340 ssize_t ret = 0;
341 char kbuf[80];
340 342
341 while (size) { 343 while (size) {
342 struct keybuf_key *w; 344 struct keybuf_key *w;
@@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
355 if (i->bytes) 357 if (i->bytes)
356 break; 358 break;
357 359
358 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); 360 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
359 if (!w) 361 if (!w)
360 break; 362 break;
361 363
362 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); 364 bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
365 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
363 bch_keybuf_del(&i->keys, w); 366 bch_keybuf_del(&i->keys, w);
364 } 367 }
365 368
@@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file)
377 380
378 file->private_data = i; 381 file->private_data = i;
379 i->c = c; 382 i->c = c;
380 bch_keybuf_init(&i->keys, dump_pred); 383 bch_keybuf_init(&i->keys);
381 i->keys.last_scanned = KEY(0, 0, 0); 384 i->keys.last_scanned = KEY(0, 0, 0);
382 385
383 return 0; 386 return 0;
@@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c)
409 412
410#endif 413#endif
411 414
412/* Fuzz tester has rotted: */
413#if 0
414
415static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
416 const char *buffer, size_t size)
417{
418 void dump(struct btree *b)
419 {
420 struct bset *i;
421
422 for (i = b->sets[0].data;
423 index(i, b) < btree_blocks(b) &&
424 i->seq == b->sets[0].data->seq;
425 i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
426 dump_bset(b, i);
427 }
428
429 struct cache_sb *sb;
430 struct cache_set *c;
431 struct btree *all[3], *b, *fill, *orig;
432 int j;
433
434 struct btree_op op;
435 bch_btree_op_init_stack(&op);
436
437 sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
438 if (!sb)
439 return -ENOMEM;
440
441 sb->bucket_size = 128;
442 sb->block_size = 4;
443
444 c = bch_cache_set_alloc(sb);
445 if (!c)
446 return -ENOMEM;
447
448 for (j = 0; j < 3; j++) {
449 BUG_ON(list_empty(&c->btree_cache));
450 all[j] = list_first_entry(&c->btree_cache, struct btree, list);
451 list_del_init(&all[j]->list);
452
453 all[j]->key = KEY(0, 0, c->sb.bucket_size);
454 bkey_copy_key(&all[j]->key, &MAX_KEY);
455 }
456
457 b = all[0];
458 fill = all[1];
459 orig = all[2];
460
461 while (1) {
462 for (j = 0; j < 3; j++)
463 all[j]->written = all[j]->nsets = 0;
464
465 bch_bset_init_next(b);
466
467 while (1) {
468 struct bset *i = write_block(b);
469 struct bkey *k = op.keys.top;
470 unsigned rand;
471
472 bkey_init(k);
473 rand = get_random_int();
474
475 op.type = rand & 1
476 ? BTREE_INSERT
477 : BTREE_REPLACE;
478 rand >>= 1;
479
480 SET_KEY_SIZE(k, bucket_remainder(c, rand));
481 rand >>= c->bucket_bits;
482 rand &= 1024 * 512 - 1;
483 rand += c->sb.bucket_size;
484 SET_KEY_OFFSET(k, rand);
485#if 0
486 SET_KEY_PTRS(k, 1);
487#endif
488 bch_keylist_push(&op.keys);
489 bch_btree_insert_keys(b, &op);
490
491 if (should_split(b) ||
492 set_blocks(i, b->c) !=
493 __set_blocks(i, i->keys + 15, b->c)) {
494 i->csum = csum_set(i);
495
496 memcpy(write_block(fill),
497 i, set_bytes(i));
498
499 b->written += set_blocks(i, b->c);
500 fill->written = b->written;
501 if (b->written == btree_blocks(b))
502 break;
503
504 bch_btree_sort_lazy(b);
505 bch_bset_init_next(b);
506 }
507 }
508
509 memcpy(orig->sets[0].data,
510 fill->sets[0].data,
511 btree_bytes(c));
512
513 bch_btree_sort(b);
514 fill->written = 0;
515 bch_btree_read_done(&fill->io.cl);
516
517 if (b->sets[0].data->keys != fill->sets[0].data->keys ||
518 memcmp(b->sets[0].data->start,
519 fill->sets[0].data->start,
520 b->sets[0].data->keys * sizeof(uint64_t))) {
521 struct bset *i = b->sets[0].data;
522 struct bkey *k, *l;
523
524 for (k = i->start,
525 l = fill->sets[0].data->start;
526 k < end(i);
527 k = bkey_next(k), l = bkey_next(l))
528 if (bkey_cmp(k, l) ||
529 KEY_SIZE(k) != KEY_SIZE(l))
530 pr_err("key %zi differs: %s != %s",
531 (uint64_t *) k - i->d,
532 pkey(k), pkey(l));
533
534 for (j = 0; j < 3; j++) {
535 pr_err("**** Set %i ****", j);
536 dump(all[j]);
537 }
538 panic("\n");
539 }
540
541 pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
542 }
543}
544
545kobj_attribute_write(fuzz, btree_fuzz);
546#endif
547
548void bch_debug_exit(void) 415void bch_debug_exit(void)
549{ 416{
550 if (!IS_ERR_OR_NULL(debug)) 417 if (!IS_ERR_OR_NULL(debug))
@@ -554,11 +421,6 @@ void bch_debug_exit(void)
554int __init bch_debug_init(struct kobject *kobj) 421int __init bch_debug_init(struct kobject *kobj)
555{ 422{
556 int ret = 0; 423 int ret = 0;
557#if 0
558 ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
559 if (ret)
560 return ret;
561#endif
562 424
563 debug = debugfs_create_dir("bcache", NULL); 425 debug = debugfs_create_dir("bcache", NULL);
564 return ret; 426 return ret;
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index f9378a218148..1c39b5a2489b 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -3,15 +3,8 @@
3 3
4/* Btree/bkey debug printing */ 4/* Btree/bkey debug printing */
5 5
6#define KEYHACK_SIZE 80 6int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
7struct keyprint_hack { 7int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
8 char s[KEYHACK_SIZE];
9};
10
11struct keyprint_hack bch_pkey(const struct bkey *k);
12struct keyprint_hack bch_pbtree(const struct btree *b);
13#define pkey(k) (&bch_pkey(k).s[0])
14#define pbtree(b) (&bch_pbtree(b).s[0])
15 8
16#ifdef CONFIG_BCACHE_EDEBUG 9#ifdef CONFIG_BCACHE_EDEBUG
17 10
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 48efd4dea645..9056632995b1 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -9,6 +9,8 @@
9#include "bset.h" 9#include "bset.h"
10#include "debug.h" 10#include "debug.h"
11 11
12#include <linux/blkdev.h>
13
12static void bch_bi_idx_hack_endio(struct bio *bio, int error) 14static void bch_bi_idx_hack_endio(struct bio *bio, int error)
13{ 15{
14 struct bio *p = bio->bi_private; 16 struct bio *p = bio->bi_private;
@@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio)
66 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a 68 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
67 * bvec boundry; it is the caller's responsibility to ensure that @bio is not 69 * bvec boundry; it is the caller's responsibility to ensure that @bio is not
68 * freed before the split. 70 * freed before the split.
69 *
70 * If bch_bio_split() is running under generic_make_request(), it's not safe to
71 * allocate more than one bio from the same bio set. Therefore, if it is running
72 * under generic_make_request() it masks out __GFP_WAIT when doing the
73 * allocation. The caller must check for failure if there's any possibility of
74 * it being called from under generic_make_request(); it is then the caller's
75 * responsibility to retry from a safe context (by e.g. punting to workqueue).
76 */ 71 */
77struct bio *bch_bio_split(struct bio *bio, int sectors, 72struct bio *bch_bio_split(struct bio *bio, int sectors,
78 gfp_t gfp, struct bio_set *bs) 73 gfp_t gfp, struct bio_set *bs)
@@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
83 78
84 BUG_ON(sectors <= 0); 79 BUG_ON(sectors <= 0);
85 80
86 /*
87 * If we're being called from underneath generic_make_request() and we
88 * already allocated any bios from this bio set, we risk deadlock if we
89 * use the mempool. So instead, we possibly fail and let the caller punt
90 * to workqueue or somesuch and retry in a safe context.
91 */
92 if (current->bio_list)
93 gfp &= ~__GFP_WAIT;
94
95 if (sectors >= bio_sectors(bio)) 81 if (sectors >= bio_sectors(bio))
96 return bio; 82 return bio;
97 83
98 if (bio->bi_rw & REQ_DISCARD) { 84 if (bio->bi_rw & REQ_DISCARD) {
99 ret = bio_alloc_bioset(gfp, 1, bs); 85 ret = bio_alloc_bioset(gfp, 1, bs);
86 if (!ret)
87 return NULL;
100 idx = 0; 88 idx = 0;
101 goto out; 89 goto out;
102 } 90 }
@@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
160 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 148 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
161 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, 149 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
162 queue_max_segments(q)); 150 queue_max_segments(q));
163 struct bio_vec *bv, *end = bio_iovec(bio) +
164 min_t(int, bio_segments(bio), max_segments);
165 151
166 if (bio->bi_rw & REQ_DISCARD) 152 if (bio->bi_rw & REQ_DISCARD)
167 return min(ret, q->limits.max_discard_sectors); 153 return min(ret, q->limits.max_discard_sectors);
168 154
169 if (bio_segments(bio) > max_segments || 155 if (bio_segments(bio) > max_segments ||
170 q->merge_bvec_fn) { 156 q->merge_bvec_fn) {
157 struct bio_vec *bv;
158 int i, seg = 0;
159
171 ret = 0; 160 ret = 0;
172 161
173 for (bv = bio_iovec(bio); bv < end; bv++) { 162 bio_for_each_segment(bv, bio, i) {
174 struct bvec_merge_data bvm = { 163 struct bvec_merge_data bvm = {
175 .bi_bdev = bio->bi_bdev, 164 .bi_bdev = bio->bi_bdev,
176 .bi_sector = bio->bi_sector, 165 .bi_sector = bio->bi_sector,
@@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
178 .bi_rw = bio->bi_rw, 167 .bi_rw = bio->bi_rw,
179 }; 168 };
180 169
170 if (seg == max_segments)
171 break;
172
181 if (q->merge_bvec_fn && 173 if (q->merge_bvec_fn &&
182 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) 174 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
183 break; 175 break;
184 176
177 seg++;
185 ret += bv->bv_len >> 9; 178 ret += bv->bv_len >> 9;
186 } 179 }
187 } 180 }
@@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error)
218 closure_put(cl); 211 closure_put(cl);
219} 212}
220 213
221static void __bch_bio_submit_split(struct closure *cl)
222{
223 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
224 struct bio *bio = s->bio, *n;
225
226 do {
227 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
228 GFP_NOIO, s->p->bio_split);
229 if (!n)
230 continue_at(cl, __bch_bio_submit_split, system_wq);
231
232 n->bi_end_io = bch_bio_submit_split_endio;
233 n->bi_private = cl;
234
235 closure_get(cl);
236 bch_generic_make_request_hack(n);
237 } while (n != bio);
238
239 continue_at(cl, bch_bio_submit_split_done, NULL);
240}
241
242void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) 214void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
243{ 215{
244 struct bio_split_hook *s; 216 struct bio_split_hook *s;
217 struct bio *n;
245 218
246 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) 219 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
247 goto submit; 220 goto submit;
@@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
250 goto submit; 223 goto submit;
251 224
252 s = mempool_alloc(p->bio_split_hook, GFP_NOIO); 225 s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
226 closure_init(&s->cl, NULL);
253 227
254 s->bio = bio; 228 s->bio = bio;
255 s->p = p; 229 s->p = p;
@@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
257 s->bi_private = bio->bi_private; 231 s->bi_private = bio->bi_private;
258 bio_get(bio); 232 bio_get(bio);
259 233
260 closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); 234 do {
261 return; 235 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
236 GFP_NOIO, s->p->bio_split);
237
238 n->bi_end_io = bch_bio_submit_split_endio;
239 n->bi_private = &s->cl;
240
241 closure_get(&s->cl);
242 bch_generic_make_request_hack(n);
243 } while (n != bio);
244
245 continue_at(&s->cl, bch_bio_submit_split_done, NULL);
262submit: 246submit:
263 bch_generic_make_request_hack(bio); 247 bch_generic_make_request_hack(bio);
264} 248}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8c8dfdcd9d4c..4b250667bb7f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -9,6 +9,8 @@
9#include "debug.h" 9#include "debug.h"
10#include "request.h" 10#include "request.h"
11 11
12#include <trace/events/bcache.h>
13
12/* 14/*
13 * Journal replay/recovery: 15 * Journal replay/recovery:
14 * 16 *
@@ -300,7 +302,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
300 for (k = i->j.start; 302 for (k = i->j.start;
301 k < end(&i->j); 303 k < end(&i->j);
302 k = bkey_next(k)) { 304 k = bkey_next(k)) {
303 pr_debug("%s", pkey(k)); 305 trace_bcache_journal_replay_key(k);
306
304 bkey_copy(op->keys.top, k); 307 bkey_copy(op->keys.top, k);
305 bch_keylist_push(&op->keys); 308 bch_keylist_push(&op->keys);
306 309
@@ -384,7 +387,7 @@ out:
384 return; 387 return;
385found: 388found:
386 if (btree_node_dirty(best)) 389 if (btree_node_dirty(best))
387 bch_btree_write(best, true, NULL); 390 bch_btree_node_write(best, NULL);
388 rw_unlock(true, best); 391 rw_unlock(true, best);
389} 392}
390 393
@@ -617,7 +620,7 @@ static void journal_write_unlocked(struct closure *cl)
617 bio_reset(bio); 620 bio_reset(bio);
618 bio->bi_sector = PTR_OFFSET(k, i); 621 bio->bi_sector = PTR_OFFSET(k, i);
619 bio->bi_bdev = ca->bdev; 622 bio->bi_bdev = ca->bdev;
620 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; 623 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
621 bio->bi_size = sectors << 9; 624 bio->bi_size = sectors << 9;
622 625
623 bio->bi_end_io = journal_write_endio; 626 bio->bi_end_io = journal_write_endio;
@@ -712,7 +715,8 @@ void bch_journal(struct closure *cl)
712 spin_lock(&c->journal.lock); 715 spin_lock(&c->journal.lock);
713 716
714 if (journal_full(&c->journal)) { 717 if (journal_full(&c->journal)) {
715 /* XXX: tracepoint */ 718 trace_bcache_journal_full(c);
719
716 closure_wait(&c->journal.wait, cl); 720 closure_wait(&c->journal.wait, cl);
717 721
718 journal_reclaim(c); 722 journal_reclaim(c);
@@ -728,13 +732,15 @@ void bch_journal(struct closure *cl)
728 732
729 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || 733 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
730 b > c->journal.blocks_free) { 734 b > c->journal.blocks_free) {
731 /* XXX: If we were inserting so many keys that they won't fit in 735 trace_bcache_journal_entry_full(c);
736
737 /*
738 * XXX: If we were inserting so many keys that they won't fit in
732 * an _empty_ journal write, we'll deadlock. For now, handle 739 * an _empty_ journal write, we'll deadlock. For now, handle
733 * this in bch_keylist_realloc() - but something to think about. 740 * this in bch_keylist_realloc() - but something to think about.
734 */ 741 */
735 BUG_ON(!w->data->keys); 742 BUG_ON(!w->data->keys);
736 743
737 /* XXX: tracepoint */
738 BUG_ON(!closure_wait(&w->wait, cl)); 744 BUG_ON(!closure_wait(&w->wait, cl));
739 745
740 closure_flush(&c->journal.io); 746 closure_flush(&c->journal.io);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 8589512c972e..1a3b4f4786c3 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -9,6 +9,8 @@
9#include "debug.h" 9#include "debug.h"
10#include "request.h" 10#include "request.h"
11 11
12#include <trace/events/bcache.h>
13
12struct moving_io { 14struct moving_io {
13 struct keybuf_key *w; 15 struct keybuf_key *w;
14 struct search s; 16 struct search s;
@@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl)
44{ 46{
45 struct moving_io *io = container_of(cl, struct moving_io, s.cl); 47 struct moving_io *io = container_of(cl, struct moving_io, s.cl);
46 struct bio *bio = &io->bio.bio; 48 struct bio *bio = &io->bio.bio;
47 struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); 49 struct bio_vec *bv;
50 int i;
48 51
49 while (bv-- != bio->bi_io_vec) 52 bio_for_each_segment_all(bv, bio, i)
50 __free_page(bv->bv_page); 53 __free_page(bv->bv_page);
51 54
52 pr_debug("%s %s", io->s.op.insert_collision 55 if (io->s.op.insert_collision)
53 ? "collision moving" : "moved", 56 trace_bcache_gc_copy_collision(&io->w->key);
54 pkey(&io->w->key));
55 57
56 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); 58 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
57 59
@@ -94,8 +96,6 @@ static void write_moving(struct closure *cl)
94 struct moving_io *io = container_of(s, struct moving_io, s); 96 struct moving_io *io = container_of(s, struct moving_io, s);
95 97
96 if (!s->error) { 98 if (!s->error) {
97 trace_bcache_write_moving(&io->bio.bio);
98
99 moving_init(io); 99 moving_init(io);
100 100
101 io->bio.bio.bi_sector = KEY_START(&io->w->key); 101 io->bio.bio.bi_sector = KEY_START(&io->w->key);
@@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl)
122 struct moving_io *io = container_of(s, struct moving_io, s); 122 struct moving_io *io = container_of(s, struct moving_io, s);
123 struct bio *bio = &io->bio.bio; 123 struct bio *bio = &io->bio.bio;
124 124
125 trace_bcache_read_moving(bio);
126 bch_submit_bbio(bio, s->op.c, &io->w->key, 0); 125 bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
127 126
128 continue_at(cl, write_moving, bch_gc_wq); 127 continue_at(cl, write_moving, bch_gc_wq);
@@ -138,7 +137,8 @@ static void read_moving(struct closure *cl)
138 /* XXX: if we error, background writeback could stall indefinitely */ 137 /* XXX: if we error, background writeback could stall indefinitely */
139 138
140 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { 139 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
141 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); 140 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
141 &MAX_KEY, moving_pred);
142 if (!w) 142 if (!w)
143 break; 143 break;
144 144
@@ -159,10 +159,10 @@ static void read_moving(struct closure *cl)
159 bio->bi_rw = READ; 159 bio->bi_rw = READ;
160 bio->bi_end_io = read_moving_endio; 160 bio->bi_end_io = read_moving_endio;
161 161
162 if (bch_bio_alloc_pages(bio, GFP_KERNEL)) 162 if (bio_alloc_pages(bio, GFP_KERNEL))
163 goto err; 163 goto err;
164 164
165 pr_debug("%s", pkey(&w->key)); 165 trace_bcache_gc_copy(&w->key);
166 166
167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); 167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
168 168
@@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl)
250 250
251void bch_moving_init_cache_set(struct cache_set *c) 251void bch_moving_init_cache_set(struct cache_set *c)
252{ 252{
253 bch_keybuf_init(&c->moving_gc_keys, moving_pred); 253 bch_keybuf_init(&c->moving_gc_keys);
254} 254}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index e5ff12e52d5b..b6e74d3c8faf 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -10,6 +10,7 @@
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "request.h" 12#include "request.h"
13#include "writeback.h"
13 14
14#include <linux/cgroup.h> 15#include <linux/cgroup.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -21,8 +22,6 @@
21 22
22#define CUTOFF_CACHE_ADD 95 23#define CUTOFF_CACHE_ADD 95
23#define CUTOFF_CACHE_READA 90 24#define CUTOFF_CACHE_READA 90
24#define CUTOFF_WRITEBACK 50
25#define CUTOFF_WRITEBACK_SYNC 75
26 25
27struct kmem_cache *bch_search_cache; 26struct kmem_cache *bch_search_cache;
28 27
@@ -510,10 +509,6 @@ static void bch_insert_data_loop(struct closure *cl)
510 goto err; 509 goto err;
511 510
512 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); 511 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
513 if (!n) {
514 __bkey_put(op->c, k);
515 continue_at(cl, bch_insert_data_loop, bcache_wq);
516 }
517 512
518 n->bi_end_io = bch_insert_data_endio; 513 n->bi_end_io = bch_insert_data_endio;
519 n->bi_private = cl; 514 n->bi_private = cl;
@@ -530,10 +525,9 @@ static void bch_insert_data_loop(struct closure *cl)
530 if (KEY_CSUM(k)) 525 if (KEY_CSUM(k))
531 bio_csum(n, k); 526 bio_csum(n, k);
532 527
533 pr_debug("%s", pkey(k)); 528 trace_bcache_cache_insert(k);
534 bch_keylist_push(&op->keys); 529 bch_keylist_push(&op->keys);
535 530
536 trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
537 n->bi_rw |= REQ_WRITE; 531 n->bi_rw |= REQ_WRITE;
538 bch_submit_bbio(n, op->c, k, 0); 532 bch_submit_bbio(n, op->c, k, 0);
539 } while (n != bio); 533 } while (n != bio);
@@ -784,11 +778,8 @@ static void request_read_error(struct closure *cl)
784 int i; 778 int i;
785 779
786 if (s->recoverable) { 780 if (s->recoverable) {
787 /* The cache read failed, but we can retry from the backing 781 /* Retry from the backing device: */
788 * device. 782 trace_bcache_read_retry(s->orig_bio);
789 */
790 pr_debug("recovering at sector %llu",
791 (uint64_t) s->orig_bio->bi_sector);
792 783
793 s->error = 0; 784 s->error = 0;
794 bv = s->bio.bio.bi_io_vec; 785 bv = s->bio.bio.bi_io_vec;
@@ -806,7 +797,6 @@ static void request_read_error(struct closure *cl)
806 797
807 /* XXX: invalidate cache */ 798 /* XXX: invalidate cache */
808 799
809 trace_bcache_read_retry(&s->bio.bio);
810 closure_bio_submit(&s->bio.bio, &s->cl, s->d); 800 closure_bio_submit(&s->bio.bio, &s->cl, s->d);
811 } 801 }
812 802
@@ -827,53 +817,13 @@ static void request_read_done(struct closure *cl)
827 */ 817 */
828 818
829 if (s->op.cache_bio) { 819 if (s->op.cache_bio) {
830 struct bio_vec *src, *dst;
831 unsigned src_offset, dst_offset, bytes;
832 void *dst_ptr;
833
834 bio_reset(s->op.cache_bio); 820 bio_reset(s->op.cache_bio);
835 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; 821 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector;
836 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; 822 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev;
837 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 823 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
838 bch_bio_map(s->op.cache_bio, NULL); 824 bch_bio_map(s->op.cache_bio, NULL);
839 825
840 src = bio_iovec(s->op.cache_bio); 826 bio_copy_data(s->cache_miss, s->op.cache_bio);
841 dst = bio_iovec(s->cache_miss);
842 src_offset = src->bv_offset;
843 dst_offset = dst->bv_offset;
844 dst_ptr = kmap(dst->bv_page);
845
846 while (1) {
847 if (dst_offset == dst->bv_offset + dst->bv_len) {
848 kunmap(dst->bv_page);
849 dst++;
850 if (dst == bio_iovec_idx(s->cache_miss,
851 s->cache_miss->bi_vcnt))
852 break;
853
854 dst_offset = dst->bv_offset;
855 dst_ptr = kmap(dst->bv_page);
856 }
857
858 if (src_offset == src->bv_offset + src->bv_len) {
859 src++;
860 if (src == bio_iovec_idx(s->op.cache_bio,
861 s->op.cache_bio->bi_vcnt))
862 BUG();
863
864 src_offset = src->bv_offset;
865 }
866
867 bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
868 src->bv_offset + src->bv_len - src_offset);
869
870 memcpy(dst_ptr + dst_offset,
871 page_address(src->bv_page) + src_offset,
872 bytes);
873
874 src_offset += bytes;
875 dst_offset += bytes;
876 }
877 827
878 bio_put(s->cache_miss); 828 bio_put(s->cache_miss);
879 s->cache_miss = NULL; 829 s->cache_miss = NULL;
@@ -899,6 +849,7 @@ static void request_read_done_bh(struct closure *cl)
899 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 849 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
900 850
901 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); 851 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
852 trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
902 853
903 if (s->error) 854 if (s->error)
904 continue_at_nobarrier(cl, request_read_error, bcache_wq); 855 continue_at_nobarrier(cl, request_read_error, bcache_wq);
@@ -917,9 +868,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
917 struct bio *miss; 868 struct bio *miss;
918 869
919 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 870 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
920 if (!miss)
921 return -EAGAIN;
922
923 if (miss == bio) 871 if (miss == bio)
924 s->op.lookup_done = true; 872 s->op.lookup_done = true;
925 873
@@ -938,8 +886,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
938 reada = min(dc->readahead >> 9, 886 reada = min(dc->readahead >> 9,
939 sectors - bio_sectors(miss)); 887 sectors - bio_sectors(miss));
940 888
941 if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) 889 if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
942 reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); 890 reada = bdev_sectors(miss->bi_bdev) -
891 bio_end_sector(miss);
943 } 892 }
944 893
945 s->cache_bio_sectors = bio_sectors(miss) + reada; 894 s->cache_bio_sectors = bio_sectors(miss) + reada;
@@ -963,13 +912,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
963 goto out_put; 912 goto out_put;
964 913
965 bch_bio_map(s->op.cache_bio, NULL); 914 bch_bio_map(s->op.cache_bio, NULL);
966 if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) 915 if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
967 goto out_put; 916 goto out_put;
968 917
969 s->cache_miss = miss; 918 s->cache_miss = miss;
970 bio_get(s->op.cache_bio); 919 bio_get(s->op.cache_bio);
971 920
972 trace_bcache_cache_miss(s->orig_bio);
973 closure_bio_submit(s->op.cache_bio, &s->cl, s->d); 921 closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
974 922
975 return ret; 923 return ret;
@@ -1002,24 +950,13 @@ static void cached_dev_write_complete(struct closure *cl)
1002 cached_dev_bio_complete(cl); 950 cached_dev_bio_complete(cl);
1003} 951}
1004 952
1005static bool should_writeback(struct cached_dev *dc, struct bio *bio)
1006{
1007 unsigned threshold = (bio->bi_rw & REQ_SYNC)
1008 ? CUTOFF_WRITEBACK_SYNC
1009 : CUTOFF_WRITEBACK;
1010
1011 return !atomic_read(&dc->disk.detaching) &&
1012 cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
1013 dc->disk.c->gc_stats.in_use < threshold;
1014}
1015
1016static void request_write(struct cached_dev *dc, struct search *s) 953static void request_write(struct cached_dev *dc, struct search *s)
1017{ 954{
1018 struct closure *cl = &s->cl; 955 struct closure *cl = &s->cl;
1019 struct bio *bio = &s->bio.bio; 956 struct bio *bio = &s->bio.bio;
1020 struct bkey start, end; 957 struct bkey start, end;
1021 start = KEY(dc->disk.id, bio->bi_sector, 0); 958 start = KEY(dc->disk.id, bio->bi_sector, 0);
1022 end = KEY(dc->disk.id, bio_end(bio), 0); 959 end = KEY(dc->disk.id, bio_end_sector(bio), 0);
1023 960
1024 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); 961 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
1025 962
@@ -1034,22 +971,37 @@ static void request_write(struct cached_dev *dc, struct search *s)
1034 if (bio->bi_rw & REQ_DISCARD) 971 if (bio->bi_rw & REQ_DISCARD)
1035 goto skip; 972 goto skip;
1036 973
974 if (should_writeback(dc, s->orig_bio,
975 cache_mode(dc, bio),
976 s->op.skip)) {
977 s->op.skip = false;
978 s->writeback = true;
979 }
980
1037 if (s->op.skip) 981 if (s->op.skip)
1038 goto skip; 982 goto skip;
1039 983
1040 if (should_writeback(dc, s->orig_bio)) 984 trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
1041 s->writeback = true;
1042 985
1043 if (!s->writeback) { 986 if (!s->writeback) {
1044 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 987 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1045 dc->disk.bio_split); 988 dc->disk.bio_split);
1046 989
1047 trace_bcache_writethrough(s->orig_bio);
1048 closure_bio_submit(bio, cl, s->d); 990 closure_bio_submit(bio, cl, s->d);
1049 } else { 991 } else {
1050 s->op.cache_bio = bio; 992 bch_writeback_add(dc);
1051 trace_bcache_writeback(s->orig_bio); 993
1052 bch_writeback_add(dc, bio_sectors(bio)); 994 if (s->op.flush_journal) {
995 /* Also need to send a flush to the backing device */
996 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
997 dc->disk.bio_split);
998
999 bio->bi_size = 0;
1000 bio->bi_vcnt = 0;
1001 closure_bio_submit(bio, cl, s->d);
1002 } else {
1003 s->op.cache_bio = bio;
1004 }
1053 } 1005 }
1054out: 1006out:
1055 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1007 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
@@ -1058,7 +1010,6 @@ skip:
1058 s->op.skip = true; 1010 s->op.skip = true;
1059 s->op.cache_bio = s->orig_bio; 1011 s->op.cache_bio = s->orig_bio;
1060 bio_get(s->op.cache_bio); 1012 bio_get(s->op.cache_bio);
1061 trace_bcache_write_skip(s->orig_bio);
1062 1013
1063 if ((bio->bi_rw & REQ_DISCARD) && 1014 if ((bio->bi_rw & REQ_DISCARD) &&
1064 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1015 !blk_queue_discard(bdev_get_queue(dc->bdev)))
@@ -1088,9 +1039,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
1088 1039
1089/* Cached devices - read & write stuff */ 1040/* Cached devices - read & write stuff */
1090 1041
1091int bch_get_congested(struct cache_set *c) 1042unsigned bch_get_congested(struct cache_set *c)
1092{ 1043{
1093 int i; 1044 int i;
1045 long rand;
1094 1046
1095 if (!c->congested_read_threshold_us && 1047 if (!c->congested_read_threshold_us &&
1096 !c->congested_write_threshold_us) 1048 !c->congested_write_threshold_us)
@@ -1106,7 +1058,13 @@ int bch_get_congested(struct cache_set *c)
1106 1058
1107 i += CONGESTED_MAX; 1059 i += CONGESTED_MAX;
1108 1060
1109 return i <= 0 ? 1 : fract_exp_two(i, 6); 1061 if (i > 0)
1062 i = fract_exp_two(i, 6);
1063
1064 rand = get_random_int();
1065 i -= bitmap_weight(&rand, BITS_PER_LONG);
1066
1067 return i > 0 ? i : 1;
1110} 1068}
1111 1069
1112static void add_sequential(struct task_struct *t) 1070static void add_sequential(struct task_struct *t)
@@ -1126,10 +1084,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
1126{ 1084{
1127 struct cache_set *c = s->op.c; 1085 struct cache_set *c = s->op.c;
1128 struct bio *bio = &s->bio.bio; 1086 struct bio *bio = &s->bio.bio;
1129
1130 long rand;
1131 int cutoff = bch_get_congested(c);
1132 unsigned mode = cache_mode(dc, bio); 1087 unsigned mode = cache_mode(dc, bio);
1088 unsigned sectors, congested = bch_get_congested(c);
1133 1089
1134 if (atomic_read(&dc->disk.detaching) || 1090 if (atomic_read(&dc->disk.detaching) ||
1135 c->gc_stats.in_use > CUTOFF_CACHE_ADD || 1091 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -1147,17 +1103,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
1147 goto skip; 1103 goto skip;
1148 } 1104 }
1149 1105
1150 if (!cutoff) { 1106 if (!congested && !dc->sequential_cutoff)
1151 cutoff = dc->sequential_cutoff >> 9; 1107 goto rescale;
1152 1108
1153 if (!cutoff) 1109 if (!congested &&
1154 goto rescale; 1110 mode == CACHE_MODE_WRITEBACK &&
1155 1111 (bio->bi_rw & REQ_WRITE) &&
1156 if (mode == CACHE_MODE_WRITEBACK && 1112 (bio->bi_rw & REQ_SYNC))
1157 (bio->bi_rw & REQ_WRITE) && 1113 goto rescale;
1158 (bio->bi_rw & REQ_SYNC))
1159 goto rescale;
1160 }
1161 1114
1162 if (dc->sequential_merge) { 1115 if (dc->sequential_merge) {
1163 struct io *i; 1116 struct io *i;
@@ -1177,7 +1130,7 @@ found:
1177 if (i->sequential + bio->bi_size > i->sequential) 1130 if (i->sequential + bio->bi_size > i->sequential)
1178 i->sequential += bio->bi_size; 1131 i->sequential += bio->bi_size;
1179 1132
1180 i->last = bio_end(bio); 1133 i->last = bio_end_sector(bio);
1181 i->jiffies = jiffies + msecs_to_jiffies(5000); 1134 i->jiffies = jiffies + msecs_to_jiffies(5000);
1182 s->task->sequential_io = i->sequential; 1135 s->task->sequential_io = i->sequential;
1183 1136
@@ -1192,12 +1145,19 @@ found:
1192 add_sequential(s->task); 1145 add_sequential(s->task);
1193 } 1146 }
1194 1147
1195 rand = get_random_int(); 1148 sectors = max(s->task->sequential_io,
1196 cutoff -= bitmap_weight(&rand, BITS_PER_LONG); 1149 s->task->sequential_io_avg) >> 9;
1197 1150
1198 if (cutoff <= (int) (max(s->task->sequential_io, 1151 if (dc->sequential_cutoff &&
1199 s->task->sequential_io_avg) >> 9)) 1152 sectors >= dc->sequential_cutoff >> 9) {
1153 trace_bcache_bypass_sequential(s->orig_bio);
1200 goto skip; 1154 goto skip;
1155 }
1156
1157 if (congested && sectors >= congested) {
1158 trace_bcache_bypass_congested(s->orig_bio);
1159 goto skip;
1160 }
1201 1161
1202rescale: 1162rescale:
1203 bch_rescale_priorities(c, bio_sectors(bio)); 1163 bch_rescale_priorities(c, bio_sectors(bio));
@@ -1288,30 +1248,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
1288static int flash_dev_cache_miss(struct btree *b, struct search *s, 1248static int flash_dev_cache_miss(struct btree *b, struct search *s,
1289 struct bio *bio, unsigned sectors) 1249 struct bio *bio, unsigned sectors)
1290{ 1250{
1251 struct bio_vec *bv;
1252 int i;
1253
1291 /* Zero fill bio */ 1254 /* Zero fill bio */
1292 1255
1293 while (bio->bi_idx != bio->bi_vcnt) { 1256 bio_for_each_segment(bv, bio, i) {
1294 struct bio_vec *bv = bio_iovec(bio);
1295 unsigned j = min(bv->bv_len >> 9, sectors); 1257 unsigned j = min(bv->bv_len >> 9, sectors);
1296 1258
1297 void *p = kmap(bv->bv_page); 1259 void *p = kmap(bv->bv_page);
1298 memset(p + bv->bv_offset, 0, j << 9); 1260 memset(p + bv->bv_offset, 0, j << 9);
1299 kunmap(bv->bv_page); 1261 kunmap(bv->bv_page);
1300 1262
1301 bv->bv_len -= j << 9; 1263 sectors -= j;
1302 bv->bv_offset += j << 9;
1303
1304 if (bv->bv_len)
1305 return 0;
1306
1307 bio->bi_sector += j;
1308 bio->bi_size -= j << 9;
1309
1310 bio->bi_idx++;
1311 sectors -= j;
1312 } 1264 }
1313 1265
1314 s->op.lookup_done = true; 1266 bio_advance(bio, min(sectors << 9, bio->bi_size));
1267
1268 if (!bio->bi_size)
1269 s->op.lookup_done = true;
1315 1270
1316 return 0; 1271 return 0;
1317} 1272}
@@ -1338,8 +1293,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1338 closure_call(&s->op.cl, btree_read_async, NULL, cl); 1293 closure_call(&s->op.cl, btree_read_async, NULL, cl);
1339 } else if (bio_has_data(bio) || s->op.skip) { 1294 } else if (bio_has_data(bio) || s->op.skip) {
1340 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, 1295 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
1341 &KEY(d->id, bio->bi_sector, 0), 1296 &KEY(d->id, bio->bi_sector, 0),
1342 &KEY(d->id, bio_end(bio), 0)); 1297 &KEY(d->id, bio_end_sector(bio), 0));
1343 1298
1344 s->writeback = true; 1299 s->writeback = true;
1345 s->op.cache_bio = bio; 1300 s->op.cache_bio = bio;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 254d9ab5707c..57dc4784f4f4 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -30,7 +30,7 @@ struct search {
30}; 30};
31 31
32void bch_cache_read_endio(struct bio *, int); 32void bch_cache_read_endio(struct bio *, int);
33int bch_get_congested(struct cache_set *); 33unsigned bch_get_congested(struct cache_set *);
34void bch_insert_data(struct closure *cl); 34void bch_insert_data(struct closure *cl);
35void bch_btree_insert_async(struct closure *); 35void bch_btree_insert_async(struct closure *);
36void bch_cache_read_endio(struct bio *, int); 36void bch_cache_read_endio(struct bio *, int);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f88e2b653a3f..cff2d182dfb0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -10,7 +10,9 @@
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "request.h" 12#include "request.h"
13#include "writeback.h"
13 14
15#include <linux/blkdev.h>
14#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
15#include <linux/debugfs.h> 17#include <linux/debugfs.h>
16#include <linux/genhd.h> 18#include <linux/genhd.h>
@@ -342,6 +344,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
342 struct closure *cl = &c->uuid_write.cl; 344 struct closure *cl = &c->uuid_write.cl;
343 struct uuid_entry *u; 345 struct uuid_entry *u;
344 unsigned i; 346 unsigned i;
347 char buf[80];
345 348
346 BUG_ON(!parent); 349 BUG_ON(!parent);
347 closure_lock(&c->uuid_write, parent); 350 closure_lock(&c->uuid_write, parent);
@@ -362,8 +365,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
362 break; 365 break;
363 } 366 }
364 367
365 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", 368 bch_bkey_to_text(buf, sizeof(buf), k);
366 pkey(&c->uuid_bucket)); 369 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
367 370
368 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) 371 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
369 if (!bch_is_zero(u->uuid, 16)) 372 if (!bch_is_zero(u->uuid, 16))
@@ -543,7 +546,6 @@ void bch_prio_write(struct cache *ca)
543 546
544 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), 547 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
545 fifo_used(&ca->free_inc), fifo_used(&ca->unused)); 548 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
546 blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
547 549
548 for (i = prio_buckets(ca) - 1; i >= 0; --i) { 550 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
549 long bucket; 551 long bucket;
@@ -743,13 +745,35 @@ static void bcache_device_free(struct bcache_device *d)
743 mempool_destroy(d->unaligned_bvec); 745 mempool_destroy(d->unaligned_bvec);
744 if (d->bio_split) 746 if (d->bio_split)
745 bioset_free(d->bio_split); 747 bioset_free(d->bio_split);
748 if (is_vmalloc_addr(d->stripe_sectors_dirty))
749 vfree(d->stripe_sectors_dirty);
750 else
751 kfree(d->stripe_sectors_dirty);
746 752
747 closure_debug_destroy(&d->cl); 753 closure_debug_destroy(&d->cl);
748} 754}
749 755
750static int bcache_device_init(struct bcache_device *d, unsigned block_size) 756static int bcache_device_init(struct bcache_device *d, unsigned block_size,
757 sector_t sectors)
751{ 758{
752 struct request_queue *q; 759 struct request_queue *q;
760 size_t n;
761
762 if (!d->stripe_size_bits)
763 d->stripe_size_bits = 31;
764
765 d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >>
766 d->stripe_size_bits;
767
768 if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
769 return -ENOMEM;
770
771 n = d->nr_stripes * sizeof(atomic_t);
772 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
773 ? kzalloc(n, GFP_KERNEL)
774 : vzalloc(n);
775 if (!d->stripe_sectors_dirty)
776 return -ENOMEM;
753 777
754 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 778 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
755 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, 779 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
@@ -759,6 +783,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
759 !(q = blk_alloc_queue(GFP_KERNEL))) 783 !(q = blk_alloc_queue(GFP_KERNEL)))
760 return -ENOMEM; 784 return -ENOMEM;
761 785
786 set_capacity(d->disk, sectors);
762 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); 787 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
763 788
764 d->disk->major = bcache_major; 789 d->disk->major = bcache_major;
@@ -800,6 +825,17 @@ static void calc_cached_dev_sectors(struct cache_set *c)
800void bch_cached_dev_run(struct cached_dev *dc) 825void bch_cached_dev_run(struct cached_dev *dc)
801{ 826{
802 struct bcache_device *d = &dc->disk; 827 struct bcache_device *d = &dc->disk;
828 char buf[SB_LABEL_SIZE + 1];
829 char *env[] = {
830 "DRIVER=bcache",
831 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
832 NULL,
833 NULL,
834 };
835
836 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
837 buf[SB_LABEL_SIZE] = '\0';
838 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
803 839
804 if (atomic_xchg(&dc->running, 1)) 840 if (atomic_xchg(&dc->running, 1))
805 return; 841 return;
@@ -816,10 +852,12 @@ void bch_cached_dev_run(struct cached_dev *dc)
816 852
817 add_disk(d->disk); 853 add_disk(d->disk);
818 bd_link_disk_holder(dc->bdev, dc->disk.disk); 854 bd_link_disk_holder(dc->bdev, dc->disk.disk);
819#if 0 855 /* won't show up in the uevent file, use udevadm monitor -e instead
820 char *env[] = { "SYMLINK=label" , NULL }; 856 * only class / kset properties are persistent */
821 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); 857 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
822#endif 858 kfree(env[1]);
859 kfree(env[2]);
860
823 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || 861 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
824 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) 862 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
825 pr_debug("error creating sysfs link"); 863 pr_debug("error creating sysfs link");
@@ -960,6 +998,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
960 atomic_set(&dc->count, 1); 998 atomic_set(&dc->count, 1);
961 999
962 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { 1000 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1001 bch_sectors_dirty_init(dc);
963 atomic_set(&dc->has_dirty, 1); 1002 atomic_set(&dc->has_dirty, 1);
964 atomic_inc(&dc->count); 1003 atomic_inc(&dc->count);
965 bch_writeback_queue(dc); 1004 bch_writeback_queue(dc);
@@ -1045,7 +1084,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1045 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); 1084 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1046 } 1085 }
1047 1086
1048 ret = bcache_device_init(&dc->disk, block_size); 1087 ret = bcache_device_init(&dc->disk, block_size,
1088 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1049 if (ret) 1089 if (ret)
1050 return ret; 1090 return ret;
1051 1091
@@ -1144,11 +1184,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1144 1184
1145 kobject_init(&d->kobj, &bch_flash_dev_ktype); 1185 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1146 1186
1147 if (bcache_device_init(d, block_bytes(c))) 1187 if (bcache_device_init(d, block_bytes(c), u->sectors))
1148 goto err; 1188 goto err;
1149 1189
1150 bcache_device_attach(d, c, u - c->uuids); 1190 bcache_device_attach(d, c, u - c->uuids);
1151 set_capacity(d->disk, u->sectors);
1152 bch_flash_dev_request_init(d); 1191 bch_flash_dev_request_init(d);
1153 add_disk(d->disk); 1192 add_disk(d->disk);
1154 1193
@@ -1255,9 +1294,10 @@ static void cache_set_free(struct closure *cl)
1255 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1294 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1256 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); 1295 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1257 1296
1258 kfree(c->fill_iter);
1259 if (c->bio_split) 1297 if (c->bio_split)
1260 bioset_free(c->bio_split); 1298 bioset_free(c->bio_split);
1299 if (c->fill_iter)
1300 mempool_destroy(c->fill_iter);
1261 if (c->bio_meta) 1301 if (c->bio_meta)
1262 mempool_destroy(c->bio_meta); 1302 mempool_destroy(c->bio_meta);
1263 if (c->search) 1303 if (c->search)
@@ -1282,7 +1322,7 @@ static void cache_set_flush(struct closure *cl)
1282 1322
1283 /* Shut down allocator threads */ 1323 /* Shut down allocator threads */
1284 set_bit(CACHE_SET_STOPPING_2, &c->flags); 1324 set_bit(CACHE_SET_STOPPING_2, &c->flags);
1285 wake_up(&c->alloc_wait); 1325 wake_up_allocators(c);
1286 1326
1287 bch_cache_accounting_destroy(&c->accounting); 1327 bch_cache_accounting_destroy(&c->accounting);
1288 1328
@@ -1295,7 +1335,7 @@ static void cache_set_flush(struct closure *cl)
1295 /* Should skip this if we're unregistering because of an error */ 1335 /* Should skip this if we're unregistering because of an error */
1296 list_for_each_entry(b, &c->btree_cache, list) 1336 list_for_each_entry(b, &c->btree_cache, list)
1297 if (btree_node_dirty(b)) 1337 if (btree_node_dirty(b))
1298 bch_btree_write(b, true, NULL); 1338 bch_btree_node_write(b, NULL);
1299 1339
1300 closure_return(cl); 1340 closure_return(cl);
1301} 1341}
@@ -1373,9 +1413,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1373 c->btree_pages = max_t(int, c->btree_pages / 4, 1413 c->btree_pages = max_t(int, c->btree_pages / 4,
1374 BTREE_MAX_PAGES); 1414 BTREE_MAX_PAGES);
1375 1415
1376 init_waitqueue_head(&c->alloc_wait); 1416 c->sort_crit_factor = int_sqrt(c->btree_pages);
1417
1377 mutex_init(&c->bucket_lock); 1418 mutex_init(&c->bucket_lock);
1378 mutex_init(&c->fill_lock);
1379 mutex_init(&c->sort_lock); 1419 mutex_init(&c->sort_lock);
1380 spin_lock_init(&c->sort_time_lock); 1420 spin_lock_init(&c->sort_time_lock);
1381 closure_init_unlocked(&c->sb_write); 1421 closure_init_unlocked(&c->sb_write);
@@ -1401,8 +1441,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1401 !(c->bio_meta = mempool_create_kmalloc_pool(2, 1441 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1402 sizeof(struct bbio) + sizeof(struct bio_vec) * 1442 sizeof(struct bbio) + sizeof(struct bio_vec) *
1403 bucket_pages(c))) || 1443 bucket_pages(c))) ||
1444 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1404 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1445 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1405 !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
1406 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || 1446 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1407 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1447 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1408 bch_journal_alloc(c) || 1448 bch_journal_alloc(c) ||
@@ -1410,8 +1450,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1410 bch_open_buckets_alloc(c)) 1450 bch_open_buckets_alloc(c))
1411 goto err; 1451 goto err;
1412 1452
1413 c->fill_iter->size = sb->bucket_size / sb->block_size;
1414
1415 c->congested_read_threshold_us = 2000; 1453 c->congested_read_threshold_us = 2000;
1416 c->congested_write_threshold_us = 20000; 1454 c->congested_write_threshold_us = 20000;
1417 c->error_limit = 8 << IO_ERROR_SHIFT; 1455 c->error_limit = 8 << IO_ERROR_SHIFT;
@@ -1496,9 +1534,10 @@ static void run_cache_set(struct cache_set *c)
1496 */ 1534 */
1497 bch_journal_next(&c->journal); 1535 bch_journal_next(&c->journal);
1498 1536
1537 err = "error starting allocator thread";
1499 for_each_cache(ca, c, i) 1538 for_each_cache(ca, c, i)
1500 closure_call(&ca->alloc, bch_allocator_thread, 1539 if (bch_cache_allocator_start(ca))
1501 system_wq, &c->cl); 1540 goto err;
1502 1541
1503 /* 1542 /*
1504 * First place it's safe to allocate: btree_check() and 1543 * First place it's safe to allocate: btree_check() and
@@ -1531,17 +1570,16 @@ static void run_cache_set(struct cache_set *c)
1531 1570
1532 bch_btree_gc_finish(c); 1571 bch_btree_gc_finish(c);
1533 1572
1573 err = "error starting allocator thread";
1534 for_each_cache(ca, c, i) 1574 for_each_cache(ca, c, i)
1535 closure_call(&ca->alloc, bch_allocator_thread, 1575 if (bch_cache_allocator_start(ca))
1536 ca->alloc_workqueue, &c->cl); 1576 goto err;
1537 1577
1538 mutex_lock(&c->bucket_lock); 1578 mutex_lock(&c->bucket_lock);
1539 for_each_cache(ca, c, i) 1579 for_each_cache(ca, c, i)
1540 bch_prio_write(ca); 1580 bch_prio_write(ca);
1541 mutex_unlock(&c->bucket_lock); 1581 mutex_unlock(&c->bucket_lock);
1542 1582
1543 wake_up(&c->alloc_wait);
1544
1545 err = "cannot allocate new UUID bucket"; 1583 err = "cannot allocate new UUID bucket";
1546 if (__uuid_write(c)) 1584 if (__uuid_write(c))
1547 goto err_unlock_gc; 1585 goto err_unlock_gc;
@@ -1552,7 +1590,7 @@ static void run_cache_set(struct cache_set *c)
1552 goto err_unlock_gc; 1590 goto err_unlock_gc;
1553 1591
1554 bkey_copy_key(&c->root->key, &MAX_KEY); 1592 bkey_copy_key(&c->root->key, &MAX_KEY);
1555 bch_btree_write(c->root, true, &op); 1593 bch_btree_node_write(c->root, &op.cl);
1556 1594
1557 bch_btree_set_root(c->root); 1595 bch_btree_set_root(c->root);
1558 rw_unlock(true, c->root); 1596 rw_unlock(true, c->root);
@@ -1673,9 +1711,6 @@ void bch_cache_release(struct kobject *kobj)
1673 1711
1674 bio_split_pool_free(&ca->bio_split_hook); 1712 bio_split_pool_free(&ca->bio_split_hook);
1675 1713
1676 if (ca->alloc_workqueue)
1677 destroy_workqueue(ca->alloc_workqueue);
1678
1679 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); 1714 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1680 kfree(ca->prio_buckets); 1715 kfree(ca->prio_buckets);
1681 vfree(ca->buckets); 1716 vfree(ca->buckets);
@@ -1723,7 +1758,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1723 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 1758 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1724 2, GFP_KERNEL)) || 1759 2, GFP_KERNEL)) ||
1725 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || 1760 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1726 !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
1727 bio_split_pool_init(&ca->bio_split_hook)) 1761 bio_split_pool_init(&ca->bio_split_hook))
1728 return -ENOMEM; 1762 return -ENOMEM;
1729 1763
@@ -1786,6 +1820,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1786kobj_attribute_write(register, register_bcache); 1820kobj_attribute_write(register, register_bcache);
1787kobj_attribute_write(register_quiet, register_bcache); 1821kobj_attribute_write(register_quiet, register_bcache);
1788 1822
1823static bool bch_is_open_backing(struct block_device *bdev) {
1824 struct cache_set *c, *tc;
1825 struct cached_dev *dc, *t;
1826
1827 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1828 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1829 if (dc->bdev == bdev)
1830 return true;
1831 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1832 if (dc->bdev == bdev)
1833 return true;
1834 return false;
1835}
1836
1837static bool bch_is_open_cache(struct block_device *bdev) {
1838 struct cache_set *c, *tc;
1839 struct cache *ca;
1840 unsigned i;
1841
1842 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1843 for_each_cache(ca, c, i)
1844 if (ca->bdev == bdev)
1845 return true;
1846 return false;
1847}
1848
1849static bool bch_is_open(struct block_device *bdev) {
1850 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
1851}
1852
1789static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, 1853static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1790 const char *buffer, size_t size) 1854 const char *buffer, size_t size)
1791{ 1855{
@@ -1810,8 +1874,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1810 FMODE_READ|FMODE_WRITE|FMODE_EXCL, 1874 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1811 sb); 1875 sb);
1812 if (IS_ERR(bdev)) { 1876 if (IS_ERR(bdev)) {
1813 if (bdev == ERR_PTR(-EBUSY)) 1877 if (bdev == ERR_PTR(-EBUSY)) {
1814 err = "device busy"; 1878 bdev = lookup_bdev(strim(path));
1879 if (!IS_ERR(bdev) && bch_is_open(bdev))
1880 err = "device already registered";
1881 else
1882 err = "device busy";
1883 }
1815 goto err; 1884 goto err;
1816 } 1885 }
1817 1886
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4d9cca47e4c6..dd3f00a42729 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -9,7 +9,9 @@
9#include "sysfs.h" 9#include "sysfs.h"
10#include "btree.h" 10#include "btree.h"
11#include "request.h" 11#include "request.h"
12#include "writeback.h"
12 13
14#include <linux/blkdev.h>
13#include <linux/sort.h> 15#include <linux/sort.h>
14 16
15static const char * const cache_replacement_policies[] = { 17static const char * const cache_replacement_policies[] = {
@@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse);
79rw_attribute(writeback_rate_d_smooth); 81rw_attribute(writeback_rate_d_smooth);
80read_attribute(writeback_rate_debug); 82read_attribute(writeback_rate_debug);
81 83
84read_attribute(stripe_size);
85read_attribute(partial_stripes_expensive);
86
82rw_attribute(synchronous); 87rw_attribute(synchronous);
83rw_attribute(journal_delay_ms); 88rw_attribute(journal_delay_ms);
84rw_attribute(discard); 89rw_attribute(discard);
@@ -127,7 +132,7 @@ SHOW(__bch_cached_dev)
127 char derivative[20]; 132 char derivative[20];
128 char target[20]; 133 char target[20];
129 bch_hprint(dirty, 134 bch_hprint(dirty,
130 atomic_long_read(&dc->disk.sectors_dirty) << 9); 135 bcache_dev_sectors_dirty(&dc->disk) << 9);
131 bch_hprint(derivative, dc->writeback_rate_derivative << 9); 136 bch_hprint(derivative, dc->writeback_rate_derivative << 9);
132 bch_hprint(target, dc->writeback_rate_target << 9); 137 bch_hprint(target, dc->writeback_rate_target << 9);
133 138
@@ -143,7 +148,10 @@ SHOW(__bch_cached_dev)
143 } 148 }
144 149
145 sysfs_hprint(dirty_data, 150 sysfs_hprint(dirty_data,
146 atomic_long_read(&dc->disk.sectors_dirty) << 9); 151 bcache_dev_sectors_dirty(&dc->disk) << 9);
152
153 sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9);
154 var_printf(partial_stripes_expensive, "%u");
147 155
148 var_printf(sequential_merge, "%i"); 156 var_printf(sequential_merge, "%i");
149 var_hprint(sequential_cutoff); 157 var_hprint(sequential_cutoff);
@@ -170,6 +178,7 @@ STORE(__cached_dev)
170 disk.kobj); 178 disk.kobj);
171 unsigned v = size; 179 unsigned v = size;
172 struct cache_set *c; 180 struct cache_set *c;
181 struct kobj_uevent_env *env;
173 182
174#define d_strtoul(var) sysfs_strtoul(var, dc->var) 183#define d_strtoul(var) sysfs_strtoul(var, dc->var)
175#define d_strtoi_h(var) sysfs_hatoi(var, dc->var) 184#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
@@ -214,6 +223,7 @@ STORE(__cached_dev)
214 } 223 }
215 224
216 if (attr == &sysfs_label) { 225 if (attr == &sysfs_label) {
226 /* note: endlines are preserved */
217 memcpy(dc->sb.label, buf, SB_LABEL_SIZE); 227 memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
218 bch_write_bdev_super(dc, NULL); 228 bch_write_bdev_super(dc, NULL);
219 if (dc->disk.c) { 229 if (dc->disk.c) {
@@ -221,6 +231,13 @@ STORE(__cached_dev)
221 buf, SB_LABEL_SIZE); 231 buf, SB_LABEL_SIZE);
222 bch_uuid_write(dc->disk.c); 232 bch_uuid_write(dc->disk.c);
223 } 233 }
234 env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
235 add_uevent_var(env, "DRIVER=bcache");
236 add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
237 add_uevent_var(env, "CACHED_LABEL=%s", buf);
238 kobject_uevent_env(
239 &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
240 kfree(env);
224 } 241 }
225 242
226 if (attr == &sysfs_attach) { 243 if (attr == &sysfs_attach) {
@@ -284,6 +301,8 @@ static struct attribute *bch_cached_dev_files[] = {
284 &sysfs_writeback_rate_d_smooth, 301 &sysfs_writeback_rate_d_smooth,
285 &sysfs_writeback_rate_debug, 302 &sysfs_writeback_rate_debug,
286 &sysfs_dirty_data, 303 &sysfs_dirty_data,
304 &sysfs_stripe_size,
305 &sysfs_partial_stripes_expensive,
287 &sysfs_sequential_cutoff, 306 &sysfs_sequential_cutoff,
288 &sysfs_sequential_merge, 307 &sysfs_sequential_merge,
289 &sysfs_clear_stats, 308 &sysfs_clear_stats,
@@ -665,12 +684,10 @@ SHOW(__bch_cache)
665 int cmp(const void *l, const void *r) 684 int cmp(const void *l, const void *r)
666 { return *((uint16_t *) r) - *((uint16_t *) l); } 685 { return *((uint16_t *) r) - *((uint16_t *) l); }
667 686
668 /* Number of quantiles we compute */
669 const unsigned nq = 31;
670
671 size_t n = ca->sb.nbuckets, i, unused, btree; 687 size_t n = ca->sb.nbuckets, i, unused, btree;
672 uint64_t sum = 0; 688 uint64_t sum = 0;
673 uint16_t q[nq], *p, *cached; 689 /* Compute 31 quantiles */
690 uint16_t q[31], *p, *cached;
674 ssize_t ret; 691 ssize_t ret;
675 692
676 cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); 693 cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
@@ -703,26 +720,29 @@ SHOW(__bch_cache)
703 if (n) 720 if (n)
704 do_div(sum, n); 721 do_div(sum, n);
705 722
706 for (i = 0; i < nq; i++) 723 for (i = 0; i < ARRAY_SIZE(q); i++)
707 q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; 724 q[i] = INITIAL_PRIO - cached[n * (i + 1) /
725 (ARRAY_SIZE(q) + 1)];
708 726
709 vfree(p); 727 vfree(p);
710 728
711 ret = snprintf(buf, PAGE_SIZE, 729 ret = scnprintf(buf, PAGE_SIZE,
712 "Unused: %zu%%\n" 730 "Unused: %zu%%\n"
713 "Metadata: %zu%%\n" 731 "Metadata: %zu%%\n"
714 "Average: %llu\n" 732 "Average: %llu\n"
715 "Sectors per Q: %zu\n" 733 "Sectors per Q: %zu\n"
716 "Quantiles: [", 734 "Quantiles: [",
717 unused * 100 / (size_t) ca->sb.nbuckets, 735 unused * 100 / (size_t) ca->sb.nbuckets,
718 btree * 100 / (size_t) ca->sb.nbuckets, sum, 736 btree * 100 / (size_t) ca->sb.nbuckets, sum,
719 n * ca->sb.bucket_size / (nq + 1)); 737 n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
720 738
721 for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) 739 for (i = 0; i < ARRAY_SIZE(q); i++)
722 ret += snprintf(buf + ret, PAGE_SIZE - ret, 740 ret += scnprintf(buf + ret, PAGE_SIZE - ret,
723 i < nq - 1 ? "%u " : "%u]\n", q[i]); 741 "%u ", q[i]);
724 742 ret--;
725 buf[PAGE_SIZE - 1] = '\0'; 743
744 ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
745
726 return ret; 746 return ret;
727 } 747 }
728 748
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index 983f9bb411bc..f7b6c197f90f 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -2,6 +2,7 @@
2#include "btree.h" 2#include "btree.h"
3#include "request.h" 3#include "request.h"
4 4
5#include <linux/blktrace_api.h>
5#include <linux/module.h> 6#include <linux/module.h>
6 7
7#define CREATE_TRACE_POINTS 8#define CREATE_TRACE_POINTS
@@ -9,18 +10,44 @@
9 10
10EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); 11EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
11EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); 12EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
12EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); 13
13EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); 14EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential);
14EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); 15EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested);
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read);
18EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write);
15EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); 19EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
16EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); 20
17EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); 21EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
18EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); 22
23EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key);
24EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
25EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full);
26EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full);
27
28EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize);
29
19EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); 30EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
20EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); 31EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
21EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); 32
22EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); 33EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc);
23EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); 34EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail);
24EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); 35EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free);
36
37EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce);
25EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); 38EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
26EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); 39EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
40EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy);
41EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision);
42
43EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key);
44
45EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
46EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
47EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
48
49EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
50EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
51
52EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
53EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index da3a99e85b1e..98eb81159a22 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
228 } 228 }
229} 229}
230 230
231int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
232{
233 int i;
234 struct bio_vec *bv;
235
236 bio_for_each_segment(bv, bio, i) {
237 bv->bv_page = alloc_page(gfp);
238 if (!bv->bv_page) {
239 while (bv-- != bio->bi_io_vec + bio->bi_idx)
240 __free_page(bv->bv_page);
241 return -ENOMEM;
242 }
243 }
244
245 return 0;
246}
247
248/* 231/*
249 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any 232 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
250 * use permitted, subject to terms of PostgreSQL license; see.) 233 * use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 577393e38c3a..1ae2a73ad85f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -15,8 +15,6 @@
15 15
16struct closure; 16struct closure;
17 17
18#include <trace/events/bcache.h>
19
20#ifdef CONFIG_BCACHE_EDEBUG 18#ifdef CONFIG_BCACHE_EDEBUG
21 19
22#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 20#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
@@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
566 return x; 564 return x;
567} 565}
568 566
569#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
570
571void bch_bio_map(struct bio *bio, void *base); 567void bch_bio_map(struct bio *bio, void *base);
572 568
573int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
574
575static inline sector_t bdev_sectors(struct block_device *bdev) 569static inline sector_t bdev_sectors(struct block_device *bdev)
576{ 570{
577 return bdev->bd_inode->i_size >> 9; 571 return bdev->bd_inode->i_size >> 9;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 2714ed3991d1..22cbff551628 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -9,6 +9,9 @@
9#include "bcache.h" 9#include "bcache.h"
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "writeback.h"
13
14#include <trace/events/bcache.h>
12 15
13static struct workqueue_struct *dirty_wq; 16static struct workqueue_struct *dirty_wq;
14 17
@@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
36 39
37 int change = 0; 40 int change = 0;
38 int64_t error; 41 int64_t error;
39 int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); 42 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
40 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 43 int64_t derivative = dirty - dc->disk.sectors_dirty_last;
41 44
42 dc->disk.sectors_dirty_last = dirty; 45 dc->disk.sectors_dirty_last = dirty;
@@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
105 return KEY_DIRTY(k); 108 return KEY_DIRTY(k);
106} 109}
107 110
111static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
112{
113 uint64_t stripe;
114 unsigned nr_sectors = KEY_SIZE(k);
115 struct cached_dev *dc = container_of(buf, struct cached_dev,
116 writeback_keys);
117 unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
118
119 if (!KEY_DIRTY(k))
120 return false;
121
122 stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
123 while (1) {
124 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
125 stripe_size)
126 return false;
127
128 if (nr_sectors <= stripe_size)
129 return true;
130
131 nr_sectors -= stripe_size;
132 stripe++;
133 }
134}
135
108static void dirty_init(struct keybuf_key *w) 136static void dirty_init(struct keybuf_key *w)
109{ 137{
110 struct dirty_io *io = w->private; 138 struct dirty_io *io = w->private;
@@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl)
149 searched_from_start = true; 177 searched_from_start = true;
150 } 178 }
151 179
152 bch_refill_keybuf(dc->disk.c, buf, &end); 180 if (dc->partial_stripes_expensive) {
181 uint64_t i;
182
183 for (i = 0; i < dc->disk.nr_stripes; i++)
184 if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
185 1 << dc->disk.stripe_size_bits)
186 goto full_stripes;
187
188 goto normal_refill;
189full_stripes:
190 bch_refill_keybuf(dc->disk.c, buf, &end,
191 dirty_full_stripe_pred);
192 } else {
193normal_refill:
194 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
195 }
153 196
154 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { 197 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
155 /* Searched the entire btree - delay awhile */ 198 /* Searched the entire btree - delay awhile */
@@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc)
181 } 224 }
182} 225}
183 226
184void bch_writeback_add(struct cached_dev *dc, unsigned sectors) 227void bch_writeback_add(struct cached_dev *dc)
185{ 228{
186 atomic_long_add(sectors, &dc->disk.sectors_dirty);
187
188 if (!atomic_read(&dc->has_dirty) && 229 if (!atomic_read(&dc->has_dirty) &&
189 !atomic_xchg(&dc->has_dirty, 1)) { 230 !atomic_xchg(&dc->has_dirty, 1)) {
190 atomic_inc(&dc->count); 231 atomic_inc(&dc->count);
@@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
203 } 244 }
204} 245}
205 246
247void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
248 uint64_t offset, int nr_sectors)
249{
250 struct bcache_device *d = c->devices[inode];
251 unsigned stripe_size, stripe_offset;
252 uint64_t stripe;
253
254 if (!d)
255 return;
256
257 stripe_size = 1 << d->stripe_size_bits;
258 stripe = offset >> d->stripe_size_bits;
259 stripe_offset = offset & (stripe_size - 1);
260
261 while (nr_sectors) {
262 int s = min_t(unsigned, abs(nr_sectors),
263 stripe_size - stripe_offset);
264
265 if (nr_sectors < 0)
266 s = -s;
267
268 atomic_add(s, d->stripe_sectors_dirty + stripe);
269 nr_sectors -= s;
270 stripe_offset = 0;
271 stripe++;
272 }
273}
274
206/* Background writeback - IO loop */ 275/* Background writeback - IO loop */
207 276
208static void dirty_io_destructor(struct closure *cl) 277static void dirty_io_destructor(struct closure *cl)
@@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl)
216 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 285 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
217 struct keybuf_key *w = io->bio.bi_private; 286 struct keybuf_key *w = io->bio.bi_private;
218 struct cached_dev *dc = io->dc; 287 struct cached_dev *dc = io->dc;
219 struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); 288 struct bio_vec *bv;
289 int i;
220 290
221 while (bv-- != io->bio.bi_io_vec) 291 bio_for_each_segment_all(bv, &io->bio, i)
222 __free_page(bv->bv_page); 292 __free_page(bv->bv_page);
223 293
224 /* This is kind of a dumb way of signalling errors. */ 294 /* This is kind of a dumb way of signalling errors. */
@@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl)
236 for (i = 0; i < KEY_PTRS(&w->key); i++) 306 for (i = 0; i < KEY_PTRS(&w->key); i++)
237 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 307 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
238 308
239 pr_debug("clearing %s", pkey(&w->key));
240 bch_btree_insert(&op, dc->disk.c); 309 bch_btree_insert(&op, dc->disk.c);
241 closure_sync(&op.cl); 310 closure_sync(&op.cl);
242 311
312 if (op.insert_collision)
313 trace_bcache_writeback_collision(&w->key);
314
243 atomic_long_inc(op.insert_collision 315 atomic_long_inc(op.insert_collision
244 ? &dc->disk.c->writeback_keys_failed 316 ? &dc->disk.c->writeback_keys_failed
245 : &dc->disk.c->writeback_keys_done); 317 : &dc->disk.c->writeback_keys_done);
@@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl)
275 io->bio.bi_bdev = io->dc->bdev; 347 io->bio.bi_bdev = io->dc->bdev;
276 io->bio.bi_end_io = dirty_endio; 348 io->bio.bi_end_io = dirty_endio;
277 349
278 trace_bcache_write_dirty(&io->bio);
279 closure_bio_submit(&io->bio, cl, &io->dc->disk); 350 closure_bio_submit(&io->bio, cl, &io->dc->disk);
280 351
281 continue_at(cl, write_dirty_finish, dirty_wq); 352 continue_at(cl, write_dirty_finish, dirty_wq);
@@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl)
296{ 367{
297 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 368 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
298 369
299 trace_bcache_read_dirty(&io->bio);
300 closure_bio_submit(&io->bio, cl, &io->dc->disk); 370 closure_bio_submit(&io->bio, cl, &io->dc->disk);
301 371
302 continue_at(cl, write_dirty, dirty_wq); 372 continue_at(cl, write_dirty, dirty_wq);
@@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl)
349 io->bio.bi_rw = READ; 419 io->bio.bi_rw = READ;
350 io->bio.bi_end_io = read_dirty_endio; 420 io->bio.bi_end_io = read_dirty_endio;
351 421
352 if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) 422 if (bio_alloc_pages(&io->bio, GFP_KERNEL))
353 goto err_free; 423 goto err_free;
354 424
355 pr_debug("%s", pkey(&w->key)); 425 trace_bcache_writeback(&w->key);
356 426
357 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); 427 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
358 428
@@ -375,12 +445,49 @@ err:
375 refill_dirty(cl); 445 refill_dirty(cl);
376} 446}
377 447
448/* Init */
449
450static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
451 struct cached_dev *dc)
452{
453 struct bkey *k;
454 struct btree_iter iter;
455
456 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
457 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
458 if (!b->level) {
459 if (KEY_INODE(k) > dc->disk.id)
460 break;
461
462 if (KEY_DIRTY(k))
463 bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
464 KEY_START(k),
465 KEY_SIZE(k));
466 } else {
467 btree(sectors_dirty_init, k, b, op, dc);
468 if (KEY_INODE(k) > dc->disk.id)
469 break;
470
471 cond_resched();
472 }
473
474 return 0;
475}
476
477void bch_sectors_dirty_init(struct cached_dev *dc)
478{
479 struct btree_op op;
480
481 bch_btree_op_init_stack(&op);
482 btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
483}
484
378void bch_cached_dev_writeback_init(struct cached_dev *dc) 485void bch_cached_dev_writeback_init(struct cached_dev *dc)
379{ 486{
380 closure_init_unlocked(&dc->writeback); 487 closure_init_unlocked(&dc->writeback);
381 init_rwsem(&dc->writeback_lock); 488 init_rwsem(&dc->writeback_lock);
382 489
383 bch_keybuf_init(&dc->writeback_keys, dirty_pred); 490 bch_keybuf_init(&dc->writeback_keys);
384 491
385 dc->writeback_metadata = true; 492 dc->writeback_metadata = true;
386 dc->writeback_running = true; 493 dc->writeback_running = true;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
new file mode 100644
index 000000000000..c91f61bb95b6
--- /dev/null
+++ b/drivers/md/bcache/writeback.h
@@ -0,0 +1,64 @@
1#ifndef _BCACHE_WRITEBACK_H
2#define _BCACHE_WRITEBACK_H
3
4#define CUTOFF_WRITEBACK 40
5#define CUTOFF_WRITEBACK_SYNC 70
6
7static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
8{
9 uint64_t i, ret = 0;
10
11 for (i = 0; i < d->nr_stripes; i++)
12 ret += atomic_read(d->stripe_sectors_dirty + i);
13
14 return ret;
15}
16
17static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
18 uint64_t offset,
19 unsigned nr_sectors)
20{
21 uint64_t stripe = offset >> d->stripe_size_bits;
22
23 while (1) {
24 if (atomic_read(d->stripe_sectors_dirty + stripe))
25 return true;
26
27 if (nr_sectors <= 1 << d->stripe_size_bits)
28 return false;
29
30 nr_sectors -= 1 << d->stripe_size_bits;
31 stripe++;
32 }
33}
34
35static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
36 unsigned cache_mode, bool would_skip)
37{
38 unsigned in_use = dc->disk.c->gc_stats.in_use;
39
40 if (cache_mode != CACHE_MODE_WRITEBACK ||
41 atomic_read(&dc->disk.detaching) ||
42 in_use > CUTOFF_WRITEBACK_SYNC)
43 return false;
44
45 if (dc->partial_stripes_expensive &&
46 bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
47 bio_sectors(bio)))
48 return true;
49
50 if (would_skip)
51 return false;
52
53 return bio->bi_rw & REQ_SYNC ||
54 in_use <= CUTOFF_WRITEBACK;
55}
56
57void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
58void bch_writeback_queue(struct cached_dev *);
59void bch_writeback_add(struct cached_dev *);
60
61void bch_sectors_dirty_init(struct cached_dev *dc);
62void bch_cached_dev_writeback_init(struct cached_dev *);
63
64#endif
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 3cc5a0b278c3..5ebda976ea93 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -9,9 +9,7 @@
9struct search; 9struct search;
10 10
11DECLARE_EVENT_CLASS(bcache_request, 11DECLARE_EVENT_CLASS(bcache_request,
12
13 TP_PROTO(struct search *s, struct bio *bio), 12 TP_PROTO(struct search *s, struct bio *bio),
14
15 TP_ARGS(s, bio), 13 TP_ARGS(s, bio),
16 14
17 TP_STRUCT__entry( 15 TP_STRUCT__entry(
@@ -22,7 +20,6 @@ DECLARE_EVENT_CLASS(bcache_request,
22 __field(dev_t, orig_sector ) 20 __field(dev_t, orig_sector )
23 __field(unsigned int, nr_sector ) 21 __field(unsigned int, nr_sector )
24 __array(char, rwbs, 6 ) 22 __array(char, rwbs, 6 )
25 __array(char, comm, TASK_COMM_LEN )
26 ), 23 ),
27 24
28 TP_fast_assign( 25 TP_fast_assign(
@@ -33,36 +30,66 @@ DECLARE_EVENT_CLASS(bcache_request,
33 __entry->orig_sector = bio->bi_sector - 16; 30 __entry->orig_sector = bio->bi_sector - 16;
34 __entry->nr_sector = bio->bi_size >> 9; 31 __entry->nr_sector = bio->bi_size >> 9;
35 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 32 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
36 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
37 ), 33 ),
38 34
39 TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)", 35 TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
40 MAJOR(__entry->dev), MINOR(__entry->dev), 36 MAJOR(__entry->dev), MINOR(__entry->dev),
41 __entry->rwbs, 37 __entry->rwbs, (unsigned long long)__entry->sector,
42 (unsigned long long)__entry->sector, 38 __entry->nr_sector, __entry->orig_major, __entry->orig_minor,
43 __entry->nr_sector, __entry->comm,
44 __entry->orig_major, __entry->orig_minor,
45 (unsigned long long)__entry->orig_sector) 39 (unsigned long long)__entry->orig_sector)
46); 40);
47 41
48DEFINE_EVENT(bcache_request, bcache_request_start, 42DECLARE_EVENT_CLASS(bkey,
43 TP_PROTO(struct bkey *k),
44 TP_ARGS(k),
49 45
50 TP_PROTO(struct search *s, struct bio *bio), 46 TP_STRUCT__entry(
47 __field(u32, size )
48 __field(u32, inode )
49 __field(u64, offset )
50 __field(bool, dirty )
51 ),
51 52
52 TP_ARGS(s, bio) 53 TP_fast_assign(
54 __entry->inode = KEY_INODE(k);
55 __entry->offset = KEY_OFFSET(k);
56 __entry->size = KEY_SIZE(k);
57 __entry->dirty = KEY_DIRTY(k);
58 ),
59
60 TP_printk("%u:%llu len %u dirty %u", __entry->inode,
61 __entry->offset, __entry->size, __entry->dirty)
53); 62);
54 63
55DEFINE_EVENT(bcache_request, bcache_request_end, 64DECLARE_EVENT_CLASS(btree_node,
65 TP_PROTO(struct btree *b),
66 TP_ARGS(b),
67
68 TP_STRUCT__entry(
69 __field(size_t, bucket )
70 ),
56 71
72 TP_fast_assign(
73 __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0);
74 ),
75
76 TP_printk("bucket %zu", __entry->bucket)
77);
78
79/* request.c */
80
81DEFINE_EVENT(bcache_request, bcache_request_start,
57 TP_PROTO(struct search *s, struct bio *bio), 82 TP_PROTO(struct search *s, struct bio *bio),
83 TP_ARGS(s, bio)
84);
58 85
86DEFINE_EVENT(bcache_request, bcache_request_end,
87 TP_PROTO(struct search *s, struct bio *bio),
59 TP_ARGS(s, bio) 88 TP_ARGS(s, bio)
60); 89);
61 90
62DECLARE_EVENT_CLASS(bcache_bio, 91DECLARE_EVENT_CLASS(bcache_bio,
63
64 TP_PROTO(struct bio *bio), 92 TP_PROTO(struct bio *bio),
65
66 TP_ARGS(bio), 93 TP_ARGS(bio),
67 94
68 TP_STRUCT__entry( 95 TP_STRUCT__entry(
@@ -70,7 +97,6 @@ DECLARE_EVENT_CLASS(bcache_bio,
70 __field(sector_t, sector ) 97 __field(sector_t, sector )
71 __field(unsigned int, nr_sector ) 98 __field(unsigned int, nr_sector )
72 __array(char, rwbs, 6 ) 99 __array(char, rwbs, 6 )
73 __array(char, comm, TASK_COMM_LEN )
74 ), 100 ),
75 101
76 TP_fast_assign( 102 TP_fast_assign(
@@ -78,191 +104,328 @@ DECLARE_EVENT_CLASS(bcache_bio,
78 __entry->sector = bio->bi_sector; 104 __entry->sector = bio->bi_sector;
79 __entry->nr_sector = bio->bi_size >> 9; 105 __entry->nr_sector = bio->bi_size >> 9;
80 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 106 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
81 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
82 ), 107 ),
83 108
84 TP_printk("%d,%d %s %llu + %u [%s]", 109 TP_printk("%d,%d %s %llu + %u",
85 MAJOR(__entry->dev), MINOR(__entry->dev), 110 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
86 __entry->rwbs, 111 (unsigned long long)__entry->sector, __entry->nr_sector)
87 (unsigned long long)__entry->sector,
88 __entry->nr_sector, __entry->comm)
89); 112);
90 113
91 114DEFINE_EVENT(bcache_bio, bcache_bypass_sequential,
92DEFINE_EVENT(bcache_bio, bcache_passthrough,
93
94 TP_PROTO(struct bio *bio), 115 TP_PROTO(struct bio *bio),
116 TP_ARGS(bio)
117);
95 118
119DEFINE_EVENT(bcache_bio, bcache_bypass_congested,
120 TP_PROTO(struct bio *bio),
96 TP_ARGS(bio) 121 TP_ARGS(bio)
97); 122);
98 123
99DEFINE_EVENT(bcache_bio, bcache_cache_hit, 124TRACE_EVENT(bcache_read,
125 TP_PROTO(struct bio *bio, bool hit, bool bypass),
126 TP_ARGS(bio, hit, bypass),
100 127
101 TP_PROTO(struct bio *bio), 128 TP_STRUCT__entry(
129 __field(dev_t, dev )
130 __field(sector_t, sector )
131 __field(unsigned int, nr_sector )
132 __array(char, rwbs, 6 )
133 __field(bool, cache_hit )
134 __field(bool, bypass )
135 ),
102 136
103 TP_ARGS(bio) 137 TP_fast_assign(
138 __entry->dev = bio->bi_bdev->bd_dev;
139 __entry->sector = bio->bi_sector;
140 __entry->nr_sector = bio->bi_size >> 9;
141 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
142 __entry->cache_hit = hit;
143 __entry->bypass = bypass;
144 ),
145
146 TP_printk("%d,%d %s %llu + %u hit %u bypass %u",
147 MAJOR(__entry->dev), MINOR(__entry->dev),
148 __entry->rwbs, (unsigned long long)__entry->sector,
149 __entry->nr_sector, __entry->cache_hit, __entry->bypass)
104); 150);
105 151
106DEFINE_EVENT(bcache_bio, bcache_cache_miss, 152TRACE_EVENT(bcache_write,
153 TP_PROTO(struct bio *bio, bool writeback, bool bypass),
154 TP_ARGS(bio, writeback, bypass),
107 155
108 TP_PROTO(struct bio *bio), 156 TP_STRUCT__entry(
157 __field(dev_t, dev )
158 __field(sector_t, sector )
159 __field(unsigned int, nr_sector )
160 __array(char, rwbs, 6 )
161 __field(bool, writeback )
162 __field(bool, bypass )
163 ),
109 164
110 TP_ARGS(bio) 165 TP_fast_assign(
166 __entry->dev = bio->bi_bdev->bd_dev;
167 __entry->sector = bio->bi_sector;
168 __entry->nr_sector = bio->bi_size >> 9;
169 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
170 __entry->writeback = writeback;
171 __entry->bypass = bypass;
172 ),
173
174 TP_printk("%d,%d %s %llu + %u hit %u bypass %u",
175 MAJOR(__entry->dev), MINOR(__entry->dev),
176 __entry->rwbs, (unsigned long long)__entry->sector,
177 __entry->nr_sector, __entry->writeback, __entry->bypass)
111); 178);
112 179
113DEFINE_EVENT(bcache_bio, bcache_read_retry, 180DEFINE_EVENT(bcache_bio, bcache_read_retry,
114
115 TP_PROTO(struct bio *bio), 181 TP_PROTO(struct bio *bio),
116
117 TP_ARGS(bio) 182 TP_ARGS(bio)
118); 183);
119 184
120DEFINE_EVENT(bcache_bio, bcache_writethrough, 185DEFINE_EVENT(bkey, bcache_cache_insert,
186 TP_PROTO(struct bkey *k),
187 TP_ARGS(k)
188);
121 189
122 TP_PROTO(struct bio *bio), 190/* Journal */
123 191
124 TP_ARGS(bio) 192DECLARE_EVENT_CLASS(cache_set,
125); 193 TP_PROTO(struct cache_set *c),
194 TP_ARGS(c),
126 195
127DEFINE_EVENT(bcache_bio, bcache_writeback, 196 TP_STRUCT__entry(
197 __array(char, uuid, 16 )
198 ),
128 199
129 TP_PROTO(struct bio *bio), 200 TP_fast_assign(
201 memcpy(__entry->uuid, c->sb.set_uuid, 16);
202 ),
130 203
131 TP_ARGS(bio) 204 TP_printk("%pU", __entry->uuid)
132); 205);
133 206
134DEFINE_EVENT(bcache_bio, bcache_write_skip, 207DEFINE_EVENT(bkey, bcache_journal_replay_key,
135 208 TP_PROTO(struct bkey *k),
136 TP_PROTO(struct bio *bio), 209 TP_ARGS(k)
210);
137 211
138 TP_ARGS(bio) 212DEFINE_EVENT(cache_set, bcache_journal_full,
213 TP_PROTO(struct cache_set *c),
214 TP_ARGS(c)
139); 215);
140 216
141DEFINE_EVENT(bcache_bio, bcache_btree_read, 217DEFINE_EVENT(cache_set, bcache_journal_entry_full,
218 TP_PROTO(struct cache_set *c),
219 TP_ARGS(c)
220);
142 221
222DEFINE_EVENT(bcache_bio, bcache_journal_write,
143 TP_PROTO(struct bio *bio), 223 TP_PROTO(struct bio *bio),
144
145 TP_ARGS(bio) 224 TP_ARGS(bio)
146); 225);
147 226
148DEFINE_EVENT(bcache_bio, bcache_btree_write, 227/* Btree */
149 228
150 TP_PROTO(struct bio *bio), 229DEFINE_EVENT(cache_set, bcache_btree_cache_cannibalize,
230 TP_PROTO(struct cache_set *c),
231 TP_ARGS(c)
232);
151 233
152 TP_ARGS(bio) 234DEFINE_EVENT(btree_node, bcache_btree_read,
235 TP_PROTO(struct btree *b),
236 TP_ARGS(b)
153); 237);
154 238
155DEFINE_EVENT(bcache_bio, bcache_write_dirty, 239TRACE_EVENT(bcache_btree_write,
240 TP_PROTO(struct btree *b),
241 TP_ARGS(b),
156 242
157 TP_PROTO(struct bio *bio), 243 TP_STRUCT__entry(
244 __field(size_t, bucket )
245 __field(unsigned, block )
246 __field(unsigned, keys )
247 ),
158 248
159 TP_ARGS(bio) 249 TP_fast_assign(
250 __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0);
251 __entry->block = b->written;
252 __entry->keys = b->sets[b->nsets].data->keys;
253 ),
254
255 TP_printk("bucket %zu", __entry->bucket)
160); 256);
161 257
162DEFINE_EVENT(bcache_bio, bcache_read_dirty, 258DEFINE_EVENT(btree_node, bcache_btree_node_alloc,
259 TP_PROTO(struct btree *b),
260 TP_ARGS(b)
261);
163 262
164 TP_PROTO(struct bio *bio), 263DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail,
264 TP_PROTO(struct btree *b),
265 TP_ARGS(b)
266);
165 267
166 TP_ARGS(bio) 268DEFINE_EVENT(btree_node, bcache_btree_node_free,
269 TP_PROTO(struct btree *b),
270 TP_ARGS(b)
167); 271);
168 272
169DEFINE_EVENT(bcache_bio, bcache_write_moving, 273TRACE_EVENT(bcache_btree_gc_coalesce,
274 TP_PROTO(unsigned nodes),
275 TP_ARGS(nodes),
170 276
171 TP_PROTO(struct bio *bio), 277 TP_STRUCT__entry(
278 __field(unsigned, nodes )
279 ),
172 280
173 TP_ARGS(bio) 281 TP_fast_assign(
282 __entry->nodes = nodes;
283 ),
284
285 TP_printk("coalesced %u nodes", __entry->nodes)
174); 286);
175 287
176DEFINE_EVENT(bcache_bio, bcache_read_moving, 288DEFINE_EVENT(cache_set, bcache_gc_start,
289 TP_PROTO(struct cache_set *c),
290 TP_ARGS(c)
291);
177 292
178 TP_PROTO(struct bio *bio), 293DEFINE_EVENT(cache_set, bcache_gc_end,
294 TP_PROTO(struct cache_set *c),
295 TP_ARGS(c)
296);
179 297
180 TP_ARGS(bio) 298DEFINE_EVENT(bkey, bcache_gc_copy,
299 TP_PROTO(struct bkey *k),
300 TP_ARGS(k)
181); 301);
182 302
183DEFINE_EVENT(bcache_bio, bcache_journal_write, 303DEFINE_EVENT(bkey, bcache_gc_copy_collision,
304 TP_PROTO(struct bkey *k),
305 TP_ARGS(k)
306);
184 307
185 TP_PROTO(struct bio *bio), 308TRACE_EVENT(bcache_btree_insert_key,
309 TP_PROTO(struct btree *b, struct bkey *k, unsigned op, unsigned status),
310 TP_ARGS(b, k, op, status),
186 311
187 TP_ARGS(bio) 312 TP_STRUCT__entry(
188); 313 __field(u64, btree_node )
314 __field(u32, btree_level )
315 __field(u32, inode )
316 __field(u64, offset )
317 __field(u32, size )
318 __field(u8, dirty )
319 __field(u8, op )
320 __field(u8, status )
321 ),
189 322
190DECLARE_EVENT_CLASS(bcache_cache_bio, 323 TP_fast_assign(
324 __entry->btree_node = PTR_BUCKET_NR(b->c, &b->key, 0);
325 __entry->btree_level = b->level;
326 __entry->inode = KEY_INODE(k);
327 __entry->offset = KEY_OFFSET(k);
328 __entry->size = KEY_SIZE(k);
329 __entry->dirty = KEY_DIRTY(k);
330 __entry->op = op;
331 __entry->status = status;
332 ),
191 333
192 TP_PROTO(struct bio *bio, 334 TP_printk("%u for %u at %llu(%u): %u:%llu len %u dirty %u",
193 sector_t orig_sector, 335 __entry->status, __entry->op,
194 struct block_device* orig_bdev), 336 __entry->btree_node, __entry->btree_level,
337 __entry->inode, __entry->offset,
338 __entry->size, __entry->dirty)
339);
195 340
196 TP_ARGS(bio, orig_sector, orig_bdev), 341DECLARE_EVENT_CLASS(btree_split,
342 TP_PROTO(struct btree *b, unsigned keys),
343 TP_ARGS(b, keys),
197 344
198 TP_STRUCT__entry( 345 TP_STRUCT__entry(
199 __field(dev_t, dev ) 346 __field(size_t, bucket )
200 __field(dev_t, orig_dev ) 347 __field(unsigned, keys )
201 __field(sector_t, sector )
202 __field(sector_t, orig_sector )
203 __field(unsigned int, nr_sector )
204 __array(char, rwbs, 6 )
205 __array(char, comm, TASK_COMM_LEN )
206 ), 348 ),
207 349
208 TP_fast_assign( 350 TP_fast_assign(
209 __entry->dev = bio->bi_bdev->bd_dev; 351 __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0);
210 __entry->orig_dev = orig_bdev->bd_dev; 352 __entry->keys = keys;
211 __entry->sector = bio->bi_sector;
212 __entry->orig_sector = orig_sector;
213 __entry->nr_sector = bio->bi_size >> 9;
214 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
215 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
216 ), 353 ),
217 354
218 TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)", 355 TP_printk("bucket %zu keys %u", __entry->bucket, __entry->keys)
219 MAJOR(__entry->dev), MINOR(__entry->dev),
220 __entry->rwbs,
221 (unsigned long long)__entry->sector,
222 __entry->nr_sector, __entry->comm,
223 MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev),
224 (unsigned long long)__entry->orig_sector)
225); 356);
226 357
227DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert, 358DEFINE_EVENT(btree_split, bcache_btree_node_split,
228 359 TP_PROTO(struct btree *b, unsigned keys),
229 TP_PROTO(struct bio *bio, 360 TP_ARGS(b, keys)
230 sector_t orig_sector, 361);
231 struct block_device *orig_bdev),
232 362
233 TP_ARGS(bio, orig_sector, orig_bdev) 363DEFINE_EVENT(btree_split, bcache_btree_node_compact,
364 TP_PROTO(struct btree *b, unsigned keys),
365 TP_ARGS(b, keys)
234); 366);
235 367
236DECLARE_EVENT_CLASS(bcache_gc, 368DEFINE_EVENT(btree_node, bcache_btree_set_root,
369 TP_PROTO(struct btree *b),
370 TP_ARGS(b)
371);
237 372
238 TP_PROTO(uint8_t *uuid), 373/* Allocator */
239 374
240 TP_ARGS(uuid), 375TRACE_EVENT(bcache_alloc_invalidate,
376 TP_PROTO(struct cache *ca),
377 TP_ARGS(ca),
241 378
242 TP_STRUCT__entry( 379 TP_STRUCT__entry(
243 __field(uint8_t *, uuid) 380 __field(unsigned, free )
381 __field(unsigned, free_inc )
382 __field(unsigned, free_inc_size )
383 __field(unsigned, unused )
244 ), 384 ),
245 385
246 TP_fast_assign( 386 TP_fast_assign(
247 __entry->uuid = uuid; 387 __entry->free = fifo_used(&ca->free);
388 __entry->free_inc = fifo_used(&ca->free_inc);
389 __entry->free_inc_size = ca->free_inc.size;
390 __entry->unused = fifo_used(&ca->unused);
248 ), 391 ),
249 392
250 TP_printk("%pU", __entry->uuid) 393 TP_printk("free %u free_inc %u/%u unused %u", __entry->free,
394 __entry->free_inc, __entry->free_inc_size, __entry->unused)
251); 395);
252 396
397TRACE_EVENT(bcache_alloc_fail,
398 TP_PROTO(struct cache *ca),
399 TP_ARGS(ca),
253 400
254DEFINE_EVENT(bcache_gc, bcache_gc_start, 401 TP_STRUCT__entry(
402 __field(unsigned, free )
403 __field(unsigned, free_inc )
404 __field(unsigned, unused )
405 __field(unsigned, blocked )
406 ),
255 407
256 TP_PROTO(uint8_t *uuid), 408 TP_fast_assign(
409 __entry->free = fifo_used(&ca->free);
410 __entry->free_inc = fifo_used(&ca->free_inc);
411 __entry->unused = fifo_used(&ca->unused);
412 __entry->blocked = atomic_read(&ca->set->prio_blocked);
413 ),
257 414
258 TP_ARGS(uuid) 415 TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free,
416 __entry->free_inc, __entry->unused, __entry->blocked)
259); 417);
260 418
261DEFINE_EVENT(bcache_gc, bcache_gc_end, 419/* Background writeback */
262 420
263 TP_PROTO(uint8_t *uuid), 421DEFINE_EVENT(bkey, bcache_writeback,
422 TP_PROTO(struct bkey *k),
423 TP_ARGS(k)
424);
264 425
265 TP_ARGS(uuid) 426DEFINE_EVENT(bkey, bcache_writeback_collision,
427 TP_PROTO(struct bkey *k),
428 TP_ARGS(k)
266); 429);
267 430
268#endif /* _TRACE_BCACHE_H */ 431#endif /* _TRACE_BCACHE_H */