aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorKent Overstreet <koverstreet@google.com>2013-03-23 19:11:31 -0400
committerKent Overstreet <koverstreet@google.com>2013-03-23 19:11:31 -0400
commitcafe563591446cf80bfbc2fe3bc72a2e36cf1060 (patch)
treec8ae27b13dcdb0219634376ca5e667df32b1173a /drivers
parentea6749c705d9e629ed03c7336cc929fc6014b834 (diff)
bcache: A block layer cache
Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/Kconfig2
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bcache/Kconfig42
-rw-r--r--drivers/md/bcache/Makefile7
-rw-r--r--drivers/md/bcache/alloc.c583
-rw-r--r--drivers/md/bcache/bcache.h1232
-rw-r--r--drivers/md/bcache/bset.c1190
-rw-r--r--drivers/md/bcache/bset.h379
-rw-r--r--drivers/md/bcache/btree.c2503
-rw-r--r--drivers/md/bcache/btree.h405
-rw-r--r--drivers/md/bcache/closure.c348
-rw-r--r--drivers/md/bcache/closure.h670
-rw-r--r--drivers/md/bcache/debug.c563
-rw-r--r--drivers/md/bcache/debug.h54
-rw-r--r--drivers/md/bcache/io.c390
-rw-r--r--drivers/md/bcache/journal.c785
-rw-r--r--drivers/md/bcache/journal.h215
-rw-r--r--drivers/md/bcache/movinggc.c254
-rw-r--r--drivers/md/bcache/request.c1409
-rw-r--r--drivers/md/bcache/request.h62
-rw-r--r--drivers/md/bcache/stats.c245
-rw-r--r--drivers/md/bcache/stats.h58
-rw-r--r--drivers/md/bcache/super.c1941
-rw-r--r--drivers/md/bcache/sysfs.c817
-rw-r--r--drivers/md/bcache/sysfs.h110
-rw-r--r--drivers/md/bcache/trace.c26
-rw-r--r--drivers/md/bcache/util.c389
-rw-r--r--drivers/md/bcache/util.h589
-rw-r--r--drivers/md/bcache/writeback.c414
29 files changed, 15683 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 4d8d90b4fe78..3bfc8f1da9fe 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -174,6 +174,8 @@ config MD_FAULTY
174 174
175 In unsure, say N. 175 In unsure, say N.
176 176
177source "drivers/md/bcache/Kconfig"
178
177config BLK_DEV_DM 179config BLK_DEV_DM
178 tristate "Device mapper support" 180 tristate "Device mapper support"
179 ---help--- 181 ---help---
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 7ceeaefc0e95..1439fd4ad9b1 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o
29obj-$(CONFIG_MD_RAID456) += raid456.o 29obj-$(CONFIG_MD_RAID456) += raid456.o
30obj-$(CONFIG_MD_MULTIPATH) += multipath.o 30obj-$(CONFIG_MD_MULTIPATH) += multipath.o
31obj-$(CONFIG_MD_FAULTY) += faulty.o 31obj-$(CONFIG_MD_FAULTY) += faulty.o
32obj-$(CONFIG_BCACHE) += bcache/
32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 33obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
33obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 34obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
34obj-$(CONFIG_DM_BUFIO) += dm-bufio.o 35obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
new file mode 100644
index 000000000000..05c220d05e23
--- /dev/null
+++ b/drivers/md/bcache/Kconfig
@@ -0,0 +1,42 @@
1
2config BCACHE
3 tristate "Block device as cache"
4 select CLOSURES
5 ---help---
6 Allows a block device to be used as cache for other devices; uses
7 a btree for indexing and the layout is optimized for SSDs.
8
9 See Documentation/bcache.txt for details.
10
11config BCACHE_DEBUG
12 bool "Bcache debugging"
13 depends on BCACHE
14 ---help---
15 Don't select this option unless you're a developer
16
17 Enables extra debugging tools (primarily a fuzz tester)
18
19config BCACHE_EDEBUG
20 bool "Extended runtime checks"
21 depends on BCACHE
22 ---help---
23 Don't select this option unless you're a developer
24
25 Enables extra runtime checks which significantly affect performance
26
27config BCACHE_CLOSURES_DEBUG
28 bool "Debug closures"
29 depends on BCACHE
30 select DEBUG_FS
31 ---help---
32 Keeps all active closures in a linked list and provides a debugfs
33 interface to list them, which makes it possible to see asynchronous
34 operations that get stuck.
35
36# cgroup code needs to be updated:
37#
38#config CGROUP_BCACHE
39# bool "Cgroup controls for bcache"
40# depends on BCACHE && BLK_CGROUP
41# ---help---
42# TODO
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
new file mode 100644
index 000000000000..0e9c82523be6
--- /dev/null
+++ b/drivers/md/bcache/Makefile
@@ -0,0 +1,7 @@
1
2obj-$(CONFIG_BCACHE) += bcache.o
3
4bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\
5 movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
6
7CFLAGS_request.o += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
new file mode 100644
index 000000000000..ed18115e078e
--- /dev/null
+++ b/drivers/md/bcache/alloc.c
@@ -0,0 +1,583 @@
1/*
2 * Primary bucket allocation code
3 *
4 * Copyright 2012 Google, Inc.
5 *
6 * Allocation in bcache is done in terms of buckets:
7 *
8 * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
9 * btree pointers - they must match for the pointer to be considered valid.
10 *
11 * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
12 * bucket simply by incrementing its gen.
13 *
14 * The gens (along with the priorities; it's really the gens are important but
15 * the code is named as if it's the priorities) are written in an arbitrary list
16 * of buckets on disk, with a pointer to them in the journal header.
17 *
18 * When we invalidate a bucket, we have to write its new gen to disk and wait
19 * for that write to complete before we use it - otherwise after a crash we
20 * could have pointers that appeared to be good but pointed to data that had
21 * been overwritten.
22 *
23 * Since the gens and priorities are all stored contiguously on disk, we can
24 * batch this up: We fill up the free_inc list with freshly invalidated buckets,
25 * call prio_write(), and when prio_write() finishes we pull buckets off the
26 * free_inc list and optionally discard them.
27 *
28 * free_inc isn't the only freelist - if it was, we'd often to sleep while
29 * priorities and gens were being written before we could allocate. c->free is a
30 * smaller freelist, and buckets on that list are always ready to be used.
31 *
32 * If we've got discards enabled, that happens when a bucket moves from the
33 * free_inc list to the free list.
34 *
35 * There is another freelist, because sometimes we have buckets that we know
36 * have nothing pointing into them - these we can reuse without waiting for
37 * priorities to be rewritten. These come from freed btree nodes and buckets
38 * that garbage collection discovered no longer had valid keys pointing into
39 * them (because they were overwritten). That's the unused list - buckets on the
40 * unused list move to the free list, optionally being discarded in the process.
41 *
42 * It's also important to ensure that gens don't wrap around - with respect to
43 * either the oldest gen in the btree or the gen on disk. This is quite
44 * difficult to do in practice, but we explicitly guard against it anyways - if
45 * a bucket is in danger of wrapping around we simply skip invalidating it that
46 * time around, and we garbage collect or rewrite the priorities sooner than we
47 * would have otherwise.
48 *
49 * bch_bucket_alloc() allocates a single bucket from a specific cache.
50 *
51 * bch_bucket_alloc_set() allocates one or more buckets from different caches
52 * out of a cache set.
53 *
54 * free_some_buckets() drives all the processes described above. It's called
55 * from bch_bucket_alloc() and a few other places that need to make sure free
56 * buckets are ready.
57 *
58 * invalidate_buckets_(lru|fifo)() find buckets that are available to be
59 * invalidated, and then invalidate them and stick them on the free_inc list -
60 * in either lru or fifo order.
61 */
62
63#include "bcache.h"
64#include "btree.h"
65
66#include <linux/random.h>
67
68#define MAX_IN_FLIGHT_DISCARDS 8U
69
70/* Bucket heap / gen */
71
72uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
73{
74 uint8_t ret = ++b->gen;
75
76 ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
77 WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
78
79 if (CACHE_SYNC(&ca->set->sb)) {
80 ca->need_save_prio = max(ca->need_save_prio,
81 bucket_disk_gen(b));
82 WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
83 }
84
85 return ret;
86}
87
88void bch_rescale_priorities(struct cache_set *c, int sectors)
89{
90 struct cache *ca;
91 struct bucket *b;
92 unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
93 unsigned i;
94 int r;
95
96 atomic_sub(sectors, &c->rescale);
97
98 do {
99 r = atomic_read(&c->rescale);
100
101 if (r >= 0)
102 return;
103 } while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
104
105 mutex_lock(&c->bucket_lock);
106
107 c->min_prio = USHRT_MAX;
108
109 for_each_cache(ca, c, i)
110 for_each_bucket(b, ca)
111 if (b->prio &&
112 b->prio != BTREE_PRIO &&
113 !atomic_read(&b->pin)) {
114 b->prio--;
115 c->min_prio = min(c->min_prio, b->prio);
116 }
117
118 mutex_unlock(&c->bucket_lock);
119}
120
121/* Discard/TRIM */
122
123struct discard {
124 struct list_head list;
125 struct work_struct work;
126 struct cache *ca;
127 long bucket;
128
129 struct bio bio;
130 struct bio_vec bv;
131};
132
133static void discard_finish(struct work_struct *w)
134{
135 struct discard *d = container_of(w, struct discard, work);
136 struct cache *ca = d->ca;
137 char buf[BDEVNAME_SIZE];
138
139 if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
140 pr_notice("discard error on %s, disabling",
141 bdevname(ca->bdev, buf));
142 d->ca->discard = 0;
143 }
144
145 mutex_lock(&ca->set->bucket_lock);
146
147 fifo_push(&ca->free, d->bucket);
148 list_add(&d->list, &ca->discards);
149 atomic_dec(&ca->discards_in_flight);
150
151 mutex_unlock(&ca->set->bucket_lock);
152
153 closure_wake_up(&ca->set->bucket_wait);
154 wake_up(&ca->set->alloc_wait);
155
156 closure_put(&ca->set->cl);
157}
158
159static void discard_endio(struct bio *bio, int error)
160{
161 struct discard *d = container_of(bio, struct discard, bio);
162 schedule_work(&d->work);
163}
164
165static void do_discard(struct cache *ca, long bucket)
166{
167 struct discard *d = list_first_entry(&ca->discards,
168 struct discard, list);
169
170 list_del(&d->list);
171 d->bucket = bucket;
172
173 atomic_inc(&ca->discards_in_flight);
174 closure_get(&ca->set->cl);
175
176 bio_init(&d->bio);
177
178 d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket);
179 d->bio.bi_bdev = ca->bdev;
180 d->bio.bi_rw = REQ_WRITE|REQ_DISCARD;
181 d->bio.bi_max_vecs = 1;
182 d->bio.bi_io_vec = d->bio.bi_inline_vecs;
183 d->bio.bi_size = bucket_bytes(ca);
184 d->bio.bi_end_io = discard_endio;
185 bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
186
187 submit_bio(0, &d->bio);
188}
189
190/* Allocation */
191
192static inline bool can_inc_bucket_gen(struct bucket *b)
193{
194 return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
195 bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
196}
197
198bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
199{
200 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
201
202 if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
203 CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
204 return false;
205
206 b->prio = 0;
207
208 if (can_inc_bucket_gen(b) &&
209 fifo_push(&ca->unused, b - ca->buckets)) {
210 atomic_inc(&b->pin);
211 return true;
212 }
213
214 return false;
215}
216
217static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
218{
219 return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
220 !atomic_read(&b->pin) &&
221 can_inc_bucket_gen(b);
222}
223
224static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
225{
226 bch_inc_gen(ca, b);
227 b->prio = INITIAL_PRIO;
228 atomic_inc(&b->pin);
229 fifo_push(&ca->free_inc, b - ca->buckets);
230}
231
232static void invalidate_buckets_lru(struct cache *ca)
233{
234 unsigned bucket_prio(struct bucket *b)
235 {
236 return ((unsigned) (b->prio - ca->set->min_prio)) *
237 GC_SECTORS_USED(b);
238 }
239
240 bool bucket_max_cmp(struct bucket *l, struct bucket *r)
241 {
242 return bucket_prio(l) < bucket_prio(r);
243 }
244
245 bool bucket_min_cmp(struct bucket *l, struct bucket *r)
246 {
247 return bucket_prio(l) > bucket_prio(r);
248 }
249
250 struct bucket *b;
251 ssize_t i;
252
253 ca->heap.used = 0;
254
255 for_each_bucket(b, ca) {
256 if (!can_invalidate_bucket(ca, b))
257 continue;
258
259 if (!GC_SECTORS_USED(b)) {
260 if (!bch_bucket_add_unused(ca, b))
261 return;
262 } else {
263 if (!heap_full(&ca->heap))
264 heap_add(&ca->heap, b, bucket_max_cmp);
265 else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
266 ca->heap.data[0] = b;
267 heap_sift(&ca->heap, 0, bucket_max_cmp);
268 }
269 }
270 }
271
272 if (ca->heap.used * 2 < ca->heap.size)
273 bch_queue_gc(ca->set);
274
275 for (i = ca->heap.used / 2 - 1; i >= 0; --i)
276 heap_sift(&ca->heap, i, bucket_min_cmp);
277
278 while (!fifo_full(&ca->free_inc)) {
279 if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
280 /* We don't want to be calling invalidate_buckets()
281 * multiple times when it can't do anything
282 */
283 ca->invalidate_needs_gc = 1;
284 bch_queue_gc(ca->set);
285 return;
286 }
287
288 invalidate_one_bucket(ca, b);
289 }
290}
291
292static void invalidate_buckets_fifo(struct cache *ca)
293{
294 struct bucket *b;
295 size_t checked = 0;
296
297 while (!fifo_full(&ca->free_inc)) {
298 if (ca->fifo_last_bucket < ca->sb.first_bucket ||
299 ca->fifo_last_bucket >= ca->sb.nbuckets)
300 ca->fifo_last_bucket = ca->sb.first_bucket;
301
302 b = ca->buckets + ca->fifo_last_bucket++;
303
304 if (can_invalidate_bucket(ca, b))
305 invalidate_one_bucket(ca, b);
306
307 if (++checked >= ca->sb.nbuckets) {
308 ca->invalidate_needs_gc = 1;
309 bch_queue_gc(ca->set);
310 return;
311 }
312 }
313}
314
315static void invalidate_buckets_random(struct cache *ca)
316{
317 struct bucket *b;
318 size_t checked = 0;
319
320 while (!fifo_full(&ca->free_inc)) {
321 size_t n;
322 get_random_bytes(&n, sizeof(n));
323
324 n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
325 n += ca->sb.first_bucket;
326
327 b = ca->buckets + n;
328
329 if (can_invalidate_bucket(ca, b))
330 invalidate_one_bucket(ca, b);
331
332 if (++checked >= ca->sb.nbuckets / 2) {
333 ca->invalidate_needs_gc = 1;
334 bch_queue_gc(ca->set);
335 return;
336 }
337 }
338}
339
340static void invalidate_buckets(struct cache *ca)
341{
342 if (ca->invalidate_needs_gc)
343 return;
344
345 switch (CACHE_REPLACEMENT(&ca->sb)) {
346 case CACHE_REPLACEMENT_LRU:
347 invalidate_buckets_lru(ca);
348 break;
349 case CACHE_REPLACEMENT_FIFO:
350 invalidate_buckets_fifo(ca);
351 break;
352 case CACHE_REPLACEMENT_RANDOM:
353 invalidate_buckets_random(ca);
354 break;
355 }
356}
357
358#define allocator_wait(ca, cond) \
359do { \
360 DEFINE_WAIT(__wait); \
361 \
362 while (!(cond)) { \
363 prepare_to_wait(&ca->set->alloc_wait, \
364 &__wait, TASK_INTERRUPTIBLE); \
365 \
366 mutex_unlock(&(ca)->set->bucket_lock); \
367 if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
368 finish_wait(&ca->set->alloc_wait, &__wait); \
369 closure_return(cl); \
370 } \
371 \
372 schedule(); \
373 __set_current_state(TASK_RUNNING); \
374 mutex_lock(&(ca)->set->bucket_lock); \
375 } \
376 \
377 finish_wait(&ca->set->alloc_wait, &__wait); \
378} while (0)
379
380void bch_allocator_thread(struct closure *cl)
381{
382 struct cache *ca = container_of(cl, struct cache, alloc);
383
384 mutex_lock(&ca->set->bucket_lock);
385
386 while (1) {
387 while (1) {
388 long bucket;
389
390 if ((!atomic_read(&ca->set->prio_blocked) ||
391 !CACHE_SYNC(&ca->set->sb)) &&
392 !fifo_empty(&ca->unused))
393 fifo_pop(&ca->unused, bucket);
394 else if (!fifo_empty(&ca->free_inc))
395 fifo_pop(&ca->free_inc, bucket);
396 else
397 break;
398
399 allocator_wait(ca, (int) fifo_free(&ca->free) >
400 atomic_read(&ca->discards_in_flight));
401
402 if (ca->discard) {
403 allocator_wait(ca, !list_empty(&ca->discards));
404 do_discard(ca, bucket);
405 } else {
406 fifo_push(&ca->free, bucket);
407 closure_wake_up(&ca->set->bucket_wait);
408 }
409 }
410
411 allocator_wait(ca, ca->set->gc_mark_valid);
412 invalidate_buckets(ca);
413
414 allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) ||
415 !CACHE_SYNC(&ca->set->sb));
416
417 if (CACHE_SYNC(&ca->set->sb) &&
418 (!fifo_empty(&ca->free_inc) ||
419 ca->need_save_prio > 64)) {
420 bch_prio_write(ca);
421 }
422 }
423}
424
425long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
426{
427 long r = -1;
428again:
429 wake_up(&ca->set->alloc_wait);
430
431 if (fifo_used(&ca->free) > ca->watermark[watermark] &&
432 fifo_pop(&ca->free, r)) {
433 struct bucket *b = ca->buckets + r;
434#ifdef CONFIG_BCACHE_EDEBUG
435 size_t iter;
436 long i;
437
438 for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
439 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
440
441 fifo_for_each(i, &ca->free, iter)
442 BUG_ON(i == r);
443 fifo_for_each(i, &ca->free_inc, iter)
444 BUG_ON(i == r);
445 fifo_for_each(i, &ca->unused, iter)
446 BUG_ON(i == r);
447#endif
448 BUG_ON(atomic_read(&b->pin) != 1);
449
450 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
451
452 if (watermark <= WATERMARK_METADATA) {
453 SET_GC_MARK(b, GC_MARK_METADATA);
454 b->prio = BTREE_PRIO;
455 } else {
456 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
457 b->prio = INITIAL_PRIO;
458 }
459
460 return r;
461 }
462
463 pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
464 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
465 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
466
467 if (cl) {
468 closure_wait(&ca->set->bucket_wait, cl);
469
470 if (closure_blocking(cl)) {
471 mutex_unlock(&ca->set->bucket_lock);
472 closure_sync(cl);
473 mutex_lock(&ca->set->bucket_lock);
474 goto again;
475 }
476 }
477
478 return -1;
479}
480
481void bch_bucket_free(struct cache_set *c, struct bkey *k)
482{
483 unsigned i;
484
485 for (i = 0; i < KEY_PTRS(k); i++) {
486 struct bucket *b = PTR_BUCKET(c, k, i);
487
488 SET_GC_MARK(b, 0);
489 SET_GC_SECTORS_USED(b, 0);
490 bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
491 }
492}
493
494int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
495 struct bkey *k, int n, struct closure *cl)
496{
497 int i;
498
499 lockdep_assert_held(&c->bucket_lock);
500 BUG_ON(!n || n > c->caches_loaded || n > 8);
501
502 bkey_init(k);
503
504 /* sort by free space/prio of oldest data in caches */
505
506 for (i = 0; i < n; i++) {
507 struct cache *ca = c->cache_by_alloc[i];
508 long b = bch_bucket_alloc(ca, watermark, cl);
509
510 if (b == -1)
511 goto err;
512
513 k->ptr[i] = PTR(ca->buckets[b].gen,
514 bucket_to_sector(c, b),
515 ca->sb.nr_this_dev);
516
517 SET_KEY_PTRS(k, i + 1);
518 }
519
520 return 0;
521err:
522 bch_bucket_free(c, k);
523 __bkey_put(c, k);
524 return -1;
525}
526
527int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
528 struct bkey *k, int n, struct closure *cl)
529{
530 int ret;
531 mutex_lock(&c->bucket_lock);
532 ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
533 mutex_unlock(&c->bucket_lock);
534 return ret;
535}
536
537/* Init */
538
539void bch_cache_allocator_exit(struct cache *ca)
540{
541 struct discard *d;
542
543 while (!list_empty(&ca->discards)) {
544 d = list_first_entry(&ca->discards, struct discard, list);
545 cancel_work_sync(&d->work);
546 list_del(&d->list);
547 kfree(d);
548 }
549}
550
551int bch_cache_allocator_init(struct cache *ca)
552{
553 unsigned i;
554
555 /*
556 * Reserve:
557 * Prio/gen writes first
558 * Then 8 for btree allocations
559 * Then half for the moving garbage collector
560 */
561
562 ca->watermark[WATERMARK_PRIO] = 0;
563
564 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
565
566 ca->watermark[WATERMARK_MOVINGGC] = 8 +
567 ca->watermark[WATERMARK_METADATA];
568
569 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
570 ca->watermark[WATERMARK_MOVINGGC];
571
572 for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
573 struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
574 if (!d)
575 return -ENOMEM;
576
577 d->ca = ca;
578 INIT_WORK(&d->work, discard_finish);
579 list_add(&d->list, &ca->discards);
580 }
581
582 return 0;
583}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
new file mode 100644
index 000000000000..d01a553f63f3
--- /dev/null
+++ b/drivers/md/bcache/bcache.h
@@ -0,0 +1,1232 @@
1#ifndef _BCACHE_H
2#define _BCACHE_H
3
4/*
5 * SOME HIGH LEVEL CODE DOCUMENTATION:
6 *
7 * Bcache mostly works with cache sets, cache devices, and backing devices.
8 *
9 * Support for multiple cache devices hasn't quite been finished off yet, but
10 * it's about 95% plumbed through. A cache set and its cache devices is sort of
11 * like a md raid array and its component devices. Most of the code doesn't care
12 * about individual cache devices, the main abstraction is the cache set.
13 *
14 * Multiple cache devices is intended to give us the ability to mirror dirty
15 * cached data and metadata, without mirroring clean cached data.
16 *
17 * Backing devices are different, in that they have a lifetime independent of a
18 * cache set. When you register a newly formatted backing device it'll come up
19 * in passthrough mode, and then you can attach and detach a backing device from
20 * a cache set at runtime - while it's mounted and in use. Detaching implicitly
21 * invalidates any cached data for that backing device.
22 *
23 * A cache set can have multiple (many) backing devices attached to it.
24 *
25 * There's also flash only volumes - this is the reason for the distinction
26 * between struct cached_dev and struct bcache_device. A flash only volume
27 * works much like a bcache device that has a backing device, except the
28 * "cached" data is always dirty. The end result is that we get thin
29 * provisioning with very little additional code.
30 *
31 * Flash only volumes work but they're not production ready because the moving
32 * garbage collector needs more work. More on that later.
33 *
34 * BUCKETS/ALLOCATION:
35 *
36 * Bcache is primarily designed for caching, which means that in normal
37 * operation all of our available space will be allocated. Thus, we need an
38 * efficient way of deleting things from the cache so we can write new things to
39 * it.
40 *
41 * To do this, we first divide the cache device up into buckets. A bucket is the
42 * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
43 * works efficiently.
44 *
45 * Each bucket has a 16 bit priority, and an 8 bit generation associated with
46 * it. The gens and priorities for all the buckets are stored contiguously and
47 * packed on disk (in a linked list of buckets - aside from the superblock, all
48 * of bcache's metadata is stored in buckets).
49 *
50 * The priority is used to implement an LRU. We reset a bucket's priority when
51 * we allocate it or on cache it, and every so often we decrement the priority
52 * of each bucket. It could be used to implement something more sophisticated,
53 * if anyone ever gets around to it.
54 *
55 * The generation is used for invalidating buckets. Each pointer also has an 8
56 * bit generation embedded in it; for a pointer to be considered valid, its gen
57 * must match the gen of the bucket it points into. Thus, to reuse a bucket all
58 * we have to do is increment its gen (and write its new gen to disk; we batch
59 * this up).
60 *
61 * Bcache is entirely COW - we never write twice to a bucket, even buckets that
62 * contain metadata (including btree nodes).
63 *
64 * THE BTREE:
65 *
66 * Bcache is in large part design around the btree.
67 *
68 * At a high level, the btree is just an index of key -> ptr tuples.
69 *
70 * Keys represent extents, and thus have a size field. Keys also have a variable
71 * number of pointers attached to them (potentially zero, which is handy for
72 * invalidating the cache).
73 *
74 * The key itself is an inode:offset pair. The inode number corresponds to a
75 * backing device or a flash only volume. The offset is the ending offset of the
76 * extent within the inode - not the starting offset; this makes lookups
77 * slightly more convenient.
78 *
79 * Pointers contain the cache device id, the offset on that device, and an 8 bit
80 * generation number. More on the gen later.
81 *
82 * Index lookups are not fully abstracted - cache lookups in particular are
83 * still somewhat mixed in with the btree code, but things are headed in that
84 * direction.
85 *
86 * Updates are fairly well abstracted, though. There are two different ways of
87 * updating the btree; insert and replace.
88 *
89 * BTREE_INSERT will just take a list of keys and insert them into the btree -
90 * overwriting (possibly only partially) any extents they overlap with. This is
91 * used to update the index after a write.
92 *
93 * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
94 * overwriting a key that matches another given key. This is used for inserting
95 * data into the cache after a cache miss, and for background writeback, and for
96 * the moving garbage collector.
97 *
98 * There is no "delete" operation; deleting things from the index is
99 * accomplished by either by invalidating pointers (by incrementing a bucket's
100 * gen) or by inserting a key with 0 pointers - which will overwrite anything
101 * previously present at that location in the index.
102 *
103 * This means that there are always stale/invalid keys in the btree. They're
104 * filtered out by the code that iterates through a btree node, and removed when
105 * a btree node is rewritten.
106 *
107 * BTREE NODES:
108 *
109 * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
110 * free smaller than a bucket - so, that's how big our btree nodes are.
111 *
112 * (If buckets are really big we'll only use part of the bucket for a btree node
113 * - no less than 1/4th - but a bucket still contains no more than a single
114 * btree node. I'd actually like to change this, but for now we rely on the
115 * bucket's gen for deleting btree nodes when we rewrite/split a node.)
116 *
117 * Anyways, btree nodes are big - big enough to be inefficient with a textbook
118 * btree implementation.
119 *
120 * The way this is solved is that btree nodes are internally log structured; we
121 * can append new keys to an existing btree node without rewriting it. This
122 * means each set of keys we write is sorted, but the node is not.
123 *
124 * We maintain this log structure in memory - keeping 1Mb of keys sorted would
125 * be expensive, and we have to distinguish between the keys we have written and
126 * the keys we haven't. So to do a lookup in a btree node, we have to search
127 * each sorted set. But we do merge written sets together lazily, so the cost of
128 * these extra searches is quite low (normally most of the keys in a btree node
129 * will be in one big set, and then there'll be one or two sets that are much
130 * smaller).
131 *
132 * This log structure makes bcache's btree more of a hybrid between a
133 * conventional btree and a compacting data structure, with some of the
134 * advantages of both.
135 *
136 * GARBAGE COLLECTION:
137 *
138 * We can't just invalidate any bucket - it might contain dirty data or
139 * metadata. If it once contained dirty data, other writes might overwrite it
140 * later, leaving no valid pointers into that bucket in the index.
141 *
142 * Thus, the primary purpose of garbage collection is to find buckets to reuse.
143 * It also counts how much valid data it each bucket currently contains, so that
144 * allocation can reuse buckets sooner when they've been mostly overwritten.
145 *
146 * It also does some things that are really internal to the btree
147 * implementation. If a btree node contains pointers that are stale by more than
148 * some threshold, it rewrites the btree node to avoid the bucket's generation
149 * wrapping around. It also merges adjacent btree nodes if they're empty enough.
150 *
151 * THE JOURNAL:
152 *
153 * Bcache's journal is not necessary for consistency; we always strictly
154 * order metadata writes so that the btree and everything else is consistent on
155 * disk in the event of an unclean shutdown, and in fact bcache had writeback
156 * caching (with recovery from unclean shutdown) before journalling was
157 * implemented.
158 *
159 * Rather, the journal is purely a performance optimization; we can't complete a
160 * write until we've updated the index on disk, otherwise the cache would be
161 * inconsistent in the event of an unclean shutdown. This means that without the
162 * journal, on random write workloads we constantly have to update all the leaf
163 * nodes in the btree, and those writes will be mostly empty (appending at most
164 * a few keys each) - highly inefficient in terms of amount of metadata writes,
165 * and it puts more strain on the various btree resorting/compacting code.
166 *
167 * The journal is just a log of keys we've inserted; on startup we just reinsert
168 * all the keys in the open journal entries. That means that when we're updating
169 * a node in the btree, we can wait until a 4k block of keys fills up before
170 * writing them out.
171 *
172 * For simplicity, we only journal updates to leaf nodes; updates to parent
173 * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
174 * the complexity to deal with journalling them (in particular, journal replay)
175 * - updates to non leaf nodes just happen synchronously (see btree_split()).
176 */
177
178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
179
180#include <linux/bio.h>
181#include <linux/blktrace_api.h>
182#include <linux/kobject.h>
183#include <linux/list.h>
184#include <linux/mutex.h>
185#include <linux/rbtree.h>
186#include <linux/rwsem.h>
187#include <linux/types.h>
188#include <linux/workqueue.h>
189
190#include "util.h"
191#include "closure.h"
192
193struct bucket {
194 atomic_t pin;
195 uint16_t prio;
196 uint8_t gen;
197 uint8_t disk_gen;
198 uint8_t last_gc; /* Most out of date gen in the btree */
199 uint8_t gc_gen;
200 uint16_t gc_mark;
201};
202
203/*
204 * I'd use bitfields for these, but I don't trust the compiler not to screw me
205 * as multiple threads touch struct bucket without locking
206 */
207
208BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
209#define GC_MARK_RECLAIMABLE 0
210#define GC_MARK_DIRTY 1
211#define GC_MARK_METADATA 2
212BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
213
214struct bkey {
215 uint64_t high;
216 uint64_t low;
217 uint64_t ptr[];
218};
219
220/* Enough for a key with 6 pointers */
221#define BKEY_PAD 8
222
223#define BKEY_PADDED(key) \
224 union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
225
226/* Version 1: Backing device
227 * Version 2: Seed pointer into btree node checksum
228 * Version 3: New UUID format
229 */
230#define BCACHE_SB_VERSION 3
231
232#define SB_SECTOR 8
233#define SB_SIZE 4096
234#define SB_LABEL_SIZE 32
235#define SB_JOURNAL_BUCKETS 256U
236/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
237#define MAX_CACHES_PER_SET 8
238
239#define BDEV_DATA_START 16 /* sectors */
240
241struct cache_sb {
242 uint64_t csum;
243 uint64_t offset; /* sector where this sb was written */
244 uint64_t version;
245#define CACHE_BACKING_DEV 1
246
247 uint8_t magic[16];
248
249 uint8_t uuid[16];
250 union {
251 uint8_t set_uuid[16];
252 uint64_t set_magic;
253 };
254 uint8_t label[SB_LABEL_SIZE];
255
256 uint64_t flags;
257 uint64_t seq;
258 uint64_t pad[8];
259
260 uint64_t nbuckets; /* device size */
261 uint16_t block_size; /* sectors */
262 uint16_t bucket_size; /* sectors */
263
264 uint16_t nr_in_set;
265 uint16_t nr_this_dev;
266
267 uint32_t last_mount; /* time_t */
268
269 uint16_t first_bucket;
270 union {
271 uint16_t njournal_buckets;
272 uint16_t keys;
273 };
274 uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */
275};
276
277BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
278BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
279BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
280#define CACHE_REPLACEMENT_LRU 0U
281#define CACHE_REPLACEMENT_FIFO 1U
282#define CACHE_REPLACEMENT_RANDOM 2U
283
284BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
285#define CACHE_MODE_WRITETHROUGH 0U
286#define CACHE_MODE_WRITEBACK 1U
287#define CACHE_MODE_WRITEAROUND 2U
288#define CACHE_MODE_NONE 3U
289BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
290#define BDEV_STATE_NONE 0U
291#define BDEV_STATE_CLEAN 1U
292#define BDEV_STATE_DIRTY 2U
293#define BDEV_STATE_STALE 3U
294
295/* Version 1: Seed pointer into btree node checksum
296 */
297#define BCACHE_BSET_VERSION 1
298
299/*
300 * This is the on disk format for btree nodes - a btree node on disk is a list
301 * of these; within each set the keys are sorted
302 */
303struct bset {
304 uint64_t csum;
305 uint64_t magic;
306 uint64_t seq;
307 uint32_t version;
308 uint32_t keys;
309
310 union {
311 struct bkey start[0];
312 uint64_t d[0];
313 };
314};
315
316/*
317 * On disk format for priorities and gens - see super.c near prio_write() for
318 * more.
319 */
320struct prio_set {
321 uint64_t csum;
322 uint64_t magic;
323 uint64_t seq;
324 uint32_t version;
325 uint32_t pad;
326
327 uint64_t next_bucket;
328
329 struct bucket_disk {
330 uint16_t prio;
331 uint8_t gen;
332 } __attribute((packed)) data[];
333};
334
335struct uuid_entry {
336 union {
337 struct {
338 uint8_t uuid[16];
339 uint8_t label[32];
340 uint32_t first_reg;
341 uint32_t last_reg;
342 uint32_t invalidated;
343
344 uint32_t flags;
345 /* Size of flash only volumes */
346 uint64_t sectors;
347 };
348
349 uint8_t pad[128];
350 };
351};
352
353BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
354
355#include "journal.h"
356#include "stats.h"
357struct search;
358struct btree;
359struct keybuf;
360
361struct keybuf_key {
362 struct rb_node node;
363 BKEY_PADDED(key);
364 void *private;
365};
366
367typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
368
369struct keybuf {
370 keybuf_pred_fn *key_predicate;
371
372 struct bkey last_scanned;
373 spinlock_t lock;
374
375 /*
376 * Beginning and end of range in rb tree - so that we can skip taking
377 * lock and checking the rb tree when we need to check for overlapping
378 * keys.
379 */
380 struct bkey start;
381 struct bkey end;
382
383 struct rb_root keys;
384
385#define KEYBUF_NR 100
386 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
387};
388
389struct bio_split_pool {
390 struct bio_set *bio_split;
391 mempool_t *bio_split_hook;
392};
393
394struct bio_split_hook {
395 struct closure cl;
396 struct bio_split_pool *p;
397 struct bio *bio;
398 bio_end_io_t *bi_end_io;
399 void *bi_private;
400};
401
402struct bcache_device {
403 struct closure cl;
404
405 struct kobject kobj;
406
407 struct cache_set *c;
408 unsigned id;
409#define BCACHEDEVNAME_SIZE 12
410 char name[BCACHEDEVNAME_SIZE];
411
412 struct gendisk *disk;
413
414 /* If nonzero, we're closing */
415 atomic_t closing;
416
417 /* If nonzero, we're detaching/unregistering from cache set */
418 atomic_t detaching;
419
420 atomic_long_t sectors_dirty;
421 unsigned long sectors_dirty_gc;
422 unsigned long sectors_dirty_last;
423 long sectors_dirty_derivative;
424
425 mempool_t *unaligned_bvec;
426 struct bio_set *bio_split;
427
428 unsigned data_csum:1;
429
430 int (*cache_miss)(struct btree *, struct search *,
431 struct bio *, unsigned);
432 int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
433
434 struct bio_split_pool bio_split_hook;
435};
436
437struct io {
438 /* Used to track sequential IO so it can be skipped */
439 struct hlist_node hash;
440 struct list_head lru;
441
442 unsigned long jiffies;
443 unsigned sequential;
444 sector_t last;
445};
446
447struct cached_dev {
448 struct list_head list;
449 struct bcache_device disk;
450 struct block_device *bdev;
451
452 struct cache_sb sb;
453 struct bio sb_bio;
454 struct bio_vec sb_bv[1];
455 struct closure_with_waitlist sb_write;
456
457 /* Refcount on the cache set. Always nonzero when we're caching. */
458 atomic_t count;
459 struct work_struct detach;
460
461 /*
462 * Device might not be running if it's dirty and the cache set hasn't
463 * showed up yet.
464 */
465 atomic_t running;
466
467 /*
468 * Writes take a shared lock from start to finish; scanning for dirty
469 * data to refill the rb tree requires an exclusive lock.
470 */
471 struct rw_semaphore writeback_lock;
472
473 /*
474 * Nonzero, and writeback has a refcount (d->count), iff there is dirty
475 * data in the cache. Protected by writeback_lock; must have an
476 * shared lock to set and exclusive lock to clear.
477 */
478 atomic_t has_dirty;
479
480 struct ratelimit writeback_rate;
481 struct delayed_work writeback_rate_update;
482
483 /*
484 * Internal to the writeback code, so read_dirty() can keep track of
485 * where it's at.
486 */
487 sector_t last_read;
488
489 /* Number of writeback bios in flight */
490 atomic_t in_flight;
491 struct closure_with_timer writeback;
492 struct closure_waitlist writeback_wait;
493
494 struct keybuf writeback_keys;
495
496 /* For tracking sequential IO */
497#define RECENT_IO_BITS 7
498#define RECENT_IO (1 << RECENT_IO_BITS)
499 struct io io[RECENT_IO];
500 struct hlist_head io_hash[RECENT_IO + 1];
501 struct list_head io_lru;
502 spinlock_t io_lock;
503
504 struct cache_accounting accounting;
505
506 /* The rest of this all shows up in sysfs */
507 unsigned sequential_cutoff;
508 unsigned readahead;
509
510 unsigned sequential_merge:1;
511 unsigned verify:1;
512
513 unsigned writeback_metadata:1;
514 unsigned writeback_running:1;
515 unsigned char writeback_percent;
516 unsigned writeback_delay;
517
518 int writeback_rate_change;
519 int64_t writeback_rate_derivative;
520 uint64_t writeback_rate_target;
521
522 unsigned writeback_rate_update_seconds;
523 unsigned writeback_rate_d_term;
524 unsigned writeback_rate_p_term_inverse;
525 unsigned writeback_rate_d_smooth;
526};
527
528enum alloc_watermarks {
529 WATERMARK_PRIO,
530 WATERMARK_METADATA,
531 WATERMARK_MOVINGGC,
532 WATERMARK_NONE,
533 WATERMARK_MAX
534};
535
536struct cache {
537 struct cache_set *set;
538 struct cache_sb sb;
539 struct bio sb_bio;
540 struct bio_vec sb_bv[1];
541
542 struct kobject kobj;
543 struct block_device *bdev;
544
545 unsigned watermark[WATERMARK_MAX];
546
547 struct closure alloc;
548 struct workqueue_struct *alloc_workqueue;
549
550 struct closure prio;
551 struct prio_set *disk_buckets;
552
553 /*
554 * When allocating new buckets, prio_write() gets first dibs - since we
555 * may not be allocate at all without writing priorities and gens.
556 * prio_buckets[] contains the last buckets we wrote priorities to (so
557 * gc can mark them as metadata), prio_next[] contains the buckets
558 * allocated for the next prio write.
559 */
560 uint64_t *prio_buckets;
561 uint64_t *prio_last_buckets;
562
563 /*
564 * free: Buckets that are ready to be used
565 *
566 * free_inc: Incoming buckets - these are buckets that currently have
567 * cached data in them, and we can't reuse them until after we write
568 * their new gen to disk. After prio_write() finishes writing the new
569 * gens/prios, they'll be moved to the free list (and possibly discarded
570 * in the process)
571 *
572 * unused: GC found nothing pointing into these buckets (possibly
573 * because all the data they contained was overwritten), so we only
574 * need to discard them before they can be moved to the free list.
575 */
576 DECLARE_FIFO(long, free);
577 DECLARE_FIFO(long, free_inc);
578 DECLARE_FIFO(long, unused);
579
580 size_t fifo_last_bucket;
581
582 /* Allocation stuff: */
583 struct bucket *buckets;
584
585 DECLARE_HEAP(struct bucket *, heap);
586
587 /*
588 * max(gen - disk_gen) for all buckets. When it gets too big we have to
589 * call prio_write() to keep gens from wrapping.
590 */
591 uint8_t need_save_prio;
592 unsigned gc_move_threshold;
593
594 /*
595 * If nonzero, we know we aren't going to find any buckets to invalidate
596 * until a gc finishes - otherwise we could pointlessly burn a ton of
597 * cpu
598 */
599 unsigned invalidate_needs_gc:1;
600
601 bool discard; /* Get rid of? */
602
603 /*
604 * We preallocate structs for issuing discards to buckets, and keep them
605 * on this list when they're not in use; do_discard() issues discards
606 * whenever there's work to do and is called by free_some_buckets() and
607 * when a discard finishes.
608 */
609 atomic_t discards_in_flight;
610 struct list_head discards;
611
612 struct journal_device journal;
613
614 /* The rest of this all shows up in sysfs */
615#define IO_ERROR_SHIFT 20
616 atomic_t io_errors;
617 atomic_t io_count;
618
619 atomic_long_t meta_sectors_written;
620 atomic_long_t btree_sectors_written;
621 atomic_long_t sectors_written;
622
623 struct bio_split_pool bio_split_hook;
624};
625
626struct gc_stat {
627 size_t nodes;
628 size_t key_bytes;
629
630 size_t nkeys;
631 uint64_t data; /* sectors */
632 uint64_t dirty; /* sectors */
633 unsigned in_use; /* percent */
634};
635
636/*
637 * Flag bits, for how the cache set is shutting down, and what phase it's at:
638 *
639 * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching
640 * all the backing devices first (their cached data gets invalidated, and they
641 * won't automatically reattach).
642 *
643 * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
644 * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
645 * flushing dirty data).
646 *
647 * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down the
648 * allocation thread.
649 */
650#define CACHE_SET_UNREGISTERING 0
651#define CACHE_SET_STOPPING 1
652#define CACHE_SET_STOPPING_2 2
653
654struct cache_set {
655 struct closure cl;
656
657 struct list_head list;
658 struct kobject kobj;
659 struct kobject internal;
660 struct dentry *debug;
661 struct cache_accounting accounting;
662
663 unsigned long flags;
664
665 struct cache_sb sb;
666
667 struct cache *cache[MAX_CACHES_PER_SET];
668 struct cache *cache_by_alloc[MAX_CACHES_PER_SET];
669 int caches_loaded;
670
671 struct bcache_device **devices;
672 struct list_head cached_devs;
673 uint64_t cached_dev_sectors;
674 struct closure caching;
675
676 struct closure_with_waitlist sb_write;
677
678 mempool_t *search;
679 mempool_t *bio_meta;
680 struct bio_set *bio_split;
681
682 /* For the btree cache */
683 struct shrinker shrink;
684
685 /* For the allocator itself */
686 wait_queue_head_t alloc_wait;
687
688 /* For the btree cache and anything allocation related */
689 struct mutex bucket_lock;
690
691 /* log2(bucket_size), in sectors */
692 unsigned short bucket_bits;
693
694 /* log2(block_size), in sectors */
695 unsigned short block_bits;
696
697 /*
698 * Default number of pages for a new btree node - may be less than a
699 * full bucket
700 */
701 unsigned btree_pages;
702
703 /*
704 * Lists of struct btrees; lru is the list for structs that have memory
705 * allocated for actual btree node, freed is for structs that do not.
706 *
707 * We never free a struct btree, except on shutdown - we just put it on
708 * the btree_cache_freed list and reuse it later. This simplifies the
709 * code, and it doesn't cost us much memory as the memory usage is
710 * dominated by buffers that hold the actual btree node data and those
711 * can be freed - and the number of struct btrees allocated is
712 * effectively bounded.
713 *
714 * btree_cache_freeable effectively is a small cache - we use it because
715 * high order page allocations can be rather expensive, and it's quite
716 * common to delete and allocate btree nodes in quick succession. It
717 * should never grow past ~2-3 nodes in practice.
718 */
719 struct list_head btree_cache;
720 struct list_head btree_cache_freeable;
721 struct list_head btree_cache_freed;
722
723 /* Number of elements in btree_cache + btree_cache_freeable lists */
724 unsigned bucket_cache_used;
725
726 /*
727 * If we need to allocate memory for a new btree node and that
728 * allocation fails, we can cannibalize another node in the btree cache
729 * to satisfy the allocation. However, only one thread can be doing this
730 * at a time, for obvious reasons - try_harder and try_wait are
731 * basically a lock for this that we can wait on asynchronously. The
732 * btree_root() macro releases the lock when it returns.
733 */
734 struct closure *try_harder;
735 struct closure_waitlist try_wait;
736 uint64_t try_harder_start;
737
738 /*
739 * When we free a btree node, we increment the gen of the bucket the
740 * node is in - but we can't rewrite the prios and gens until we
741 * finished whatever it is we were doing, otherwise after a crash the
742 * btree node would be freed but for say a split, we might not have the
743 * pointers to the new nodes inserted into the btree yet.
744 *
745 * This is a refcount that blocks prio_write() until the new keys are
746 * written.
747 */
748 atomic_t prio_blocked;
749 struct closure_waitlist bucket_wait;
750
751 /*
752 * For any bio we don't skip we subtract the number of sectors from
753 * rescale; when it hits 0 we rescale all the bucket priorities.
754 */
755 atomic_t rescale;
756 /*
757 * When we invalidate buckets, we use both the priority and the amount
758 * of good data to determine which buckets to reuse first - to weight
759 * those together consistently we keep track of the smallest nonzero
760 * priority of any bucket.
761 */
762 uint16_t min_prio;
763
764 /*
765 * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
766 * to keep gens from wrapping around.
767 */
768 uint8_t need_gc;
769 struct gc_stat gc_stats;
770 size_t nbuckets;
771
772 struct closure_with_waitlist gc;
773 /* Where in the btree gc currently is */
774 struct bkey gc_done;
775
776 /*
777 * The allocation code needs gc_mark in struct bucket to be correct, but
778 * it's not while a gc is in progress. Protected by bucket_lock.
779 */
780 int gc_mark_valid;
781
782 /* Counts how many sectors bio_insert has added to the cache */
783 atomic_t sectors_to_gc;
784
785 struct closure moving_gc;
786 struct closure_waitlist moving_gc_wait;
787 struct keybuf moving_gc_keys;
788 /* Number of moving GC bios in flight */
789 atomic_t in_flight;
790
791 struct btree *root;
792
793#ifdef CONFIG_BCACHE_DEBUG
794 struct btree *verify_data;
795 struct mutex verify_lock;
796#endif
797
798 unsigned nr_uuids;
799 struct uuid_entry *uuids;
800 BKEY_PADDED(uuid_bucket);
801 struct closure_with_waitlist uuid_write;
802
803 /*
804 * A btree node on disk could have too many bsets for an iterator to fit
805 * on the stack - this is a single element mempool for btree_read_work()
806 */
807 struct mutex fill_lock;
808 struct btree_iter *fill_iter;
809
810 /*
811 * btree_sort() is a merge sort and requires temporary space - single
812 * element mempool
813 */
814 struct mutex sort_lock;
815 struct bset *sort;
816
817 /* List of buckets we're currently writing data to */
818 struct list_head data_buckets;
819 spinlock_t data_bucket_lock;
820
821 struct journal journal;
822
823#define CONGESTED_MAX 1024
824 unsigned congested_last_us;
825 atomic_t congested;
826
827 /* The rest of this all shows up in sysfs */
828 unsigned congested_read_threshold_us;
829 unsigned congested_write_threshold_us;
830
831 spinlock_t sort_time_lock;
832 struct time_stats sort_time;
833 struct time_stats btree_gc_time;
834 struct time_stats btree_split_time;
835 spinlock_t btree_read_time_lock;
836 struct time_stats btree_read_time;
837 struct time_stats try_harder_time;
838
839 atomic_long_t cache_read_races;
840 atomic_long_t writeback_keys_done;
841 atomic_long_t writeback_keys_failed;
842 unsigned error_limit;
843 unsigned error_decay;
844 unsigned short journal_delay_ms;
845 unsigned verify:1;
846 unsigned key_merging_disabled:1;
847 unsigned gc_always_rewrite:1;
848 unsigned shrinker_disabled:1;
849 unsigned copy_gc_enabled:1;
850
851#define BUCKET_HASH_BITS 12
852 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
853};
854
855static inline bool key_merging_disabled(struct cache_set *c)
856{
857#ifdef CONFIG_BCACHE_DEBUG
858 return c->key_merging_disabled;
859#else
860 return 0;
861#endif
862}
863
864struct bbio {
865 unsigned submit_time_us;
866 union {
867 struct bkey key;
868 uint64_t _pad[3];
869 /*
870 * We only need pad = 3 here because we only ever carry around a
871 * single pointer - i.e. the pointer we're doing io to/from.
872 */
873 };
874 struct bio bio;
875};
876
877static inline unsigned local_clock_us(void)
878{
879 return local_clock() >> 10;
880}
881
882#define MAX_BSETS 4U
883
884#define BTREE_PRIO USHRT_MAX
885#define INITIAL_PRIO 32768
886
887#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
888#define btree_blocks(b) \
889 ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
890
891#define btree_default_blocks(c) \
892 ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
893
894#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS)
895#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
896#define block_bytes(c) ((c)->sb.block_size << 9)
897
898#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
899#define set_bytes(i) __set_bytes(i, i->keys)
900
901#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
902#define set_blocks(i, c) __set_blocks(i, (i)->keys, c)
903
904#define node(i, j) ((struct bkey *) ((i)->d + (j)))
905#define end(i) node(i, (i)->keys)
906
907#define index(i, b) \
908 ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \
909 block_bytes(b->c)))
910
911#define btree_data_space(b) (PAGE_SIZE << (b)->page_order)
912
913#define prios_per_bucket(c) \
914 ((bucket_bytes(c) - sizeof(struct prio_set)) / \
915 sizeof(struct bucket_disk))
916#define prio_buckets(c) \
917 DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
918
919#define JSET_MAGIC 0x245235c1a3625032ULL
920#define PSET_MAGIC 0x6750e15f87337f91ULL
921#define BSET_MAGIC 0x90135c78b99e07f5ULL
922
923#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC)
924#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC)
925#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC)
926
927/* Bkey fields: all units are in sectors */
928
929#define KEY_FIELD(name, field, offset, size) \
930 BITMASK(name, struct bkey, field, offset, size)
931
932#define PTR_FIELD(name, offset, size) \
933 static inline uint64_t name(const struct bkey *k, unsigned i) \
934 { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \
935 \
936 static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
937 { \
938 k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \
939 k->ptr[i] |= v << offset; \
940 }
941
942KEY_FIELD(KEY_PTRS, high, 60, 3)
943KEY_FIELD(HEADER_SIZE, high, 58, 2)
944KEY_FIELD(KEY_CSUM, high, 56, 2)
945KEY_FIELD(KEY_PINNED, high, 55, 1)
946KEY_FIELD(KEY_DIRTY, high, 36, 1)
947
948KEY_FIELD(KEY_SIZE, high, 20, 16)
949KEY_FIELD(KEY_INODE, high, 0, 20)
950
951/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
952
953static inline uint64_t KEY_OFFSET(const struct bkey *k)
954{
955 return k->low;
956}
957
958static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
959{
960 k->low = v;
961}
962
963PTR_FIELD(PTR_DEV, 51, 12)
964PTR_FIELD(PTR_OFFSET, 8, 43)
965PTR_FIELD(PTR_GEN, 0, 8)
966
967#define PTR_CHECK_DEV ((1 << 12) - 1)
968
969#define PTR(gen, offset, dev) \
970 ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
971
972static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
973{
974 return s >> c->bucket_bits;
975}
976
977static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
978{
979 return ((sector_t) b) << c->bucket_bits;
980}
981
982static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
983{
984 return s & (c->sb.bucket_size - 1);
985}
986
987static inline struct cache *PTR_CACHE(struct cache_set *c,
988 const struct bkey *k,
989 unsigned ptr)
990{
991 return c->cache[PTR_DEV(k, ptr)];
992}
993
994static inline size_t PTR_BUCKET_NR(struct cache_set *c,
995 const struct bkey *k,
996 unsigned ptr)
997{
998 return sector_to_bucket(c, PTR_OFFSET(k, ptr));
999}
1000
1001static inline struct bucket *PTR_BUCKET(struct cache_set *c,
1002 const struct bkey *k,
1003 unsigned ptr)
1004{
1005 return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
1006}
1007
1008/* Btree key macros */
1009
1010/*
1011 * The high bit being set is a relic from when we used it to do binary
1012 * searches - it told you where a key started. It's not used anymore,
1013 * and can probably be safely dropped.
1014 */
1015#define KEY(dev, sector, len) (struct bkey) \
1016{ \
1017 .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \
1018 .low = (sector) \
1019}
1020
1021static inline void bkey_init(struct bkey *k)
1022{
1023 *k = KEY(0, 0, 0);
1024}
1025
1026#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
1027#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
1028#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
1029#define ZERO_KEY KEY(0, 0, 0)
1030
1031/*
1032 * This is used for various on disk data structures - cache_sb, prio_set, bset,
1033 * jset: The checksum is _always_ the first 8 bytes of these structs
1034 */
1035#define csum_set(i) \
1036 crc64(((void *) (i)) + sizeof(uint64_t), \
1037 ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
1038
1039/* Error handling macros */
1040
1041#define btree_bug(b, ...) \
1042do { \
1043 if (bch_cache_set_error((b)->c, __VA_ARGS__)) \
1044 dump_stack(); \
1045} while (0)
1046
1047#define cache_bug(c, ...) \
1048do { \
1049 if (bch_cache_set_error(c, __VA_ARGS__)) \
1050 dump_stack(); \
1051} while (0)
1052
1053#define btree_bug_on(cond, b, ...) \
1054do { \
1055 if (cond) \
1056 btree_bug(b, __VA_ARGS__); \
1057} while (0)
1058
1059#define cache_bug_on(cond, c, ...) \
1060do { \
1061 if (cond) \
1062 cache_bug(c, __VA_ARGS__); \
1063} while (0)
1064
1065#define cache_set_err_on(cond, c, ...) \
1066do { \
1067 if (cond) \
1068 bch_cache_set_error(c, __VA_ARGS__); \
1069} while (0)
1070
1071/* Looping macros */
1072
1073#define for_each_cache(ca, cs, iter) \
1074 for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++)
1075
1076#define for_each_bucket(b, ca) \
1077 for (b = (ca)->buckets + (ca)->sb.first_bucket; \
1078 b < (ca)->buckets + (ca)->sb.nbuckets; b++)
1079
1080static inline void __bkey_put(struct cache_set *c, struct bkey *k)
1081{
1082 unsigned i;
1083
1084 for (i = 0; i < KEY_PTRS(k); i++)
1085 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
1086}
1087
1088/* Blktrace macros */
1089
1090#define blktrace_msg(c, fmt, ...) \
1091do { \
1092 struct request_queue *q = bdev_get_queue(c->bdev); \
1093 if (q) \
1094 blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
1095} while (0)
1096
1097#define blktrace_msg_all(s, fmt, ...) \
1098do { \
1099 struct cache *_c; \
1100 unsigned i; \
1101 for_each_cache(_c, (s), i) \
1102 blktrace_msg(_c, fmt, ##__VA_ARGS__); \
1103} while (0)
1104
1105static inline void cached_dev_put(struct cached_dev *dc)
1106{
1107 if (atomic_dec_and_test(&dc->count))
1108 schedule_work(&dc->detach);
1109}
1110
1111static inline bool cached_dev_get(struct cached_dev *dc)
1112{
1113 if (!atomic_inc_not_zero(&dc->count))
1114 return false;
1115
1116 /* Paired with the mb in cached_dev_attach */
1117 smp_mb__after_atomic_inc();
1118 return true;
1119}
1120
1121/*
1122 * bucket_gc_gen() returns the difference between the bucket's current gen and
1123 * the oldest gen of any pointer into that bucket in the btree (last_gc).
1124 *
1125 * bucket_disk_gen() returns the difference between the current gen and the gen
1126 * on disk; they're both used to make sure gens don't wrap around.
1127 */
1128
1129static inline uint8_t bucket_gc_gen(struct bucket *b)
1130{
1131 return b->gen - b->last_gc;
1132}
1133
1134static inline uint8_t bucket_disk_gen(struct bucket *b)
1135{
1136 return b->gen - b->disk_gen;
1137}
1138
1139#define BUCKET_GC_GEN_MAX 96U
1140#define BUCKET_DISK_GEN_MAX 64U
1141
1142#define kobj_attribute_write(n, fn) \
1143 static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
1144
1145#define kobj_attribute_rw(n, show, store) \
1146 static struct kobj_attribute ksysfs_##n = \
1147 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1148
1149/* Forward declarations */
1150
1151void bch_writeback_queue(struct cached_dev *);
1152void bch_writeback_add(struct cached_dev *, unsigned);
1153
1154void bch_count_io_errors(struct cache *, int, const char *);
1155void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
1156 int, const char *);
1157void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
1158void bch_bbio_free(struct bio *, struct cache_set *);
1159struct bio *bch_bbio_alloc(struct cache_set *);
1160
1161struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *);
1162void bch_generic_make_request(struct bio *, struct bio_split_pool *);
1163void __bch_submit_bbio(struct bio *, struct cache_set *);
1164void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
1165
1166uint8_t bch_inc_gen(struct cache *, struct bucket *);
1167void bch_rescale_priorities(struct cache_set *, int);
1168bool bch_bucket_add_unused(struct cache *, struct bucket *);
1169void bch_allocator_thread(struct closure *);
1170
1171long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
1172void bch_bucket_free(struct cache_set *, struct bkey *);
1173
1174int __bch_bucket_alloc_set(struct cache_set *, unsigned,
1175 struct bkey *, int, struct closure *);
1176int bch_bucket_alloc_set(struct cache_set *, unsigned,
1177 struct bkey *, int, struct closure *);
1178
1179__printf(2, 3)
1180bool bch_cache_set_error(struct cache_set *, const char *, ...);
1181
1182void bch_prio_write(struct cache *);
1183void bch_write_bdev_super(struct cached_dev *, struct closure *);
1184
1185extern struct workqueue_struct *bcache_wq, *bch_gc_wq;
1186extern const char * const bch_cache_modes[];
1187extern struct mutex bch_register_lock;
1188extern struct list_head bch_cache_sets;
1189
1190extern struct kobj_type bch_cached_dev_ktype;
1191extern struct kobj_type bch_flash_dev_ktype;
1192extern struct kobj_type bch_cache_set_ktype;
1193extern struct kobj_type bch_cache_set_internal_ktype;
1194extern struct kobj_type bch_cache_ktype;
1195
1196void bch_cached_dev_release(struct kobject *);
1197void bch_flash_dev_release(struct kobject *);
1198void bch_cache_set_release(struct kobject *);
1199void bch_cache_release(struct kobject *);
1200
1201int bch_uuid_write(struct cache_set *);
1202void bcache_write_super(struct cache_set *);
1203
1204int bch_flash_dev_create(struct cache_set *c, uint64_t size);
1205
1206int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
1207void bch_cached_dev_detach(struct cached_dev *);
1208void bch_cached_dev_run(struct cached_dev *);
1209void bcache_device_stop(struct bcache_device *);
1210
1211void bch_cache_set_unregister(struct cache_set *);
1212void bch_cache_set_stop(struct cache_set *);
1213
1214struct cache_set *bch_cache_set_alloc(struct cache_sb *);
1215void bch_btree_cache_free(struct cache_set *);
1216int bch_btree_cache_alloc(struct cache_set *);
1217void bch_writeback_init_cached_dev(struct cached_dev *);
1218void bch_moving_init_cache_set(struct cache_set *);
1219
1220void bch_cache_allocator_exit(struct cache *ca);
1221int bch_cache_allocator_init(struct cache *ca);
1222
1223void bch_debug_exit(void);
1224int bch_debug_init(struct kobject *);
1225void bch_writeback_exit(void);
1226int bch_writeback_init(void);
1227void bch_request_exit(void);
1228int bch_request_init(void);
1229void bch_btree_exit(void);
1230int bch_btree_init(void);
1231
1232#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
new file mode 100644
index 000000000000..bb0f7ae14b3c
--- /dev/null
+++ b/drivers/md/bcache/bset.c
@@ -0,0 +1,1190 @@
1/*
2 * Code for working with individual keys, and sorted sets of keys with in a
3 * btree node
4 *
5 * Copyright 2012 Google, Inc.
6 */
7
8#include "bcache.h"
9#include "btree.h"
10#include "debug.h"
11
12#include <linux/random.h>
13
14/* Keylists */
15
16void bch_keylist_copy(struct keylist *dest, struct keylist *src)
17{
18 *dest = *src;
19
20 if (src->list == src->d) {
21 size_t n = (uint64_t *) src->top - src->d;
22 dest->top = (struct bkey *) &dest->d[n];
23 dest->list = dest->d;
24 }
25}
26
27int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
28{
29 unsigned oldsize = (uint64_t *) l->top - l->list;
30 unsigned newsize = oldsize + 2 + nptrs;
31 uint64_t *new;
32
33 /* The journalling code doesn't handle the case where the keys to insert
34 * is bigger than an empty write: If we just return -ENOMEM here,
35 * bio_insert() and bio_invalidate() will insert the keys created so far
36 * and finish the rest when the keylist is empty.
37 */
38 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
39 return -ENOMEM;
40
41 newsize = roundup_pow_of_two(newsize);
42
43 if (newsize <= KEYLIST_INLINE ||
44 roundup_pow_of_two(oldsize) == newsize)
45 return 0;
46
47 new = krealloc(l->list == l->d ? NULL : l->list,
48 sizeof(uint64_t) * newsize, GFP_NOIO);
49
50 if (!new)
51 return -ENOMEM;
52
53 if (l->list == l->d)
54 memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE);
55
56 l->list = new;
57 l->top = (struct bkey *) (&l->list[oldsize]);
58
59 return 0;
60}
61
62struct bkey *bch_keylist_pop(struct keylist *l)
63{
64 struct bkey *k = l->bottom;
65
66 if (k == l->top)
67 return NULL;
68
69 while (bkey_next(k) != l->top)
70 k = bkey_next(k);
71
72 return l->top = k;
73}
74
75/* Pointer validation */
76
77bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
78{
79 unsigned i;
80
81 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
82 goto bad;
83
84 if (!level && KEY_SIZE(k) > KEY_OFFSET(k))
85 goto bad;
86
87 if (!KEY_SIZE(k))
88 return true;
89
90 for (i = 0; i < KEY_PTRS(k); i++)
91 if (ptr_available(c, k, i)) {
92 struct cache *ca = PTR_CACHE(c, k, i);
93 size_t bucket = PTR_BUCKET_NR(c, k, i);
94 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
95
96 if (KEY_SIZE(k) + r > c->sb.bucket_size ||
97 bucket < ca->sb.first_bucket ||
98 bucket >= ca->sb.nbuckets)
99 goto bad;
100 }
101
102 return false;
103bad:
104 cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
105 return true;
106}
107
108bool bch_ptr_bad(struct btree *b, const struct bkey *k)
109{
110 struct bucket *g;
111 unsigned i, stale;
112
113 if (!bkey_cmp(k, &ZERO_KEY) ||
114 !KEY_PTRS(k) ||
115 bch_ptr_invalid(b, k))
116 return true;
117
118 if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV)
119 return true;
120
121 for (i = 0; i < KEY_PTRS(k); i++)
122 if (ptr_available(b->c, k, i)) {
123 g = PTR_BUCKET(b->c, k, i);
124 stale = ptr_stale(b->c, k, i);
125
126 btree_bug_on(stale > 96, b,
127 "key too stale: %i, need_gc %u",
128 stale, b->c->need_gc);
129
130 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
131 b, "stale dirty pointer");
132
133 if (stale)
134 return true;
135
136#ifdef CONFIG_BCACHE_EDEBUG
137 if (!mutex_trylock(&b->c->bucket_lock))
138 continue;
139
140 if (b->level) {
141 if (KEY_DIRTY(k) ||
142 g->prio != BTREE_PRIO ||
143 (b->c->gc_mark_valid &&
144 GC_MARK(g) != GC_MARK_METADATA))
145 goto bug;
146
147 } else {
148 if (g->prio == BTREE_PRIO)
149 goto bug;
150
151 if (KEY_DIRTY(k) &&
152 b->c->gc_mark_valid &&
153 GC_MARK(g) != GC_MARK_DIRTY)
154 goto bug;
155 }
156 mutex_unlock(&b->c->bucket_lock);
157#endif
158 }
159
160 return false;
161#ifdef CONFIG_BCACHE_EDEBUG
162bug:
163 mutex_unlock(&b->c->bucket_lock);
164 btree_bug(b, "inconsistent pointer %s: bucket %li pin %i "
165 "prio %i gen %i last_gc %i mark %llu gc_gen %i", pkey(k),
166 PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
167 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
168 return true;
169#endif
170}
171
172/* Key/pointer manipulation */
173
174void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
175 unsigned i)
176{
177 BUG_ON(i > KEY_PTRS(src));
178
179 /* Only copy the header, key, and one pointer. */
180 memcpy(dest, src, 2 * sizeof(uint64_t));
181 dest->ptr[0] = src->ptr[i];
182 SET_KEY_PTRS(dest, 1);
183 /* We didn't copy the checksum so clear that bit. */
184 SET_KEY_CSUM(dest, 0);
185}
186
187bool __bch_cut_front(const struct bkey *where, struct bkey *k)
188{
189 unsigned i, len = 0;
190
191 if (bkey_cmp(where, &START_KEY(k)) <= 0)
192 return false;
193
194 if (bkey_cmp(where, k) < 0)
195 len = KEY_OFFSET(k) - KEY_OFFSET(where);
196 else
197 bkey_copy_key(k, where);
198
199 for (i = 0; i < KEY_PTRS(k); i++)
200 SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
201
202 BUG_ON(len > KEY_SIZE(k));
203 SET_KEY_SIZE(k, len);
204 return true;
205}
206
207bool __bch_cut_back(const struct bkey *where, struct bkey *k)
208{
209 unsigned len = 0;
210
211 if (bkey_cmp(where, k) >= 0)
212 return false;
213
214 BUG_ON(KEY_INODE(where) != KEY_INODE(k));
215
216 if (bkey_cmp(where, &START_KEY(k)) > 0)
217 len = KEY_OFFSET(where) - KEY_START(k);
218
219 bkey_copy_key(k, where);
220
221 BUG_ON(len > KEY_SIZE(k));
222 SET_KEY_SIZE(k, len);
223 return true;
224}
225
226static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
227{
228 return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
229 ~((uint64_t)1 << 63);
230}
231
232/* Tries to merge l and r: l should be lower than r
233 * Returns true if we were able to merge. If we did merge, l will be the merged
234 * key, r will be untouched.
235 */
236bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
237{
238 unsigned i;
239
240 if (key_merging_disabled(b->c))
241 return false;
242
243 if (KEY_PTRS(l) != KEY_PTRS(r) ||
244 KEY_DIRTY(l) != KEY_DIRTY(r) ||
245 bkey_cmp(l, &START_KEY(r)))
246 return false;
247
248 for (i = 0; i < KEY_PTRS(l); i++)
249 if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
250 PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
251 return false;
252
253 /* Keys with no pointers aren't restricted to one bucket and could
254 * overflow KEY_SIZE
255 */
256 if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
257 SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
258 SET_KEY_SIZE(l, USHRT_MAX);
259
260 bch_cut_front(l, r);
261 return false;
262 }
263
264 if (KEY_CSUM(l)) {
265 if (KEY_CSUM(r))
266 l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
267 else
268 SET_KEY_CSUM(l, 0);
269 }
270
271 SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
272 SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
273
274 return true;
275}
276
277/* Binary tree stuff for auxiliary search trees */
278
279static unsigned inorder_next(unsigned j, unsigned size)
280{
281 if (j * 2 + 1 < size) {
282 j = j * 2 + 1;
283
284 while (j * 2 < size)
285 j *= 2;
286 } else
287 j >>= ffz(j) + 1;
288
289 return j;
290}
291
292static unsigned inorder_prev(unsigned j, unsigned size)
293{
294 if (j * 2 < size) {
295 j = j * 2;
296
297 while (j * 2 + 1 < size)
298 j = j * 2 + 1;
299 } else
300 j >>= ffs(j);
301
302 return j;
303}
304
305/* I have no idea why this code works... and I'm the one who wrote it
306 *
307 * However, I do know what it does:
308 * Given a binary tree constructed in an array (i.e. how you normally implement
309 * a heap), it converts a node in the tree - referenced by array index - to the
310 * index it would have if you did an inorder traversal.
311 *
312 * Also tested for every j, size up to size somewhere around 6 million.
313 *
314 * The binary tree starts at array index 1, not 0
315 * extra is a function of size:
316 * extra = (size - rounddown_pow_of_two(size - 1)) << 1;
317 */
318static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
319{
320 unsigned b = fls(j);
321 unsigned shift = fls(size - 1) - b;
322
323 j ^= 1U << (b - 1);
324 j <<= 1;
325 j |= 1;
326 j <<= shift;
327
328 if (j > extra)
329 j -= (j - extra) >> 1;
330
331 return j;
332}
333
334static unsigned to_inorder(unsigned j, struct bset_tree *t)
335{
336 return __to_inorder(j, t->size, t->extra);
337}
338
339static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
340{
341 unsigned shift;
342
343 if (j > extra)
344 j += j - extra;
345
346 shift = ffs(j);
347
348 j >>= shift;
349 j |= roundup_pow_of_two(size) >> shift;
350
351 return j;
352}
353
354static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
355{
356 return __inorder_to_tree(j, t->size, t->extra);
357}
358
359#if 0
360void inorder_test(void)
361{
362 unsigned long done = 0;
363 ktime_t start = ktime_get();
364
365 for (unsigned size = 2;
366 size < 65536000;
367 size++) {
368 unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
369 unsigned i = 1, j = rounddown_pow_of_two(size - 1);
370
371 if (!(size % 4096))
372 printk(KERN_NOTICE "loop %u, %llu per us\n", size,
373 done / ktime_us_delta(ktime_get(), start));
374
375 while (1) {
376 if (__inorder_to_tree(i, size, extra) != j)
377 panic("size %10u j %10u i %10u", size, j, i);
378
379 if (__to_inorder(j, size, extra) != i)
380 panic("size %10u j %10u i %10u", size, j, i);
381
382 if (j == rounddown_pow_of_two(size) - 1)
383 break;
384
385 BUG_ON(inorder_prev(inorder_next(j, size), size) != j);
386
387 j = inorder_next(j, size);
388 i++;
389 }
390
391 done += size - 1;
392 }
393}
394#endif
395
396/*
397 * Cacheline/offset <-> bkey pointer arithmatic:
398 *
399 * t->tree is a binary search tree in an array; each node corresponds to a key
400 * in one cacheline in t->set (BSET_CACHELINE bytes).
401 *
402 * This means we don't have to store the full index of the key that a node in
403 * the binary tree points to; to_inorder() gives us the cacheline, and then
404 * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
405 *
406 * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to
407 * make this work.
408 *
409 * To construct the bfloat for an arbitrary key we need to know what the key
410 * immediately preceding it is: we have to check if the two keys differ in the
411 * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
412 * of the previous key so we can walk backwards to it from t->tree[j]'s key.
413 */
414
415static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline,
416 unsigned offset)
417{
418 return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8;
419}
420
421static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
422{
423 return ((void *) k - (void *) t->data) / BSET_CACHELINE;
424}
425
426static unsigned bkey_to_cacheline_offset(struct bkey *k)
427{
428 return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
429}
430
431static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
432{
433 return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
434}
435
436static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
437{
438 return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]);
439}
440
441/*
442 * For the write set - the one we're currently inserting keys into - we don't
443 * maintain a full search tree, we just keep a simple lookup table in t->prev.
444 */
445static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
446{
447 return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
448}
449
450static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
451{
452#ifdef CONFIG_X86_64
453 asm("shrd %[shift],%[high],%[low]"
454 : [low] "+Rm" (low)
455 : [high] "R" (high),
456 [shift] "ci" (shift)
457 : "cc");
458#else
459 low >>= shift;
460 low |= (high << 1) << (63U - shift);
461#endif
462 return low;
463}
464
465static inline unsigned bfloat_mantissa(const struct bkey *k,
466 struct bkey_float *f)
467{
468 const uint64_t *p = &k->low - (f->exponent >> 6);
469 return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
470}
471
472static void make_bfloat(struct bset_tree *t, unsigned j)
473{
474 struct bkey_float *f = &t->tree[j];
475 struct bkey *m = tree_to_bkey(t, j);
476 struct bkey *p = tree_to_prev_bkey(t, j);
477
478 struct bkey *l = is_power_of_2(j)
479 ? t->data->start
480 : tree_to_prev_bkey(t, j >> ffs(j));
481
482 struct bkey *r = is_power_of_2(j + 1)
483 ? node(t->data, t->data->keys - bkey_u64s(&t->end))
484 : tree_to_bkey(t, j >> (ffz(j) + 1));
485
486 BUG_ON(m < l || m > r);
487 BUG_ON(bkey_next(p) != m);
488
489 if (KEY_INODE(l) != KEY_INODE(r))
490 f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
491 else
492 f->exponent = fls64(r->low ^ l->low);
493
494 f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0);
495
496 /*
497 * Setting f->exponent = 127 flags this node as failed, and causes the
498 * lookup code to fall back to comparing against the original key.
499 */
500
501 if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f))
502 f->mantissa = bfloat_mantissa(m, f) - 1;
503 else
504 f->exponent = 127;
505}
506
507static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
508{
509 if (t != b->sets) {
510 unsigned j = roundup(t[-1].size,
511 64 / sizeof(struct bkey_float));
512
513 t->tree = t[-1].tree + j;
514 t->prev = t[-1].prev + j;
515 }
516
517 while (t < b->sets + MAX_BSETS)
518 t++->size = 0;
519}
520
521static void bset_build_unwritten_tree(struct btree *b)
522{
523 struct bset_tree *t = b->sets + b->nsets;
524
525 bset_alloc_tree(b, t);
526
527 if (t->tree != b->sets->tree + bset_tree_space(b)) {
528 t->prev[0] = bkey_to_cacheline_offset(t->data->start);
529 t->size = 1;
530 }
531}
532
533static void bset_build_written_tree(struct btree *b)
534{
535 struct bset_tree *t = b->sets + b->nsets;
536 struct bkey *k = t->data->start;
537 unsigned j, cacheline = 1;
538
539 bset_alloc_tree(b, t);
540
541 t->size = min_t(unsigned,
542 bkey_to_cacheline(t, end(t->data)),
543 b->sets->tree + bset_tree_space(b) - t->tree);
544
545 if (t->size < 2) {
546 t->size = 0;
547 return;
548 }
549
550 t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
551
552 /* First we figure out where the first key in each cacheline is */
553 for (j = inorder_next(0, t->size);
554 j;
555 j = inorder_next(j, t->size)) {
556 while (bkey_to_cacheline(t, k) != cacheline)
557 k = bkey_next(k);
558
559 t->prev[j] = bkey_u64s(k);
560 k = bkey_next(k);
561 cacheline++;
562 t->tree[j].m = bkey_to_cacheline_offset(k);
563 }
564
565 while (bkey_next(k) != end(t->data))
566 k = bkey_next(k);
567
568 t->end = *k;
569
570 /* Then we build the tree */
571 for (j = inorder_next(0, t->size);
572 j;
573 j = inorder_next(j, t->size))
574 make_bfloat(t, j);
575}
576
577void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
578{
579 struct bset_tree *t;
580 unsigned inorder, j = 1;
581
582 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
583 if (k < end(t->data))
584 goto found_set;
585
586 BUG();
587found_set:
588 if (!t->size || !bset_written(b, t))
589 return;
590
591 inorder = bkey_to_cacheline(t, k);
592
593 if (k == t->data->start)
594 goto fix_left;
595
596 if (bkey_next(k) == end(t->data)) {
597 t->end = *k;
598 goto fix_right;
599 }
600
601 j = inorder_to_tree(inorder, t);
602
603 if (j &&
604 j < t->size &&
605 k == tree_to_bkey(t, j))
606fix_left: do {
607 make_bfloat(t, j);
608 j = j * 2;
609 } while (j < t->size);
610
611 j = inorder_to_tree(inorder + 1, t);
612
613 if (j &&
614 j < t->size &&
615 k == tree_to_prev_bkey(t, j))
616fix_right: do {
617 make_bfloat(t, j);
618 j = j * 2 + 1;
619 } while (j < t->size);
620}
621
622void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
623{
624 struct bset_tree *t = &b->sets[b->nsets];
625 unsigned shift = bkey_u64s(k);
626 unsigned j = bkey_to_cacheline(t, k);
627
628 /* We're getting called from btree_split() or btree_gc, just bail out */
629 if (!t->size)
630 return;
631
632 /* k is the key we just inserted; we need to find the entry in the
633 * lookup table for the first key that is strictly greater than k:
634 * it's either k's cacheline or the next one
635 */
636 if (j < t->size &&
637 table_to_bkey(t, j) <= k)
638 j++;
639
640 /* Adjust all the lookup table entries, and find a new key for any that
641 * have gotten too big
642 */
643 for (; j < t->size; j++) {
644 t->prev[j] += shift;
645
646 if (t->prev[j] > 7) {
647 k = table_to_bkey(t, j - 1);
648
649 while (k < cacheline_to_bkey(t, j, 0))
650 k = bkey_next(k);
651
652 t->prev[j] = bkey_to_cacheline_offset(k);
653 }
654 }
655
656 if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
657 return;
658
659 /* Possibly add a new entry to the end of the lookup table */
660
661 for (k = table_to_bkey(t, t->size - 1);
662 k != end(t->data);
663 k = bkey_next(k))
664 if (t->size == bkey_to_cacheline(t, k)) {
665 t->prev[t->size] = bkey_to_cacheline_offset(k);
666 t->size++;
667 }
668}
669
670void bch_bset_init_next(struct btree *b)
671{
672 struct bset *i = write_block(b);
673
674 if (i != b->sets[0].data) {
675 b->sets[++b->nsets].data = i;
676 i->seq = b->sets[0].data->seq;
677 } else
678 get_random_bytes(&i->seq, sizeof(uint64_t));
679
680 i->magic = bset_magic(b->c);
681 i->version = 0;
682 i->keys = 0;
683
684 bset_build_unwritten_tree(b);
685}
686
687struct bset_search_iter {
688 struct bkey *l, *r;
689};
690
691static struct bset_search_iter bset_search_write_set(struct btree *b,
692 struct bset_tree *t,
693 const struct bkey *search)
694{
695 unsigned li = 0, ri = t->size;
696
697 BUG_ON(!b->nsets &&
698 t->size < bkey_to_cacheline(t, end(t->data)));
699
700 while (li + 1 != ri) {
701 unsigned m = (li + ri) >> 1;
702
703 if (bkey_cmp(table_to_bkey(t, m), search) > 0)
704 ri = m;
705 else
706 li = m;
707 }
708
709 return (struct bset_search_iter) {
710 table_to_bkey(t, li),
711 ri < t->size ? table_to_bkey(t, ri) : end(t->data)
712 };
713}
714
715static struct bset_search_iter bset_search_tree(struct btree *b,
716 struct bset_tree *t,
717 const struct bkey *search)
718{
719 struct bkey *l, *r;
720 struct bkey_float *f;
721 unsigned inorder, j, n = 1;
722
723 do {
724 unsigned p = n << 4;
725 p &= ((int) (p - t->size)) >> 31;
726
727 prefetch(&t->tree[p]);
728
729 j = n;
730 f = &t->tree[j];
731
732 /*
733 * n = (f->mantissa > bfloat_mantissa())
734 * ? j * 2
735 * : j * 2 + 1;
736 *
737 * We need to subtract 1 from f->mantissa for the sign bit trick
738 * to work - that's done in make_bfloat()
739 */
740 if (likely(f->exponent != 127))
741 n = j * 2 + (((unsigned)
742 (f->mantissa -
743 bfloat_mantissa(search, f))) >> 31);
744 else
745 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
746 ? j * 2
747 : j * 2 + 1;
748 } while (n < t->size);
749
750 inorder = to_inorder(j, t);
751
752 /*
753 * n would have been the node we recursed to - the low bit tells us if
754 * we recursed left or recursed right.
755 */
756 if (n & 1) {
757 l = cacheline_to_bkey(t, inorder, f->m);
758
759 if (++inorder != t->size) {
760 f = &t->tree[inorder_next(j, t->size)];
761 r = cacheline_to_bkey(t, inorder, f->m);
762 } else
763 r = end(t->data);
764 } else {
765 r = cacheline_to_bkey(t, inorder, f->m);
766
767 if (--inorder) {
768 f = &t->tree[inorder_prev(j, t->size)];
769 l = cacheline_to_bkey(t, inorder, f->m);
770 } else
771 l = t->data->start;
772 }
773
774 return (struct bset_search_iter) {l, r};
775}
776
777struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
778 const struct bkey *search)
779{
780 struct bset_search_iter i;
781
782 /*
783 * First, we search for a cacheline, then lastly we do a linear search
784 * within that cacheline.
785 *
786 * To search for the cacheline, there's three different possibilities:
787 * * The set is too small to have a search tree, so we just do a linear
788 * search over the whole set.
789 * * The set is the one we're currently inserting into; keeping a full
790 * auxiliary search tree up to date would be too expensive, so we
791 * use a much simpler lookup table to do a binary search -
792 * bset_search_write_set().
793 * * Or we use the auxiliary search tree we constructed earlier -
794 * bset_search_tree()
795 */
796
797 if (unlikely(!t->size)) {
798 i.l = t->data->start;
799 i.r = end(t->data);
800 } else if (bset_written(b, t)) {
801 /*
802 * Each node in the auxiliary search tree covers a certain range
803 * of bits, and keys above and below the set it covers might
804 * differ outside those bits - so we have to special case the
805 * start and end - handle that here:
806 */
807
808 if (unlikely(bkey_cmp(search, &t->end) >= 0))
809 return end(t->data);
810
811 if (unlikely(bkey_cmp(search, t->data->start) < 0))
812 return t->data->start;
813
814 i = bset_search_tree(b, t, search);
815 } else
816 i = bset_search_write_set(b, t, search);
817
818#ifdef CONFIG_BCACHE_EDEBUG
819 BUG_ON(bset_written(b, t) &&
820 i.l != t->data->start &&
821 bkey_cmp(tree_to_prev_bkey(t,
822 inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
823 search) > 0);
824
825 BUG_ON(i.r != end(t->data) &&
826 bkey_cmp(i.r, search) <= 0);
827#endif
828
829 while (likely(i.l != i.r) &&
830 bkey_cmp(i.l, search) <= 0)
831 i.l = bkey_next(i.l);
832
833 return i.l;
834}
835
836/* Btree iterator */
837
838static inline bool btree_iter_cmp(struct btree_iter_set l,
839 struct btree_iter_set r)
840{
841 int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
842
843 return c ? c > 0 : l.k < r.k;
844}
845
846static inline bool btree_iter_end(struct btree_iter *iter)
847{
848 return !iter->used;
849}
850
851void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
852 struct bkey *end)
853{
854 if (k != end)
855 BUG_ON(!heap_add(iter,
856 ((struct btree_iter_set) { k, end }),
857 btree_iter_cmp));
858}
859
860struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
861 struct bkey *search, struct bset_tree *start)
862{
863 struct bkey *ret = NULL;
864 iter->size = ARRAY_SIZE(iter->data);
865 iter->used = 0;
866
867 for (; start <= &b->sets[b->nsets]; start++) {
868 ret = bch_bset_search(b, start, search);
869 bch_btree_iter_push(iter, ret, end(start->data));
870 }
871
872 return ret;
873}
874
875struct bkey *bch_btree_iter_next(struct btree_iter *iter)
876{
877 struct btree_iter_set unused;
878 struct bkey *ret = NULL;
879
880 if (!btree_iter_end(iter)) {
881 ret = iter->data->k;
882 iter->data->k = bkey_next(iter->data->k);
883
884 if (iter->data->k > iter->data->end) {
885 __WARN();
886 iter->data->k = iter->data->end;
887 }
888
889 if (iter->data->k == iter->data->end)
890 heap_pop(iter, unused, btree_iter_cmp);
891 else
892 heap_sift(iter, 0, btree_iter_cmp);
893 }
894
895 return ret;
896}
897
898struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
899 struct btree *b, ptr_filter_fn fn)
900{
901 struct bkey *ret;
902
903 do {
904 ret = bch_btree_iter_next(iter);
905 } while (ret && fn(b, ret));
906
907 return ret;
908}
909
910struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
911{
912 struct btree_iter iter;
913
914 bch_btree_iter_init(b, &iter, search);
915 return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
916}
917
918/* Mergesort */
919
920static void btree_sort_fixup(struct btree_iter *iter)
921{
922 while (iter->used > 1) {
923 struct btree_iter_set *top = iter->data, *i = top + 1;
924 struct bkey *k;
925
926 if (iter->used > 2 &&
927 btree_iter_cmp(i[0], i[1]))
928 i++;
929
930 for (k = i->k;
931 k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
932 k = bkey_next(k))
933 if (top->k > i->k)
934 __bch_cut_front(top->k, k);
935 else if (KEY_SIZE(k))
936 bch_cut_back(&START_KEY(k), top->k);
937
938 if (top->k < i->k || k == i->k)
939 break;
940
941 heap_sift(iter, i - top, btree_iter_cmp);
942 }
943}
944
945static void btree_mergesort(struct btree *b, struct bset *out,
946 struct btree_iter *iter,
947 bool fixup, bool remove_stale)
948{
949 struct bkey *k, *last = NULL;
950 bool (*bad)(struct btree *, const struct bkey *) = remove_stale
951 ? bch_ptr_bad
952 : bch_ptr_invalid;
953
954 while (!btree_iter_end(iter)) {
955 if (fixup && !b->level)
956 btree_sort_fixup(iter);
957
958 k = bch_btree_iter_next(iter);
959 if (bad(b, k))
960 continue;
961
962 if (!last) {
963 last = out->start;
964 bkey_copy(last, k);
965 } else if (b->level ||
966 !bch_bkey_try_merge(b, last, k)) {
967 last = bkey_next(last);
968 bkey_copy(last, k);
969 }
970 }
971
972 out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
973
974 pr_debug("sorted %i keys", out->keys);
975 bch_check_key_order(b, out);
976}
977
978static void __btree_sort(struct btree *b, struct btree_iter *iter,
979 unsigned start, unsigned order, bool fixup)
980{
981 uint64_t start_time;
982 bool remove_stale = !b->written;
983 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
984 order);
985 if (!out) {
986 mutex_lock(&b->c->sort_lock);
987 out = b->c->sort;
988 order = ilog2(bucket_pages(b->c));
989 }
990
991 start_time = local_clock();
992
993 btree_mergesort(b, out, iter, fixup, remove_stale);
994 b->nsets = start;
995
996 if (!fixup && !start && b->written)
997 bch_btree_verify(b, out);
998
999 if (!start && order == b->page_order) {
1000 /*
1001 * Our temporary buffer is the same size as the btree node's
1002 * buffer, we can just swap buffers instead of doing a big
1003 * memcpy()
1004 */
1005
1006 out->magic = bset_magic(b->c);
1007 out->seq = b->sets[0].data->seq;
1008 out->version = b->sets[0].data->version;
1009 swap(out, b->sets[0].data);
1010
1011 if (b->c->sort == b->sets[0].data)
1012 b->c->sort = out;
1013 } else {
1014 b->sets[start].data->keys = out->keys;
1015 memcpy(b->sets[start].data->start, out->start,
1016 (void *) end(out) - (void *) out->start);
1017 }
1018
1019 if (out == b->c->sort)
1020 mutex_unlock(&b->c->sort_lock);
1021 else
1022 free_pages((unsigned long) out, order);
1023
1024 if (b->written)
1025 bset_build_written_tree(b);
1026
1027 if (!start) {
1028 spin_lock(&b->c->sort_time_lock);
1029 time_stats_update(&b->c->sort_time, start_time);
1030 spin_unlock(&b->c->sort_time_lock);
1031 }
1032}
1033
1034void bch_btree_sort_partial(struct btree *b, unsigned start)
1035{
1036 size_t oldsize = 0, order = b->page_order, keys = 0;
1037 struct btree_iter iter;
1038 __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
1039
1040 BUG_ON(b->sets[b->nsets].data == write_block(b) &&
1041 (b->sets[b->nsets].size || b->nsets));
1042
1043 if (b->written)
1044 oldsize = bch_count_data(b);
1045
1046 if (start) {
1047 unsigned i;
1048
1049 for (i = start; i <= b->nsets; i++)
1050 keys += b->sets[i].data->keys;
1051
1052 order = roundup_pow_of_two(__set_bytes(b->sets->data, keys)) / PAGE_SIZE;
1053 if (order)
1054 order = ilog2(order);
1055 }
1056
1057 __btree_sort(b, &iter, start, order, false);
1058
1059 EBUG_ON(b->written && bch_count_data(b) != oldsize);
1060}
1061
1062void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
1063{
1064 BUG_ON(!b->written);
1065 __btree_sort(b, iter, 0, b->page_order, true);
1066}
1067
1068void bch_btree_sort_into(struct btree *b, struct btree *new)
1069{
1070 uint64_t start_time = local_clock();
1071
1072 struct btree_iter iter;
1073 bch_btree_iter_init(b, &iter, NULL);
1074
1075 btree_mergesort(b, new->sets->data, &iter, false, true);
1076
1077 spin_lock(&b->c->sort_time_lock);
1078 time_stats_update(&b->c->sort_time, start_time);
1079 spin_unlock(&b->c->sort_time_lock);
1080
1081 bkey_copy_key(&new->key, &b->key);
1082 new->sets->size = 0;
1083}
1084
1085void bch_btree_sort_lazy(struct btree *b)
1086{
1087 if (b->nsets) {
1088 unsigned i, j, keys = 0, total;
1089
1090 for (i = 0; i <= b->nsets; i++)
1091 keys += b->sets[i].data->keys;
1092
1093 total = keys;
1094
1095 for (j = 0; j < b->nsets; j++) {
1096 if (keys * 2 < total ||
1097 keys < 1000) {
1098 bch_btree_sort_partial(b, j);
1099 return;
1100 }
1101
1102 keys -= b->sets[j].data->keys;
1103 }
1104
1105 /* Must sort if b->nsets == 3 or we'll overflow */
1106 if (b->nsets >= (MAX_BSETS - 1) - b->level) {
1107 bch_btree_sort(b);
1108 return;
1109 }
1110 }
1111
1112 bset_build_written_tree(b);
1113}
1114
1115/* Sysfs stuff */
1116
1117struct bset_stats {
1118 size_t nodes;
1119 size_t sets_written, sets_unwritten;
1120 size_t bytes_written, bytes_unwritten;
1121 size_t floats, failed;
1122};
1123
1124static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
1125 struct bset_stats *stats)
1126{
1127 struct bkey *k;
1128 unsigned i;
1129
1130 stats->nodes++;
1131
1132 for (i = 0; i <= b->nsets; i++) {
1133 struct bset_tree *t = &b->sets[i];
1134 size_t bytes = t->data->keys * sizeof(uint64_t);
1135 size_t j;
1136
1137 if (bset_written(b, t)) {
1138 stats->sets_written++;
1139 stats->bytes_written += bytes;
1140
1141 stats->floats += t->size - 1;
1142
1143 for (j = 1; j < t->size; j++)
1144 if (t->tree[j].exponent == 127)
1145 stats->failed++;
1146 } else {
1147 stats->sets_unwritten++;
1148 stats->bytes_unwritten += bytes;
1149 }
1150 }
1151
1152 if (b->level) {
1153 struct btree_iter iter;
1154
1155 for_each_key_filter(b, k, &iter, bch_ptr_bad) {
1156 int ret = btree(bset_stats, k, b, op, stats);
1157 if (ret)
1158 return ret;
1159 }
1160 }
1161
1162 return 0;
1163}
1164
1165int bch_bset_print_stats(struct cache_set *c, char *buf)
1166{
1167 struct btree_op op;
1168 struct bset_stats t;
1169 int ret;
1170
1171 bch_btree_op_init_stack(&op);
1172 memset(&t, 0, sizeof(struct bset_stats));
1173
1174 ret = btree_root(bset_stats, c, &op, &t);
1175 if (ret)
1176 return ret;
1177
1178 return snprintf(buf, PAGE_SIZE,
1179 "btree nodes: %zu\n"
1180 "written sets: %zu\n"
1181 "unwritten sets: %zu\n"
1182 "written key bytes: %zu\n"
1183 "unwritten key bytes: %zu\n"
1184 "floats: %zu\n"
1185 "failed: %zu\n",
1186 t.nodes,
1187 t.sets_written, t.sets_unwritten,
1188 t.bytes_written, t.bytes_unwritten,
1189 t.floats, t.failed);
1190}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
new file mode 100644
index 000000000000..57a9cff41546
--- /dev/null
+++ b/drivers/md/bcache/bset.h
@@ -0,0 +1,379 @@
1#ifndef _BCACHE_BSET_H
2#define _BCACHE_BSET_H
3
4/*
5 * BKEYS:
6 *
7 * A bkey contains a key, a size field, a variable number of pointers, and some
8 * ancillary flag bits.
9 *
10 * We use two different functions for validating bkeys, bch_ptr_invalid and
11 * bch_ptr_bad().
12 *
13 * bch_ptr_invalid() primarily filters out keys and pointers that would be
14 * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and
15 * pointer that occur in normal practice but don't point to real data.
16 *
17 * The one exception to the rule that ptr_invalid() filters out invalid keys is
18 * that it also filters out keys of size 0 - these are keys that have been
19 * completely overwritten. It'd be safe to delete these in memory while leaving
20 * them on disk, just unnecessary work - so we filter them out when resorting
21 * instead.
22 *
23 * We can't filter out stale keys when we're resorting, because garbage
24 * collection needs to find them to ensure bucket gens don't wrap around -
25 * unless we're rewriting the btree node those stale keys still exist on disk.
26 *
27 * We also implement functions here for removing some number of sectors from the
28 * front or the back of a bkey - this is mainly used for fixing overlapping
29 * extents, by removing the overlapping sectors from the older key.
30 *
31 * BSETS:
32 *
33 * A bset is an array of bkeys laid out contiguously in memory in sorted order,
34 * along with a header. A btree node is made up of a number of these, written at
35 * different times.
36 *
37 * There could be many of them on disk, but we never allow there to be more than
38 * 4 in memory - we lazily resort as needed.
39 *
40 * We implement code here for creating and maintaining auxiliary search trees
41 * (described below) for searching an individial bset, and on top of that we
42 * implement a btree iterator.
43 *
44 * BTREE ITERATOR:
45 *
46 * Most of the code in bcache doesn't care about an individual bset - it needs
47 * to search entire btree nodes and iterate over them in sorted order.
48 *
49 * The btree iterator code serves both functions; it iterates through the keys
50 * in a btree node in sorted order, starting from either keys after a specific
51 * point (if you pass it a search key) or the start of the btree node.
52 *
53 * AUXILIARY SEARCH TREES:
54 *
55 * Since keys are variable length, we can't use a binary search on a bset - we
56 * wouldn't be able to find the start of the next key. But binary searches are
57 * slow anyways, due to terrible cache behaviour; bcache originally used binary
58 * searches and that code topped out at under 50k lookups/second.
59 *
60 * So we need to construct some sort of lookup table. Since we only insert keys
61 * into the last (unwritten) set, most of the keys within a given btree node are
62 * usually in sets that are mostly constant. We use two different types of
63 * lookup tables to take advantage of this.
64 *
65 * Both lookup tables share in common that they don't index every key in the
66 * set; they index one key every BSET_CACHELINE bytes, and then a linear search
67 * is used for the rest.
68 *
69 * For sets that have been written to disk and are no longer being inserted
70 * into, we construct a binary search tree in an array - traversing a binary
71 * search tree in an array gives excellent locality of reference and is very
72 * fast, since both children of any node are adjacent to each other in memory
73 * (and their grandchildren, and great grandchildren...) - this means
74 * prefetching can be used to great effect.
75 *
76 * It's quite useful performance wise to keep these nodes small - not just
77 * because they're more likely to be in L2, but also because we can prefetch
78 * more nodes on a single cacheline and thus prefetch more iterations in advance
79 * when traversing this tree.
80 *
81 * Nodes in the auxiliary search tree must contain both a key to compare against
82 * (we don't want to fetch the key from the set, that would defeat the purpose),
83 * and a pointer to the key. We use a few tricks to compress both of these.
84 *
85 * To compress the pointer, we take advantage of the fact that one node in the
86 * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
87 * a function (to_inorder()) that takes the index of a node in a binary tree and
88 * returns what its index would be in an inorder traversal, so we only have to
89 * store the low bits of the offset.
90 *
91 * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
92 * compress that, we take advantage of the fact that when we're traversing the
93 * search tree at every iteration we know that both our search key and the key
94 * we're looking for lie within some range - bounded by our previous
95 * comparisons. (We special case the start of a search so that this is true even
96 * at the root of the tree).
97 *
98 * So we know the key we're looking for is between a and b, and a and b don't
99 * differ higher than bit 50, we don't need to check anything higher than bit
100 * 50.
101 *
102 * We don't usually need the rest of the bits, either; we only need enough bits
103 * to partition the key range we're currently checking. Consider key n - the
104 * key our auxiliary search tree node corresponds to, and key p, the key
105 * immediately preceding n. The lowest bit we need to store in the auxiliary
106 * search tree is the highest bit that differs between n and p.
107 *
108 * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
109 * comparison. But we'd really like our nodes in the auxiliary search tree to be
110 * of fixed size.
111 *
112 * The solution is to make them fixed size, and when we're constructing a node
113 * check if p and n differed in the bits we needed them to. If they don't we
114 * flag that node, and when doing lookups we fallback to comparing against the
115 * real key. As long as this doesn't happen to often (and it seems to reliably
116 * happen a bit less than 1% of the time), we win - even on failures, that key
117 * is then more likely to be in cache than if we were doing binary searches all
118 * the way, since we're touching so much less memory.
119 *
120 * The keys in the auxiliary search tree are stored in (software) floating
121 * point, with an exponent and a mantissa. The exponent needs to be big enough
122 * to address all the bits in the original key, but the number of bits in the
123 * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
124 *
125 * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
126 * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
127 * We need one node per 128 bytes in the btree node, which means the auxiliary
128 * search trees take up 3% as much memory as the btree itself.
129 *
130 * Constructing these auxiliary search trees is moderately expensive, and we
131 * don't want to be constantly rebuilding the search tree for the last set
132 * whenever we insert another key into it. For the unwritten set, we use a much
133 * simpler lookup table - it's just a flat array, so index i in the lookup table
134 * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
135 * within each byte range works the same as with the auxiliary search trees.
136 *
137 * These are much easier to keep up to date when we insert a key - we do it
138 * somewhat lazily; when we shift a key up we usually just increment the pointer
139 * to it, only when it would overflow do we go to the trouble of finding the
140 * first key in that range of bytes again.
141 */
142
143/* Btree key comparison/iteration */
144
145struct btree_iter {
146 size_t size, used;
147 struct btree_iter_set {
148 struct bkey *k, *end;
149 } data[MAX_BSETS];
150};
151
152struct bset_tree {
153 /*
154 * We construct a binary tree in an array as if the array
155 * started at 1, so that things line up on the same cachelines
156 * better: see comments in bset.c at cacheline_to_bkey() for
157 * details
158 */
159
160 /* size of the binary tree and prev array */
161 unsigned size;
162
163 /* function of size - precalculated for to_inorder() */
164 unsigned extra;
165
166 /* copy of the last key in the set */
167 struct bkey end;
168 struct bkey_float *tree;
169
170 /*
171 * The nodes in the bset tree point to specific keys - this
172 * array holds the sizes of the previous key.
173 *
174 * Conceptually it's a member of struct bkey_float, but we want
175 * to keep bkey_float to 4 bytes and prev isn't used in the fast
176 * path.
177 */
178 uint8_t *prev;
179
180 /* The actual btree node, with pointers to each sorted set */
181 struct bset *data;
182};
183
184static __always_inline int64_t bkey_cmp(const struct bkey *l,
185 const struct bkey *r)
186{
187 return unlikely(KEY_INODE(l) != KEY_INODE(r))
188 ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
189 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
190}
191
192static inline size_t bkey_u64s(const struct bkey *k)
193{
194 BUG_ON(KEY_CSUM(k) > 1);
195 return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
196}
197
198static inline size_t bkey_bytes(const struct bkey *k)
199{
200 return bkey_u64s(k) * sizeof(uint64_t);
201}
202
203static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
204{
205 memcpy(dest, src, bkey_bytes(src));
206}
207
208static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
209{
210 if (!src)
211 src = &KEY(0, 0, 0);
212
213 SET_KEY_INODE(dest, KEY_INODE(src));
214 SET_KEY_OFFSET(dest, KEY_OFFSET(src));
215}
216
217static inline struct bkey *bkey_next(const struct bkey *k)
218{
219 uint64_t *d = (void *) k;
220 return (struct bkey *) (d + bkey_u64s(k));
221}
222
223/* Keylists */
224
225struct keylist {
226 struct bkey *top;
227 union {
228 uint64_t *list;
229 struct bkey *bottom;
230 };
231
232 /* Enough room for btree_split's keys without realloc */
233#define KEYLIST_INLINE 16
234 uint64_t d[KEYLIST_INLINE];
235};
236
237static inline void bch_keylist_init(struct keylist *l)
238{
239 l->top = (void *) (l->list = l->d);
240}
241
242static inline void bch_keylist_push(struct keylist *l)
243{
244 l->top = bkey_next(l->top);
245}
246
247static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
248{
249 bkey_copy(l->top, k);
250 bch_keylist_push(l);
251}
252
253static inline bool bch_keylist_empty(struct keylist *l)
254{
255 return l->top == (void *) l->list;
256}
257
258static inline void bch_keylist_free(struct keylist *l)
259{
260 if (l->list != l->d)
261 kfree(l->list);
262}
263
264void bch_keylist_copy(struct keylist *, struct keylist *);
265struct bkey *bch_keylist_pop(struct keylist *);
266int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
267
268void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
269 unsigned);
270bool __bch_cut_front(const struct bkey *, struct bkey *);
271bool __bch_cut_back(const struct bkey *, struct bkey *);
272
273static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
274{
275 BUG_ON(bkey_cmp(where, k) > 0);
276 return __bch_cut_front(where, k);
277}
278
279static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
280{
281 BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
282 return __bch_cut_back(where, k);
283}
284
285const char *bch_ptr_status(struct cache_set *, const struct bkey *);
286bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *);
287bool bch_ptr_bad(struct btree *, const struct bkey *);
288
289static inline uint8_t gen_after(uint8_t a, uint8_t b)
290{
291 uint8_t r = a - b;
292 return r > 128U ? 0 : r;
293}
294
295static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
296 unsigned i)
297{
298 return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
299}
300
301static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
302 unsigned i)
303{
304 return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
305}
306
307
308typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
309
310struct bkey *bch_next_recurse_key(struct btree *, struct bkey *);
311struct bkey *bch_btree_iter_next(struct btree_iter *);
312struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
313 struct btree *, ptr_filter_fn);
314
315void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
316struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
317 struct bkey *, struct bset_tree *);
318
319/* 32 bits total: */
320#define BKEY_MID_BITS 3
321#define BKEY_EXPONENT_BITS 7
322#define BKEY_MANTISSA_BITS 22
323#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
324
325struct bkey_float {
326 unsigned exponent:BKEY_EXPONENT_BITS;
327 unsigned m:BKEY_MID_BITS;
328 unsigned mantissa:BKEY_MANTISSA_BITS;
329} __packed;
330
331/*
332 * BSET_CACHELINE was originally intended to match the hardware cacheline size -
333 * it used to be 64, but I realized the lookup code would touch slightly less
334 * memory if it was 128.
335 *
336 * It definites the number of bytes (in struct bset) per struct bkey_float in
337 * the auxiliar search tree - when we're done searching the bset_float tree we
338 * have this many bytes left that we do a linear search over.
339 *
340 * Since (after level 5) every level of the bset_tree is on a new cacheline,
341 * we're touching one fewer cacheline in the bset tree in exchange for one more
342 * cacheline in the linear search - but the linear search might stop before it
343 * gets to the second cacheline.
344 */
345
346#define BSET_CACHELINE 128
347#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE)
348
349#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float))
350#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t))
351
352void bch_bset_init_next(struct btree *);
353
354void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
355void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
356
357struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
358 const struct bkey *);
359
360static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
361 const struct bkey *search)
362{
363 return search ? __bch_bset_search(b, t, search) : t->data->start;
364}
365
366bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
367void bch_btree_sort_lazy(struct btree *);
368void bch_btree_sort_into(struct btree *, struct btree *);
369void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
370void bch_btree_sort_partial(struct btree *, unsigned);
371
372static inline void bch_btree_sort(struct btree *b)
373{
374 bch_btree_sort_partial(b, 0);
375}
376
377int bch_bset_print_stats(struct cache_set *, char *);
378
379#endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
new file mode 100644
index 000000000000..e7bc917ef0d7
--- /dev/null
+++ b/drivers/md/bcache/btree.c
@@ -0,0 +1,2503 @@
1/*
2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3 *
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
6 * of the device.
7 *
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
12 *
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
15 *
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
19 *
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
21 */
22
23#include "bcache.h"
24#include "btree.h"
25#include "debug.h"
26#include "request.h"
27
28#include <linux/slab.h>
29#include <linux/bitops.h>
30#include <linux/hash.h>
31#include <linux/random.h>
32#include <linux/rcupdate.h>
33#include <trace/events/bcache.h>
34
35/*
36 * Todo:
37 * register_bcache: Return errors out to userspace correctly
38 *
39 * Writeback: don't undirty key until after a cache flush
40 *
41 * Create an iterator for key pointers
42 *
43 * On btree write error, mark bucket such that it won't be freed from the cache
44 *
45 * Journalling:
46 * Check for bad keys in replay
47 * Propagate barriers
48 * Refcount journal entries in journal_replay
49 *
50 * Garbage collection:
51 * Finish incremental gc
52 * Gc should free old UUIDs, data for invalid UUIDs
53 *
54 * Provide a way to list backing device UUIDs we have data cached for, and
55 * probably how long it's been since we've seen them, and a way to invalidate
56 * dirty data for devices that will never be attached again
57 *
58 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
59 * that based on that and how much dirty data we have we can keep writeback
60 * from being starved
61 *
62 * Add a tracepoint or somesuch to watch for writeback starvation
63 *
64 * When btree depth > 1 and splitting an interior node, we have to make sure
65 * alloc_bucket() cannot fail. This should be true but is not completely
66 * obvious.
67 *
68 * Make sure all allocations get charged to the root cgroup
69 *
70 * Plugging?
71 *
72 * If data write is less than hard sector size of ssd, round up offset in open
73 * bucket to the next whole sector
74 *
75 * Also lookup by cgroup in get_open_bucket()
76 *
77 * Superblock needs to be fleshed out for multiple cache devices
78 *
79 * Add a sysfs tunable for the number of writeback IOs in flight
80 *
81 * Add a sysfs tunable for the number of open data buckets
82 *
83 * IO tracking: Can we track when one process is doing io on behalf of another?
84 * IO tracking: Don't use just an average, weigh more recent stuff higher
85 *
86 * Test module load/unload
87 */
88
89static const char * const op_types[] = {
90 "insert", "replace"
91};
92
93static const char *op_type(struct btree_op *op)
94{
95 return op_types[op->type];
96}
97
98#define MAX_NEED_GC 64
99#define MAX_SAVE_PRIO 72
100
101#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
102
103#define PTR_HASH(c, k) \
104 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
105
106struct workqueue_struct *bch_gc_wq;
107static struct workqueue_struct *btree_io_wq;
108
109void bch_btree_op_init_stack(struct btree_op *op)
110{
111 memset(op, 0, sizeof(struct btree_op));
112 closure_init_stack(&op->cl);
113 op->lock = -1;
114 bch_keylist_init(&op->keys);
115}
116
117/* Btree key manipulation */
118
119static void bkey_put(struct cache_set *c, struct bkey *k, int level)
120{
121 if ((level && KEY_OFFSET(k)) || !level)
122 __bkey_put(c, k);
123}
124
125/* Btree IO */
126
127static uint64_t btree_csum_set(struct btree *b, struct bset *i)
128{
129 uint64_t crc = b->key.ptr[0];
130 void *data = (void *) i + 8, *end = end(i);
131
132 crc = crc64_update(crc, data, end - data);
133 return crc ^ 0xffffffffffffffff;
134}
135
136static void btree_bio_endio(struct bio *bio, int error)
137{
138 struct closure *cl = bio->bi_private;
139 struct btree *b = container_of(cl, struct btree, io.cl);
140
141 if (error)
142 set_btree_node_io_error(b);
143
144 bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
145 ? "writing btree" : "reading btree");
146 closure_put(cl);
147}
148
149static void btree_bio_init(struct btree *b)
150{
151 BUG_ON(b->bio);
152 b->bio = bch_bbio_alloc(b->c);
153
154 b->bio->bi_end_io = btree_bio_endio;
155 b->bio->bi_private = &b->io.cl;
156}
157
158void bch_btree_read_done(struct closure *cl)
159{
160 struct btree *b = container_of(cl, struct btree, io.cl);
161 struct bset *i = b->sets[0].data;
162 struct btree_iter *iter = b->c->fill_iter;
163 const char *err = "bad btree header";
164 BUG_ON(b->nsets || b->written);
165
166 bch_bbio_free(b->bio, b->c);
167 b->bio = NULL;
168
169 mutex_lock(&b->c->fill_lock);
170 iter->used = 0;
171
172 if (btree_node_io_error(b) ||
173 !i->seq)
174 goto err;
175
176 for (;
177 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
178 i = write_block(b)) {
179 err = "unsupported bset version";
180 if (i->version > BCACHE_BSET_VERSION)
181 goto err;
182
183 err = "bad btree header";
184 if (b->written + set_blocks(i, b->c) > btree_blocks(b))
185 goto err;
186
187 err = "bad magic";
188 if (i->magic != bset_magic(b->c))
189 goto err;
190
191 err = "bad checksum";
192 switch (i->version) {
193 case 0:
194 if (i->csum != csum_set(i))
195 goto err;
196 break;
197 case BCACHE_BSET_VERSION:
198 if (i->csum != btree_csum_set(b, i))
199 goto err;
200 break;
201 }
202
203 err = "empty set";
204 if (i != b->sets[0].data && !i->keys)
205 goto err;
206
207 bch_btree_iter_push(iter, i->start, end(i));
208
209 b->written += set_blocks(i, b->c);
210 }
211
212 err = "corrupted btree";
213 for (i = write_block(b);
214 index(i, b) < btree_blocks(b);
215 i = ((void *) i) + block_bytes(b->c))
216 if (i->seq == b->sets[0].data->seq)
217 goto err;
218
219 bch_btree_sort_and_fix_extents(b, iter);
220
221 i = b->sets[0].data;
222 err = "short btree key";
223 if (b->sets[0].size &&
224 bkey_cmp(&b->key, &b->sets[0].end) < 0)
225 goto err;
226
227 if (b->written < btree_blocks(b))
228 bch_bset_init_next(b);
229out:
230
231 mutex_unlock(&b->c->fill_lock);
232
233 spin_lock(&b->c->btree_read_time_lock);
234 time_stats_update(&b->c->btree_read_time, b->io_start_time);
235 spin_unlock(&b->c->btree_read_time_lock);
236
237 smp_wmb(); /* read_done is our write lock */
238 set_btree_node_read_done(b);
239
240 closure_return(cl);
241err:
242 set_btree_node_io_error(b);
243 bch_cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys",
244 err, PTR_BUCKET_NR(b->c, &b->key, 0),
245 index(i, b), i->keys);
246 goto out;
247}
248
249void bch_btree_read(struct btree *b)
250{
251 BUG_ON(b->nsets || b->written);
252
253 if (!closure_trylock(&b->io.cl, &b->c->cl))
254 BUG();
255
256 b->io_start_time = local_clock();
257
258 btree_bio_init(b);
259 b->bio->bi_rw = REQ_META|READ_SYNC;
260 b->bio->bi_size = KEY_SIZE(&b->key) << 9;
261
262 bio_map(b->bio, b->sets[0].data);
263
264 pr_debug("%s", pbtree(b));
265 trace_bcache_btree_read(b->bio);
266 bch_submit_bbio(b->bio, b->c, &b->key, 0);
267
268 continue_at(&b->io.cl, bch_btree_read_done, system_wq);
269}
270
271static void btree_complete_write(struct btree *b, struct btree_write *w)
272{
273 if (w->prio_blocked &&
274 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
275 wake_up(&b->c->alloc_wait);
276
277 if (w->journal) {
278 atomic_dec_bug(w->journal);
279 __closure_wake_up(&b->c->journal.wait);
280 }
281
282 if (w->owner)
283 closure_put(w->owner);
284
285 w->prio_blocked = 0;
286 w->journal = NULL;
287 w->owner = NULL;
288}
289
290static void __btree_write_done(struct closure *cl)
291{
292 struct btree *b = container_of(cl, struct btree, io.cl);
293 struct btree_write *w = btree_prev_write(b);
294
295 bch_bbio_free(b->bio, b->c);
296 b->bio = NULL;
297 btree_complete_write(b, w);
298
299 if (btree_node_dirty(b))
300 queue_delayed_work(btree_io_wq, &b->work,
301 msecs_to_jiffies(30000));
302
303 closure_return(cl);
304}
305
306static void btree_write_done(struct closure *cl)
307{
308 struct btree *b = container_of(cl, struct btree, io.cl);
309 struct bio_vec *bv;
310 int n;
311
312 __bio_for_each_segment(bv, b->bio, n, 0)
313 __free_page(bv->bv_page);
314
315 __btree_write_done(cl);
316}
317
318static void do_btree_write(struct btree *b)
319{
320 struct closure *cl = &b->io.cl;
321 struct bset *i = b->sets[b->nsets].data;
322 BKEY_PADDED(key) k;
323
324 i->version = BCACHE_BSET_VERSION;
325 i->csum = btree_csum_set(b, i);
326
327 btree_bio_init(b);
328 b->bio->bi_rw = REQ_META|WRITE_SYNC;
329 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
330 bio_map(b->bio, i);
331
332 bkey_copy(&k.key, &b->key);
333 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
334
335 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
336 int j;
337 struct bio_vec *bv;
338 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
339
340 bio_for_each_segment(bv, b->bio, j)
341 memcpy(page_address(bv->bv_page),
342 base + j * PAGE_SIZE, PAGE_SIZE);
343
344 trace_bcache_btree_write(b->bio);
345 bch_submit_bbio(b->bio, b->c, &k.key, 0);
346
347 continue_at(cl, btree_write_done, NULL);
348 } else {
349 b->bio->bi_vcnt = 0;
350 bio_map(b->bio, i);
351
352 trace_bcache_btree_write(b->bio);
353 bch_submit_bbio(b->bio, b->c, &k.key, 0);
354
355 closure_sync(cl);
356 __btree_write_done(cl);
357 }
358}
359
360static void __btree_write(struct btree *b)
361{
362 struct bset *i = b->sets[b->nsets].data;
363
364 BUG_ON(current->bio_list);
365
366 closure_lock(&b->io, &b->c->cl);
367 cancel_delayed_work(&b->work);
368
369 clear_bit(BTREE_NODE_dirty, &b->flags);
370 change_bit(BTREE_NODE_write_idx, &b->flags);
371
372 bch_check_key_order(b, i);
373 BUG_ON(b->written && !i->keys);
374
375 do_btree_write(b);
376
377 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
378
379 b->written += set_blocks(i, b->c);
380 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
381 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
382
383 bch_btree_sort_lazy(b);
384
385 if (b->written < btree_blocks(b))
386 bch_bset_init_next(b);
387}
388
389static void btree_write_work(struct work_struct *w)
390{
391 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
392
393 down_write(&b->lock);
394
395 if (btree_node_dirty(b))
396 __btree_write(b);
397 up_write(&b->lock);
398}
399
400void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
401{
402 struct bset *i = b->sets[b->nsets].data;
403 struct btree_write *w = btree_current_write(b);
404
405 BUG_ON(b->written &&
406 (b->written >= btree_blocks(b) ||
407 i->seq != b->sets[0].data->seq ||
408 !i->keys));
409
410 if (!btree_node_dirty(b)) {
411 set_btree_node_dirty(b);
412 queue_delayed_work(btree_io_wq, &b->work,
413 msecs_to_jiffies(30000));
414 }
415
416 w->prio_blocked += b->prio_blocked;
417 b->prio_blocked = 0;
418
419 if (op && op->journal && !b->level) {
420 if (w->journal &&
421 journal_pin_cmp(b->c, w, op)) {
422 atomic_dec_bug(w->journal);
423 w->journal = NULL;
424 }
425
426 if (!w->journal) {
427 w->journal = op->journal;
428 atomic_inc(w->journal);
429 }
430 }
431
432 if (current->bio_list)
433 return;
434
435 /* Force write if set is too big */
436 if (now ||
437 b->level ||
438 set_bytes(i) > PAGE_SIZE - 48) {
439 if (op && now) {
440 /* Must wait on multiple writes */
441 BUG_ON(w->owner);
442 w->owner = &op->cl;
443 closure_get(&op->cl);
444 }
445
446 __btree_write(b);
447 }
448 BUG_ON(!b->written);
449}
450
451/*
452 * Btree in memory cache - allocation/freeing
453 * mca -> memory cache
454 */
455
456static void mca_reinit(struct btree *b)
457{
458 unsigned i;
459
460 b->flags = 0;
461 b->written = 0;
462 b->nsets = 0;
463
464 for (i = 0; i < MAX_BSETS; i++)
465 b->sets[i].size = 0;
466 /*
467 * Second loop starts at 1 because b->sets[0]->data is the memory we
468 * allocated
469 */
470 for (i = 1; i < MAX_BSETS; i++)
471 b->sets[i].data = NULL;
472}
473
474#define mca_reserve(c) (((c->root && c->root->level) \
475 ? c->root->level : 1) * 8 + 16)
476#define mca_can_free(c) \
477 max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
478
479static void mca_data_free(struct btree *b)
480{
481 struct bset_tree *t = b->sets;
482 BUG_ON(!closure_is_unlocked(&b->io.cl));
483
484 if (bset_prev_bytes(b) < PAGE_SIZE)
485 kfree(t->prev);
486 else
487 free_pages((unsigned long) t->prev,
488 get_order(bset_prev_bytes(b)));
489
490 if (bset_tree_bytes(b) < PAGE_SIZE)
491 kfree(t->tree);
492 else
493 free_pages((unsigned long) t->tree,
494 get_order(bset_tree_bytes(b)));
495
496 free_pages((unsigned long) t->data, b->page_order);
497
498 t->prev = NULL;
499 t->tree = NULL;
500 t->data = NULL;
501 list_move(&b->list, &b->c->btree_cache_freed);
502 b->c->bucket_cache_used--;
503}
504
505static void mca_bucket_free(struct btree *b)
506{
507 BUG_ON(btree_node_dirty(b));
508
509 b->key.ptr[0] = 0;
510 hlist_del_init_rcu(&b->hash);
511 list_move(&b->list, &b->c->btree_cache_freeable);
512}
513
514static unsigned btree_order(struct bkey *k)
515{
516 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
517}
518
519static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
520{
521 struct bset_tree *t = b->sets;
522 BUG_ON(t->data);
523
524 b->page_order = max_t(unsigned,
525 ilog2(b->c->btree_pages),
526 btree_order(k));
527
528 t->data = (void *) __get_free_pages(gfp, b->page_order);
529 if (!t->data)
530 goto err;
531
532 t->tree = bset_tree_bytes(b) < PAGE_SIZE
533 ? kmalloc(bset_tree_bytes(b), gfp)
534 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
535 if (!t->tree)
536 goto err;
537
538 t->prev = bset_prev_bytes(b) < PAGE_SIZE
539 ? kmalloc(bset_prev_bytes(b), gfp)
540 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
541 if (!t->prev)
542 goto err;
543
544 list_move(&b->list, &b->c->btree_cache);
545 b->c->bucket_cache_used++;
546 return;
547err:
548 mca_data_free(b);
549}
550
551static struct btree *mca_bucket_alloc(struct cache_set *c,
552 struct bkey *k, gfp_t gfp)
553{
554 struct btree *b = kzalloc(sizeof(struct btree), gfp);
555 if (!b)
556 return NULL;
557
558 init_rwsem(&b->lock);
559 lockdep_set_novalidate_class(&b->lock);
560 INIT_LIST_HEAD(&b->list);
561 INIT_DELAYED_WORK(&b->work, btree_write_work);
562 b->c = c;
563 closure_init_unlocked(&b->io);
564
565 mca_data_alloc(b, k, gfp);
566 return b;
567}
568
569static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
570{
571 lockdep_assert_held(&b->c->bucket_lock);
572
573 if (!down_write_trylock(&b->lock))
574 return -ENOMEM;
575
576 if (b->page_order < min_order) {
577 rw_unlock(true, b);
578 return -ENOMEM;
579 }
580
581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
582
583 if (cl && btree_node_dirty(b))
584 bch_btree_write(b, true, NULL);
585
586 if (cl)
587 closure_wait_event_async(&b->io.wait, cl,
588 atomic_read(&b->io.cl.remaining) == -1);
589
590 if (btree_node_dirty(b) ||
591 !closure_is_unlocked(&b->io.cl) ||
592 work_pending(&b->work.work)) {
593 rw_unlock(true, b);
594 return -EAGAIN;
595 }
596
597 return 0;
598}
599
600static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
601{
602 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
603 struct btree *b, *t;
604 unsigned long i, nr = sc->nr_to_scan;
605
606 if (c->shrinker_disabled)
607 return 0;
608
609 if (c->try_harder)
610 return 0;
611
612 /*
613 * If nr == 0, we're supposed to return the number of items we have
614 * cached. Not allowed to return -1.
615 */
616 if (!nr)
617 return mca_can_free(c) * c->btree_pages;
618
619 /* Return -1 if we can't do anything right now */
620 if (sc->gfp_mask & __GFP_WAIT)
621 mutex_lock(&c->bucket_lock);
622 else if (!mutex_trylock(&c->bucket_lock))
623 return -1;
624
625 nr /= c->btree_pages;
626 nr = min_t(unsigned long, nr, mca_can_free(c));
627
628 i = 0;
629 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
630 if (!nr)
631 break;
632
633 if (++i > 3 &&
634 !mca_reap(b, NULL, 0)) {
635 mca_data_free(b);
636 rw_unlock(true, b);
637 --nr;
638 }
639 }
640
641 /*
642 * Can happen right when we first start up, before we've read in any
643 * btree nodes
644 */
645 if (list_empty(&c->btree_cache))
646 goto out;
647
648 for (i = 0; nr && i < c->bucket_cache_used; i++) {
649 b = list_first_entry(&c->btree_cache, struct btree, list);
650 list_rotate_left(&c->btree_cache);
651
652 if (!b->accessed &&
653 !mca_reap(b, NULL, 0)) {
654 mca_bucket_free(b);
655 mca_data_free(b);
656 rw_unlock(true, b);
657 --nr;
658 } else
659 b->accessed = 0;
660 }
661out:
662 nr = mca_can_free(c) * c->btree_pages;
663 mutex_unlock(&c->bucket_lock);
664 return nr;
665}
666
667void bch_btree_cache_free(struct cache_set *c)
668{
669 struct btree *b;
670 struct closure cl;
671 closure_init_stack(&cl);
672
673 if (c->shrink.list.next)
674 unregister_shrinker(&c->shrink);
675
676 mutex_lock(&c->bucket_lock);
677
678#ifdef CONFIG_BCACHE_DEBUG
679 if (c->verify_data)
680 list_move(&c->verify_data->list, &c->btree_cache);
681#endif
682
683 list_splice(&c->btree_cache_freeable,
684 &c->btree_cache);
685
686 while (!list_empty(&c->btree_cache)) {
687 b = list_first_entry(&c->btree_cache, struct btree, list);
688
689 if (btree_node_dirty(b))
690 btree_complete_write(b, btree_current_write(b));
691 clear_bit(BTREE_NODE_dirty, &b->flags);
692
693 mca_data_free(b);
694 }
695
696 while (!list_empty(&c->btree_cache_freed)) {
697 b = list_first_entry(&c->btree_cache_freed,
698 struct btree, list);
699 list_del(&b->list);
700 cancel_delayed_work_sync(&b->work);
701 kfree(b);
702 }
703
704 mutex_unlock(&c->bucket_lock);
705}
706
707int bch_btree_cache_alloc(struct cache_set *c)
708{
709 unsigned i;
710
711 /* XXX: doesn't check for errors */
712
713 closure_init_unlocked(&c->gc);
714
715 for (i = 0; i < mca_reserve(c); i++)
716 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
717
718 list_splice_init(&c->btree_cache,
719 &c->btree_cache_freeable);
720
721#ifdef CONFIG_BCACHE_DEBUG
722 mutex_init(&c->verify_lock);
723
724 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
725
726 if (c->verify_data &&
727 c->verify_data->sets[0].data)
728 list_del_init(&c->verify_data->list);
729 else
730 c->verify_data = NULL;
731#endif
732
733 c->shrink.shrink = bch_mca_shrink;
734 c->shrink.seeks = 4;
735 c->shrink.batch = c->btree_pages * 2;
736 register_shrinker(&c->shrink);
737
738 return 0;
739}
740
741/* Btree in memory cache - hash table */
742
743static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
744{
745 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
746}
747
748static struct btree *mca_find(struct cache_set *c, struct bkey *k)
749{
750 struct btree *b;
751
752 rcu_read_lock();
753 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
754 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
755 goto out;
756 b = NULL;
757out:
758 rcu_read_unlock();
759 return b;
760}
761
762static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
763 int level, struct closure *cl)
764{
765 int ret = -ENOMEM;
766 struct btree *i;
767
768 if (!cl)
769 return ERR_PTR(-ENOMEM);
770
771 /*
772 * Trying to free up some memory - i.e. reuse some btree nodes - may
773 * require initiating IO to flush the dirty part of the node. If we're
774 * running under generic_make_request(), that IO will never finish and
775 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
776 * punt to workqueue and retry.
777 */
778 if (current->bio_list)
779 return ERR_PTR(-EAGAIN);
780
781 if (c->try_harder && c->try_harder != cl) {
782 closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
783 return ERR_PTR(-EAGAIN);
784 }
785
786 /* XXX: tracepoint */
787 c->try_harder = cl;
788 c->try_harder_start = local_clock();
789retry:
790 list_for_each_entry_reverse(i, &c->btree_cache, list) {
791 int r = mca_reap(i, cl, btree_order(k));
792 if (!r)
793 return i;
794 if (r != -ENOMEM)
795 ret = r;
796 }
797
798 if (ret == -EAGAIN &&
799 closure_blocking(cl)) {
800 mutex_unlock(&c->bucket_lock);
801 closure_sync(cl);
802 mutex_lock(&c->bucket_lock);
803 goto retry;
804 }
805
806 return ERR_PTR(ret);
807}
808
809/*
810 * We can only have one thread cannibalizing other cached btree nodes at a time,
811 * or we'll deadlock. We use an open coded mutex to ensure that, which a
812 * cannibalize_bucket() will take. This means every time we unlock the root of
813 * the btree, we need to release this lock if we have it held.
814 */
815void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
816{
817 if (c->try_harder == cl) {
818 time_stats_update(&c->try_harder_time, c->try_harder_start);
819 c->try_harder = NULL;
820 __closure_wake_up(&c->try_wait);
821 }
822}
823
824static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
825 int level, struct closure *cl)
826{
827 struct btree *b;
828
829 lockdep_assert_held(&c->bucket_lock);
830
831 if (mca_find(c, k))
832 return NULL;
833
834 /* btree_free() doesn't free memory; it sticks the node on the end of
835 * the list. Check if there's any freed nodes there:
836 */
837 list_for_each_entry(b, &c->btree_cache_freeable, list)
838 if (!mca_reap(b, NULL, btree_order(k)))
839 goto out;
840
841 /* We never free struct btree itself, just the memory that holds the on
842 * disk node. Check the freed list before allocating a new one:
843 */
844 list_for_each_entry(b, &c->btree_cache_freed, list)
845 if (!mca_reap(b, NULL, 0)) {
846 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
847 if (!b->sets[0].data)
848 goto err;
849 else
850 goto out;
851 }
852
853 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
854 if (!b)
855 goto err;
856
857 BUG_ON(!down_write_trylock(&b->lock));
858 if (!b->sets->data)
859 goto err;
860out:
861 BUG_ON(!closure_is_unlocked(&b->io.cl));
862
863 bkey_copy(&b->key, k);
864 list_move(&b->list, &c->btree_cache);
865 hlist_del_init_rcu(&b->hash);
866 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
867
868 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
869 b->level = level;
870
871 mca_reinit(b);
872
873 return b;
874err:
875 if (b)
876 rw_unlock(true, b);
877
878 b = mca_cannibalize(c, k, level, cl);
879 if (!IS_ERR(b))
880 goto out;
881
882 return b;
883}
884
885/**
886 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
887 * in from disk if necessary.
888 *
889 * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
890 * if that closure is in non blocking mode, will return -EAGAIN.
891 *
892 * The btree node will have either a read or a write lock held, depending on
893 * level and op->lock.
894 */
895struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
896 int level, struct btree_op *op)
897{
898 int i = 0;
899 bool write = level <= op->lock;
900 struct btree *b;
901
902 BUG_ON(level < 0);
903retry:
904 b = mca_find(c, k);
905
906 if (!b) {
907 mutex_lock(&c->bucket_lock);
908 b = mca_alloc(c, k, level, &op->cl);
909 mutex_unlock(&c->bucket_lock);
910
911 if (!b)
912 goto retry;
913 if (IS_ERR(b))
914 return b;
915
916 bch_btree_read(b);
917
918 if (!write)
919 downgrade_write(&b->lock);
920 } else {
921 rw_lock(write, b, level);
922 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
923 rw_unlock(write, b);
924 goto retry;
925 }
926 BUG_ON(b->level != level);
927 }
928
929 b->accessed = 1;
930
931 for (; i <= b->nsets && b->sets[i].size; i++) {
932 prefetch(b->sets[i].tree);
933 prefetch(b->sets[i].data);
934 }
935
936 for (; i <= b->nsets; i++)
937 prefetch(b->sets[i].data);
938
939 if (!closure_wait_event(&b->io.wait, &op->cl,
940 btree_node_read_done(b))) {
941 rw_unlock(write, b);
942 b = ERR_PTR(-EAGAIN);
943 } else if (btree_node_io_error(b)) {
944 rw_unlock(write, b);
945 b = ERR_PTR(-EIO);
946 } else
947 BUG_ON(!b->written);
948
949 return b;
950}
951
952static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
953{
954 struct btree *b;
955
956 mutex_lock(&c->bucket_lock);
957 b = mca_alloc(c, k, level, NULL);
958 mutex_unlock(&c->bucket_lock);
959
960 if (!IS_ERR_OR_NULL(b)) {
961 bch_btree_read(b);
962 rw_unlock(true, b);
963 }
964}
965
966/* Btree alloc */
967
968static void btree_node_free(struct btree *b, struct btree_op *op)
969{
970 unsigned i;
971
972 /*
973 * The BUG_ON() in btree_node_get() implies that we must have a write
974 * lock on parent to free or even invalidate a node
975 */
976 BUG_ON(op->lock <= b->level);
977 BUG_ON(b == b->c->root);
978 pr_debug("bucket %s", pbtree(b));
979
980 if (btree_node_dirty(b))
981 btree_complete_write(b, btree_current_write(b));
982 clear_bit(BTREE_NODE_dirty, &b->flags);
983
984 if (b->prio_blocked &&
985 !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
986 closure_wake_up(&b->c->bucket_wait);
987
988 b->prio_blocked = 0;
989
990 cancel_delayed_work(&b->work);
991
992 mutex_lock(&b->c->bucket_lock);
993
994 for (i = 0; i < KEY_PTRS(&b->key); i++) {
995 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
996
997 bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
998 PTR_BUCKET(b->c, &b->key, i));
999 }
1000
1001 bch_bucket_free(b->c, &b->key);
1002 mca_bucket_free(b);
1003 mutex_unlock(&b->c->bucket_lock);
1004}
1005
1006struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
1007 struct closure *cl)
1008{
1009 BKEY_PADDED(key) k;
1010 struct btree *b = ERR_PTR(-EAGAIN);
1011
1012 mutex_lock(&c->bucket_lock);
1013retry:
1014 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
1015 goto err;
1016
1017 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1018
1019 b = mca_alloc(c, &k.key, level, cl);
1020 if (IS_ERR(b))
1021 goto err_free;
1022
1023 if (!b) {
1024 cache_bug(c, "Tried to allocate bucket"
1025 " that was in btree cache");
1026 __bkey_put(c, &k.key);
1027 goto retry;
1028 }
1029
1030 set_btree_node_read_done(b);
1031 b->accessed = 1;
1032 bch_bset_init_next(b);
1033
1034 mutex_unlock(&c->bucket_lock);
1035 return b;
1036err_free:
1037 bch_bucket_free(c, &k.key);
1038 __bkey_put(c, &k.key);
1039err:
1040 mutex_unlock(&c->bucket_lock);
1041 return b;
1042}
1043
1044static struct btree *btree_node_alloc_replacement(struct btree *b,
1045 struct closure *cl)
1046{
1047 struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
1048 if (!IS_ERR_OR_NULL(n))
1049 bch_btree_sort_into(b, n);
1050
1051 return n;
1052}
1053
1054/* Garbage collection */
1055
1056uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1057{
1058 uint8_t stale = 0;
1059 unsigned i;
1060 struct bucket *g;
1061
1062 /*
1063 * ptr_invalid() can't return true for the keys that mark btree nodes as
1064 * freed, but since ptr_bad() returns true we'll never actually use them
1065 * for anything and thus we don't want mark their pointers here
1066 */
1067 if (!bkey_cmp(k, &ZERO_KEY))
1068 return stale;
1069
1070 for (i = 0; i < KEY_PTRS(k); i++) {
1071 if (!ptr_available(c, k, i))
1072 continue;
1073
1074 g = PTR_BUCKET(c, k, i);
1075
1076 if (gen_after(g->gc_gen, PTR_GEN(k, i)))
1077 g->gc_gen = PTR_GEN(k, i);
1078
1079 if (ptr_stale(c, k, i)) {
1080 stale = max(stale, ptr_stale(c, k, i));
1081 continue;
1082 }
1083
1084 cache_bug_on(GC_MARK(g) &&
1085 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1086 c, "inconsistent ptrs: mark = %llu, level = %i",
1087 GC_MARK(g), level);
1088
1089 if (level)
1090 SET_GC_MARK(g, GC_MARK_METADATA);
1091 else if (KEY_DIRTY(k))
1092 SET_GC_MARK(g, GC_MARK_DIRTY);
1093
1094 /* guard against overflow */
1095 SET_GC_SECTORS_USED(g, min_t(unsigned,
1096 GC_SECTORS_USED(g) + KEY_SIZE(k),
1097 (1 << 14) - 1));
1098
1099 BUG_ON(!GC_SECTORS_USED(g));
1100 }
1101
1102 return stale;
1103}
1104
1105#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1106
1107static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1108 struct gc_stat *gc)
1109{
1110 uint8_t stale = 0;
1111 unsigned last_dev = -1;
1112 struct bcache_device *d = NULL;
1113 struct bkey *k;
1114 struct btree_iter iter;
1115 struct bset_tree *t;
1116
1117 gc->nodes++;
1118
1119 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1120 if (last_dev != KEY_INODE(k)) {
1121 last_dev = KEY_INODE(k);
1122
1123 d = KEY_INODE(k) < b->c->nr_uuids
1124 ? b->c->devices[last_dev]
1125 : NULL;
1126 }
1127
1128 stale = max(stale, btree_mark_key(b, k));
1129
1130 if (bch_ptr_bad(b, k))
1131 continue;
1132
1133 *keys += bkey_u64s(k);
1134
1135 gc->key_bytes += bkey_u64s(k);
1136 gc->nkeys++;
1137
1138 gc->data += KEY_SIZE(k);
1139 if (KEY_DIRTY(k)) {
1140 gc->dirty += KEY_SIZE(k);
1141 if (d)
1142 d->sectors_dirty_gc += KEY_SIZE(k);
1143 }
1144 }
1145
1146 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
1147 btree_bug_on(t->size &&
1148 bset_written(b, t) &&
1149 bkey_cmp(&b->key, &t->end) < 0,
1150 b, "found short btree key in gc");
1151
1152 return stale;
1153}
1154
1155static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1156 struct btree_op *op)
1157{
1158 /*
1159 * We block priorities from being written for the duration of garbage
1160 * collection, so we can't sleep in btree_alloc() ->
1161 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
1162 * our closure.
1163 */
1164 struct btree *n = btree_node_alloc_replacement(b, NULL);
1165
1166 if (!IS_ERR_OR_NULL(n)) {
1167 swap(b, n);
1168
1169 memcpy(k->ptr, b->key.ptr,
1170 sizeof(uint64_t) * KEY_PTRS(&b->key));
1171
1172 __bkey_put(b->c, &b->key);
1173 atomic_inc(&b->c->prio_blocked);
1174 b->prio_blocked++;
1175
1176 btree_node_free(n, op);
1177 up_write(&n->lock);
1178 }
1179
1180 return b;
1181}
1182
1183/*
1184 * Leaving this at 2 until we've got incremental garbage collection done; it
1185 * could be higher (and has been tested with 4) except that garbage collection
1186 * could take much longer, adversely affecting latency.
1187 */
1188#define GC_MERGE_NODES 2U
1189
1190struct gc_merge_info {
1191 struct btree *b;
1192 struct bkey *k;
1193 unsigned keys;
1194};
1195
1196static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1197 struct gc_stat *gc, struct gc_merge_info *r)
1198{
1199 unsigned nodes = 0, keys = 0, blocks;
1200 int i;
1201
1202 while (nodes < GC_MERGE_NODES && r[nodes].b)
1203 keys += r[nodes++].keys;
1204
1205 blocks = btree_default_blocks(b->c) * 2 / 3;
1206
1207 if (nodes < 2 ||
1208 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1209 return;
1210
1211 for (i = nodes - 1; i >= 0; --i) {
1212 if (r[i].b->written)
1213 r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
1214
1215 if (r[i].b->written)
1216 return;
1217 }
1218
1219 for (i = nodes - 1; i > 0; --i) {
1220 struct bset *n1 = r[i].b->sets->data;
1221 struct bset *n2 = r[i - 1].b->sets->data;
1222 struct bkey *k, *last = NULL;
1223
1224 keys = 0;
1225
1226 if (i == 1) {
1227 /*
1228 * Last node we're not getting rid of - we're getting
1229 * rid of the node at r[0]. Have to try and fit all of
1230 * the remaining keys into this node; we can't ensure
1231 * they will always fit due to rounding and variable
1232 * length keys (shouldn't be possible in practice,
1233 * though)
1234 */
1235 if (__set_blocks(n1, n1->keys + r->keys,
1236 b->c) > btree_blocks(r[i].b))
1237 return;
1238
1239 keys = n2->keys;
1240 last = &r->b->key;
1241 } else
1242 for (k = n2->start;
1243 k < end(n2);
1244 k = bkey_next(k)) {
1245 if (__set_blocks(n1, n1->keys + keys +
1246 bkey_u64s(k), b->c) > blocks)
1247 break;
1248
1249 last = k;
1250 keys += bkey_u64s(k);
1251 }
1252
1253 BUG_ON(__set_blocks(n1, n1->keys + keys,
1254 b->c) > btree_blocks(r[i].b));
1255
1256 if (last) {
1257 bkey_copy_key(&r[i].b->key, last);
1258 bkey_copy_key(r[i].k, last);
1259 }
1260
1261 memcpy(end(n1),
1262 n2->start,
1263 (void *) node(n2, keys) - (void *) n2->start);
1264
1265 n1->keys += keys;
1266
1267 memmove(n2->start,
1268 node(n2, keys),
1269 (void *) end(n2) - (void *) node(n2, keys));
1270
1271 n2->keys -= keys;
1272
1273 r[i].keys = n1->keys;
1274 r[i - 1].keys = n2->keys;
1275 }
1276
1277 btree_node_free(r->b, op);
1278 up_write(&r->b->lock);
1279
1280 pr_debug("coalesced %u nodes", nodes);
1281
1282 gc->nodes--;
1283 nodes--;
1284
1285 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
1286 memset(&r[nodes], 0, sizeof(struct gc_merge_info));
1287}
1288
1289static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1290 struct closure *writes, struct gc_stat *gc)
1291{
1292 void write(struct btree *r)
1293 {
1294 if (!r->written)
1295 bch_btree_write(r, true, op);
1296 else if (btree_node_dirty(r)) {
1297 BUG_ON(btree_current_write(r)->owner);
1298 btree_current_write(r)->owner = writes;
1299 closure_get(writes);
1300
1301 bch_btree_write(r, true, NULL);
1302 }
1303
1304 up_write(&r->lock);
1305 }
1306
1307 int ret = 0, stale;
1308 unsigned i;
1309 struct gc_merge_info r[GC_MERGE_NODES];
1310
1311 memset(r, 0, sizeof(r));
1312
1313 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
1314 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
1315
1316 if (IS_ERR(r->b)) {
1317 ret = PTR_ERR(r->b);
1318 break;
1319 }
1320
1321 r->keys = 0;
1322 stale = btree_gc_mark_node(r->b, &r->keys, gc);
1323
1324 if (!b->written &&
1325 (r->b->level || stale > 10 ||
1326 b->c->gc_always_rewrite))
1327 r->b = btree_gc_alloc(r->b, r->k, op);
1328
1329 if (r->b->level)
1330 ret = btree_gc_recurse(r->b, op, writes, gc);
1331
1332 if (ret) {
1333 write(r->b);
1334 break;
1335 }
1336
1337 bkey_copy_key(&b->c->gc_done, r->k);
1338
1339 if (!b->written)
1340 btree_gc_coalesce(b, op, gc, r);
1341
1342 if (r[GC_MERGE_NODES - 1].b)
1343 write(r[GC_MERGE_NODES - 1].b);
1344
1345 memmove(&r[1], &r[0],
1346 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
1347
1348 /* When we've got incremental GC working, we'll want to do
1349 * if (should_resched())
1350 * return -EAGAIN;
1351 */
1352 cond_resched();
1353#if 0
1354 if (need_resched()) {
1355 ret = -EAGAIN;
1356 break;
1357 }
1358#endif
1359 }
1360
1361 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
1362 write(r[i].b);
1363
1364 /* Might have freed some children, must remove their keys */
1365 if (!b->written)
1366 bch_btree_sort(b);
1367
1368 return ret;
1369}
1370
1371static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1372 struct closure *writes, struct gc_stat *gc)
1373{
1374 struct btree *n = NULL;
1375 unsigned keys = 0;
1376 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
1377
1378 if (b->level || stale > 10)
1379 n = btree_node_alloc_replacement(b, NULL);
1380
1381 if (!IS_ERR_OR_NULL(n))
1382 swap(b, n);
1383
1384 if (b->level)
1385 ret = btree_gc_recurse(b, op, writes, gc);
1386
1387 if (!b->written || btree_node_dirty(b)) {
1388 atomic_inc(&b->c->prio_blocked);
1389 b->prio_blocked++;
1390 bch_btree_write(b, true, n ? op : NULL);
1391 }
1392
1393 if (!IS_ERR_OR_NULL(n)) {
1394 closure_sync(&op->cl);
1395 bch_btree_set_root(b);
1396 btree_node_free(n, op);
1397 rw_unlock(true, b);
1398 }
1399
1400 return ret;
1401}
1402
1403static void btree_gc_start(struct cache_set *c)
1404{
1405 struct cache *ca;
1406 struct bucket *b;
1407 struct bcache_device **d;
1408 unsigned i;
1409
1410 if (!c->gc_mark_valid)
1411 return;
1412
1413 mutex_lock(&c->bucket_lock);
1414
1415 c->gc_mark_valid = 0;
1416 c->gc_done = ZERO_KEY;
1417
1418 for_each_cache(ca, c, i)
1419 for_each_bucket(b, ca) {
1420 b->gc_gen = b->gen;
1421 if (!atomic_read(&b->pin))
1422 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1423 }
1424
1425 for (d = c->devices;
1426 d < c->devices + c->nr_uuids;
1427 d++)
1428 if (*d)
1429 (*d)->sectors_dirty_gc = 0;
1430
1431 mutex_unlock(&c->bucket_lock);
1432}
1433
1434size_t bch_btree_gc_finish(struct cache_set *c)
1435{
1436 size_t available = 0;
1437 struct bucket *b;
1438 struct cache *ca;
1439 struct bcache_device **d;
1440 unsigned i;
1441
1442 mutex_lock(&c->bucket_lock);
1443
1444 set_gc_sectors(c);
1445 c->gc_mark_valid = 1;
1446 c->need_gc = 0;
1447
1448 if (c->root)
1449 for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1450 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1451 GC_MARK_METADATA);
1452
1453 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1454 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1455 GC_MARK_METADATA);
1456
1457 for_each_cache(ca, c, i) {
1458 uint64_t *i;
1459
1460 ca->invalidate_needs_gc = 0;
1461
1462 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1463 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1464
1465 for (i = ca->prio_buckets;
1466 i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1467 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1468
1469 for_each_bucket(b, ca) {
1470 b->last_gc = b->gc_gen;
1471 c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1472
1473 if (!atomic_read(&b->pin) &&
1474 GC_MARK(b) == GC_MARK_RECLAIMABLE) {
1475 available++;
1476 if (!GC_SECTORS_USED(b))
1477 bch_bucket_add_unused(ca, b);
1478 }
1479 }
1480 }
1481
1482 for (d = c->devices;
1483 d < c->devices + c->nr_uuids;
1484 d++)
1485 if (*d) {
1486 unsigned long last =
1487 atomic_long_read(&((*d)->sectors_dirty));
1488 long difference = (*d)->sectors_dirty_gc - last;
1489
1490 pr_debug("sectors dirty off by %li", difference);
1491
1492 (*d)->sectors_dirty_last += difference;
1493
1494 atomic_long_set(&((*d)->sectors_dirty),
1495 (*d)->sectors_dirty_gc);
1496 }
1497
1498 mutex_unlock(&c->bucket_lock);
1499 return available;
1500}
1501
1502static void bch_btree_gc(struct closure *cl)
1503{
1504 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
1505 int ret;
1506 unsigned long available;
1507 struct gc_stat stats;
1508 struct closure writes;
1509 struct btree_op op;
1510
1511 uint64_t start_time = local_clock();
1512 trace_bcache_gc_start(c->sb.set_uuid);
1513 blktrace_msg_all(c, "Starting gc");
1514
1515 memset(&stats, 0, sizeof(struct gc_stat));
1516 closure_init_stack(&writes);
1517 bch_btree_op_init_stack(&op);
1518 op.lock = SHRT_MAX;
1519
1520 btree_gc_start(c);
1521
1522 ret = btree_root(gc_root, c, &op, &writes, &stats);
1523 closure_sync(&op.cl);
1524 closure_sync(&writes);
1525
1526 if (ret) {
1527 blktrace_msg_all(c, "Stopped gc");
1528 pr_warn("gc failed!");
1529
1530 continue_at(cl, bch_btree_gc, bch_gc_wq);
1531 }
1532
1533 /* Possibly wait for new UUIDs or whatever to hit disk */
1534 bch_journal_meta(c, &op.cl);
1535 closure_sync(&op.cl);
1536
1537 available = bch_btree_gc_finish(c);
1538
1539 time_stats_update(&c->btree_gc_time, start_time);
1540
1541 stats.key_bytes *= sizeof(uint64_t);
1542 stats.dirty <<= 9;
1543 stats.data <<= 9;
1544 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1545 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1546 blktrace_msg_all(c, "Finished gc");
1547
1548 trace_bcache_gc_end(c->sb.set_uuid);
1549 wake_up(&c->alloc_wait);
1550 closure_wake_up(&c->bucket_wait);
1551
1552 continue_at(cl, bch_moving_gc, bch_gc_wq);
1553}
1554
1555void bch_queue_gc(struct cache_set *c)
1556{
1557 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
1558}
1559
1560/* Initial partial gc */
1561
1562static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1563 unsigned long **seen)
1564{
1565 int ret;
1566 unsigned i;
1567 struct bkey *k;
1568 struct bucket *g;
1569 struct btree_iter iter;
1570
1571 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1572 for (i = 0; i < KEY_PTRS(k); i++) {
1573 if (!ptr_available(b->c, k, i))
1574 continue;
1575
1576 g = PTR_BUCKET(b->c, k, i);
1577
1578 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
1579 seen[PTR_DEV(k, i)]) ||
1580 !ptr_stale(b->c, k, i)) {
1581 g->gen = PTR_GEN(k, i);
1582
1583 if (b->level)
1584 g->prio = BTREE_PRIO;
1585 else if (g->prio == BTREE_PRIO)
1586 g->prio = INITIAL_PRIO;
1587 }
1588 }
1589
1590 btree_mark_key(b, k);
1591 }
1592
1593 if (b->level) {
1594 k = bch_next_recurse_key(b, &ZERO_KEY);
1595
1596 while (k) {
1597 struct bkey *p = bch_next_recurse_key(b, k);
1598 if (p)
1599 btree_node_prefetch(b->c, p, b->level - 1);
1600
1601 ret = btree(check_recurse, k, b, op, seen);
1602 if (ret)
1603 return ret;
1604
1605 k = p;
1606 }
1607 }
1608
1609 return 0;
1610}
1611
1612int bch_btree_check(struct cache_set *c, struct btree_op *op)
1613{
1614 int ret = -ENOMEM;
1615 unsigned i;
1616 unsigned long *seen[MAX_CACHES_PER_SET];
1617
1618 memset(seen, 0, sizeof(seen));
1619
1620 for (i = 0; c->cache[i]; i++) {
1621 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
1622 seen[i] = kmalloc(n, GFP_KERNEL);
1623 if (!seen[i])
1624 goto err;
1625
1626 /* Disables the seen array until prio_read() uses it too */
1627 memset(seen[i], 0xFF, n);
1628 }
1629
1630 ret = btree_root(check_recurse, c, op, seen);
1631err:
1632 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1633 kfree(seen[i]);
1634 return ret;
1635}
1636
1637/* Btree insertion */
1638
1639static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1640{
1641 struct bset *i = b->sets[b->nsets].data;
1642
1643 memmove((uint64_t *) where + bkey_u64s(insert),
1644 where,
1645 (void *) end(i) - (void *) where);
1646
1647 i->keys += bkey_u64s(insert);
1648 bkey_copy(where, insert);
1649 bch_bset_fix_lookup_table(b, where);
1650}
1651
1652static bool fix_overlapping_extents(struct btree *b,
1653 struct bkey *insert,
1654 struct btree_iter *iter,
1655 struct btree_op *op)
1656{
1657 void subtract_dirty(struct bkey *k, int sectors)
1658 {
1659 struct bcache_device *d = b->c->devices[KEY_INODE(k)];
1660
1661 if (KEY_DIRTY(k) && d)
1662 atomic_long_sub(sectors, &d->sectors_dirty);
1663 }
1664
1665 unsigned old_size, sectors_found = 0;
1666
1667 while (1) {
1668 struct bkey *k = bch_btree_iter_next(iter);
1669 if (!k ||
1670 bkey_cmp(&START_KEY(k), insert) >= 0)
1671 break;
1672
1673 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1674 continue;
1675
1676 old_size = KEY_SIZE(k);
1677
1678 /*
1679 * We might overlap with 0 size extents; we can't skip these
1680 * because if they're in the set we're inserting to we have to
1681 * adjust them so they don't overlap with the key we're
1682 * inserting. But we don't want to check them for BTREE_REPLACE
1683 * operations.
1684 */
1685
1686 if (op->type == BTREE_REPLACE &&
1687 KEY_SIZE(k)) {
1688 /*
1689 * k might have been split since we inserted/found the
1690 * key we're replacing
1691 */
1692 unsigned i;
1693 uint64_t offset = KEY_START(k) -
1694 KEY_START(&op->replace);
1695
1696 /* But it must be a subset of the replace key */
1697 if (KEY_START(k) < KEY_START(&op->replace) ||
1698 KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
1699 goto check_failed;
1700
1701 /* We didn't find a key that we were supposed to */
1702 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1703 goto check_failed;
1704
1705 if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
1706 goto check_failed;
1707
1708 /* skip past gen */
1709 offset <<= 8;
1710
1711 BUG_ON(!KEY_PTRS(&op->replace));
1712
1713 for (i = 0; i < KEY_PTRS(&op->replace); i++)
1714 if (k->ptr[i] != op->replace.ptr[i] + offset)
1715 goto check_failed;
1716
1717 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1718 }
1719
1720 if (bkey_cmp(insert, k) < 0 &&
1721 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1722 /*
1723 * We overlapped in the middle of an existing key: that
1724 * means we have to split the old key. But we have to do
1725 * slightly different things depending on whether the
1726 * old key has been written out yet.
1727 */
1728
1729 struct bkey *top;
1730
1731 subtract_dirty(k, KEY_SIZE(insert));
1732
1733 if (bkey_written(b, k)) {
1734 /*
1735 * We insert a new key to cover the top of the
1736 * old key, and the old key is modified in place
1737 * to represent the bottom split.
1738 *
1739 * It's completely arbitrary whether the new key
1740 * is the top or the bottom, but it has to match
1741 * up with what btree_sort_fixup() does - it
1742 * doesn't check for this kind of overlap, it
1743 * depends on us inserting a new key for the top
1744 * here.
1745 */
1746 top = bch_bset_search(b, &b->sets[b->nsets],
1747 insert);
1748 shift_keys(b, top, k);
1749 } else {
1750 BKEY_PADDED(key) temp;
1751 bkey_copy(&temp.key, k);
1752 shift_keys(b, k, &temp.key);
1753 top = bkey_next(k);
1754 }
1755
1756 bch_cut_front(insert, top);
1757 bch_cut_back(&START_KEY(insert), k);
1758 bch_bset_fix_invalidated_key(b, k);
1759 return false;
1760 }
1761
1762 if (bkey_cmp(insert, k) < 0) {
1763 bch_cut_front(insert, k);
1764 } else {
1765 if (bkey_written(b, k) &&
1766 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1767 /*
1768 * Completely overwrote, so we don't have to
1769 * invalidate the binary search tree
1770 */
1771 bch_cut_front(k, k);
1772 } else {
1773 __bch_cut_back(&START_KEY(insert), k);
1774 bch_bset_fix_invalidated_key(b, k);
1775 }
1776 }
1777
1778 subtract_dirty(k, old_size - KEY_SIZE(k));
1779 }
1780
1781check_failed:
1782 if (op->type == BTREE_REPLACE) {
1783 if (!sectors_found) {
1784 op->insert_collision = true;
1785 return true;
1786 } else if (sectors_found < KEY_SIZE(insert)) {
1787 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1788 (KEY_SIZE(insert) - sectors_found));
1789 SET_KEY_SIZE(insert, sectors_found);
1790 }
1791 }
1792
1793 return false;
1794}
1795
1796static bool btree_insert_key(struct btree *b, struct btree_op *op,
1797 struct bkey *k)
1798{
1799 struct bset *i = b->sets[b->nsets].data;
1800 struct bkey *m, *prev;
1801 const char *status = "insert";
1802
1803 BUG_ON(bkey_cmp(k, &b->key) > 0);
1804 BUG_ON(b->level && !KEY_PTRS(k));
1805 BUG_ON(!b->level && !KEY_OFFSET(k));
1806
1807 if (!b->level) {
1808 struct btree_iter iter;
1809 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
1810
1811 /*
1812 * bset_search() returns the first key that is strictly greater
1813 * than the search key - but for back merging, we want to find
1814 * the first key that is greater than or equal to KEY_START(k) -
1815 * unless KEY_START(k) is 0.
1816 */
1817 if (KEY_OFFSET(&search))
1818 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
1819
1820 prev = NULL;
1821 m = bch_btree_iter_init(b, &iter, &search);
1822
1823 if (fix_overlapping_extents(b, k, &iter, op))
1824 return false;
1825
1826 while (m != end(i) &&
1827 bkey_cmp(k, &START_KEY(m)) > 0)
1828 prev = m, m = bkey_next(m);
1829
1830 if (key_merging_disabled(b->c))
1831 goto insert;
1832
1833 /* prev is in the tree, if we merge we're done */
1834 status = "back merging";
1835 if (prev &&
1836 bch_bkey_try_merge(b, prev, k))
1837 goto merged;
1838
1839 status = "overwrote front";
1840 if (m != end(i) &&
1841 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1842 goto copy;
1843
1844 status = "front merge";
1845 if (m != end(i) &&
1846 bch_bkey_try_merge(b, k, m))
1847 goto copy;
1848 } else
1849 m = bch_bset_search(b, &b->sets[b->nsets], k);
1850
1851insert: shift_keys(b, m, k);
1852copy: bkey_copy(m, k);
1853merged:
1854 bch_check_keys(b, "%s for %s at %s: %s", status,
1855 op_type(op), pbtree(b), pkey(k));
1856 bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status,
1857 op_type(op), pbtree(b), pkey(k));
1858
1859 if (b->level && !KEY_OFFSET(k))
1860 b->prio_blocked++;
1861
1862 pr_debug("%s for %s at %s: %s", status,
1863 op_type(op), pbtree(b), pkey(k));
1864
1865 return true;
1866}
1867
1868bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
1869{
1870 bool ret = false;
1871 struct bkey *k;
1872 unsigned oldsize = bch_count_data(b);
1873
1874 while ((k = bch_keylist_pop(&op->keys))) {
1875 bkey_put(b->c, k, b->level);
1876 ret |= btree_insert_key(b, op, k);
1877 }
1878
1879 BUG_ON(bch_count_data(b) < oldsize);
1880 return ret;
1881}
1882
1883bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1884 struct bio *bio)
1885{
1886 bool ret = false;
1887 uint64_t btree_ptr = b->key.ptr[0];
1888 unsigned long seq = b->seq;
1889 BKEY_PADDED(k) tmp;
1890
1891 rw_unlock(false, b);
1892 rw_lock(true, b, b->level);
1893
1894 if (b->key.ptr[0] != btree_ptr ||
1895 b->seq != seq + 1 ||
1896 should_split(b))
1897 goto out;
1898
1899 op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio));
1900
1901 SET_KEY_PTRS(&op->replace, 1);
1902 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
1903
1904 SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
1905
1906 bkey_copy(&tmp.k, &op->replace);
1907
1908 BUG_ON(op->type != BTREE_INSERT);
1909 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1910 bch_btree_write(b, false, NULL);
1911 ret = true;
1912out:
1913 downgrade_write(&b->lock);
1914 return ret;
1915}
1916
1917static int btree_split(struct btree *b, struct btree_op *op)
1918{
1919 bool split, root = b == b->c->root;
1920 struct btree *n1, *n2 = NULL, *n3 = NULL;
1921 uint64_t start_time = local_clock();
1922
1923 if (b->level)
1924 set_closure_blocking(&op->cl);
1925
1926 n1 = btree_node_alloc_replacement(b, &op->cl);
1927 if (IS_ERR(n1))
1928 goto err;
1929
1930 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
1931
1932 pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
1933 pbtree(b), n1->sets[0].data->keys);
1934
1935 if (split) {
1936 unsigned keys = 0;
1937
1938 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
1939 if (IS_ERR(n2))
1940 goto err_free1;
1941
1942 if (root) {
1943 n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
1944 if (IS_ERR(n3))
1945 goto err_free2;
1946 }
1947
1948 bch_btree_insert_keys(n1, op);
1949
1950 /* Has to be a linear search because we don't have an auxiliary
1951 * search tree yet
1952 */
1953
1954 while (keys < (n1->sets[0].data->keys * 3) / 5)
1955 keys += bkey_u64s(node(n1->sets[0].data, keys));
1956
1957 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
1958 keys += bkey_u64s(node(n1->sets[0].data, keys));
1959
1960 n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
1961 n1->sets[0].data->keys = keys;
1962
1963 memcpy(n2->sets[0].data->start,
1964 end(n1->sets[0].data),
1965 n2->sets[0].data->keys * sizeof(uint64_t));
1966
1967 bkey_copy_key(&n2->key, &b->key);
1968
1969 bch_keylist_add(&op->keys, &n2->key);
1970 bch_btree_write(n2, true, op);
1971 rw_unlock(true, n2);
1972 } else
1973 bch_btree_insert_keys(n1, op);
1974
1975 bch_keylist_add(&op->keys, &n1->key);
1976 bch_btree_write(n1, true, op);
1977
1978 if (n3) {
1979 bkey_copy_key(&n3->key, &MAX_KEY);
1980 bch_btree_insert_keys(n3, op);
1981 bch_btree_write(n3, true, op);
1982
1983 closure_sync(&op->cl);
1984 bch_btree_set_root(n3);
1985 rw_unlock(true, n3);
1986 } else if (root) {
1987 op->keys.top = op->keys.bottom;
1988 closure_sync(&op->cl);
1989 bch_btree_set_root(n1);
1990 } else {
1991 unsigned i;
1992
1993 bkey_copy(op->keys.top, &b->key);
1994 bkey_copy_key(op->keys.top, &ZERO_KEY);
1995
1996 for (i = 0; i < KEY_PTRS(&b->key); i++) {
1997 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
1998
1999 SET_PTR_GEN(op->keys.top, i, g);
2000 }
2001
2002 bch_keylist_push(&op->keys);
2003 closure_sync(&op->cl);
2004 atomic_inc(&b->c->prio_blocked);
2005 }
2006
2007 rw_unlock(true, n1);
2008 btree_node_free(b, op);
2009
2010 time_stats_update(&b->c->btree_split_time, start_time);
2011
2012 return 0;
2013err_free2:
2014 __bkey_put(n2->c, &n2->key);
2015 btree_node_free(n2, op);
2016 rw_unlock(true, n2);
2017err_free1:
2018 __bkey_put(n1->c, &n1->key);
2019 btree_node_free(n1, op);
2020 rw_unlock(true, n1);
2021err:
2022 if (n3 == ERR_PTR(-EAGAIN) ||
2023 n2 == ERR_PTR(-EAGAIN) ||
2024 n1 == ERR_PTR(-EAGAIN))
2025 return -EAGAIN;
2026
2027 pr_warn("couldn't split");
2028 return -ENOMEM;
2029}
2030
2031static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2032 struct keylist *stack_keys)
2033{
2034 if (b->level) {
2035 int ret;
2036 struct bkey *insert = op->keys.bottom;
2037 struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
2038
2039 if (!k) {
2040 btree_bug(b, "no key to recurse on at level %i/%i",
2041 b->level, b->c->root->level);
2042
2043 op->keys.top = op->keys.bottom;
2044 return -EIO;
2045 }
2046
2047 if (bkey_cmp(insert, k) > 0) {
2048 unsigned i;
2049
2050 if (op->type == BTREE_REPLACE) {
2051 __bkey_put(b->c, insert);
2052 op->keys.top = op->keys.bottom;
2053 op->insert_collision = true;
2054 return 0;
2055 }
2056
2057 for (i = 0; i < KEY_PTRS(insert); i++)
2058 atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
2059
2060 bkey_copy(stack_keys->top, insert);
2061
2062 bch_cut_back(k, insert);
2063 bch_cut_front(k, stack_keys->top);
2064
2065 bch_keylist_push(stack_keys);
2066 }
2067
2068 ret = btree(insert_recurse, k, b, op, stack_keys);
2069 if (ret)
2070 return ret;
2071 }
2072
2073 if (!bch_keylist_empty(&op->keys)) {
2074 if (should_split(b)) {
2075 if (op->lock <= b->c->root->level) {
2076 BUG_ON(b->level);
2077 op->lock = b->c->root->level + 1;
2078 return -EINTR;
2079 }
2080 return btree_split(b, op);
2081 }
2082
2083 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2084
2085 if (bch_btree_insert_keys(b, op))
2086 bch_btree_write(b, false, op);
2087 }
2088
2089 return 0;
2090}
2091
2092int bch_btree_insert(struct btree_op *op, struct cache_set *c)
2093{
2094 int ret = 0;
2095 struct keylist stack_keys;
2096
2097 /*
2098 * Don't want to block with the btree locked unless we have to,
2099 * otherwise we get deadlocks with try_harder and between split/gc
2100 */
2101 clear_closure_blocking(&op->cl);
2102
2103 BUG_ON(bch_keylist_empty(&op->keys));
2104 bch_keylist_copy(&stack_keys, &op->keys);
2105 bch_keylist_init(&op->keys);
2106
2107 while (!bch_keylist_empty(&stack_keys) ||
2108 !bch_keylist_empty(&op->keys)) {
2109 if (bch_keylist_empty(&op->keys)) {
2110 bch_keylist_add(&op->keys,
2111 bch_keylist_pop(&stack_keys));
2112 op->lock = 0;
2113 }
2114
2115 ret = btree_root(insert_recurse, c, op, &stack_keys);
2116
2117 if (ret == -EAGAIN) {
2118 ret = 0;
2119 closure_sync(&op->cl);
2120 } else if (ret) {
2121 struct bkey *k;
2122
2123 pr_err("error %i trying to insert key for %s",
2124 ret, op_type(op));
2125
2126 while ((k = bch_keylist_pop(&stack_keys) ?:
2127 bch_keylist_pop(&op->keys)))
2128 bkey_put(c, k, 0);
2129 }
2130 }
2131
2132 bch_keylist_free(&stack_keys);
2133
2134 if (op->journal)
2135 atomic_dec_bug(op->journal);
2136 op->journal = NULL;
2137 return ret;
2138}
2139
2140void bch_btree_set_root(struct btree *b)
2141{
2142 unsigned i;
2143
2144 BUG_ON(!b->written);
2145
2146 for (i = 0; i < KEY_PTRS(&b->key); i++)
2147 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2148
2149 mutex_lock(&b->c->bucket_lock);
2150 list_del_init(&b->list);
2151 mutex_unlock(&b->c->bucket_lock);
2152
2153 b->c->root = b;
2154 __bkey_put(b->c, &b->key);
2155
2156 bch_journal_meta(b->c, NULL);
2157 pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0));
2158}
2159
2160/* Cache lookup */
2161
2162static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
2163 struct bkey *k)
2164{
2165 struct search *s = container_of(op, struct search, op);
2166 struct bio *bio = &s->bio.bio;
2167 int ret = 0;
2168
2169 while (!ret &&
2170 !op->lookup_done) {
2171 unsigned sectors = INT_MAX;
2172
2173 if (KEY_INODE(k) == op->inode) {
2174 if (KEY_START(k) <= bio->bi_sector)
2175 break;
2176
2177 sectors = min_t(uint64_t, sectors,
2178 KEY_START(k) - bio->bi_sector);
2179 }
2180
2181 ret = s->d->cache_miss(b, s, bio, sectors);
2182 }
2183
2184 return ret;
2185}
2186
2187/*
2188 * Read from a single key, handling the initial cache miss if the key starts in
2189 * the middle of the bio
2190 */
2191static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2192 struct bkey *k)
2193{
2194 struct search *s = container_of(op, struct search, op);
2195 struct bio *bio = &s->bio.bio;
2196 unsigned ptr;
2197 struct bio *n;
2198
2199 int ret = submit_partial_cache_miss(b, op, k);
2200 if (ret || op->lookup_done)
2201 return ret;
2202
2203 /* XXX: figure out best pointer - for multiple cache devices */
2204 ptr = 0;
2205
2206 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
2207
2208 while (!op->lookup_done &&
2209 KEY_INODE(k) == op->inode &&
2210 bio->bi_sector < KEY_OFFSET(k)) {
2211 struct bkey *bio_key;
2212 sector_t sector = PTR_OFFSET(k, ptr) +
2213 (bio->bi_sector - KEY_START(k));
2214 unsigned sectors = min_t(uint64_t, INT_MAX,
2215 KEY_OFFSET(k) - bio->bi_sector);
2216
2217 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
2218 if (!n)
2219 return -EAGAIN;
2220
2221 if (n == bio)
2222 op->lookup_done = true;
2223
2224 bio_key = &container_of(n, struct bbio, bio)->key;
2225
2226 /*
2227 * The bucket we're reading from might be reused while our bio
2228 * is in flight, and we could then end up reading the wrong
2229 * data.
2230 *
2231 * We guard against this by checking (in cache_read_endio()) if
2232 * the pointer is stale again; if so, we treat it as an error
2233 * and reread from the backing device (but we don't pass that
2234 * error up anywhere).
2235 */
2236
2237 bch_bkey_copy_single_ptr(bio_key, k, ptr);
2238 SET_PTR_OFFSET(bio_key, 0, sector);
2239
2240 n->bi_end_io = bch_cache_read_endio;
2241 n->bi_private = &s->cl;
2242
2243 trace_bcache_cache_hit(n);
2244 __bch_submit_bbio(n, b->c);
2245 }
2246
2247 return 0;
2248}
2249
2250int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
2251{
2252 struct search *s = container_of(op, struct search, op);
2253 struct bio *bio = &s->bio.bio;
2254
2255 int ret = 0;
2256 struct bkey *k;
2257 struct btree_iter iter;
2258 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2259
2260 pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
2261 (uint64_t) bio->bi_sector);
2262
2263 do {
2264 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2265 if (!k) {
2266 /*
2267 * b->key would be exactly what we want, except that
2268 * pointers to btree nodes have nonzero size - we
2269 * wouldn't go far enough
2270 */
2271
2272 ret = submit_partial_cache_miss(b, op,
2273 &KEY(KEY_INODE(&b->key),
2274 KEY_OFFSET(&b->key), 0));
2275 break;
2276 }
2277
2278 ret = b->level
2279 ? btree(search_recurse, k, b, op)
2280 : submit_partial_cache_hit(b, op, k);
2281 } while (!ret &&
2282 !op->lookup_done);
2283
2284 return ret;
2285}
2286
2287/* Keybuf code */
2288
2289static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2290{
2291 /* Overlapping keys compare equal */
2292 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2293 return -1;
2294 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2295 return 1;
2296 return 0;
2297}
2298
2299static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2300 struct keybuf_key *r)
2301{
2302 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2303}
2304
2305static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2306 struct keybuf *buf, struct bkey *end)
2307{
2308 struct btree_iter iter;
2309 bch_btree_iter_init(b, &iter, &buf->last_scanned);
2310
2311 while (!array_freelist_empty(&buf->freelist)) {
2312 struct bkey *k = bch_btree_iter_next_filter(&iter, b,
2313 bch_ptr_bad);
2314
2315 if (!b->level) {
2316 if (!k) {
2317 buf->last_scanned = b->key;
2318 break;
2319 }
2320
2321 buf->last_scanned = *k;
2322 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2323 break;
2324
2325 if (buf->key_predicate(buf, k)) {
2326 struct keybuf_key *w;
2327
2328 pr_debug("%s", pkey(k));
2329
2330 spin_lock(&buf->lock);
2331
2332 w = array_alloc(&buf->freelist);
2333
2334 w->private = NULL;
2335 bkey_copy(&w->key, k);
2336
2337 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2338 array_free(&buf->freelist, w);
2339
2340 spin_unlock(&buf->lock);
2341 }
2342 } else {
2343 if (!k)
2344 break;
2345
2346 btree(refill_keybuf, k, b, op, buf, end);
2347 /*
2348 * Might get an error here, but can't really do anything
2349 * and it'll get logged elsewhere. Just read what we
2350 * can.
2351 */
2352
2353 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2354 break;
2355
2356 cond_resched();
2357 }
2358 }
2359
2360 return 0;
2361}
2362
2363void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2364 struct bkey *end)
2365{
2366 struct bkey start = buf->last_scanned;
2367 struct btree_op op;
2368 bch_btree_op_init_stack(&op);
2369
2370 cond_resched();
2371
2372 btree_root(refill_keybuf, c, &op, buf, end);
2373 closure_sync(&op.cl);
2374
2375 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
2376 RB_EMPTY_ROOT(&buf->keys) ? "no" :
2377 array_freelist_empty(&buf->freelist) ? "some" : "a few",
2378 KEY_INODE(&start), KEY_OFFSET(&start),
2379 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
2380
2381 spin_lock(&buf->lock);
2382
2383 if (!RB_EMPTY_ROOT(&buf->keys)) {
2384 struct keybuf_key *w;
2385 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2386 buf->start = START_KEY(&w->key);
2387
2388 w = RB_LAST(&buf->keys, struct keybuf_key, node);
2389 buf->end = w->key;
2390 } else {
2391 buf->start = MAX_KEY;
2392 buf->end = MAX_KEY;
2393 }
2394
2395 spin_unlock(&buf->lock);
2396}
2397
2398static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2399{
2400 rb_erase(&w->node, &buf->keys);
2401 array_free(&buf->freelist, w);
2402}
2403
2404void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2405{
2406 spin_lock(&buf->lock);
2407 __bch_keybuf_del(buf, w);
2408 spin_unlock(&buf->lock);
2409}
2410
2411bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2412 struct bkey *end)
2413{
2414 bool ret = false;
2415 struct keybuf_key *p, *w, s;
2416 s.key = *start;
2417
2418 if (bkey_cmp(end, &buf->start) <= 0 ||
2419 bkey_cmp(start, &buf->end) >= 0)
2420 return false;
2421
2422 spin_lock(&buf->lock);
2423 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2424
2425 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2426 p = w;
2427 w = RB_NEXT(w, node);
2428
2429 if (p->private)
2430 ret = true;
2431 else
2432 __bch_keybuf_del(buf, p);
2433 }
2434
2435 spin_unlock(&buf->lock);
2436 return ret;
2437}
2438
2439struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2440{
2441 struct keybuf_key *w;
2442 spin_lock(&buf->lock);
2443
2444 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2445
2446 while (w && w->private)
2447 w = RB_NEXT(w, node);
2448
2449 if (w)
2450 w->private = ERR_PTR(-EINTR);
2451
2452 spin_unlock(&buf->lock);
2453 return w;
2454}
2455
2456struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2457 struct keybuf *buf,
2458 struct bkey *end)
2459{
2460 struct keybuf_key *ret;
2461
2462 while (1) {
2463 ret = bch_keybuf_next(buf);
2464 if (ret)
2465 break;
2466
2467 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2468 pr_debug("scan finished");
2469 break;
2470 }
2471
2472 bch_refill_keybuf(c, buf, end);
2473 }
2474
2475 return ret;
2476}
2477
2478void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn)
2479{
2480 buf->key_predicate = fn;
2481 buf->last_scanned = MAX_KEY;
2482 buf->keys = RB_ROOT;
2483
2484 spin_lock_init(&buf->lock);
2485 array_allocator_init(&buf->freelist);
2486}
2487
2488void bch_btree_exit(void)
2489{
2490 if (btree_io_wq)
2491 destroy_workqueue(btree_io_wq);
2492 if (bch_gc_wq)
2493 destroy_workqueue(bch_gc_wq);
2494}
2495
2496int __init bch_btree_init(void)
2497{
2498 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
2499 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
2500 return -ENOMEM;
2501
2502 return 0;
2503}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
new file mode 100644
index 000000000000..af4a7092a28c
--- /dev/null
+++ b/drivers/md/bcache/btree.h
@@ -0,0 +1,405 @@
1#ifndef _BCACHE_BTREE_H
2#define _BCACHE_BTREE_H
3
4/*
5 * THE BTREE:
6 *
7 * At a high level, bcache's btree is relatively standard b+ tree. All keys and
8 * pointers are in the leaves; interior nodes only have pointers to the child
9 * nodes.
10 *
11 * In the interior nodes, a struct bkey always points to a child btree node, and
12 * the key is the highest key in the child node - except that the highest key in
13 * an interior node is always MAX_KEY. The size field refers to the size on disk
14 * of the child node - this would allow us to have variable sized btree nodes
15 * (handy for keeping the depth of the btree 1 by expanding just the root).
16 *
17 * Btree nodes are themselves log structured, but this is hidden fairly
18 * thoroughly. Btree nodes on disk will in practice have extents that overlap
19 * (because they were written at different times), but in memory we never have
20 * overlapping extents - when we read in a btree node from disk, the first thing
21 * we do is resort all the sets of keys with a mergesort, and in the same pass
22 * we check for overlapping extents and adjust them appropriately.
23 *
24 * struct btree_op is a central interface to the btree code. It's used for
25 * specifying read vs. write locking, and the embedded closure is used for
26 * waiting on IO or reserve memory.
27 *
28 * BTREE CACHE:
29 *
30 * Btree nodes are cached in memory; traversing the btree might require reading
31 * in btree nodes which is handled mostly transparently.
32 *
33 * bch_btree_node_get() looks up a btree node in the cache and reads it in from
34 * disk if necessary. This function is almost never called directly though - the
35 * btree() macro is used to get a btree node, call some function on it, and
36 * unlock the node after the function returns.
37 *
38 * The root is special cased - it's taken out of the cache's lru (thus pinning
39 * it in memory), so we can find the root of the btree by just dereferencing a
40 * pointer instead of looking it up in the cache. This makes locking a bit
41 * tricky, since the root pointer is protected by the lock in the btree node it
42 * points to - the btree_root() macro handles this.
43 *
44 * In various places we must be able to allocate memory for multiple btree nodes
45 * in order to make forward progress. To do this we use the btree cache itself
46 * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
47 * cache we can reuse. We can't allow more than one thread to be doing this at a
48 * time, so there's a lock, implemented by a pointer to the btree_op closure -
49 * this allows the btree_root() macro to implicitly release this lock.
50 *
51 * BTREE IO:
52 *
53 * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
54 * this.
55 *
56 * For writing, we have two btree_write structs embeddded in struct btree - one
57 * write in flight, and one being set up, and we toggle between them.
58 *
59 * Writing is done with a single function - bch_btree_write() really serves two
60 * different purposes and should be broken up into two different functions. When
61 * passing now = false, it merely indicates that the node is now dirty - calling
62 * it ensures that the dirty keys will be written at some point in the future.
63 *
64 * When passing now = true, bch_btree_write() causes a write to happen
65 * "immediately" (if there was already a write in flight, it'll cause the write
66 * to happen as soon as the previous write completes). It returns immediately
67 * though - but it takes a refcount on the closure in struct btree_op you passed
68 * to it, so a closure_sync() later can be used to wait for the write to
69 * complete.
70 *
71 * This is handy because btree_split() and garbage collection can issue writes
72 * in parallel, reducing the amount of time they have to hold write locks.
73 *
74 * LOCKING:
75 *
76 * When traversing the btree, we may need write locks starting at some level -
77 * inserting a key into the btree will typically only require a write lock on
78 * the leaf node.
79 *
80 * This is specified with the lock field in struct btree_op; lock = 0 means we
81 * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
82 * checks this field and returns the node with the appropriate lock held.
83 *
84 * If, after traversing the btree, the insertion code discovers it has to split
85 * then it must restart from the root and take new locks - to do this it changes
86 * the lock field and returns -EINTR, which causes the btree_root() macro to
87 * loop.
88 *
89 * Handling cache misses require a different mechanism for upgrading to a write
90 * lock. We do cache lookups with only a read lock held, but if we get a cache
91 * miss and we wish to insert this data into the cache, we have to insert a
92 * placeholder key to detect races - otherwise, we could race with a write and
93 * overwrite the data that was just written to the cache with stale data from
94 * the backing device.
95 *
96 * For this we use a sequence number that write locks and unlocks increment - to
97 * insert the check key it unlocks the btree node and then takes a write lock,
98 * and fails if the sequence number doesn't match.
99 */
100
101#include "bset.h"
102#include "debug.h"
103
104struct btree_write {
105 struct closure *owner;
106 atomic_t *journal;
107
108 /* If btree_split() frees a btree node, it writes a new pointer to that
109 * btree node indicating it was freed; it takes a refcount on
110 * c->prio_blocked because we can't write the gens until the new
111 * pointer is on disk. This allows btree_write_endio() to release the
112 * refcount that btree_split() took.
113 */
114 int prio_blocked;
115};
116
117struct btree {
118 /* Hottest entries first */
119 struct hlist_node hash;
120
121 /* Key/pointer for this btree node */
122 BKEY_PADDED(key);
123
124 /* Single bit - set when accessed, cleared by shrinker */
125 unsigned long accessed;
126 unsigned long seq;
127 struct rw_semaphore lock;
128 struct cache_set *c;
129
130 unsigned long flags;
131 uint16_t written; /* would be nice to kill */
132 uint8_t level;
133 uint8_t nsets;
134 uint8_t page_order;
135
136 /*
137 * Set of sorted keys - the real btree node - plus a binary search tree
138 *
139 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
140 * to the memory we have allocated for this btree node. Additionally,
141 * set[0]->data points to the entire btree node as it exists on disk.
142 */
143 struct bset_tree sets[MAX_BSETS];
144
145 /* Used to refcount bio splits, also protects b->bio */
146 struct closure_with_waitlist io;
147
148 /* Gets transferred to w->prio_blocked - see the comment there */
149 int prio_blocked;
150
151 struct list_head list;
152 struct delayed_work work;
153
154 uint64_t io_start_time;
155 struct btree_write writes[2];
156 struct bio *bio;
157};
158
159#define BTREE_FLAG(flag) \
160static inline bool btree_node_ ## flag(struct btree *b) \
161{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
162 \
163static inline void set_btree_node_ ## flag(struct btree *b) \
164{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
165
166enum btree_flags {
167 BTREE_NODE_read_done,
168 BTREE_NODE_io_error,
169 BTREE_NODE_dirty,
170 BTREE_NODE_write_idx,
171};
172
173BTREE_FLAG(read_done);
174BTREE_FLAG(io_error);
175BTREE_FLAG(dirty);
176BTREE_FLAG(write_idx);
177
178static inline struct btree_write *btree_current_write(struct btree *b)
179{
180 return b->writes + btree_node_write_idx(b);
181}
182
183static inline struct btree_write *btree_prev_write(struct btree *b)
184{
185 return b->writes + (btree_node_write_idx(b) ^ 1);
186}
187
188static inline unsigned bset_offset(struct btree *b, struct bset *i)
189{
190 return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
191}
192
193static inline struct bset *write_block(struct btree *b)
194{
195 return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
196}
197
198static inline bool bset_written(struct btree *b, struct bset_tree *t)
199{
200 return t->data < write_block(b);
201}
202
203static inline bool bkey_written(struct btree *b, struct bkey *k)
204{
205 return k < write_block(b)->start;
206}
207
208static inline void set_gc_sectors(struct cache_set *c)
209{
210 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
211}
212
213static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
214{
215 return __bch_ptr_invalid(b->c, b->level, k);
216}
217
218static inline struct bkey *bch_btree_iter_init(struct btree *b,
219 struct btree_iter *iter,
220 struct bkey *search)
221{
222 return __bch_btree_iter_init(b, iter, search, b->sets);
223}
224
225/* Looping macros */
226
227#define for_each_cached_btree(b, c, iter) \
228 for (iter = 0; \
229 iter < ARRAY_SIZE((c)->bucket_hash); \
230 iter++) \
231 hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
232
233#define for_each_key_filter(b, k, iter, filter) \
234 for (bch_btree_iter_init((b), (iter), NULL); \
235 ((k) = bch_btree_iter_next_filter((iter), b, filter));)
236
237#define for_each_key(b, k, iter) \
238 for (bch_btree_iter_init((b), (iter), NULL); \
239 ((k) = bch_btree_iter_next(iter));)
240
241/* Recursing down the btree */
242
243struct btree_op {
244 struct closure cl;
245 struct cache_set *c;
246
247 /* Journal entry we have a refcount on */
248 atomic_t *journal;
249
250 /* Bio to be inserted into the cache */
251 struct bio *cache_bio;
252
253 unsigned inode;
254
255 uint16_t write_prio;
256
257 /* Btree level at which we start taking write locks */
258 short lock;
259
260 /* Btree insertion type */
261 enum {
262 BTREE_INSERT,
263 BTREE_REPLACE
264 } type:8;
265
266 unsigned csum:1;
267 unsigned skip:1;
268 unsigned flush_journal:1;
269
270 unsigned insert_data_done:1;
271 unsigned lookup_done:1;
272 unsigned insert_collision:1;
273
274 /* Anything after this point won't get zeroed in do_bio_hook() */
275
276 /* Keys to be inserted */
277 struct keylist keys;
278 BKEY_PADDED(replace);
279};
280
281void bch_btree_op_init_stack(struct btree_op *);
282
283static inline void rw_lock(bool w, struct btree *b, int level)
284{
285 w ? down_write_nested(&b->lock, level + 1)
286 : down_read_nested(&b->lock, level + 1);
287 if (w)
288 b->seq++;
289}
290
291static inline void rw_unlock(bool w, struct btree *b)
292{
293#ifdef CONFIG_BCACHE_EDEBUG
294 unsigned i;
295
296 if (w &&
297 b->key.ptr[0] &&
298 btree_node_read_done(b))
299 for (i = 0; i <= b->nsets; i++)
300 bch_check_key_order(b, b->sets[i].data);
301#endif
302
303 if (w)
304 b->seq++;
305 (w ? up_write : up_read)(&b->lock);
306}
307
308#define insert_lock(s, b) ((b)->level <= (s)->lock)
309
310/*
311 * These macros are for recursing down the btree - they handle the details of
312 * locking and looking up nodes in the cache for you. They're best treated as
313 * mere syntax when reading code that uses them.
314 *
315 * op->lock determines whether we take a read or a write lock at a given depth.
316 * If you've got a read lock and find that you need a write lock (i.e. you're
317 * going to have to split), set op->lock and return -EINTR; btree_root() will
318 * call you again and you'll have the correct lock.
319 */
320
321/**
322 * btree - recurse down the btree on a specified key
323 * @fn: function to call, which will be passed the child node
324 * @key: key to recurse on
325 * @b: parent btree node
326 * @op: pointer to struct btree_op
327 */
328#define btree(fn, key, b, op, ...) \
329({ \
330 int _r, l = (b)->level - 1; \
331 bool _w = l <= (op)->lock; \
332 struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \
333 if (!IS_ERR(_b)) { \
334 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
335 rw_unlock(_w, _b); \
336 } else \
337 _r = PTR_ERR(_b); \
338 _r; \
339})
340
341/**
342 * btree_root - call a function on the root of the btree
343 * @fn: function to call, which will be passed the child node
344 * @c: cache set
345 * @op: pointer to struct btree_op
346 */
347#define btree_root(fn, c, op, ...) \
348({ \
349 int _r = -EINTR; \
350 do { \
351 struct btree *_b = (c)->root; \
352 bool _w = insert_lock(op, _b); \
353 rw_lock(_w, _b, _b->level); \
354 if (_b == (c)->root && \
355 _w == insert_lock(op, _b)) \
356 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
357 rw_unlock(_w, _b); \
358 bch_cannibalize_unlock(c, &(op)->cl); \
359 } while (_r == -EINTR); \
360 \
361 _r; \
362})
363
364static inline bool should_split(struct btree *b)
365{
366 struct bset *i = write_block(b);
367 return b->written >= btree_blocks(b) ||
368 (i->seq == b->sets[0].data->seq &&
369 b->written + __set_blocks(i, i->keys + 15, b->c)
370 > btree_blocks(b));
371}
372
373void bch_btree_read_done(struct closure *);
374void bch_btree_read(struct btree *);
375void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
376
377void bch_cannibalize_unlock(struct cache_set *, struct closure *);
378void bch_btree_set_root(struct btree *);
379struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
380struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
381 int, struct btree_op *);
382
383bool bch_btree_insert_keys(struct btree *, struct btree_op *);
384bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
385 struct bio *);
386int bch_btree_insert(struct btree_op *, struct cache_set *);
387
388int bch_btree_search_recurse(struct btree *, struct btree_op *);
389
390void bch_queue_gc(struct cache_set *);
391size_t bch_btree_gc_finish(struct cache_set *);
392void bch_moving_gc(struct closure *);
393int bch_btree_check(struct cache_set *, struct btree_op *);
394uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
395
396void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
397void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
398bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
399 struct bkey *);
400void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
401struct keybuf_key *bch_keybuf_next(struct keybuf *);
402struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
403 struct keybuf *, struct bkey *);
404
405#endif
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
new file mode 100644
index 000000000000..d6fbec0f8484
--- /dev/null
+++ b/drivers/md/bcache/closure.c
@@ -0,0 +1,348 @@
1/*
2 * Asynchronous refcounty things
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include <linux/debugfs.h>
9#include <linux/module.h>
10#include <linux/seq_file.h>
11
12#include "closure.h"
13
14void closure_queue(struct closure *cl)
15{
16 struct workqueue_struct *wq = cl->wq;
17 if (wq) {
18 INIT_WORK(&cl->work, cl->work.func);
19 BUG_ON(!queue_work(wq, &cl->work));
20 } else
21 cl->fn(cl);
22}
23EXPORT_SYMBOL_GPL(closure_queue);
24
25#define CL_FIELD(type, field) \
26 case TYPE_ ## type: \
27 return &container_of(cl, struct type, cl)->field
28
29static struct closure_waitlist *closure_waitlist(struct closure *cl)
30{
31 switch (cl->type) {
32 CL_FIELD(closure_with_waitlist, wait);
33 CL_FIELD(closure_with_waitlist_and_timer, wait);
34 default:
35 return NULL;
36 }
37}
38
39static struct timer_list *closure_timer(struct closure *cl)
40{
41 switch (cl->type) {
42 CL_FIELD(closure_with_timer, timer);
43 CL_FIELD(closure_with_waitlist_and_timer, timer);
44 default:
45 return NULL;
46 }
47}
48
49static inline void closure_put_after_sub(struct closure *cl, int flags)
50{
51 int r = flags & CLOSURE_REMAINING_MASK;
52
53 BUG_ON(flags & CLOSURE_GUARD_MASK);
54 BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING)));
55
56 /* Must deliver precisely one wakeup */
57 if (r == 1 && (flags & CLOSURE_SLEEPING))
58 wake_up_process(cl->task);
59
60 if (!r) {
61 if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
62 /* CLOSURE_BLOCKING might be set - clear it */
63 atomic_set(&cl->remaining,
64 CLOSURE_REMAINING_INITIALIZER);
65 closure_queue(cl);
66 } else {
67 struct closure *parent = cl->parent;
68 struct closure_waitlist *wait = closure_waitlist(cl);
69
70 closure_debug_destroy(cl);
71
72 atomic_set(&cl->remaining, -1);
73
74 if (wait)
75 closure_wake_up(wait);
76
77 if (cl->fn)
78 cl->fn(cl);
79
80 if (parent)
81 closure_put(parent);
82 }
83 }
84}
85
86/* For clearing flags with the same atomic op as a put */
87void closure_sub(struct closure *cl, int v)
88{
89 closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
90}
91EXPORT_SYMBOL_GPL(closure_sub);
92
93void closure_put(struct closure *cl)
94{
95 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
96}
97EXPORT_SYMBOL_GPL(closure_put);
98
99static void set_waiting(struct closure *cl, unsigned long f)
100{
101#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
102 cl->waiting_on = f;
103#endif
104}
105
106void __closure_wake_up(struct closure_waitlist *wait_list)
107{
108 struct llist_node *list;
109 struct closure *cl;
110 struct llist_node *reverse = NULL;
111
112 list = llist_del_all(&wait_list->list);
113
114 /* We first reverse the list to preserve FIFO ordering and fairness */
115
116 while (list) {
117 struct llist_node *t = list;
118 list = llist_next(list);
119
120 t->next = reverse;
121 reverse = t;
122 }
123
124 /* Then do the wakeups */
125
126 while (reverse) {
127 cl = container_of(reverse, struct closure, list);
128 reverse = llist_next(reverse);
129
130 set_waiting(cl, 0);
131 closure_sub(cl, CLOSURE_WAITING + 1);
132 }
133}
134EXPORT_SYMBOL_GPL(__closure_wake_up);
135
136bool closure_wait(struct closure_waitlist *list, struct closure *cl)
137{
138 if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
139 return false;
140
141 set_waiting(cl, _RET_IP_);
142 atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
143 llist_add(&cl->list, &list->list);
144
145 return true;
146}
147EXPORT_SYMBOL_GPL(closure_wait);
148
149/**
150 * closure_sync() - sleep until a closure a closure has nothing left to wait on
151 *
152 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
153 * the last refcount.
154 */
155void closure_sync(struct closure *cl)
156{
157 while (1) {
158 __closure_start_sleep(cl);
159 closure_set_ret_ip(cl);
160
161 if ((atomic_read(&cl->remaining) &
162 CLOSURE_REMAINING_MASK) == 1)
163 break;
164
165 schedule();
166 }
167
168 __closure_end_sleep(cl);
169}
170EXPORT_SYMBOL_GPL(closure_sync);
171
172/**
173 * closure_trylock() - try to acquire the closure, without waiting
174 * @cl: closure to lock
175 *
176 * Returns true if the closure was succesfully locked.
177 */
178bool closure_trylock(struct closure *cl, struct closure *parent)
179{
180 if (atomic_cmpxchg(&cl->remaining, -1,
181 CLOSURE_REMAINING_INITIALIZER) != -1)
182 return false;
183
184 closure_set_ret_ip(cl);
185
186 smp_mb();
187 cl->parent = parent;
188 if (parent)
189 closure_get(parent);
190
191 closure_debug_create(cl);
192 return true;
193}
194EXPORT_SYMBOL_GPL(closure_trylock);
195
196void __closure_lock(struct closure *cl, struct closure *parent,
197 struct closure_waitlist *wait_list)
198{
199 struct closure wait;
200 closure_init_stack(&wait);
201
202 while (1) {
203 if (closure_trylock(cl, parent))
204 return;
205
206 closure_wait_event_sync(wait_list, &wait,
207 atomic_read(&cl->remaining) == -1);
208 }
209}
210EXPORT_SYMBOL_GPL(__closure_lock);
211
212static void closure_delay_timer_fn(unsigned long data)
213{
214 struct closure *cl = (struct closure *) data;
215 closure_sub(cl, CLOSURE_TIMER + 1);
216}
217
218void do_closure_timer_init(struct closure *cl)
219{
220 struct timer_list *timer = closure_timer(cl);
221
222 init_timer(timer);
223 timer->data = (unsigned long) cl;
224 timer->function = closure_delay_timer_fn;
225}
226EXPORT_SYMBOL_GPL(do_closure_timer_init);
227
228bool __closure_delay(struct closure *cl, unsigned long delay,
229 struct timer_list *timer)
230{
231 if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
232 return false;
233
234 BUG_ON(timer_pending(timer));
235
236 timer->expires = jiffies + delay;
237
238 atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
239 add_timer(timer);
240 return true;
241}
242EXPORT_SYMBOL_GPL(__closure_delay);
243
244void __closure_flush(struct closure *cl, struct timer_list *timer)
245{
246 if (del_timer(timer))
247 closure_sub(cl, CLOSURE_TIMER + 1);
248}
249EXPORT_SYMBOL_GPL(__closure_flush);
250
251void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
252{
253 if (del_timer_sync(timer))
254 closure_sub(cl, CLOSURE_TIMER + 1);
255}
256EXPORT_SYMBOL_GPL(__closure_flush_sync);
257
258#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
259
260static LIST_HEAD(closure_list);
261static DEFINE_SPINLOCK(closure_list_lock);
262
263void closure_debug_create(struct closure *cl)
264{
265 unsigned long flags;
266
267 BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
268 cl->magic = CLOSURE_MAGIC_ALIVE;
269
270 spin_lock_irqsave(&closure_list_lock, flags);
271 list_add(&cl->all, &closure_list);
272 spin_unlock_irqrestore(&closure_list_lock, flags);
273}
274EXPORT_SYMBOL_GPL(closure_debug_create);
275
276void closure_debug_destroy(struct closure *cl)
277{
278 unsigned long flags;
279
280 BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
281 cl->magic = CLOSURE_MAGIC_DEAD;
282
283 spin_lock_irqsave(&closure_list_lock, flags);
284 list_del(&cl->all);
285 spin_unlock_irqrestore(&closure_list_lock, flags);
286}
287EXPORT_SYMBOL_GPL(closure_debug_destroy);
288
289static struct dentry *debug;
290
291#define work_data_bits(work) ((unsigned long *)(&(work)->data))
292
293static int debug_seq_show(struct seq_file *f, void *data)
294{
295 struct closure *cl;
296 spin_lock_irq(&closure_list_lock);
297
298 list_for_each_entry(cl, &closure_list, all) {
299 int r = atomic_read(&cl->remaining);
300
301 seq_printf(f, "%p: %pF -> %pf p %p r %i ",
302 cl, (void *) cl->ip, cl->fn, cl->parent,
303 r & CLOSURE_REMAINING_MASK);
304
305 seq_printf(f, "%s%s%s%s%s%s\n",
306 test_bit(WORK_STRUCT_PENDING,
307 work_data_bits(&cl->work)) ? "Q" : "",
308 r & CLOSURE_RUNNING ? "R" : "",
309 r & CLOSURE_BLOCKING ? "B" : "",
310 r & CLOSURE_STACK ? "S" : "",
311 r & CLOSURE_SLEEPING ? "Sl" : "",
312 r & CLOSURE_TIMER ? "T" : "");
313
314 if (r & CLOSURE_WAITING)
315 seq_printf(f, " W %pF\n",
316 (void *) cl->waiting_on);
317
318 seq_printf(f, "\n");
319 }
320
321 spin_unlock_irq(&closure_list_lock);
322 return 0;
323}
324
325static int debug_seq_open(struct inode *inode, struct file *file)
326{
327 return single_open(file, debug_seq_show, NULL);
328}
329
330static const struct file_operations debug_ops = {
331 .owner = THIS_MODULE,
332 .open = debug_seq_open,
333 .read = seq_read,
334 .release = single_release
335};
336
337int __init closure_debug_init(void)
338{
339 debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
340 return 0;
341}
342
343module_init(closure_debug_init);
344
345#endif
346
347MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
348MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
new file mode 100644
index 000000000000..3f31d599ea56
--- /dev/null
+++ b/drivers/md/bcache/closure.h
@@ -0,0 +1,670 @@
1#ifndef _LINUX_CLOSURE_H
2#define _LINUX_CLOSURE_H
3
4#include <linux/llist.h>
5#include <linux/sched.h>
6#include <linux/workqueue.h>
7
8/*
9 * Closure is perhaps the most overused and abused term in computer science, but
10 * since I've been unable to come up with anything better you're stuck with it
11 * again.
12 *
13 * What are closures?
14 *
15 * They embed a refcount. The basic idea is they count "things that are in
16 * progress" - in flight bios, some other thread that's doing something else -
17 * anything you might want to wait on.
18 *
19 * The refcount may be manipulated with closure_get() and closure_put().
20 * closure_put() is where many of the interesting things happen, when it causes
21 * the refcount to go to 0.
22 *
23 * Closures can be used to wait on things both synchronously and asynchronously,
24 * and synchronous and asynchronous use can be mixed without restriction. To
25 * wait synchronously, use closure_sync() - you will sleep until your closure's
26 * refcount hits 1.
27 *
28 * To wait asynchronously, use
29 * continue_at(cl, next_function, workqueue);
30 *
31 * passing it, as you might expect, the function to run when nothing is pending
32 * and the workqueue to run that function out of.
33 *
34 * continue_at() also, critically, is a macro that returns the calling function.
35 * There's good reason for this.
36 *
37 * To use safely closures asynchronously, they must always have a refcount while
38 * they are running owned by the thread that is running them. Otherwise, suppose
39 * you submit some bios and wish to have a function run when they all complete:
40 *
41 * foo_endio(struct bio *bio, int error)
42 * {
43 * closure_put(cl);
44 * }
45 *
46 * closure_init(cl);
47 *
48 * do_stuff();
49 * closure_get(cl);
50 * bio1->bi_endio = foo_endio;
51 * bio_submit(bio1);
52 *
53 * do_more_stuff();
54 * closure_get(cl);
55 * bio2->bi_endio = foo_endio;
56 * bio_submit(bio2);
57 *
58 * continue_at(cl, complete_some_read, system_wq);
59 *
60 * If closure's refcount started at 0, complete_some_read() could run before the
61 * second bio was submitted - which is almost always not what you want! More
62 * importantly, it wouldn't be possible to say whether the original thread or
63 * complete_some_read()'s thread owned the closure - and whatever state it was
64 * associated with!
65 *
66 * So, closure_init() initializes a closure's refcount to 1 - and when a
67 * closure_fn is run, the refcount will be reset to 1 first.
68 *
69 * Then, the rule is - if you got the refcount with closure_get(), release it
70 * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
71 * on a closure because you called closure_init() or you were run out of a
72 * closure - _always_ use continue_at(). Doing so consistently will help
73 * eliminate an entire class of particularly pernicious races.
74 *
75 * For a closure to wait on an arbitrary event, we need to introduce waitlists:
76 *
77 * struct closure_waitlist list;
78 * closure_wait_event(list, cl, condition);
79 * closure_wake_up(wait_list);
80 *
81 * These work analagously to wait_event() and wake_up() - except that instead of
82 * operating on the current thread (for wait_event()) and lists of threads, they
83 * operate on an explicit closure and lists of closures.
84 *
85 * Because it's a closure we can now wait either synchronously or
86 * asynchronously. closure_wait_event() returns the current value of the
87 * condition, and if it returned false continue_at() or closure_sync() can be
88 * used to wait for it to become true.
89 *
90 * It's useful for waiting on things when you can't sleep in the context in
91 * which you must check the condition (perhaps a spinlock held, or you might be
92 * beneath generic_make_request() - in which case you can't sleep on IO).
93 *
94 * closure_wait_event() will wait either synchronously or asynchronously,
95 * depending on whether the closure is in blocking mode or not. You can pick a
96 * mode explicitly with closure_wait_event_sync() and
97 * closure_wait_event_async(), which do just what you might expect.
98 *
99 * Lastly, you might have a wait list dedicated to a specific event, and have no
100 * need for specifying the condition - you just want to wait until someone runs
101 * closure_wake_up() on the appropriate wait list. In that case, just use
102 * closure_wait(). It will return either true or false, depending on whether the
103 * closure was already on a wait list or not - a closure can only be on one wait
104 * list at a time.
105 *
106 * Parents:
107 *
108 * closure_init() takes two arguments - it takes the closure to initialize, and
109 * a (possibly null) parent.
110 *
111 * If parent is non null, the new closure will have a refcount for its lifetime;
112 * a closure is considered to be "finished" when its refcount hits 0 and the
113 * function to run is null. Hence
114 *
115 * continue_at(cl, NULL, NULL);
116 *
117 * returns up the (spaghetti) stack of closures, precisely like normal return
118 * returns up the C stack. continue_at() with non null fn is better thought of
119 * as doing a tail call.
120 *
121 * All this implies that a closure should typically be embedded in a particular
122 * struct (which its refcount will normally control the lifetime of), and that
123 * struct can very much be thought of as a stack frame.
124 *
125 * Locking:
126 *
127 * Closures are based on work items but they can be thought of as more like
128 * threads - in that like threads and unlike work items they have a well
129 * defined lifetime; they are created (with closure_init()) and eventually
130 * complete after a continue_at(cl, NULL, NULL).
131 *
132 * Suppose you've got some larger structure with a closure embedded in it that's
133 * used for periodically doing garbage collection. You only want one garbage
134 * collection happening at a time, so the natural thing to do is protect it with
135 * a lock. However, it's difficult to use a lock protecting a closure correctly
136 * because the unlock should come after the last continue_to() (additionally, if
137 * you're using the closure asynchronously a mutex won't work since a mutex has
138 * to be unlocked by the same process that locked it).
139 *
140 * So to make it less error prone and more efficient, we also have the ability
141 * to use closures as locks:
142 *
143 * closure_init_unlocked();
144 * closure_trylock();
145 *
146 * That's all we need for trylock() - the last closure_put() implicitly unlocks
147 * it for you. But for closure_lock(), we also need a wait list:
148 *
149 * struct closure_with_waitlist frobnicator_cl;
150 *
151 * closure_init_unlocked(&frobnicator_cl);
152 * closure_lock(&frobnicator_cl);
153 *
154 * A closure_with_waitlist embeds a closure and a wait list - much like struct
155 * delayed_work embeds a work item and a timer_list. The important thing is, use
156 * it exactly like you would a regular closure and closure_put() will magically
157 * handle everything for you.
158 *
159 * We've got closures that embed timers, too. They're called, appropriately
160 * enough:
161 * struct closure_with_timer;
162 *
163 * This gives you access to closure_delay(). It takes a refcount for a specified
164 * number of jiffies - you could then call closure_sync() (for a slightly
165 * convoluted version of msleep()) or continue_at() - which gives you the same
166 * effect as using a delayed work item, except you can reuse the work_struct
167 * already embedded in struct closure.
168 *
169 * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
170 * probably expect, if you happen to need the features of both. (You don't
171 * really want to know how all this is implemented, but if I've done my job
172 * right you shouldn't have to care).
173 */
174
175struct closure;
176typedef void (closure_fn) (struct closure *);
177
178struct closure_waitlist {
179 struct llist_head list;
180};
181
182enum closure_type {
183 TYPE_closure = 0,
184 TYPE_closure_with_waitlist = 1,
185 TYPE_closure_with_timer = 2,
186 TYPE_closure_with_waitlist_and_timer = 3,
187 MAX_CLOSURE_TYPE = 3,
188};
189
190enum closure_state {
191 /*
192 * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
193 * waiting asynchronously
194 *
195 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
196 * the thread that owns the closure, and cleared by the thread that's
197 * waking up the closure.
198 *
199 * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
200 * - indicates that cl->task is valid and closure_put() may wake it up.
201 * Only set or cleared by the thread that owns the closure.
202 *
203 * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
204 * has an outstanding timer. Must be set by the thread that owns the
205 * closure, and cleared by the timer function when the timer goes off.
206 *
207 * The rest are for debugging and don't affect behaviour:
208 *
209 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
210 * closure_init() and when closure_put() runs then next function), and
211 * must be cleared before remaining hits 0. Primarily to help guard
212 * against incorrect usage and accidentally transferring references.
213 * continue_at() and closure_return() clear it for you, if you're doing
214 * something unusual you can use closure_set_dead() which also helps
215 * annotate where references are being transferred.
216 *
217 * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
218 * closure with this flag set
219 */
220
221 CLOSURE_BITS_START = (1 << 19),
222 CLOSURE_DESTRUCTOR = (1 << 19),
223 CLOSURE_BLOCKING = (1 << 21),
224 CLOSURE_WAITING = (1 << 23),
225 CLOSURE_SLEEPING = (1 << 25),
226 CLOSURE_TIMER = (1 << 27),
227 CLOSURE_RUNNING = (1 << 29),
228 CLOSURE_STACK = (1 << 31),
229};
230
231#define CLOSURE_GUARD_MASK \
232 ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \
233 CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1)
234
235#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
236#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
237
238struct closure {
239 union {
240 struct {
241 struct workqueue_struct *wq;
242 struct task_struct *task;
243 struct llist_node list;
244 closure_fn *fn;
245 };
246 struct work_struct work;
247 };
248
249 struct closure *parent;
250
251 atomic_t remaining;
252
253 enum closure_type type;
254
255#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
256#define CLOSURE_MAGIC_DEAD 0xc054dead
257#define CLOSURE_MAGIC_ALIVE 0xc054a11e
258
259 unsigned magic;
260 struct list_head all;
261 unsigned long ip;
262 unsigned long waiting_on;
263#endif
264};
265
266struct closure_with_waitlist {
267 struct closure cl;
268 struct closure_waitlist wait;
269};
270
271struct closure_with_timer {
272 struct closure cl;
273 struct timer_list timer;
274};
275
276struct closure_with_waitlist_and_timer {
277 struct closure cl;
278 struct closure_waitlist wait;
279 struct timer_list timer;
280};
281
282extern unsigned invalid_closure_type(void);
283
284#define __CLOSURE_TYPE(cl, _t) \
285 __builtin_types_compatible_p(typeof(cl), struct _t) \
286 ? TYPE_ ## _t : \
287
288#define __closure_type(cl) \
289( \
290 __CLOSURE_TYPE(cl, closure) \
291 __CLOSURE_TYPE(cl, closure_with_waitlist) \
292 __CLOSURE_TYPE(cl, closure_with_timer) \
293 __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \
294 invalid_closure_type() \
295)
296
297void closure_sub(struct closure *cl, int v);
298void closure_put(struct closure *cl);
299void closure_queue(struct closure *cl);
300void __closure_wake_up(struct closure_waitlist *list);
301bool closure_wait(struct closure_waitlist *list, struct closure *cl);
302void closure_sync(struct closure *cl);
303
304bool closure_trylock(struct closure *cl, struct closure *parent);
305void __closure_lock(struct closure *cl, struct closure *parent,
306 struct closure_waitlist *wait_list);
307
308void do_closure_timer_init(struct closure *cl);
309bool __closure_delay(struct closure *cl, unsigned long delay,
310 struct timer_list *timer);
311void __closure_flush(struct closure *cl, struct timer_list *timer);
312void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
313
314#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
315
316void closure_debug_create(struct closure *cl);
317void closure_debug_destroy(struct closure *cl);
318
319#else
320
321static inline void closure_debug_create(struct closure *cl) {}
322static inline void closure_debug_destroy(struct closure *cl) {}
323
324#endif
325
326static inline void closure_set_ip(struct closure *cl)
327{
328#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
329 cl->ip = _THIS_IP_;
330#endif
331}
332
333static inline void closure_set_ret_ip(struct closure *cl)
334{
335#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
336 cl->ip = _RET_IP_;
337#endif
338}
339
340static inline void closure_get(struct closure *cl)
341{
342#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
343 BUG_ON((atomic_inc_return(&cl->remaining) &
344 CLOSURE_REMAINING_MASK) <= 1);
345#else
346 atomic_inc(&cl->remaining);
347#endif
348}
349
350static inline void closure_set_stopped(struct closure *cl)
351{
352 atomic_sub(CLOSURE_RUNNING, &cl->remaining);
353}
354
355static inline bool closure_is_stopped(struct closure *cl)
356{
357 return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
358}
359
360static inline bool closure_is_unlocked(struct closure *cl)
361{
362 return atomic_read(&cl->remaining) == -1;
363}
364
365static inline void do_closure_init(struct closure *cl, struct closure *parent,
366 bool running)
367{
368 switch (cl->type) {
369 case TYPE_closure_with_timer:
370 case TYPE_closure_with_waitlist_and_timer:
371 do_closure_timer_init(cl);
372 default:
373 break;
374 }
375
376 cl->parent = parent;
377 if (parent)
378 closure_get(parent);
379
380 if (running) {
381 closure_debug_create(cl);
382 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
383 } else
384 atomic_set(&cl->remaining, -1);
385
386 closure_set_ip(cl);
387}
388
389/*
390 * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
391 * the result of __closure_type() is thrown away, it's used merely for type
392 * checking.
393 */
394#define __to_internal_closure(cl) \
395({ \
396 BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \
397 (struct closure *) cl; \
398})
399
400#define closure_init_type(cl, parent, running) \
401do { \
402 struct closure *_cl = __to_internal_closure(cl); \
403 _cl->type = __closure_type(*(cl)); \
404 do_closure_init(_cl, parent, running); \
405} while (0)
406
407/**
408 * __closure_init() - Initialize a closure, skipping the memset()
409 *
410 * May be used instead of closure_init() when memory has already been zeroed.
411 */
412#define __closure_init(cl, parent) \
413 closure_init_type(cl, parent, true)
414
415/**
416 * closure_init() - Initialize a closure, setting the refcount to 1
417 * @cl: closure to initialize
418 * @parent: parent of the new closure. cl will take a refcount on it for its
419 * lifetime; may be NULL.
420 */
421#define closure_init(cl, parent) \
422do { \
423 memset((cl), 0, sizeof(*(cl))); \
424 __closure_init(cl, parent); \
425} while (0)
426
427static inline void closure_init_stack(struct closure *cl)
428{
429 memset(cl, 0, sizeof(struct closure));
430 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|
431 CLOSURE_BLOCKING|CLOSURE_STACK);
432}
433
434/**
435 * closure_init_unlocked() - Initialize a closure but leave it unlocked.
436 * @cl: closure to initialize
437 *
438 * For when the closure will be used as a lock. The closure may not be used
439 * until after a closure_lock() or closure_trylock().
440 */
441#define closure_init_unlocked(cl) \
442do { \
443 memset((cl), 0, sizeof(*(cl))); \
444 closure_init_type(cl, NULL, false); \
445} while (0)
446
447/**
448 * closure_lock() - lock and initialize a closure.
449 * @cl: the closure to lock
450 * @parent: the new parent for this closure
451 *
452 * The closure must be of one of the types that has a waitlist (otherwise we
453 * wouldn't be able to sleep on contention).
454 *
455 * @parent has exactly the same meaning as in closure_init(); if non null, the
456 * closure will take a reference on @parent which will be released when it is
457 * unlocked.
458 */
459#define closure_lock(cl, parent) \
460 __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
461
462/**
463 * closure_delay() - delay some number of jiffies
464 * @cl: the closure that will sleep
465 * @delay: the delay in jiffies
466 *
467 * Takes a refcount on @cl which will be released after @delay jiffies; this may
468 * be used to have a function run after a delay with continue_at(), or
469 * closure_sync() may be used for a convoluted version of msleep().
470 */
471#define closure_delay(cl, delay) \
472 __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer)
473
474#define closure_flush(cl) \
475 __closure_flush(__to_internal_closure(cl), &(cl)->timer)
476
477#define closure_flush_sync(cl) \
478 __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
479
480static inline void __closure_end_sleep(struct closure *cl)
481{
482 __set_current_state(TASK_RUNNING);
483
484 if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
485 atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
486}
487
488static inline void __closure_start_sleep(struct closure *cl)
489{
490 closure_set_ip(cl);
491 cl->task = current;
492 set_current_state(TASK_UNINTERRUPTIBLE);
493
494 if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
495 atomic_add(CLOSURE_SLEEPING, &cl->remaining);
496}
497
498/**
499 * closure_blocking() - returns true if the closure is in blocking mode.
500 *
501 * If a closure is in blocking mode, closure_wait_event() will sleep until the
502 * condition is true instead of waiting asynchronously.
503 */
504static inline bool closure_blocking(struct closure *cl)
505{
506 return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
507}
508
509/**
510 * set_closure_blocking() - put a closure in blocking mode.
511 *
512 * If a closure is in blocking mode, closure_wait_event() will sleep until the
513 * condition is true instead of waiting asynchronously.
514 *
515 * Not thread safe - can only be called by the thread running the closure.
516 */
517static inline void set_closure_blocking(struct closure *cl)
518{
519 if (!closure_blocking(cl))
520 atomic_add(CLOSURE_BLOCKING, &cl->remaining);
521}
522
523/*
524 * Not thread safe - can only be called by the thread running the closure.
525 */
526static inline void clear_closure_blocking(struct closure *cl)
527{
528 if (closure_blocking(cl))
529 atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
530}
531
532/**
533 * closure_wake_up() - wake up all closures on a wait list.
534 */
535static inline void closure_wake_up(struct closure_waitlist *list)
536{
537 smp_mb();
538 __closure_wake_up(list);
539}
540
541/*
542 * Wait on an event, synchronously or asynchronously - analogous to wait_event()
543 * but for closures.
544 *
545 * The loop is oddly structured so as to avoid a race; we must check the
546 * condition again after we've added ourself to the waitlist. We know if we were
547 * already on the waitlist because closure_wait() returns false; thus, we only
548 * schedule or break if closure_wait() returns false. If it returns true, we
549 * just loop again - rechecking the condition.
550 *
551 * The __closure_wake_up() is necessary because we may race with the event
552 * becoming true; i.e. we see event false -> wait -> recheck condition, but the
553 * thread that made the event true may have called closure_wake_up() before we
554 * added ourself to the wait list.
555 *
556 * We have to call closure_sync() at the end instead of just
557 * __closure_end_sleep() because a different thread might've called
558 * closure_wake_up() before us and gotten preempted before they dropped the
559 * refcount on our closure. If this was a stack allocated closure, that would be
560 * bad.
561 */
562#define __closure_wait_event(list, cl, condition, _block) \
563({ \
564 bool block = _block; \
565 typeof(condition) ret; \
566 \
567 while (1) { \
568 ret = (condition); \
569 if (ret) { \
570 __closure_wake_up(list); \
571 if (block) \
572 closure_sync(cl); \
573 \
574 break; \
575 } \
576 \
577 if (block) \
578 __closure_start_sleep(cl); \
579 \
580 if (!closure_wait(list, cl)) { \
581 if (!block) \
582 break; \
583 \
584 schedule(); \
585 } \
586 } \
587 \
588 ret; \
589})
590
591/**
592 * closure_wait_event() - wait on a condition, synchronously or asynchronously.
593 * @list: the wait list to wait on
594 * @cl: the closure that is doing the waiting
595 * @condition: a C expression for the event to wait for
596 *
597 * If the closure is in blocking mode, sleeps until the @condition evaluates to
598 * true - exactly like wait_event().
599 *
600 * If the closure is not in blocking mode, waits asynchronously; if the
601 * condition is currently false the @cl is put onto @list and returns. @list
602 * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
603 * wait for another thread to wake up @list, which drops the refcount on @cl.
604 *
605 * Returns the value of @condition; @cl will be on @list iff @condition was
606 * false.
607 *
608 * closure_wake_up(@list) must be called after changing any variable that could
609 * cause @condition to become true.
610 */
611#define closure_wait_event(list, cl, condition) \
612 __closure_wait_event(list, cl, condition, closure_blocking(cl))
613
614#define closure_wait_event_async(list, cl, condition) \
615 __closure_wait_event(list, cl, condition, false)
616
617#define closure_wait_event_sync(list, cl, condition) \
618 __closure_wait_event(list, cl, condition, true)
619
620static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
621 struct workqueue_struct *wq)
622{
623 BUG_ON(object_is_on_stack(cl));
624 closure_set_ip(cl);
625 cl->fn = fn;
626 cl->wq = wq;
627 /* between atomic_dec() in closure_put() */
628 smp_mb__before_atomic_dec();
629}
630
631#define continue_at(_cl, _fn, _wq) \
632do { \
633 set_closure_fn(_cl, _fn, _wq); \
634 closure_sub(_cl, CLOSURE_RUNNING + 1); \
635 return; \
636} while (0)
637
638#define closure_return(_cl) continue_at((_cl), NULL, NULL)
639
640#define continue_at_nobarrier(_cl, _fn, _wq) \
641do { \
642 set_closure_fn(_cl, _fn, _wq); \
643 closure_queue(cl); \
644 return; \
645} while (0)
646
647#define closure_return_with_destructor(_cl, _destructor) \
648do { \
649 set_closure_fn(_cl, _destructor, NULL); \
650 closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
651 return; \
652} while (0)
653
654static inline void closure_call(struct closure *cl, closure_fn fn,
655 struct workqueue_struct *wq,
656 struct closure *parent)
657{
658 closure_init(cl, parent);
659 continue_at_nobarrier(cl, fn, wq);
660}
661
662static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
663 struct workqueue_struct *wq,
664 struct closure *parent)
665{
666 if (closure_trylock(cl, parent))
667 continue_at_nobarrier(cl, fn, wq);
668}
669
670#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
new file mode 100644
index 000000000000..4b37ef2b80e5
--- /dev/null
+++ b/drivers/md/bcache/debug.c
@@ -0,0 +1,563 @@
1/*
2 * Assorted bcache debug code
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include "bcache.h"
9#include "btree.h"
10#include "debug.h"
11#include "request.h"
12
13#include <linux/console.h>
14#include <linux/debugfs.h>
15#include <linux/module.h>
16#include <linux/random.h>
17#include <linux/seq_file.h>
18
19static struct dentry *debug;
20
21const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
22{
23 unsigned i;
24
25 for (i = 0; i < KEY_PTRS(k); i++)
26 if (ptr_available(c, k, i)) {
27 struct cache *ca = PTR_CACHE(c, k, i);
28 size_t bucket = PTR_BUCKET_NR(c, k, i);
29 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
30
31 if (KEY_SIZE(k) + r > c->sb.bucket_size)
32 return "bad, length too big";
33 if (bucket < ca->sb.first_bucket)
34 return "bad, short offset";
35 if (bucket >= ca->sb.nbuckets)
36 return "bad, offset past end of device";
37 if (ptr_stale(c, k, i))
38 return "stale";
39 }
40
41 if (!bkey_cmp(k, &ZERO_KEY))
42 return "bad, null key";
43 if (!KEY_PTRS(k))
44 return "bad, no pointers";
45 if (!KEY_SIZE(k))
46 return "zeroed key";
47 return "";
48}
49
50struct keyprint_hack bch_pkey(const struct bkey *k)
51{
52 unsigned i = 0;
53 struct keyprint_hack r;
54 char *out = r.s, *end = r.s + KEYHACK_SIZE;
55
56#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
57
58 p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
59
60 if (KEY_PTRS(k))
61 while (1) {
62 p("%llu:%llu gen %llu",
63 PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
64
65 if (++i == KEY_PTRS(k))
66 break;
67
68 p(", ");
69 }
70
71 p("]");
72
73 if (KEY_DIRTY(k))
74 p(" dirty");
75 if (KEY_CSUM(k))
76 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
77#undef p
78 return r;
79}
80
81struct keyprint_hack bch_pbtree(const struct btree *b)
82{
83 struct keyprint_hack r;
84
85 snprintf(r.s, 40, "%li level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
86 b->level, b->c->root ? b->c->root->level : -1);
87 return r;
88}
89
90#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
91
92static bool skipped_backwards(struct btree *b, struct bkey *k)
93{
94 return bkey_cmp(k, (!b->level)
95 ? &START_KEY(bkey_next(k))
96 : bkey_next(k)) > 0;
97}
98
99static void dump_bset(struct btree *b, struct bset *i)
100{
101 struct bkey *k;
102 unsigned j;
103
104 for (k = i->start; k < end(i); k = bkey_next(k)) {
105 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
106 (uint64_t *) k - i->d, i->keys, pkey(k));
107
108 for (j = 0; j < KEY_PTRS(k); j++) {
109 size_t n = PTR_BUCKET_NR(b->c, k, j);
110 printk(" bucket %zu", n);
111
112 if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
113 printk(" prio %i",
114 PTR_BUCKET(b->c, k, j)->prio);
115 }
116
117 printk(" %s\n", bch_ptr_status(b->c, k));
118
119 if (bkey_next(k) < end(i) &&
120 skipped_backwards(b, k))
121 printk(KERN_ERR "Key skipped backwards\n");
122 }
123}
124
125#endif
126
127#ifdef CONFIG_BCACHE_DEBUG
128
129void bch_btree_verify(struct btree *b, struct bset *new)
130{
131 struct btree *v = b->c->verify_data;
132 struct closure cl;
133 closure_init_stack(&cl);
134
135 if (!b->c->verify)
136 return;
137
138 closure_wait_event(&b->io.wait, &cl,
139 atomic_read(&b->io.cl.remaining) == -1);
140
141 mutex_lock(&b->c->verify_lock);
142
143 bkey_copy(&v->key, &b->key);
144 v->written = 0;
145 v->level = b->level;
146
147 bch_btree_read(v);
148 closure_wait_event(&v->io.wait, &cl,
149 atomic_read(&b->io.cl.remaining) == -1);
150
151 if (new->keys != v->sets[0].data->keys ||
152 memcmp(new->start,
153 v->sets[0].data->start,
154 (void *) end(new) - (void *) new->start)) {
155 unsigned i, j;
156
157 console_lock();
158
159 printk(KERN_ERR "*** original memory node:\n");
160 for (i = 0; i <= b->nsets; i++)
161 dump_bset(b, b->sets[i].data);
162
163 printk(KERN_ERR "*** sorted memory node:\n");
164 dump_bset(b, new);
165
166 printk(KERN_ERR "*** on disk node:\n");
167 dump_bset(v, v->sets[0].data);
168
169 for (j = 0; j < new->keys; j++)
170 if (new->d[j] != v->sets[0].data->d[j])
171 break;
172
173 console_unlock();
174 panic("verify failed at %u\n", j);
175 }
176
177 mutex_unlock(&b->c->verify_lock);
178}
179
180static void data_verify_endio(struct bio *bio, int error)
181{
182 struct closure *cl = bio->bi_private;
183 closure_put(cl);
184}
185
186void bch_data_verify(struct search *s)
187{
188 char name[BDEVNAME_SIZE];
189 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
190 struct closure *cl = &s->cl;
191 struct bio *check;
192 struct bio_vec *bv;
193 int i;
194
195 if (!s->unaligned_bvec)
196 bio_for_each_segment(bv, s->orig_bio, i)
197 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
198
199 check = bio_clone(s->orig_bio, GFP_NOIO);
200 if (!check)
201 return;
202
203 if (bio_alloc_pages(check, GFP_NOIO))
204 goto out_put;
205
206 check->bi_rw = READ_SYNC;
207 check->bi_private = cl;
208 check->bi_end_io = data_verify_endio;
209
210 closure_bio_submit(check, cl, &dc->disk);
211 closure_sync(cl);
212
213 bio_for_each_segment(bv, s->orig_bio, i) {
214 void *p1 = kmap(bv->bv_page);
215 void *p2 = kmap(check->bi_io_vec[i].bv_page);
216
217 if (memcmp(p1 + bv->bv_offset,
218 p2 + bv->bv_offset,
219 bv->bv_len))
220 printk(KERN_ERR "bcache (%s): verify failed"
221 " at sector %llu\n",
222 bdevname(dc->bdev, name),
223 (uint64_t) s->orig_bio->bi_sector);
224
225 kunmap(bv->bv_page);
226 kunmap(check->bi_io_vec[i].bv_page);
227 }
228
229 __bio_for_each_segment(bv, check, i, 0)
230 __free_page(bv->bv_page);
231out_put:
232 bio_put(check);
233}
234
235#endif
236
237#ifdef CONFIG_BCACHE_EDEBUG
238
239unsigned bch_count_data(struct btree *b)
240{
241 unsigned ret = 0;
242 struct btree_iter iter;
243 struct bkey *k;
244
245 if (!b->level)
246 for_each_key(b, k, &iter)
247 ret += KEY_SIZE(k);
248 return ret;
249}
250
251static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
252 va_list args)
253{
254 unsigned i;
255
256 console_lock();
257
258 for (i = 0; i <= b->nsets; i++)
259 dump_bset(b, b->sets[i].data);
260
261 vprintk(fmt, args);
262
263 console_unlock();
264
265 panic("at %s\n", pbtree(b));
266}
267
268void bch_check_key_order_msg(struct btree *b, struct bset *i,
269 const char *fmt, ...)
270{
271 struct bkey *k;
272
273 if (!i->keys)
274 return;
275
276 for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
277 if (skipped_backwards(b, k)) {
278 va_list args;
279 va_start(args, fmt);
280
281 vdump_bucket_and_panic(b, fmt, args);
282 va_end(args);
283 }
284}
285
286void bch_check_keys(struct btree *b, const char *fmt, ...)
287{
288 va_list args;
289 struct bkey *k, *p = NULL;
290 struct btree_iter iter;
291
292 if (b->level)
293 return;
294
295 for_each_key(b, k, &iter) {
296 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
297 printk(KERN_ERR "Keys out of order:\n");
298 goto bug;
299 }
300
301 if (bch_ptr_invalid(b, k))
302 continue;
303
304 if (p && bkey_cmp(p, &START_KEY(k)) > 0) {
305 printk(KERN_ERR "Overlapping keys:\n");
306 goto bug;
307 }
308 p = k;
309 }
310 return;
311bug:
312 va_start(args, fmt);
313 vdump_bucket_and_panic(b, fmt, args);
314 va_end(args);
315}
316
317#endif
318
319#ifdef CONFIG_DEBUG_FS
320
321/* XXX: cache set refcounting */
322
323struct dump_iterator {
324 char buf[PAGE_SIZE];
325 size_t bytes;
326 struct cache_set *c;
327 struct keybuf keys;
328};
329
330static bool dump_pred(struct keybuf *buf, struct bkey *k)
331{
332 return true;
333}
334
335static ssize_t bch_dump_read(struct file *file, char __user *buf,
336 size_t size, loff_t *ppos)
337{
338 struct dump_iterator *i = file->private_data;
339 ssize_t ret = 0;
340
341 while (size) {
342 struct keybuf_key *w;
343 unsigned bytes = min(i->bytes, size);
344
345 int err = copy_to_user(buf, i->buf, bytes);
346 if (err)
347 return err;
348
349 ret += bytes;
350 buf += bytes;
351 size -= bytes;
352 i->bytes -= bytes;
353 memmove(i->buf, i->buf + bytes, i->bytes);
354
355 if (i->bytes)
356 break;
357
358 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
359 if (!w)
360 break;
361
362 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key));
363 bch_keybuf_del(&i->keys, w);
364 }
365
366 return ret;
367}
368
369static int bch_dump_open(struct inode *inode, struct file *file)
370{
371 struct cache_set *c = inode->i_private;
372 struct dump_iterator *i;
373
374 i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);
375 if (!i)
376 return -ENOMEM;
377
378 file->private_data = i;
379 i->c = c;
380 bch_keybuf_init(&i->keys, dump_pred);
381 i->keys.last_scanned = KEY(0, 0, 0);
382
383 return 0;
384}
385
386static int bch_dump_release(struct inode *inode, struct file *file)
387{
388 kfree(file->private_data);
389 return 0;
390}
391
392static const struct file_operations cache_set_debug_ops = {
393 .owner = THIS_MODULE,
394 .open = bch_dump_open,
395 .read = bch_dump_read,
396 .release = bch_dump_release
397};
398
399void bch_debug_init_cache_set(struct cache_set *c)
400{
401 if (!IS_ERR_OR_NULL(debug)) {
402 char name[50];
403 snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
404
405 c->debug = debugfs_create_file(name, 0400, debug, c,
406 &cache_set_debug_ops);
407 }
408}
409
410#endif
411
412#ifdef CONFIG_BCACHE_DEBUG
413static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
414 const char *buffer, size_t size)
415{
416 void dump(struct btree *b)
417 {
418 struct bset *i;
419
420 for (i = b->sets[0].data;
421 index(i, b) < btree_blocks(b) &&
422 i->seq == b->sets[0].data->seq;
423 i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
424 dump_bset(b, i);
425 }
426
427 struct cache_sb *sb;
428 struct cache_set *c;
429 struct btree *all[3], *b, *fill, *orig;
430 int j;
431
432 struct btree_op op;
433 bch_btree_op_init_stack(&op);
434
435 sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
436 if (!sb)
437 return -ENOMEM;
438
439 sb->bucket_size = 128;
440 sb->block_size = 4;
441
442 c = bch_cache_set_alloc(sb);
443 if (!c)
444 return -ENOMEM;
445
446 for (j = 0; j < 3; j++) {
447 BUG_ON(list_empty(&c->btree_cache));
448 all[j] = list_first_entry(&c->btree_cache, struct btree, list);
449 list_del_init(&all[j]->list);
450
451 all[j]->key = KEY(0, 0, c->sb.bucket_size);
452 bkey_copy_key(&all[j]->key, &MAX_KEY);
453 }
454
455 b = all[0];
456 fill = all[1];
457 orig = all[2];
458
459 while (1) {
460 for (j = 0; j < 3; j++)
461 all[j]->written = all[j]->nsets = 0;
462
463 bch_bset_init_next(b);
464
465 while (1) {
466 struct bset *i = write_block(b);
467 struct bkey *k = op.keys.top;
468 unsigned rand;
469
470 bkey_init(k);
471 rand = get_random_int();
472
473 op.type = rand & 1
474 ? BTREE_INSERT
475 : BTREE_REPLACE;
476 rand >>= 1;
477
478 SET_KEY_SIZE(k, bucket_remainder(c, rand));
479 rand >>= c->bucket_bits;
480 rand &= 1024 * 512 - 1;
481 rand += c->sb.bucket_size;
482 SET_KEY_OFFSET(k, rand);
483#if 0
484 SET_KEY_PTRS(k, 1);
485#endif
486 bch_keylist_push(&op.keys);
487 bch_btree_insert_keys(b, &op);
488
489 if (should_split(b) ||
490 set_blocks(i, b->c) !=
491 __set_blocks(i, i->keys + 15, b->c)) {
492 i->csum = csum_set(i);
493
494 memcpy(write_block(fill),
495 i, set_bytes(i));
496
497 b->written += set_blocks(i, b->c);
498 fill->written = b->written;
499 if (b->written == btree_blocks(b))
500 break;
501
502 bch_btree_sort_lazy(b);
503 bch_bset_init_next(b);
504 }
505 }
506
507 memcpy(orig->sets[0].data,
508 fill->sets[0].data,
509 btree_bytes(c));
510
511 bch_btree_sort(b);
512 fill->written = 0;
513 bch_btree_read_done(&fill->io.cl);
514
515 if (b->sets[0].data->keys != fill->sets[0].data->keys ||
516 memcmp(b->sets[0].data->start,
517 fill->sets[0].data->start,
518 b->sets[0].data->keys * sizeof(uint64_t))) {
519 struct bset *i = b->sets[0].data;
520 struct bkey *k, *l;
521
522 for (k = i->start,
523 l = fill->sets[0].data->start;
524 k < end(i);
525 k = bkey_next(k), l = bkey_next(l))
526 if (bkey_cmp(k, l) ||
527 KEY_SIZE(k) != KEY_SIZE(l))
528 pr_err("key %zi differs: %s "
529 "!= %s", (uint64_t *) k - i->d,
530 pkey(k), pkey(l));
531
532 for (j = 0; j < 3; j++) {
533 pr_err("**** Set %i ****", j);
534 dump(all[j]);
535 }
536 panic("\n");
537 }
538
539 pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
540 }
541}
542
543kobj_attribute_write(fuzz, btree_fuzz);
544#endif
545
546void bch_debug_exit(void)
547{
548 if (!IS_ERR_OR_NULL(debug))
549 debugfs_remove_recursive(debug);
550}
551
552int __init bch_debug_init(struct kobject *kobj)
553{
554 int ret = 0;
555#ifdef CONFIG_BCACHE_DEBUG
556 ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
557 if (ret)
558 return ret;
559#endif
560
561 debug = debugfs_create_dir("bcache", NULL);
562 return ret;
563}
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
new file mode 100644
index 000000000000..f9378a218148
--- /dev/null
+++ b/drivers/md/bcache/debug.h
@@ -0,0 +1,54 @@
1#ifndef _BCACHE_DEBUG_H
2#define _BCACHE_DEBUG_H
3
4/* Btree/bkey debug printing */
5
6#define KEYHACK_SIZE 80
7struct keyprint_hack {
8 char s[KEYHACK_SIZE];
9};
10
11struct keyprint_hack bch_pkey(const struct bkey *k);
12struct keyprint_hack bch_pbtree(const struct btree *b);
13#define pkey(k) (&bch_pkey(k).s[0])
14#define pbtree(b) (&bch_pbtree(b).s[0])
15
16#ifdef CONFIG_BCACHE_EDEBUG
17
18unsigned bch_count_data(struct btree *);
19void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...);
20void bch_check_keys(struct btree *, const char *, ...);
21
22#define bch_check_key_order(b, i) \
23 bch_check_key_order_msg(b, i, "keys out of order")
24#define EBUG_ON(cond) BUG_ON(cond)
25
26#else /* EDEBUG */
27
28#define bch_count_data(b) 0
29#define bch_check_key_order(b, i) do {} while (0)
30#define bch_check_key_order_msg(b, i, ...) do {} while (0)
31#define bch_check_keys(b, ...) do {} while (0)
32#define EBUG_ON(cond) do {} while (0)
33
34#endif
35
36#ifdef CONFIG_BCACHE_DEBUG
37
38void bch_btree_verify(struct btree *, struct bset *);
39void bch_data_verify(struct search *);
40
41#else /* DEBUG */
42
43static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
44static inline void bch_data_verify(struct search *s) {};
45
46#endif
47
48#ifdef CONFIG_DEBUG_FS
49void bch_debug_init_cache_set(struct cache_set *);
50#else
51static inline void bch_debug_init_cache_set(struct cache_set *c) {}
52#endif
53
54#endif
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
new file mode 100644
index 000000000000..f565512f6fac
--- /dev/null
+++ b/drivers/md/bcache/io.c
@@ -0,0 +1,390 @@
1/*
2 * Some low level IO code, and hacks for various block layer limitations
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include "bcache.h"
9#include "bset.h"
10#include "debug.h"
11
12static void bch_bi_idx_hack_endio(struct bio *bio, int error)
13{
14 struct bio *p = bio->bi_private;
15
16 bio_endio(p, error);
17 bio_put(bio);
18}
19
20static void bch_generic_make_request_hack(struct bio *bio)
21{
22 if (bio->bi_idx) {
23 struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
24
25 memcpy(clone->bi_io_vec,
26 bio_iovec(bio),
27 bio_segments(bio) * sizeof(struct bio_vec));
28
29 clone->bi_sector = bio->bi_sector;
30 clone->bi_bdev = bio->bi_bdev;
31 clone->bi_rw = bio->bi_rw;
32 clone->bi_vcnt = bio_segments(bio);
33 clone->bi_size = bio->bi_size;
34
35 clone->bi_private = bio;
36 clone->bi_end_io = bch_bi_idx_hack_endio;
37
38 bio = clone;
39 }
40
41 generic_make_request(bio);
42}
43
44/**
45 * bch_bio_split - split a bio
46 * @bio: bio to split
47 * @sectors: number of sectors to split from the front of @bio
48 * @gfp: gfp mask
49 * @bs: bio set to allocate from
50 *
51 * Allocates and returns a new bio which represents @sectors from the start of
52 * @bio, and updates @bio to represent the remaining sectors.
53 *
54 * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
55 * unchanged.
56 *
57 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
58 * bvec boundry; it is the caller's responsibility to ensure that @bio is not
59 * freed before the split.
60 *
61 * If bch_bio_split() is running under generic_make_request(), it's not safe to
62 * allocate more than one bio from the same bio set. Therefore, if it is running
63 * under generic_make_request() it masks out __GFP_WAIT when doing the
64 * allocation. The caller must check for failure if there's any possibility of
65 * it being called from under generic_make_request(); it is then the caller's
66 * responsibility to retry from a safe context (by e.g. punting to workqueue).
67 */
68struct bio *bch_bio_split(struct bio *bio, int sectors,
69 gfp_t gfp, struct bio_set *bs)
70{
71 unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
72 struct bio_vec *bv;
73 struct bio *ret = NULL;
74
75 BUG_ON(sectors <= 0);
76
77 /*
78 * If we're being called from underneath generic_make_request() and we
79 * already allocated any bios from this bio set, we risk deadlock if we
80 * use the mempool. So instead, we possibly fail and let the caller punt
81 * to workqueue or somesuch and retry in a safe context.
82 */
83 if (current->bio_list)
84 gfp &= ~__GFP_WAIT;
85
86 if (sectors >= bio_sectors(bio))
87 return bio;
88
89 if (bio->bi_rw & REQ_DISCARD) {
90 ret = bio_alloc_bioset(gfp, 1, bs);
91 idx = 0;
92 goto out;
93 }
94
95 bio_for_each_segment(bv, bio, idx) {
96 vcnt = idx - bio->bi_idx;
97
98 if (!nbytes) {
99 ret = bio_alloc_bioset(gfp, vcnt, bs);
100 if (!ret)
101 return NULL;
102
103 memcpy(ret->bi_io_vec, bio_iovec(bio),
104 sizeof(struct bio_vec) * vcnt);
105
106 break;
107 } else if (nbytes < bv->bv_len) {
108 ret = bio_alloc_bioset(gfp, ++vcnt, bs);
109 if (!ret)
110 return NULL;
111
112 memcpy(ret->bi_io_vec, bio_iovec(bio),
113 sizeof(struct bio_vec) * vcnt);
114
115 ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
116 bv->bv_offset += nbytes;
117 bv->bv_len -= nbytes;
118 break;
119 }
120
121 nbytes -= bv->bv_len;
122 }
123out:
124 ret->bi_bdev = bio->bi_bdev;
125 ret->bi_sector = bio->bi_sector;
126 ret->bi_size = sectors << 9;
127 ret->bi_rw = bio->bi_rw;
128 ret->bi_vcnt = vcnt;
129 ret->bi_max_vecs = vcnt;
130
131 bio->bi_sector += sectors;
132 bio->bi_size -= sectors << 9;
133 bio->bi_idx = idx;
134
135 if (bio_integrity(bio)) {
136 if (bio_integrity_clone(ret, bio, gfp)) {
137 bio_put(ret);
138 return NULL;
139 }
140
141 bio_integrity_trim(ret, 0, bio_sectors(ret));
142 bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
143 }
144
145 return ret;
146}
147
148static unsigned bch_bio_max_sectors(struct bio *bio)
149{
150 unsigned ret = bio_sectors(bio);
151 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
152 struct bio_vec *bv, *end = bio_iovec(bio) +
153 min_t(int, bio_segments(bio), queue_max_segments(q));
154
155 struct bvec_merge_data bvm = {
156 .bi_bdev = bio->bi_bdev,
157 .bi_sector = bio->bi_sector,
158 .bi_size = 0,
159 .bi_rw = bio->bi_rw,
160 };
161
162 if (bio->bi_rw & REQ_DISCARD)
163 return min(ret, q->limits.max_discard_sectors);
164
165 if (bio_segments(bio) > queue_max_segments(q) ||
166 q->merge_bvec_fn) {
167 ret = 0;
168
169 for (bv = bio_iovec(bio); bv < end; bv++) {
170 if (q->merge_bvec_fn &&
171 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
172 break;
173
174 ret += bv->bv_len >> 9;
175 bvm.bi_size += bv->bv_len;
176 }
177
178 if (ret >= (BIO_MAX_PAGES * PAGE_SIZE) >> 9)
179 return (BIO_MAX_PAGES * PAGE_SIZE) >> 9;
180 }
181
182 ret = min(ret, queue_max_sectors(q));
183
184 WARN_ON(!ret);
185 ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
186
187 return ret;
188}
189
190static void bch_bio_submit_split_done(struct closure *cl)
191{
192 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
193
194 s->bio->bi_end_io = s->bi_end_io;
195 s->bio->bi_private = s->bi_private;
196 bio_endio(s->bio, 0);
197
198 closure_debug_destroy(&s->cl);
199 mempool_free(s, s->p->bio_split_hook);
200}
201
202static void bch_bio_submit_split_endio(struct bio *bio, int error)
203{
204 struct closure *cl = bio->bi_private;
205 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
206
207 if (error)
208 clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
209
210 bio_put(bio);
211 closure_put(cl);
212}
213
214static void __bch_bio_submit_split(struct closure *cl)
215{
216 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
217 struct bio *bio = s->bio, *n;
218
219 do {
220 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
221 GFP_NOIO, s->p->bio_split);
222 if (!n)
223 continue_at(cl, __bch_bio_submit_split, system_wq);
224
225 n->bi_end_io = bch_bio_submit_split_endio;
226 n->bi_private = cl;
227
228 closure_get(cl);
229 bch_generic_make_request_hack(n);
230 } while (n != bio);
231
232 continue_at(cl, bch_bio_submit_split_done, NULL);
233}
234
235void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
236{
237 struct bio_split_hook *s;
238
239 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
240 goto submit;
241
242 if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
243 goto submit;
244
245 s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
246
247 s->bio = bio;
248 s->p = p;
249 s->bi_end_io = bio->bi_end_io;
250 s->bi_private = bio->bi_private;
251 bio_get(bio);
252
253 closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
254 return;
255submit:
256 bch_generic_make_request_hack(bio);
257}
258
259/* Bios with headers */
260
261void bch_bbio_free(struct bio *bio, struct cache_set *c)
262{
263 struct bbio *b = container_of(bio, struct bbio, bio);
264 mempool_free(b, c->bio_meta);
265}
266
267struct bio *bch_bbio_alloc(struct cache_set *c)
268{
269 struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
270 struct bio *bio = &b->bio;
271
272 bio_init(bio);
273 bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
274 bio->bi_max_vecs = bucket_pages(c);
275 bio->bi_io_vec = bio->bi_inline_vecs;
276
277 return bio;
278}
279
280void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
281{
282 struct bbio *b = container_of(bio, struct bbio, bio);
283
284 bio->bi_sector = PTR_OFFSET(&b->key, 0);
285 bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev;
286
287 b->submit_time_us = local_clock_us();
288 closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
289}
290
291void bch_submit_bbio(struct bio *bio, struct cache_set *c,
292 struct bkey *k, unsigned ptr)
293{
294 struct bbio *b = container_of(bio, struct bbio, bio);
295 bch_bkey_copy_single_ptr(&b->key, k, ptr);
296 __bch_submit_bbio(bio, c);
297}
298
299/* IO errors */
300
301void bch_count_io_errors(struct cache *ca, int error, const char *m)
302{
303 /*
304 * The halflife of an error is:
305 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
306 */
307
308 if (ca->set->error_decay) {
309 unsigned count = atomic_inc_return(&ca->io_count);
310
311 while (count > ca->set->error_decay) {
312 unsigned errors;
313 unsigned old = count;
314 unsigned new = count - ca->set->error_decay;
315
316 /*
317 * First we subtract refresh from count; each time we
318 * succesfully do so, we rescale the errors once:
319 */
320
321 count = atomic_cmpxchg(&ca->io_count, old, new);
322
323 if (count == old) {
324 count = new;
325
326 errors = atomic_read(&ca->io_errors);
327 do {
328 old = errors;
329 new = ((uint64_t) errors * 127) / 128;
330 errors = atomic_cmpxchg(&ca->io_errors,
331 old, new);
332 } while (old != errors);
333 }
334 }
335 }
336
337 if (error) {
338 char buf[BDEVNAME_SIZE];
339 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
340 &ca->io_errors);
341 errors >>= IO_ERROR_SHIFT;
342
343 if (errors < ca->set->error_limit)
344 pr_err("%s: IO error on %s, recovering",
345 bdevname(ca->bdev, buf), m);
346 else
347 bch_cache_set_error(ca->set,
348 "%s: too many IO errors %s",
349 bdevname(ca->bdev, buf), m);
350 }
351}
352
353void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
354 int error, const char *m)
355{
356 struct bbio *b = container_of(bio, struct bbio, bio);
357 struct cache *ca = PTR_CACHE(c, &b->key, 0);
358
359 unsigned threshold = bio->bi_rw & REQ_WRITE
360 ? c->congested_write_threshold_us
361 : c->congested_read_threshold_us;
362
363 if (threshold) {
364 unsigned t = local_clock_us();
365
366 int us = t - b->submit_time_us;
367 int congested = atomic_read(&c->congested);
368
369 if (us > (int) threshold) {
370 int ms = us / 1024;
371 c->congested_last_us = t;
372
373 ms = min(ms, CONGESTED_MAX + congested);
374 atomic_sub(ms, &c->congested);
375 } else if (congested < 0)
376 atomic_inc(&c->congested);
377 }
378
379 bch_count_io_errors(ca, error, m);
380}
381
382void bch_bbio_endio(struct cache_set *c, struct bio *bio,
383 int error, const char *m)
384{
385 struct closure *cl = bio->bi_private;
386
387 bch_bbio_count_io_errors(c, bio, error, m);
388 bio_put(bio);
389 closure_put(cl);
390}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
new file mode 100644
index 000000000000..c871ffaabbb0
--- /dev/null
+++ b/drivers/md/bcache/journal.c
@@ -0,0 +1,785 @@
1/*
2 * bcache journalling code, for btree insertions
3 *
4 * Copyright 2012 Google, Inc.
5 */
6
7#include "bcache.h"
8#include "btree.h"
9#include "debug.h"
10#include "request.h"
11
12/*
13 * Journal replay/recovery:
14 *
15 * This code is all driven from run_cache_set(); we first read the journal
16 * entries, do some other stuff, then we mark all the keys in the journal
17 * entries (same as garbage collection would), then we replay them - reinserting
18 * them into the cache in precisely the same order as they appear in the
19 * journal.
20 *
21 * We only journal keys that go in leaf nodes, which simplifies things quite a
22 * bit.
23 */
24
25static void journal_read_endio(struct bio *bio, int error)
26{
27 struct closure *cl = bio->bi_private;
28 closure_put(cl);
29}
30
31static int journal_read_bucket(struct cache *ca, struct list_head *list,
32 struct btree_op *op, unsigned bucket_index)
33{
34 struct journal_device *ja = &ca->journal;
35 struct bio *bio = &ja->bio;
36
37 struct journal_replay *i;
38 struct jset *j, *data = ca->set->journal.w[0].data;
39 unsigned len, left, offset = 0;
40 int ret = 0;
41 sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
42
43 pr_debug("reading %llu", (uint64_t) bucket);
44
45 while (offset < ca->sb.bucket_size) {
46reread: left = ca->sb.bucket_size - offset;
47 len = min_t(unsigned, left, PAGE_SECTORS * 8);
48
49 bio_reset(bio);
50 bio->bi_sector = bucket + offset;
51 bio->bi_bdev = ca->bdev;
52 bio->bi_rw = READ;
53 bio->bi_size = len << 9;
54
55 bio->bi_end_io = journal_read_endio;
56 bio->bi_private = &op->cl;
57 bio_map(bio, data);
58
59 closure_bio_submit(bio, &op->cl, ca);
60 closure_sync(&op->cl);
61
62 /* This function could be simpler now since we no longer write
63 * journal entries that overlap bucket boundaries; this means
64 * the start of a bucket will always have a valid journal entry
65 * if it has any journal entries at all.
66 */
67
68 j = data;
69 while (len) {
70 struct list_head *where;
71 size_t blocks, bytes = set_bytes(j);
72
73 if (j->magic != jset_magic(ca->set))
74 return ret;
75
76 if (bytes > left << 9)
77 return ret;
78
79 if (bytes > len << 9)
80 goto reread;
81
82 if (j->csum != csum_set(j))
83 return ret;
84
85 blocks = set_blocks(j, ca->set);
86
87 while (!list_empty(list)) {
88 i = list_first_entry(list,
89 struct journal_replay, list);
90 if (i->j.seq >= j->last_seq)
91 break;
92 list_del(&i->list);
93 kfree(i);
94 }
95
96 list_for_each_entry_reverse(i, list, list) {
97 if (j->seq == i->j.seq)
98 goto next_set;
99
100 if (j->seq < i->j.last_seq)
101 goto next_set;
102
103 if (j->seq > i->j.seq) {
104 where = &i->list;
105 goto add;
106 }
107 }
108
109 where = list;
110add:
111 i = kmalloc(offsetof(struct journal_replay, j) +
112 bytes, GFP_KERNEL);
113 if (!i)
114 return -ENOMEM;
115 memcpy(&i->j, j, bytes);
116 list_add(&i->list, where);
117 ret = 1;
118
119 ja->seq[bucket_index] = j->seq;
120next_set:
121 offset += blocks * ca->sb.block_size;
122 len -= blocks * ca->sb.block_size;
123 j = ((void *) j) + blocks * block_bytes(ca);
124 }
125 }
126
127 return ret;
128}
129
130int bch_journal_read(struct cache_set *c, struct list_head *list,
131 struct btree_op *op)
132{
133#define read_bucket(b) \
134 ({ \
135 int ret = journal_read_bucket(ca, list, op, b); \
136 __set_bit(b, bitmap); \
137 if (ret < 0) \
138 return ret; \
139 ret; \
140 })
141
142 struct cache *ca;
143 unsigned iter;
144
145 for_each_cache(ca, c, iter) {
146 struct journal_device *ja = &ca->journal;
147 unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
148 unsigned i, l, r, m;
149 uint64_t seq;
150
151 bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
152 pr_debug("%u journal buckets", ca->sb.njournal_buckets);
153
154 /* Read journal buckets ordered by golden ratio hash to quickly
155 * find a sequence of buckets with valid journal entries
156 */
157 for (i = 0; i < ca->sb.njournal_buckets; i++) {
158 l = (i * 2654435769U) % ca->sb.njournal_buckets;
159
160 if (test_bit(l, bitmap))
161 break;
162
163 if (read_bucket(l))
164 goto bsearch;
165 }
166
167 /* If that fails, check all the buckets we haven't checked
168 * already
169 */
170 pr_debug("falling back to linear search");
171
172 for (l = 0; l < ca->sb.njournal_buckets; l++) {
173 if (test_bit(l, bitmap))
174 continue;
175
176 if (read_bucket(l))
177 goto bsearch;
178 }
179bsearch:
180 /* Binary search */
181 m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
182 pr_debug("starting binary search, l %u r %u", l, r);
183
184 while (l + 1 < r) {
185 m = (l + r) >> 1;
186
187 if (read_bucket(m))
188 l = m;
189 else
190 r = m;
191 }
192
193 /* Read buckets in reverse order until we stop finding more
194 * journal entries
195 */
196 pr_debug("finishing up");
197 l = m;
198
199 while (1) {
200 if (!l--)
201 l = ca->sb.njournal_buckets - 1;
202
203 if (l == m)
204 break;
205
206 if (test_bit(l, bitmap))
207 continue;
208
209 if (!read_bucket(l))
210 break;
211 }
212
213 seq = 0;
214
215 for (i = 0; i < ca->sb.njournal_buckets; i++)
216 if (ja->seq[i] > seq) {
217 seq = ja->seq[i];
218 ja->cur_idx = ja->discard_idx =
219 ja->last_idx = i;
220
221 }
222 }
223
224 c->journal.seq = list_entry(list->prev,
225 struct journal_replay,
226 list)->j.seq;
227
228 return 0;
229#undef read_bucket
230}
231
232void bch_journal_mark(struct cache_set *c, struct list_head *list)
233{
234 atomic_t p = { 0 };
235 struct bkey *k;
236 struct journal_replay *i;
237 struct journal *j = &c->journal;
238 uint64_t last = j->seq;
239
240 /*
241 * journal.pin should never fill up - we never write a journal
242 * entry when it would fill up. But if for some reason it does, we
243 * iterate over the list in reverse order so that we can just skip that
244 * refcount instead of bugging.
245 */
246
247 list_for_each_entry_reverse(i, list, list) {
248 BUG_ON(last < i->j.seq);
249 i->pin = NULL;
250
251 while (last-- != i->j.seq)
252 if (fifo_free(&j->pin) > 1) {
253 fifo_push_front(&j->pin, p);
254 atomic_set(&fifo_front(&j->pin), 0);
255 }
256
257 if (fifo_free(&j->pin) > 1) {
258 fifo_push_front(&j->pin, p);
259 i->pin = &fifo_front(&j->pin);
260 atomic_set(i->pin, 1);
261 }
262
263 for (k = i->j.start;
264 k < end(&i->j);
265 k = bkey_next(k)) {
266 unsigned j;
267
268 for (j = 0; j < KEY_PTRS(k); j++) {
269 struct bucket *g = PTR_BUCKET(c, k, j);
270 atomic_inc(&g->pin);
271
272 if (g->prio == BTREE_PRIO &&
273 !ptr_stale(c, k, j))
274 g->prio = INITIAL_PRIO;
275 }
276
277 __bch_btree_mark_key(c, 0, k);
278 }
279 }
280}
281
282int bch_journal_replay(struct cache_set *s, struct list_head *list,
283 struct btree_op *op)
284{
285 int ret = 0, keys = 0, entries = 0;
286 struct bkey *k;
287 struct journal_replay *i =
288 list_entry(list->prev, struct journal_replay, list);
289
290 uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
291
292 list_for_each_entry(i, list, list) {
293 BUG_ON(i->pin && atomic_read(i->pin) != 1);
294
295 if (n != i->j.seq)
296 pr_err("journal entries %llu-%llu "
297 "missing! (replaying %llu-%llu)\n",
298 n, i->j.seq - 1, start, end);
299
300 for (k = i->j.start;
301 k < end(&i->j);
302 k = bkey_next(k)) {
303 pr_debug("%s", pkey(k));
304 bkey_copy(op->keys.top, k);
305 bch_keylist_push(&op->keys);
306
307 op->journal = i->pin;
308 atomic_inc(op->journal);
309
310 ret = bch_btree_insert(op, s);
311 if (ret)
312 goto err;
313
314 BUG_ON(!bch_keylist_empty(&op->keys));
315 keys++;
316
317 cond_resched();
318 }
319
320 if (i->pin)
321 atomic_dec(i->pin);
322 n = i->j.seq + 1;
323 entries++;
324 }
325
326 pr_info("journal replay done, %i keys in %i entries, seq %llu",
327 keys, entries, end);
328
329 while (!list_empty(list)) {
330 i = list_first_entry(list, struct journal_replay, list);
331 list_del(&i->list);
332 kfree(i);
333 }
334err:
335 closure_sync(&op->cl);
336 return ret;
337}
338
339/* Journalling */
340
341static void btree_flush_write(struct cache_set *c)
342{
343 /*
344 * Try to find the btree node with that references the oldest journal
345 * entry, best is our current candidate and is locked if non NULL:
346 */
347 struct btree *b, *best = NULL;
348 unsigned iter;
349
350 for_each_cached_btree(b, c, iter) {
351 if (!down_write_trylock(&b->lock))
352 continue;
353
354 if (!btree_node_dirty(b) ||
355 !btree_current_write(b)->journal) {
356 rw_unlock(true, b);
357 continue;
358 }
359
360 if (!best)
361 best = b;
362 else if (journal_pin_cmp(c,
363 btree_current_write(best),
364 btree_current_write(b))) {
365 rw_unlock(true, best);
366 best = b;
367 } else
368 rw_unlock(true, b);
369 }
370
371 if (best)
372 goto out;
373
374 /* We can't find the best btree node, just pick the first */
375 list_for_each_entry(b, &c->btree_cache, list)
376 if (!b->level && btree_node_dirty(b)) {
377 best = b;
378 rw_lock(true, best, best->level);
379 goto found;
380 }
381
382out:
383 if (!best)
384 return;
385found:
386 if (btree_node_dirty(best))
387 bch_btree_write(best, true, NULL);
388 rw_unlock(true, best);
389}
390
391#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
392
393static void journal_discard_endio(struct bio *bio, int error)
394{
395 struct journal_device *ja =
396 container_of(bio, struct journal_device, discard_bio);
397 struct cache *ca = container_of(ja, struct cache, journal);
398
399 atomic_set(&ja->discard_in_flight, DISCARD_DONE);
400
401 closure_wake_up(&ca->set->journal.wait);
402 closure_put(&ca->set->cl);
403}
404
405static void journal_discard_work(struct work_struct *work)
406{
407 struct journal_device *ja =
408 container_of(work, struct journal_device, discard_work);
409
410 submit_bio(0, &ja->discard_bio);
411}
412
413static void do_journal_discard(struct cache *ca)
414{
415 struct journal_device *ja = &ca->journal;
416 struct bio *bio = &ja->discard_bio;
417
418 if (!ca->discard) {
419 ja->discard_idx = ja->last_idx;
420 return;
421 }
422
423 switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
424 case DISCARD_IN_FLIGHT:
425 return;
426
427 case DISCARD_DONE:
428 ja->discard_idx = (ja->discard_idx + 1) %
429 ca->sb.njournal_buckets;
430
431 atomic_set(&ja->discard_in_flight, DISCARD_READY);
432 /* fallthrough */
433
434 case DISCARD_READY:
435 if (ja->discard_idx == ja->last_idx)
436 return;
437
438 atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
439
440 bio_init(bio);
441 bio->bi_sector = bucket_to_sector(ca->set,
442 ca->sb.d[ja->discard_idx]);
443 bio->bi_bdev = ca->bdev;
444 bio->bi_rw = REQ_WRITE|REQ_DISCARD;
445 bio->bi_max_vecs = 1;
446 bio->bi_io_vec = bio->bi_inline_vecs;
447 bio->bi_size = bucket_bytes(ca);
448 bio->bi_end_io = journal_discard_endio;
449
450 closure_get(&ca->set->cl);
451 INIT_WORK(&ja->discard_work, journal_discard_work);
452 schedule_work(&ja->discard_work);
453 }
454}
455
456static void journal_reclaim(struct cache_set *c)
457{
458 struct bkey *k = &c->journal.key;
459 struct cache *ca;
460 uint64_t last_seq;
461 unsigned iter, n = 0;
462 atomic_t p;
463
464 while (!atomic_read(&fifo_front(&c->journal.pin)))
465 fifo_pop(&c->journal.pin, p);
466
467 last_seq = last_seq(&c->journal);
468
469 /* Update last_idx */
470
471 for_each_cache(ca, c, iter) {
472 struct journal_device *ja = &ca->journal;
473
474 while (ja->last_idx != ja->cur_idx &&
475 ja->seq[ja->last_idx] < last_seq)
476 ja->last_idx = (ja->last_idx + 1) %
477 ca->sb.njournal_buckets;
478 }
479
480 for_each_cache(ca, c, iter)
481 do_journal_discard(ca);
482
483 if (c->journal.blocks_free)
484 return;
485
486 /*
487 * Allocate:
488 * XXX: Sort by free journal space
489 */
490
491 for_each_cache(ca, c, iter) {
492 struct journal_device *ja = &ca->journal;
493 unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
494
495 /* No space available on this device */
496 if (next == ja->discard_idx)
497 continue;
498
499 ja->cur_idx = next;
500 k->ptr[n++] = PTR(0,
501 bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
502 ca->sb.nr_this_dev);
503 }
504
505 bkey_init(k);
506 SET_KEY_PTRS(k, n);
507
508 if (n)
509 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
510
511 if (!journal_full(&c->journal))
512 __closure_wake_up(&c->journal.wait);
513}
514
515void bch_journal_next(struct journal *j)
516{
517 atomic_t p = { 1 };
518
519 j->cur = (j->cur == j->w)
520 ? &j->w[1]
521 : &j->w[0];
522
523 /*
524 * The fifo_push() needs to happen at the same time as j->seq is
525 * incremented for last_seq() to be calculated correctly
526 */
527 BUG_ON(!fifo_push(&j->pin, p));
528 atomic_set(&fifo_back(&j->pin), 1);
529
530 j->cur->data->seq = ++j->seq;
531 j->cur->need_write = false;
532 j->cur->data->keys = 0;
533
534 if (fifo_full(&j->pin))
535 pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
536}
537
538static void journal_write_endio(struct bio *bio, int error)
539{
540 struct journal_write *w = bio->bi_private;
541
542 cache_set_err_on(error, w->c, "journal io error");
543 closure_put(&w->c->journal.io.cl);
544}
545
546static void journal_write(struct closure *);
547
548static void journal_write_done(struct closure *cl)
549{
550 struct journal *j = container_of(cl, struct journal, io.cl);
551 struct cache_set *c = container_of(j, struct cache_set, journal);
552
553 struct journal_write *w = (j->cur == j->w)
554 ? &j->w[1]
555 : &j->w[0];
556
557 __closure_wake_up(&w->wait);
558
559 if (c->journal_delay_ms)
560 closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
561
562 continue_at(cl, journal_write, system_wq);
563}
564
565static void journal_write_unlocked(struct closure *cl)
566{
567 struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
568 struct cache *ca;
569 struct journal_write *w = c->journal.cur;
570 struct bkey *k = &c->journal.key;
571 unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
572
573 struct bio *bio;
574 struct bio_list list;
575 bio_list_init(&list);
576
577 if (!w->need_write) {
578 /*
579 * XXX: have to unlock closure before we unlock journal lock,
580 * else we race with bch_journal(). But this way we race
581 * against cache set unregister. Doh.
582 */
583 set_closure_fn(cl, NULL, NULL);
584 closure_sub(cl, CLOSURE_RUNNING + 1);
585 spin_unlock(&c->journal.lock);
586 return;
587 } else if (journal_full(&c->journal)) {
588 journal_reclaim(c);
589 spin_unlock(&c->journal.lock);
590
591 btree_flush_write(c);
592 continue_at(cl, journal_write, system_wq);
593 }
594
595 c->journal.blocks_free -= set_blocks(w->data, c);
596
597 w->data->btree_level = c->root->level;
598
599 bkey_copy(&w->data->btree_root, &c->root->key);
600 bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
601
602 for_each_cache(ca, c, i)
603 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
604
605 w->data->magic = jset_magic(c);
606 w->data->version = BCACHE_JSET_VERSION;
607 w->data->last_seq = last_seq(&c->journal);
608 w->data->csum = csum_set(w->data);
609
610 for (i = 0; i < KEY_PTRS(k); i++) {
611 ca = PTR_CACHE(c, k, i);
612 bio = &ca->journal.bio;
613
614 atomic_long_add(sectors, &ca->meta_sectors_written);
615
616 bio_reset(bio);
617 bio->bi_sector = PTR_OFFSET(k, i);
618 bio->bi_bdev = ca->bdev;
619 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
620 bio->bi_size = sectors << 9;
621
622 bio->bi_end_io = journal_write_endio;
623 bio->bi_private = w;
624 bio_map(bio, w->data);
625
626 trace_bcache_journal_write(bio);
627 bio_list_add(&list, bio);
628
629 SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
630
631 ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
632 }
633
634 atomic_dec_bug(&fifo_back(&c->journal.pin));
635 bch_journal_next(&c->journal);
636 journal_reclaim(c);
637
638 spin_unlock(&c->journal.lock);
639
640 while ((bio = bio_list_pop(&list)))
641 closure_bio_submit(bio, cl, c->cache[0]);
642
643 continue_at(cl, journal_write_done, NULL);
644}
645
646static void journal_write(struct closure *cl)
647{
648 struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
649
650 spin_lock(&c->journal.lock);
651 journal_write_unlocked(cl);
652}
653
654static void __journal_try_write(struct cache_set *c, bool noflush)
655{
656 struct closure *cl = &c->journal.io.cl;
657
658 if (!closure_trylock(cl, &c->cl))
659 spin_unlock(&c->journal.lock);
660 else if (noflush && journal_full(&c->journal)) {
661 spin_unlock(&c->journal.lock);
662 continue_at(cl, journal_write, system_wq);
663 } else
664 journal_write_unlocked(cl);
665}
666
667#define journal_try_write(c) __journal_try_write(c, false)
668
669void bch_journal_meta(struct cache_set *c, struct closure *cl)
670{
671 struct journal_write *w;
672
673 if (CACHE_SYNC(&c->sb)) {
674 spin_lock(&c->journal.lock);
675
676 w = c->journal.cur;
677 w->need_write = true;
678
679 if (cl)
680 BUG_ON(!closure_wait(&w->wait, cl));
681
682 __journal_try_write(c, true);
683 }
684}
685
686/*
687 * Entry point to the journalling code - bio_insert() and btree_invalidate()
688 * pass bch_journal() a list of keys to be journalled, and then
689 * bch_journal() hands those same keys off to btree_insert_async()
690 */
691
692void bch_journal(struct closure *cl)
693{
694 struct btree_op *op = container_of(cl, struct btree_op, cl);
695 struct cache_set *c = op->c;
696 struct journal_write *w;
697 size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
698
699 if (op->type != BTREE_INSERT ||
700 !CACHE_SYNC(&c->sb))
701 goto out;
702
703 /*
704 * If we're looping because we errored, might already be waiting on
705 * another journal write:
706 */
707 while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
708 closure_sync(cl->parent);
709
710 spin_lock(&c->journal.lock);
711
712 if (journal_full(&c->journal)) {
713 /* XXX: tracepoint */
714 closure_wait(&c->journal.wait, cl);
715
716 journal_reclaim(c);
717 spin_unlock(&c->journal.lock);
718
719 btree_flush_write(c);
720 continue_at(cl, bch_journal, bcache_wq);
721 }
722
723 w = c->journal.cur;
724 w->need_write = true;
725 b = __set_blocks(w->data, w->data->keys + n, c);
726
727 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
728 b > c->journal.blocks_free) {
729 /* XXX: If we were inserting so many keys that they won't fit in
730 * an _empty_ journal write, we'll deadlock. For now, handle
731 * this in bch_keylist_realloc() - but something to think about.
732 */
733 BUG_ON(!w->data->keys);
734
735 /* XXX: tracepoint */
736 BUG_ON(!closure_wait(&w->wait, cl));
737
738 closure_flush(&c->journal.io);
739
740 journal_try_write(c);
741 continue_at(cl, bch_journal, bcache_wq);
742 }
743
744 memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
745 w->data->keys += n;
746
747 op->journal = &fifo_back(&c->journal.pin);
748 atomic_inc(op->journal);
749
750 if (op->flush_journal) {
751 closure_flush(&c->journal.io);
752 closure_wait(&w->wait, cl->parent);
753 }
754
755 journal_try_write(c);
756out:
757 bch_btree_insert_async(cl);
758}
759
760void bch_journal_free(struct cache_set *c)
761{
762 free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
763 free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
764 free_fifo(&c->journal.pin);
765}
766
767int bch_journal_alloc(struct cache_set *c)
768{
769 struct journal *j = &c->journal;
770
771 closure_init_unlocked(&j->io);
772 spin_lock_init(&j->lock);
773
774 c->journal_delay_ms = 100;
775
776 j->w[0].c = c;
777 j->w[1].c = c;
778
779 if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
780 !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
781 !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
782 return -ENOMEM;
783
784 return 0;
785}
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
new file mode 100644
index 000000000000..3d7851274b04
--- /dev/null
+++ b/drivers/md/bcache/journal.h
@@ -0,0 +1,215 @@
1#ifndef _BCACHE_JOURNAL_H
2#define _BCACHE_JOURNAL_H
3
4/*
5 * THE JOURNAL:
6 *
7 * The journal is treated as a circular buffer of buckets - a journal entry
8 * never spans two buckets. This means (not implemented yet) we can resize the
9 * journal at runtime, and will be needed for bcache on raw flash support.
10 *
11 * Journal entries contain a list of keys, ordered by the time they were
12 * inserted; thus journal replay just has to reinsert the keys.
13 *
14 * We also keep some things in the journal header that are logically part of the
15 * superblock - all the things that are frequently updated. This is for future
16 * bcache on raw flash support; the superblock (which will become another
17 * journal) can't be moved or wear leveled, so it contains just enough
18 * information to find the main journal, and the superblock only has to be
19 * rewritten when we want to move/wear level the main journal.
20 *
21 * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
22 * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
23 * from cache misses, which don't have to be journaled, and for writeback and
24 * moving gc we work around it by flushing the btree to disk before updating the
25 * gc information. But it is a potential issue with incremental garbage
26 * collection, and it's fragile.
27 *
28 * OPEN JOURNAL ENTRIES:
29 *
30 * Each journal entry contains, in the header, the sequence number of the last
31 * journal entry still open - i.e. that has keys that haven't been flushed to
32 * disk in the btree.
33 *
34 * We track this by maintaining a refcount for every open journal entry, in a
35 * fifo; each entry in the fifo corresponds to a particular journal
36 * entry/sequence number. When the refcount at the tail of the fifo goes to
37 * zero, we pop it off - thus, the size of the fifo tells us the number of open
38 * journal entries
39 *
40 * We take a refcount on a journal entry when we add some keys to a journal
41 * entry that we're going to insert (held by struct btree_op), and then when we
42 * insert those keys into the btree the btree write we're setting up takes a
43 * copy of that refcount (held by struct btree_write). That refcount is dropped
44 * when the btree write completes.
45 *
46 * A struct btree_write can only hold a refcount on a single journal entry, but
47 * might contain keys for many journal entries - we handle this by making sure
48 * it always has a refcount on the _oldest_ journal entry of all the journal
49 * entries it has keys for.
50 *
51 * JOURNAL RECLAIM:
52 *
53 * As mentioned previously, our fifo of refcounts tells us the number of open
54 * journal entries; from that and the current journal sequence number we compute
55 * last_seq - the oldest journal entry we still need. We write last_seq in each
56 * journal entry, and we also have to keep track of where it exists on disk so
57 * we don't overwrite it when we loop around the journal.
58 *
59 * To do that we track, for each journal bucket, the sequence number of the
60 * newest journal entry it contains - if we don't need that journal entry we
61 * don't need anything in that bucket anymore. From that we track the last
62 * journal bucket we still need; all this is tracked in struct journal_device
63 * and updated by journal_reclaim().
64 *
65 * JOURNAL FILLING UP:
66 *
67 * There are two ways the journal could fill up; either we could run out of
68 * space to write to, or we could have too many open journal entries and run out
69 * of room in the fifo of refcounts. Since those refcounts are decremented
70 * without any locking we can't safely resize that fifo, so we handle it the
71 * same way.
72 *
73 * If the journal fills up, we start flushing dirty btree nodes until we can
74 * allocate space for a journal write again - preferentially flushing btree
75 * nodes that are pinning the oldest journal entries first.
76 */
77
78#define BCACHE_JSET_VERSION_UUIDv1 1
79/* Always latest UUID format */
80#define BCACHE_JSET_VERSION_UUID 1
81#define BCACHE_JSET_VERSION 1
82
83/*
84 * On disk format for a journal entry:
85 * seq is monotonically increasing; every journal entry has its own unique
86 * sequence number.
87 *
88 * last_seq is the oldest journal entry that still has keys the btree hasn't
89 * flushed to disk yet.
90 *
91 * version is for on disk format changes.
92 */
93struct jset {
94 uint64_t csum;
95 uint64_t magic;
96 uint64_t seq;
97 uint32_t version;
98 uint32_t keys;
99
100 uint64_t last_seq;
101
102 BKEY_PADDED(uuid_bucket);
103 BKEY_PADDED(btree_root);
104 uint16_t btree_level;
105 uint16_t pad[3];
106
107 uint64_t prio_bucket[MAX_CACHES_PER_SET];
108
109 union {
110 struct bkey start[0];
111 uint64_t d[0];
112 };
113};
114
115/*
116 * Only used for holding the journal entries we read in btree_journal_read()
117 * during cache_registration
118 */
119struct journal_replay {
120 struct list_head list;
121 atomic_t *pin;
122 struct jset j;
123};
124
125/*
126 * We put two of these in struct journal; we used them for writes to the
127 * journal that are being staged or in flight.
128 */
129struct journal_write {
130 struct jset *data;
131#define JSET_BITS 3
132
133 struct cache_set *c;
134 struct closure_waitlist wait;
135 bool need_write;
136};
137
138/* Embedded in struct cache_set */
139struct journal {
140 spinlock_t lock;
141 /* used when waiting because the journal was full */
142 struct closure_waitlist wait;
143 struct closure_with_timer io;
144
145 /* Number of blocks free in the bucket(s) we're currently writing to */
146 unsigned blocks_free;
147 uint64_t seq;
148 DECLARE_FIFO(atomic_t, pin);
149
150 BKEY_PADDED(key);
151
152 struct journal_write w[2], *cur;
153};
154
155/*
156 * Embedded in struct cache. First three fields refer to the array of journal
157 * buckets, in cache_sb.
158 */
159struct journal_device {
160 /*
161 * For each journal bucket, contains the max sequence number of the
162 * journal writes it contains - so we know when a bucket can be reused.
163 */
164 uint64_t seq[SB_JOURNAL_BUCKETS];
165
166 /* Journal bucket we're currently writing to */
167 unsigned cur_idx;
168
169 /* Last journal bucket that still contains an open journal entry */
170 unsigned last_idx;
171
172 /* Next journal bucket to be discarded */
173 unsigned discard_idx;
174
175#define DISCARD_READY 0
176#define DISCARD_IN_FLIGHT 1
177#define DISCARD_DONE 2
178 /* 1 - discard in flight, -1 - discard completed */
179 atomic_t discard_in_flight;
180
181 struct work_struct discard_work;
182 struct bio discard_bio;
183 struct bio_vec discard_bv;
184
185 /* Bio for journal reads/writes to this device */
186 struct bio bio;
187 struct bio_vec bv[8];
188};
189
190#define journal_pin_cmp(c, l, r) \
191 (fifo_idx(&(c)->journal.pin, (l)->journal) > \
192 fifo_idx(&(c)->journal.pin, (r)->journal))
193
194#define JOURNAL_PIN 20000
195
196#define journal_full(j) \
197 (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
198
199struct closure;
200struct cache_set;
201struct btree_op;
202
203void bch_journal(struct closure *);
204void bch_journal_next(struct journal *);
205void bch_journal_mark(struct cache_set *, struct list_head *);
206void bch_journal_meta(struct cache_set *, struct closure *);
207int bch_journal_read(struct cache_set *, struct list_head *,
208 struct btree_op *);
209int bch_journal_replay(struct cache_set *, struct list_head *,
210 struct btree_op *);
211
212void bch_journal_free(struct cache_set *);
213int bch_journal_alloc(struct cache_set *);
214
215#endif /* _BCACHE_JOURNAL_H */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
new file mode 100644
index 000000000000..c69fc92b02cf
--- /dev/null
+++ b/drivers/md/bcache/movinggc.c
@@ -0,0 +1,254 @@
1/*
2 * Moving/copying garbage collector
3 *
4 * Copyright 2012 Google, Inc.
5 */
6
7#include "bcache.h"
8#include "btree.h"
9#include "debug.h"
10#include "request.h"
11
12struct moving_io {
13 struct keybuf_key *w;
14 struct search s;
15 struct bbio bio;
16};
17
18static bool moving_pred(struct keybuf *buf, struct bkey *k)
19{
20 struct cache_set *c = container_of(buf, struct cache_set,
21 moving_gc_keys);
22 unsigned i;
23
24 for (i = 0; i < KEY_PTRS(k); i++) {
25 struct cache *ca = PTR_CACHE(c, k, i);
26 struct bucket *g = PTR_BUCKET(c, k, i);
27
28 if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
29 return true;
30 }
31
32 return false;
33}
34
35/* Moving GC - IO loop */
36
37static void moving_io_destructor(struct closure *cl)
38{
39 struct moving_io *io = container_of(cl, struct moving_io, s.cl);
40 kfree(io);
41}
42
43static void write_moving_finish(struct closure *cl)
44{
45 struct moving_io *io = container_of(cl, struct moving_io, s.cl);
46 struct bio *bio = &io->bio.bio;
47 struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
48
49 while (bv-- != bio->bi_io_vec)
50 __free_page(bv->bv_page);
51
52 pr_debug("%s %s", io->s.op.insert_collision
53 ? "collision moving" : "moved",
54 pkey(&io->w->key));
55
56 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
57
58 atomic_dec_bug(&io->s.op.c->in_flight);
59 closure_wake_up(&io->s.op.c->moving_gc_wait);
60
61 closure_return_with_destructor(cl, moving_io_destructor);
62}
63
64static void read_moving_endio(struct bio *bio, int error)
65{
66 struct moving_io *io = container_of(bio->bi_private,
67 struct moving_io, s.cl);
68
69 if (error)
70 io->s.error = error;
71
72 bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
73}
74
75static void moving_init(struct moving_io *io)
76{
77 struct bio *bio = &io->bio.bio;
78
79 bio_init(bio);
80 bio_get(bio);
81 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
82
83 bio->bi_size = KEY_SIZE(&io->w->key) << 9;
84 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key),
85 PAGE_SECTORS);
86 bio->bi_private = &io->s.cl;
87 bio->bi_io_vec = bio->bi_inline_vecs;
88 bio_map(bio, NULL);
89}
90
91static void write_moving(struct closure *cl)
92{
93 struct search *s = container_of(cl, struct search, cl);
94 struct moving_io *io = container_of(s, struct moving_io, s);
95
96 if (!s->error) {
97 trace_bcache_write_moving(&io->bio.bio);
98
99 moving_init(io);
100
101 io->bio.bio.bi_sector = KEY_START(&io->w->key);
102 s->op.lock = -1;
103 s->op.write_prio = 1;
104 s->op.cache_bio = &io->bio.bio;
105
106 s->writeback = KEY_DIRTY(&io->w->key);
107 s->op.csum = KEY_CSUM(&io->w->key);
108
109 s->op.type = BTREE_REPLACE;
110 bkey_copy(&s->op.replace, &io->w->key);
111
112 closure_init(&s->op.cl, cl);
113 bch_insert_data(&s->op.cl);
114 }
115
116 continue_at(cl, write_moving_finish, NULL);
117}
118
119static void read_moving_submit(struct closure *cl)
120{
121 struct search *s = container_of(cl, struct search, cl);
122 struct moving_io *io = container_of(s, struct moving_io, s);
123 struct bio *bio = &io->bio.bio;
124
125 trace_bcache_read_moving(bio);
126 bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
127
128 continue_at(cl, write_moving, bch_gc_wq);
129}
130
131static void read_moving(struct closure *cl)
132{
133 struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
134 struct keybuf_key *w;
135 struct moving_io *io;
136 struct bio *bio;
137
138 /* XXX: if we error, background writeback could stall indefinitely */
139
140 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
141 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
142 if (!w)
143 break;
144
145 io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
146 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
147 GFP_KERNEL);
148 if (!io)
149 goto err;
150
151 w->private = io;
152 io->w = w;
153 io->s.op.inode = KEY_INODE(&w->key);
154 io->s.op.c = c;
155
156 moving_init(io);
157 bio = &io->bio.bio;
158
159 bio->bi_rw = READ;
160 bio->bi_end_io = read_moving_endio;
161
162 if (bio_alloc_pages(bio, GFP_KERNEL))
163 goto err;
164
165 pr_debug("%s", pkey(&w->key));
166
167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
168
169 if (atomic_inc_return(&c->in_flight) >= 64) {
170 closure_wait_event(&c->moving_gc_wait, cl,
171 atomic_read(&c->in_flight) < 64);
172 continue_at(cl, read_moving, bch_gc_wq);
173 }
174 }
175
176 if (0) {
177err: if (!IS_ERR_OR_NULL(w->private))
178 kfree(w->private);
179
180 bch_keybuf_del(&c->moving_gc_keys, w);
181 }
182
183 closure_return(cl);
184}
185
186void bch_moving_gc(struct closure *cl)
187{
188 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
189 struct cache *ca;
190 struct bucket *b;
191 unsigned i;
192
193 bool bucket_cmp(struct bucket *l, struct bucket *r)
194 {
195 return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
196 }
197
198 unsigned top(struct cache *ca)
199 {
200 return GC_SECTORS_USED(heap_peek(&ca->heap));
201 }
202
203 if (!c->copy_gc_enabled)
204 closure_return(cl);
205
206 mutex_lock(&c->bucket_lock);
207
208 for_each_cache(ca, c, i) {
209 unsigned sectors_to_move = 0;
210 unsigned reserve_sectors = ca->sb.bucket_size *
211 min(fifo_used(&ca->free), ca->free.size / 2);
212
213 ca->heap.used = 0;
214
215 for_each_bucket(b, ca) {
216 if (!GC_SECTORS_USED(b))
217 continue;
218
219 if (!heap_full(&ca->heap)) {
220 sectors_to_move += GC_SECTORS_USED(b);
221 heap_add(&ca->heap, b, bucket_cmp);
222 } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
223 sectors_to_move -= top(ca);
224 sectors_to_move += GC_SECTORS_USED(b);
225
226 ca->heap.data[0] = b;
227 heap_sift(&ca->heap, 0, bucket_cmp);
228 }
229 }
230
231 while (sectors_to_move > reserve_sectors) {
232 heap_pop(&ca->heap, b, bucket_cmp);
233 sectors_to_move -= GC_SECTORS_USED(b);
234 }
235
236 ca->gc_move_threshold = top(ca);
237
238 pr_debug("threshold %u", ca->gc_move_threshold);
239 }
240
241 mutex_unlock(&c->bucket_lock);
242
243 c->moving_gc_keys.last_scanned = ZERO_KEY;
244
245 closure_init(&c->moving_gc, cl);
246 read_moving(&c->moving_gc);
247
248 closure_return(cl);
249}
250
251void bch_moving_init_cache_set(struct cache_set *c)
252{
253 bch_keybuf_init(&c->moving_gc_keys, moving_pred);
254}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
new file mode 100644
index 000000000000..4f552de49aaa
--- /dev/null
+++ b/drivers/md/bcache/request.c
@@ -0,0 +1,1409 @@
1/*
2 * Main bcache entry point - handle a read or a write request and decide what to
3 * do with it; the make_request functions are called by the block layer.
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "request.h"
13
14#include <linux/cgroup.h>
15#include <linux/module.h>
16#include <linux/hash.h>
17#include <linux/random.h>
18#include "blk-cgroup.h"
19
20#include <trace/events/bcache.h>
21
22#define CUTOFF_CACHE_ADD 95
23#define CUTOFF_CACHE_READA 90
24#define CUTOFF_WRITEBACK 50
25#define CUTOFF_WRITEBACK_SYNC 75
26
27struct kmem_cache *bch_search_cache;
28
29static void check_should_skip(struct cached_dev *, struct search *);
30
31/* Cgroup interface */
32
33#ifdef CONFIG_CGROUP_BCACHE
34static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
35
36static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
37{
38 struct cgroup_subsys_state *css;
39 return cgroup &&
40 (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
41 ? container_of(css, struct bch_cgroup, css)
42 : &bcache_default_cgroup;
43}
44
45struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
46{
47 struct cgroup_subsys_state *css = bio->bi_css
48 ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
49 : task_subsys_state(current, bcache_subsys_id);
50
51 return css
52 ? container_of(css, struct bch_cgroup, css)
53 : &bcache_default_cgroup;
54}
55
56static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
57 struct file *file,
58 char __user *buf, size_t nbytes, loff_t *ppos)
59{
60 char tmp[1024];
61 int len = snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
62 cgroup_to_bcache(cgrp)->cache_mode + 1);
63
64 if (len < 0)
65 return len;
66
67 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
68}
69
70static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
71 const char *buf)
72{
73 int v = read_string_list(buf, bch_cache_modes);
74 if (v < 0)
75 return v;
76
77 cgroup_to_bcache(cgrp)->cache_mode = v - 1;
78 return 0;
79}
80
81static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
82{
83 return cgroup_to_bcache(cgrp)->verify;
84}
85
86static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
87{
88 cgroup_to_bcache(cgrp)->verify = val;
89 return 0;
90}
91
92static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
93{
94 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
95 return atomic_read(&bcachecg->stats.cache_hits);
96}
97
98static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
99{
100 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
101 return atomic_read(&bcachecg->stats.cache_misses);
102}
103
104static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
105 struct cftype *cft)
106{
107 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
108 return atomic_read(&bcachecg->stats.cache_bypass_hits);
109}
110
111static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
112 struct cftype *cft)
113{
114 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
115 return atomic_read(&bcachecg->stats.cache_bypass_misses);
116}
117
118static struct cftype bch_files[] = {
119 {
120 .name = "cache_mode",
121 .read = cache_mode_read,
122 .write_string = cache_mode_write,
123 },
124 {
125 .name = "verify",
126 .read_u64 = bch_verify_read,
127 .write_u64 = bch_verify_write,
128 },
129 {
130 .name = "cache_hits",
131 .read_u64 = bch_cache_hits_read,
132 },
133 {
134 .name = "cache_misses",
135 .read_u64 = bch_cache_misses_read,
136 },
137 {
138 .name = "cache_bypass_hits",
139 .read_u64 = bch_cache_bypass_hits_read,
140 },
141 {
142 .name = "cache_bypass_misses",
143 .read_u64 = bch_cache_bypass_misses_read,
144 },
145 { } /* terminate */
146};
147
148static void init_bch_cgroup(struct bch_cgroup *cg)
149{
150 cg->cache_mode = -1;
151}
152
153static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
154{
155 struct bch_cgroup *cg;
156
157 cg = kzalloc(sizeof(*cg), GFP_KERNEL);
158 if (!cg)
159 return ERR_PTR(-ENOMEM);
160 init_bch_cgroup(cg);
161 return &cg->css;
162}
163
164static void bcachecg_destroy(struct cgroup *cgroup)
165{
166 struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
167 free_css_id(&bcache_subsys, &cg->css);
168 kfree(cg);
169}
170
171struct cgroup_subsys bcache_subsys = {
172 .create = bcachecg_create,
173 .destroy = bcachecg_destroy,
174 .subsys_id = bcache_subsys_id,
175 .name = "bcache",
176 .module = THIS_MODULE,
177};
178EXPORT_SYMBOL_GPL(bcache_subsys);
179#endif
180
181static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
182{
183#ifdef CONFIG_CGROUP_BCACHE
184 int r = bch_bio_to_cgroup(bio)->cache_mode;
185 if (r >= 0)
186 return r;
187#endif
188 return BDEV_CACHE_MODE(&dc->sb);
189}
190
191static bool verify(struct cached_dev *dc, struct bio *bio)
192{
193#ifdef CONFIG_CGROUP_BCACHE
194 if (bch_bio_to_cgroup(bio)->verify)
195 return true;
196#endif
197 return dc->verify;
198}
199
200static void bio_csum(struct bio *bio, struct bkey *k)
201{
202 struct bio_vec *bv;
203 uint64_t csum = 0;
204 int i;
205
206 bio_for_each_segment(bv, bio, i) {
207 void *d = kmap(bv->bv_page) + bv->bv_offset;
208 csum = crc64_update(csum, d, bv->bv_len);
209 kunmap(bv->bv_page);
210 }
211
212 k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
213}
214
215/* Insert data into cache */
216
217static void bio_invalidate(struct closure *cl)
218{
219 struct btree_op *op = container_of(cl, struct btree_op, cl);
220 struct bio *bio = op->cache_bio;
221
222 pr_debug("invalidating %i sectors from %llu",
223 bio_sectors(bio), (uint64_t) bio->bi_sector);
224
225 while (bio_sectors(bio)) {
226 unsigned len = min(bio_sectors(bio), 1U << 14);
227
228 if (bch_keylist_realloc(&op->keys, 0, op->c))
229 goto out;
230
231 bio->bi_sector += len;
232 bio->bi_size -= len << 9;
233
234 bch_keylist_add(&op->keys,
235 &KEY(op->inode, bio->bi_sector, len));
236 }
237
238 op->insert_data_done = true;
239 bio_put(bio);
240out:
241 continue_at(cl, bch_journal, bcache_wq);
242}
243
244struct open_bucket {
245 struct list_head list;
246 struct task_struct *last;
247 unsigned sectors_free;
248 BKEY_PADDED(key);
249};
250
251void bch_open_buckets_free(struct cache_set *c)
252{
253 struct open_bucket *b;
254
255 while (!list_empty(&c->data_buckets)) {
256 b = list_first_entry(&c->data_buckets,
257 struct open_bucket, list);
258 list_del(&b->list);
259 kfree(b);
260 }
261}
262
263int bch_open_buckets_alloc(struct cache_set *c)
264{
265 int i;
266
267 spin_lock_init(&c->data_bucket_lock);
268
269 for (i = 0; i < 6; i++) {
270 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
271 if (!b)
272 return -ENOMEM;
273
274 list_add(&b->list, &c->data_buckets);
275 }
276
277 return 0;
278}
279
280/*
281 * We keep multiple buckets open for writes, and try to segregate different
282 * write streams for better cache utilization: first we look for a bucket where
283 * the last write to it was sequential with the current write, and failing that
284 * we look for a bucket that was last used by the same task.
285 *
286 * The ideas is if you've got multiple tasks pulling data into the cache at the
287 * same time, you'll get better cache utilization if you try to segregate their
288 * data and preserve locality.
289 *
290 * For example, say you've starting Firefox at the same time you're copying a
291 * bunch of files. Firefox will likely end up being fairly hot and stay in the
292 * cache awhile, but the data you copied might not be; if you wrote all that
293 * data to the same buckets it'd get invalidated at the same time.
294 *
295 * Both of those tasks will be doing fairly random IO so we can't rely on
296 * detecting sequential IO to segregate their data, but going off of the task
297 * should be a sane heuristic.
298 */
299static struct open_bucket *pick_data_bucket(struct cache_set *c,
300 const struct bkey *search,
301 struct task_struct *task,
302 struct bkey *alloc)
303{
304 struct open_bucket *ret, *ret_task = NULL;
305
306 list_for_each_entry_reverse(ret, &c->data_buckets, list)
307 if (!bkey_cmp(&ret->key, search))
308 goto found;
309 else if (ret->last == task)
310 ret_task = ret;
311
312 ret = ret_task ?: list_first_entry(&c->data_buckets,
313 struct open_bucket, list);
314found:
315 if (!ret->sectors_free && KEY_PTRS(alloc)) {
316 ret->sectors_free = c->sb.bucket_size;
317 bkey_copy(&ret->key, alloc);
318 bkey_init(alloc);
319 }
320
321 if (!ret->sectors_free)
322 ret = NULL;
323
324 return ret;
325}
326
327/*
328 * Allocates some space in the cache to write to, and k to point to the newly
329 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
330 * end of the newly allocated space).
331 *
332 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
333 * sectors were actually allocated.
334 *
335 * If s->writeback is true, will not fail.
336 */
337static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
338 struct search *s)
339{
340 struct cache_set *c = s->op.c;
341 struct open_bucket *b;
342 BKEY_PADDED(key) alloc;
343 struct closure cl, *w = NULL;
344 unsigned i;
345
346 if (s->writeback) {
347 closure_init_stack(&cl);
348 w = &cl;
349 }
350
351 /*
352 * We might have to allocate a new bucket, which we can't do with a
353 * spinlock held. So if we have to allocate, we drop the lock, allocate
354 * and then retry. KEY_PTRS() indicates whether alloc points to
355 * allocated bucket(s).
356 */
357
358 bkey_init(&alloc.key);
359 spin_lock(&c->data_bucket_lock);
360
361 while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
362 unsigned watermark = s->op.write_prio
363 ? WATERMARK_MOVINGGC
364 : WATERMARK_NONE;
365
366 spin_unlock(&c->data_bucket_lock);
367
368 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
369 return false;
370
371 spin_lock(&c->data_bucket_lock);
372 }
373
374 /*
375 * If we had to allocate, we might race and not need to allocate the
376 * second time we call find_data_bucket(). If we allocated a bucket but
377 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
378 */
379 if (KEY_PTRS(&alloc.key))
380 __bkey_put(c, &alloc.key);
381
382 for (i = 0; i < KEY_PTRS(&b->key); i++)
383 EBUG_ON(ptr_stale(c, &b->key, i));
384
385 /* Set up the pointer to the space we're allocating: */
386
387 for (i = 0; i < KEY_PTRS(&b->key); i++)
388 k->ptr[i] = b->key.ptr[i];
389
390 sectors = min(sectors, b->sectors_free);
391
392 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
393 SET_KEY_SIZE(k, sectors);
394 SET_KEY_PTRS(k, KEY_PTRS(&b->key));
395
396 /*
397 * Move b to the end of the lru, and keep track of what this bucket was
398 * last used for:
399 */
400 list_move_tail(&b->list, &c->data_buckets);
401 bkey_copy_key(&b->key, k);
402 b->last = s->task;
403
404 b->sectors_free -= sectors;
405
406 for (i = 0; i < KEY_PTRS(&b->key); i++) {
407 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
408
409 atomic_long_add(sectors,
410 &PTR_CACHE(c, &b->key, i)->sectors_written);
411 }
412
413 if (b->sectors_free < c->sb.block_size)
414 b->sectors_free = 0;
415
416 /*
417 * k takes refcounts on the buckets it points to until it's inserted
418 * into the btree, but if we're done with this bucket we just transfer
419 * get_data_bucket()'s refcount.
420 */
421 if (b->sectors_free)
422 for (i = 0; i < KEY_PTRS(&b->key); i++)
423 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
424
425 spin_unlock(&c->data_bucket_lock);
426 return true;
427}
428
429static void bch_insert_data_error(struct closure *cl)
430{
431 struct btree_op *op = container_of(cl, struct btree_op, cl);
432
433 /*
434 * Our data write just errored, which means we've got a bunch of keys to
435 * insert that point to data that wasn't succesfully written.
436 *
437 * We don't have to insert those keys but we still have to invalidate
438 * that region of the cache - so, if we just strip off all the pointers
439 * from the keys we'll accomplish just that.
440 */
441
442 struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
443
444 while (src != op->keys.top) {
445 struct bkey *n = bkey_next(src);
446
447 SET_KEY_PTRS(src, 0);
448 bkey_copy(dst, src);
449
450 dst = bkey_next(dst);
451 src = n;
452 }
453
454 op->keys.top = dst;
455
456 bch_journal(cl);
457}
458
459static void bch_insert_data_endio(struct bio *bio, int error)
460{
461 struct closure *cl = bio->bi_private;
462 struct btree_op *op = container_of(cl, struct btree_op, cl);
463 struct search *s = container_of(op, struct search, op);
464
465 if (error) {
466 /* TODO: We could try to recover from this. */
467 if (s->writeback)
468 s->error = error;
469 else if (s->write)
470 set_closure_fn(cl, bch_insert_data_error, bcache_wq);
471 else
472 set_closure_fn(cl, NULL, NULL);
473 }
474
475 bch_bbio_endio(op->c, bio, error, "writing data to cache");
476}
477
478static void bch_insert_data_loop(struct closure *cl)
479{
480 struct btree_op *op = container_of(cl, struct btree_op, cl);
481 struct search *s = container_of(op, struct search, op);
482 struct bio *bio = op->cache_bio, *n;
483
484 if (op->skip)
485 return bio_invalidate(cl);
486
487 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
488 set_gc_sectors(op->c);
489 bch_queue_gc(op->c);
490 }
491
492 do {
493 unsigned i;
494 struct bkey *k;
495 struct bio_set *split = s->d
496 ? s->d->bio_split : op->c->bio_split;
497
498 /* 1 for the device pointer and 1 for the chksum */
499 if (bch_keylist_realloc(&op->keys,
500 1 + (op->csum ? 1 : 0),
501 op->c))
502 continue_at(cl, bch_journal, bcache_wq);
503
504 k = op->keys.top;
505 bkey_init(k);
506 SET_KEY_INODE(k, op->inode);
507 SET_KEY_OFFSET(k, bio->bi_sector);
508
509 if (!bch_alloc_sectors(k, bio_sectors(bio), s))
510 goto err;
511
512 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
513 if (!n) {
514 __bkey_put(op->c, k);
515 continue_at(cl, bch_insert_data_loop, bcache_wq);
516 }
517
518 n->bi_end_io = bch_insert_data_endio;
519 n->bi_private = cl;
520
521 if (s->writeback) {
522 SET_KEY_DIRTY(k, true);
523
524 for (i = 0; i < KEY_PTRS(k); i++)
525 SET_GC_MARK(PTR_BUCKET(op->c, k, i),
526 GC_MARK_DIRTY);
527 }
528
529 SET_KEY_CSUM(k, op->csum);
530 if (KEY_CSUM(k))
531 bio_csum(n, k);
532
533 pr_debug("%s", pkey(k));
534 bch_keylist_push(&op->keys);
535
536 trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
537 n->bi_rw |= REQ_WRITE;
538 bch_submit_bbio(n, op->c, k, 0);
539 } while (n != bio);
540
541 op->insert_data_done = true;
542 continue_at(cl, bch_journal, bcache_wq);
543err:
544 /* bch_alloc_sectors() blocks if s->writeback = true */
545 BUG_ON(s->writeback);
546
547 /*
548 * But if it's not a writeback write we'd rather just bail out if
549 * there aren't any buckets ready to write to - it might take awhile and
550 * we might be starving btree writes for gc or something.
551 */
552
553 if (s->write) {
554 /*
555 * Writethrough write: We can't complete the write until we've
556 * updated the index. But we don't want to delay the write while
557 * we wait for buckets to be freed up, so just invalidate the
558 * rest of the write.
559 */
560 op->skip = true;
561 return bio_invalidate(cl);
562 } else {
563 /*
564 * From a cache miss, we can just insert the keys for the data
565 * we have written or bail out if we didn't do anything.
566 */
567 op->insert_data_done = true;
568 bio_put(bio);
569
570 if (!bch_keylist_empty(&op->keys))
571 continue_at(cl, bch_journal, bcache_wq);
572 else
573 closure_return(cl);
574 }
575}
576
577/**
578 * bch_insert_data - stick some data in the cache
579 *
580 * This is the starting point for any data to end up in a cache device; it could
581 * be from a normal write, or a writeback write, or a write to a flash only
582 * volume - it's also used by the moving garbage collector to compact data in
583 * mostly empty buckets.
584 *
585 * It first writes the data to the cache, creating a list of keys to be inserted
586 * (if the data had to be fragmented there will be multiple keys); after the
587 * data is written it calls bch_journal, and after the keys have been added to
588 * the next journal write they're inserted into the btree.
589 *
590 * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
591 * and op->inode is used for the key inode.
592 *
593 * If op->skip is true, instead of inserting the data it invalidates the region
594 * of the cache represented by op->cache_bio and op->inode.
595 */
596void bch_insert_data(struct closure *cl)
597{
598 struct btree_op *op = container_of(cl, struct btree_op, cl);
599
600 bch_keylist_init(&op->keys);
601 bio_get(op->cache_bio);
602 bch_insert_data_loop(cl);
603}
604
605void bch_btree_insert_async(struct closure *cl)
606{
607 struct btree_op *op = container_of(cl, struct btree_op, cl);
608 struct search *s = container_of(op, struct search, op);
609
610 if (bch_btree_insert(op, op->c)) {
611 s->error = -ENOMEM;
612 op->insert_data_done = true;
613 }
614
615 if (op->insert_data_done) {
616 bch_keylist_free(&op->keys);
617 closure_return(cl);
618 } else
619 continue_at(cl, bch_insert_data_loop, bcache_wq);
620}
621
622/* Common code for the make_request functions */
623
624static void request_endio(struct bio *bio, int error)
625{
626 struct closure *cl = bio->bi_private;
627
628 if (error) {
629 struct search *s = container_of(cl, struct search, cl);
630 s->error = error;
631 /* Only cache read errors are recoverable */
632 s->recoverable = false;
633 }
634
635 bio_put(bio);
636 closure_put(cl);
637}
638
639void bch_cache_read_endio(struct bio *bio, int error)
640{
641 struct bbio *b = container_of(bio, struct bbio, bio);
642 struct closure *cl = bio->bi_private;
643 struct search *s = container_of(cl, struct search, cl);
644
645 /*
646 * If the bucket was reused while our bio was in flight, we might have
647 * read the wrong data. Set s->error but not error so it doesn't get
648 * counted against the cache device, but we'll still reread the data
649 * from the backing device.
650 */
651
652 if (error)
653 s->error = error;
654 else if (ptr_stale(s->op.c, &b->key, 0)) {
655 atomic_long_inc(&s->op.c->cache_read_races);
656 s->error = -EINTR;
657 }
658
659 bch_bbio_endio(s->op.c, bio, error, "reading from cache");
660}
661
662static void bio_complete(struct search *s)
663{
664 if (s->orig_bio) {
665 int cpu, rw = bio_data_dir(s->orig_bio);
666 unsigned long duration = jiffies - s->start_time;
667
668 cpu = part_stat_lock();
669 part_round_stats(cpu, &s->d->disk->part0);
670 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
671 part_stat_unlock();
672
673 trace_bcache_request_end(s, s->orig_bio);
674 bio_endio(s->orig_bio, s->error);
675 s->orig_bio = NULL;
676 }
677}
678
679static void do_bio_hook(struct search *s)
680{
681 struct bio *bio = &s->bio.bio;
682 memcpy(bio, s->orig_bio, sizeof(struct bio));
683
684 bio->bi_end_io = request_endio;
685 bio->bi_private = &s->cl;
686 atomic_set(&bio->bi_cnt, 3);
687}
688
689static void search_free(struct closure *cl)
690{
691 struct search *s = container_of(cl, struct search, cl);
692 bio_complete(s);
693
694 if (s->op.cache_bio)
695 bio_put(s->op.cache_bio);
696
697 if (s->unaligned_bvec)
698 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
699
700 closure_debug_destroy(cl);
701 mempool_free(s, s->d->c->search);
702}
703
704static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
705{
706 struct bio_vec *bv;
707 struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
708 memset(s, 0, offsetof(struct search, op.keys));
709
710 __closure_init(&s->cl, NULL);
711
712 s->op.inode = d->id;
713 s->op.c = d->c;
714 s->d = d;
715 s->op.lock = -1;
716 s->task = current;
717 s->orig_bio = bio;
718 s->write = (bio->bi_rw & REQ_WRITE) != 0;
719 s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0;
720 s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0;
721 s->recoverable = 1;
722 s->start_time = jiffies;
723 do_bio_hook(s);
724
725 if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
726 bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
727 memcpy(bv, bio_iovec(bio),
728 sizeof(struct bio_vec) * bio_segments(bio));
729
730 s->bio.bio.bi_io_vec = bv;
731 s->unaligned_bvec = 1;
732 }
733
734 return s;
735}
736
737static void btree_read_async(struct closure *cl)
738{
739 struct btree_op *op = container_of(cl, struct btree_op, cl);
740
741 int ret = btree_root(search_recurse, op->c, op);
742
743 if (ret == -EAGAIN)
744 continue_at(cl, btree_read_async, bcache_wq);
745
746 closure_return(cl);
747}
748
749/* Cached devices */
750
751static void cached_dev_bio_complete(struct closure *cl)
752{
753 struct search *s = container_of(cl, struct search, cl);
754 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
755
756 search_free(cl);
757 cached_dev_put(dc);
758}
759
760/* Process reads */
761
762static void cached_dev_read_complete(struct closure *cl)
763{
764 struct search *s = container_of(cl, struct search, cl);
765
766 if (s->op.insert_collision)
767 bch_mark_cache_miss_collision(s);
768
769 if (s->op.cache_bio) {
770 int i;
771 struct bio_vec *bv;
772
773 __bio_for_each_segment(bv, s->op.cache_bio, i, 0)
774 __free_page(bv->bv_page);
775 }
776
777 cached_dev_bio_complete(cl);
778}
779
780static void request_read_error(struct closure *cl)
781{
782 struct search *s = container_of(cl, struct search, cl);
783 struct bio_vec *bv;
784 int i;
785
786 if (s->recoverable) {
787 /* The cache read failed, but we can retry from the backing
788 * device.
789 */
790 pr_debug("recovering at sector %llu",
791 (uint64_t) s->orig_bio->bi_sector);
792
793 s->error = 0;
794 bv = s->bio.bio.bi_io_vec;
795 do_bio_hook(s);
796 s->bio.bio.bi_io_vec = bv;
797
798 if (!s->unaligned_bvec)
799 bio_for_each_segment(bv, s->orig_bio, i)
800 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
801 else
802 memcpy(s->bio.bio.bi_io_vec,
803 bio_iovec(s->orig_bio),
804 sizeof(struct bio_vec) *
805 bio_segments(s->orig_bio));
806
807 /* XXX: invalidate cache */
808
809 trace_bcache_read_retry(&s->bio.bio);
810 closure_bio_submit(&s->bio.bio, &s->cl, s->d);
811 }
812
813 continue_at(cl, cached_dev_read_complete, NULL);
814}
815
816static void request_read_done(struct closure *cl)
817{
818 struct search *s = container_of(cl, struct search, cl);
819 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
820
821 /*
822 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
823 * contains data ready to be inserted into the cache.
824 *
825 * First, we copy the data we just read from cache_bio's bounce buffers
826 * to the buffers the original bio pointed to:
827 */
828
829 if (s->op.cache_bio) {
830 struct bio_vec *src, *dst;
831 unsigned src_offset, dst_offset, bytes;
832 void *dst_ptr;
833
834 bio_reset(s->op.cache_bio);
835 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector;
836 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev;
837 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
838 bio_map(s->op.cache_bio, NULL);
839
840 src = bio_iovec(s->op.cache_bio);
841 dst = bio_iovec(s->cache_miss);
842 src_offset = src->bv_offset;
843 dst_offset = dst->bv_offset;
844 dst_ptr = kmap(dst->bv_page);
845
846 while (1) {
847 if (dst_offset == dst->bv_offset + dst->bv_len) {
848 kunmap(dst->bv_page);
849 dst++;
850 if (dst == bio_iovec_idx(s->cache_miss,
851 s->cache_miss->bi_vcnt))
852 break;
853
854 dst_offset = dst->bv_offset;
855 dst_ptr = kmap(dst->bv_page);
856 }
857
858 if (src_offset == src->bv_offset + src->bv_len) {
859 src++;
860 if (src == bio_iovec_idx(s->op.cache_bio,
861 s->op.cache_bio->bi_vcnt))
862 BUG();
863
864 src_offset = src->bv_offset;
865 }
866
867 bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
868 src->bv_offset + src->bv_len - src_offset);
869
870 memcpy(dst_ptr + dst_offset,
871 page_address(src->bv_page) + src_offset,
872 bytes);
873
874 src_offset += bytes;
875 dst_offset += bytes;
876 }
877
878 bio_put(s->cache_miss);
879 s->cache_miss = NULL;
880 }
881
882 if (verify(dc, &s->bio.bio) && s->recoverable)
883 bch_data_verify(s);
884
885 bio_complete(s);
886
887 if (s->op.cache_bio &&
888 !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
889 s->op.type = BTREE_REPLACE;
890 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
891 }
892
893 continue_at(cl, cached_dev_read_complete, NULL);
894}
895
896static void request_read_done_bh(struct closure *cl)
897{
898 struct search *s = container_of(cl, struct search, cl);
899 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
900
901 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
902
903 if (s->error)
904 continue_at_nobarrier(cl, request_read_error, bcache_wq);
905 else if (s->op.cache_bio || verify(dc, &s->bio.bio))
906 continue_at_nobarrier(cl, request_read_done, bcache_wq);
907 else
908 continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
909}
910
911static int cached_dev_cache_miss(struct btree *b, struct search *s,
912 struct bio *bio, unsigned sectors)
913{
914 int ret = 0;
915 unsigned reada;
916 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
917 struct bio *miss;
918
919 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
920 if (!miss)
921 return -EAGAIN;
922
923 if (miss == bio)
924 s->op.lookup_done = true;
925
926 miss->bi_end_io = request_endio;
927 miss->bi_private = &s->cl;
928
929 if (s->cache_miss || s->op.skip)
930 goto out_submit;
931
932 if (miss != bio ||
933 (bio->bi_rw & REQ_RAHEAD) ||
934 (bio->bi_rw & REQ_META) ||
935 s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
936 reada = 0;
937 else {
938 reada = min(dc->readahead >> 9,
939 sectors - bio_sectors(miss));
940
941 if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev))
942 reada = bdev_sectors(miss->bi_bdev) - bio_end(miss);
943 }
944
945 s->cache_bio_sectors = bio_sectors(miss) + reada;
946 s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
947 DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
948 dc->disk.bio_split);
949
950 if (!s->op.cache_bio)
951 goto out_submit;
952
953 s->op.cache_bio->bi_sector = miss->bi_sector;
954 s->op.cache_bio->bi_bdev = miss->bi_bdev;
955 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
956
957 s->op.cache_bio->bi_end_io = request_endio;
958 s->op.cache_bio->bi_private = &s->cl;
959
960 /* btree_search_recurse()'s btree iterator is no good anymore */
961 ret = -EINTR;
962 if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
963 goto out_put;
964
965 bio_map(s->op.cache_bio, NULL);
966 if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
967 goto out_put;
968
969 s->cache_miss = miss;
970 bio_get(s->op.cache_bio);
971
972 trace_bcache_cache_miss(s->orig_bio);
973 closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
974
975 return ret;
976out_put:
977 bio_put(s->op.cache_bio);
978 s->op.cache_bio = NULL;
979out_submit:
980 closure_bio_submit(miss, &s->cl, s->d);
981 return ret;
982}
983
984static void request_read(struct cached_dev *dc, struct search *s)
985{
986 struct closure *cl = &s->cl;
987
988 check_should_skip(dc, s);
989 closure_call(&s->op.cl, btree_read_async, NULL, cl);
990
991 continue_at(cl, request_read_done_bh, NULL);
992}
993
994/* Process writes */
995
996static void cached_dev_write_complete(struct closure *cl)
997{
998 struct search *s = container_of(cl, struct search, cl);
999 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
1000
1001 up_read_non_owner(&dc->writeback_lock);
1002 cached_dev_bio_complete(cl);
1003}
1004
1005static bool should_writeback(struct cached_dev *dc, struct bio *bio)
1006{
1007 unsigned threshold = (bio->bi_rw & REQ_SYNC)
1008 ? CUTOFF_WRITEBACK_SYNC
1009 : CUTOFF_WRITEBACK;
1010
1011 return !atomic_read(&dc->disk.detaching) &&
1012 cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
1013 dc->disk.c->gc_stats.in_use < threshold;
1014}
1015
1016static void request_write(struct cached_dev *dc, struct search *s)
1017{
1018 struct closure *cl = &s->cl;
1019 struct bio *bio = &s->bio.bio;
1020 struct bkey start, end;
1021 start = KEY(dc->disk.id, bio->bi_sector, 0);
1022 end = KEY(dc->disk.id, bio_end(bio), 0);
1023
1024 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
1025
1026 check_should_skip(dc, s);
1027 down_read_non_owner(&dc->writeback_lock);
1028
1029 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
1030 s->op.skip = false;
1031 s->writeback = true;
1032 }
1033
1034 if (bio->bi_rw & REQ_DISCARD)
1035 goto skip;
1036
1037 if (s->op.skip)
1038 goto skip;
1039
1040 if (should_writeback(dc, s->orig_bio))
1041 s->writeback = true;
1042
1043 if (!s->writeback) {
1044 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1045 dc->disk.bio_split);
1046
1047 trace_bcache_writethrough(s->orig_bio);
1048 closure_bio_submit(bio, cl, s->d);
1049 } else {
1050 s->op.cache_bio = bio;
1051 trace_bcache_writeback(s->orig_bio);
1052 bch_writeback_add(dc, bio_sectors(bio));
1053 }
1054out:
1055 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1056 continue_at(cl, cached_dev_write_complete, NULL);
1057skip:
1058 s->op.skip = true;
1059 s->op.cache_bio = s->orig_bio;
1060 bio_get(s->op.cache_bio);
1061 trace_bcache_write_skip(s->orig_bio);
1062
1063 if ((bio->bi_rw & REQ_DISCARD) &&
1064 !blk_queue_discard(bdev_get_queue(dc->bdev)))
1065 goto out;
1066
1067 closure_bio_submit(bio, cl, s->d);
1068 goto out;
1069}
1070
1071static void request_nodata(struct cached_dev *dc, struct search *s)
1072{
1073 struct closure *cl = &s->cl;
1074 struct bio *bio = &s->bio.bio;
1075
1076 if (bio->bi_rw & REQ_DISCARD) {
1077 request_write(dc, s);
1078 return;
1079 }
1080
1081 if (s->op.flush_journal)
1082 bch_journal_meta(s->op.c, cl);
1083
1084 closure_bio_submit(bio, cl, s->d);
1085
1086 continue_at(cl, cached_dev_bio_complete, NULL);
1087}
1088
1089/* Cached devices - read & write stuff */
1090
1091int bch_get_congested(struct cache_set *c)
1092{
1093 int i;
1094
1095 if (!c->congested_read_threshold_us &&
1096 !c->congested_write_threshold_us)
1097 return 0;
1098
1099 i = (local_clock_us() - c->congested_last_us) / 1024;
1100 if (i < 0)
1101 return 0;
1102
1103 i += atomic_read(&c->congested);
1104 if (i >= 0)
1105 return 0;
1106
1107 i += CONGESTED_MAX;
1108
1109 return i <= 0 ? 1 : fract_exp_two(i, 6);
1110}
1111
1112static void add_sequential(struct task_struct *t)
1113{
1114 ewma_add(t->sequential_io_avg,
1115 t->sequential_io, 8, 0);
1116
1117 t->sequential_io = 0;
1118}
1119
1120static void check_should_skip(struct cached_dev *dc, struct search *s)
1121{
1122 struct hlist_head *iohash(uint64_t k)
1123 { return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; }
1124
1125 struct cache_set *c = s->op.c;
1126 struct bio *bio = &s->bio.bio;
1127
1128 long rand;
1129 int cutoff = bch_get_congested(c);
1130 unsigned mode = cache_mode(dc, bio);
1131
1132 if (atomic_read(&dc->disk.detaching) ||
1133 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
1134 (bio->bi_rw & REQ_DISCARD))
1135 goto skip;
1136
1137 if (mode == CACHE_MODE_NONE ||
1138 (mode == CACHE_MODE_WRITEAROUND &&
1139 (bio->bi_rw & REQ_WRITE)))
1140 goto skip;
1141
1142 if (bio->bi_sector & (c->sb.block_size - 1) ||
1143 bio_sectors(bio) & (c->sb.block_size - 1)) {
1144 pr_debug("skipping unaligned io");
1145 goto skip;
1146 }
1147
1148 if (!cutoff) {
1149 cutoff = dc->sequential_cutoff >> 9;
1150
1151 if (!cutoff)
1152 goto rescale;
1153
1154 if (mode == CACHE_MODE_WRITEBACK &&
1155 (bio->bi_rw & REQ_WRITE) &&
1156 (bio->bi_rw & REQ_SYNC))
1157 goto rescale;
1158 }
1159
1160 if (dc->sequential_merge) {
1161 struct io *i;
1162
1163 spin_lock(&dc->io_lock);
1164
1165 hlist_for_each_entry(i, iohash(bio->bi_sector), hash)
1166 if (i->last == bio->bi_sector &&
1167 time_before(jiffies, i->jiffies))
1168 goto found;
1169
1170 i = list_first_entry(&dc->io_lru, struct io, lru);
1171
1172 add_sequential(s->task);
1173 i->sequential = 0;
1174found:
1175 if (i->sequential + bio->bi_size > i->sequential)
1176 i->sequential += bio->bi_size;
1177
1178 i->last = bio_end(bio);
1179 i->jiffies = jiffies + msecs_to_jiffies(5000);
1180 s->task->sequential_io = i->sequential;
1181
1182 hlist_del(&i->hash);
1183 hlist_add_head(&i->hash, iohash(i->last));
1184 list_move_tail(&i->lru, &dc->io_lru);
1185
1186 spin_unlock(&dc->io_lock);
1187 } else {
1188 s->task->sequential_io = bio->bi_size;
1189
1190 add_sequential(s->task);
1191 }
1192
1193 rand = get_random_int();
1194 cutoff -= bitmap_weight(&rand, BITS_PER_LONG);
1195
1196 if (cutoff <= (int) (max(s->task->sequential_io,
1197 s->task->sequential_io_avg) >> 9))
1198 goto skip;
1199
1200rescale:
1201 bch_rescale_priorities(c, bio_sectors(bio));
1202 return;
1203skip:
1204 bch_mark_sectors_bypassed(s, bio_sectors(bio));
1205 s->op.skip = true;
1206}
1207
1208static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1209{
1210 struct search *s;
1211 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1212 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1213 int cpu, rw = bio_data_dir(bio);
1214
1215 cpu = part_stat_lock();
1216 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1217 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1218 part_stat_unlock();
1219
1220 bio->bi_bdev = dc->bdev;
1221 bio->bi_sector += BDEV_DATA_START;
1222
1223 if (cached_dev_get(dc)) {
1224 s = search_alloc(bio, d);
1225 trace_bcache_request_start(s, bio);
1226
1227 if (!bio_has_data(bio))
1228 request_nodata(dc, s);
1229 else if (rw)
1230 request_write(dc, s);
1231 else
1232 request_read(dc, s);
1233 } else {
1234 if ((bio->bi_rw & REQ_DISCARD) &&
1235 !blk_queue_discard(bdev_get_queue(dc->bdev)))
1236 bio_endio(bio, 0);
1237 else
1238 bch_generic_make_request(bio, &d->bio_split_hook);
1239 }
1240}
1241
1242static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
1243 unsigned int cmd, unsigned long arg)
1244{
1245 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1246 return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
1247}
1248
1249static int cached_dev_congested(void *data, int bits)
1250{
1251 struct bcache_device *d = data;
1252 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1253 struct request_queue *q = bdev_get_queue(dc->bdev);
1254 int ret = 0;
1255
1256 if (bdi_congested(&q->backing_dev_info, bits))
1257 return 1;
1258
1259 if (cached_dev_get(dc)) {
1260 unsigned i;
1261 struct cache *ca;
1262
1263 for_each_cache(ca, d->c, i) {
1264 q = bdev_get_queue(ca->bdev);
1265 ret |= bdi_congested(&q->backing_dev_info, bits);
1266 }
1267
1268 cached_dev_put(dc);
1269 }
1270
1271 return ret;
1272}
1273
1274void bch_cached_dev_request_init(struct cached_dev *dc)
1275{
1276 struct gendisk *g = dc->disk.disk;
1277
1278 g->queue->make_request_fn = cached_dev_make_request;
1279 g->queue->backing_dev_info.congested_fn = cached_dev_congested;
1280 dc->disk.cache_miss = cached_dev_cache_miss;
1281 dc->disk.ioctl = cached_dev_ioctl;
1282}
1283
1284/* Flash backed devices */
1285
1286static int flash_dev_cache_miss(struct btree *b, struct search *s,
1287 struct bio *bio, unsigned sectors)
1288{
1289 /* Zero fill bio */
1290
1291 while (bio->bi_idx != bio->bi_vcnt) {
1292 struct bio_vec *bv = bio_iovec(bio);
1293 unsigned j = min(bv->bv_len >> 9, sectors);
1294
1295 void *p = kmap(bv->bv_page);
1296 memset(p + bv->bv_offset, 0, j << 9);
1297 kunmap(bv->bv_page);
1298
1299 bv->bv_len -= j << 9;
1300 bv->bv_offset += j << 9;
1301
1302 if (bv->bv_len)
1303 return 0;
1304
1305 bio->bi_sector += j;
1306 bio->bi_size -= j << 9;
1307
1308 bio->bi_idx++;
1309 sectors -= j;
1310 }
1311
1312 s->op.lookup_done = true;
1313
1314 return 0;
1315}
1316
1317static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1318{
1319 struct search *s;
1320 struct closure *cl;
1321 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1322 int cpu, rw = bio_data_dir(bio);
1323
1324 cpu = part_stat_lock();
1325 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1326 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1327 part_stat_unlock();
1328
1329 s = search_alloc(bio, d);
1330 cl = &s->cl;
1331 bio = &s->bio.bio;
1332
1333 trace_bcache_request_start(s, bio);
1334
1335 if (bio_has_data(bio) && !rw) {
1336 closure_call(&s->op.cl, btree_read_async, NULL, cl);
1337 } else if (bio_has_data(bio) || s->op.skip) {
1338 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
1339 &KEY(d->id, bio->bi_sector, 0),
1340 &KEY(d->id, bio_end(bio), 0));
1341
1342 s->writeback = true;
1343 s->op.cache_bio = bio;
1344
1345 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1346 } else {
1347 /* No data - probably a cache flush */
1348 if (s->op.flush_journal)
1349 bch_journal_meta(s->op.c, cl);
1350 }
1351
1352 continue_at(cl, search_free, NULL);
1353}
1354
1355static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
1356 unsigned int cmd, unsigned long arg)
1357{
1358 return -ENOTTY;
1359}
1360
1361static int flash_dev_congested(void *data, int bits)
1362{
1363 struct bcache_device *d = data;
1364 struct request_queue *q;
1365 struct cache *ca;
1366 unsigned i;
1367 int ret = 0;
1368
1369 for_each_cache(ca, d->c, i) {
1370 q = bdev_get_queue(ca->bdev);
1371 ret |= bdi_congested(&q->backing_dev_info, bits);
1372 }
1373
1374 return ret;
1375}
1376
1377void bch_flash_dev_request_init(struct bcache_device *d)
1378{
1379 struct gendisk *g = d->disk;
1380
1381 g->queue->make_request_fn = flash_dev_make_request;
1382 g->queue->backing_dev_info.congested_fn = flash_dev_congested;
1383 d->cache_miss = flash_dev_cache_miss;
1384 d->ioctl = flash_dev_ioctl;
1385}
1386
1387void bch_request_exit(void)
1388{
1389#ifdef CONFIG_CGROUP_BCACHE
1390 cgroup_unload_subsys(&bcache_subsys);
1391#endif
1392 if (bch_search_cache)
1393 kmem_cache_destroy(bch_search_cache);
1394}
1395
1396int __init bch_request_init(void)
1397{
1398 bch_search_cache = KMEM_CACHE(search, 0);
1399 if (!bch_search_cache)
1400 return -ENOMEM;
1401
1402#ifdef CONFIG_CGROUP_BCACHE
1403 cgroup_load_subsys(&bcache_subsys);
1404 init_bch_cgroup(&bcache_default_cgroup);
1405
1406 cgroup_add_cftypes(&bcache_subsys, bch_files);
1407#endif
1408 return 0;
1409}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
new file mode 100644
index 000000000000..254d9ab5707c
--- /dev/null
+++ b/drivers/md/bcache/request.h
@@ -0,0 +1,62 @@
1#ifndef _BCACHE_REQUEST_H_
2#define _BCACHE_REQUEST_H_
3
4#include <linux/cgroup.h>
5
6struct search {
7 /* Stack frame for bio_complete */
8 struct closure cl;
9
10 struct bcache_device *d;
11 struct task_struct *task;
12
13 struct bbio bio;
14 struct bio *orig_bio;
15 struct bio *cache_miss;
16 unsigned cache_bio_sectors;
17
18 unsigned recoverable:1;
19 unsigned unaligned_bvec:1;
20
21 unsigned write:1;
22 unsigned writeback:1;
23
24 /* IO error returned to s->bio */
25 short error;
26 unsigned long start_time;
27
28 /* Anything past op->keys won't get zeroed in do_bio_hook */
29 struct btree_op op;
30};
31
32void bch_cache_read_endio(struct bio *, int);
33int bch_get_congested(struct cache_set *);
34void bch_insert_data(struct closure *cl);
35void bch_btree_insert_async(struct closure *);
36void bch_cache_read_endio(struct bio *, int);
37
38void bch_open_buckets_free(struct cache_set *);
39int bch_open_buckets_alloc(struct cache_set *);
40
41void bch_cached_dev_request_init(struct cached_dev *dc);
42void bch_flash_dev_request_init(struct bcache_device *d);
43
44extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
45
46struct bch_cgroup {
47#ifdef CONFIG_CGROUP_BCACHE
48 struct cgroup_subsys_state css;
49#endif
50 /*
51 * We subtract one from the index into bch_cache_modes[], so that
52 * default == -1; this makes it so the rest match up with d->cache_mode,
53 * and we use d->cache_mode if cgrp->cache_mode < 0
54 */
55 short cache_mode;
56 bool verify;
57 struct cache_stat_collector stats;
58};
59
60struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
61
62#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
new file mode 100644
index 000000000000..bf6cf9518c89
--- /dev/null
+++ b/drivers/md/bcache/stats.c
@@ -0,0 +1,245 @@
1/*
2 * bcache stats code
3 *
4 * Copyright 2012 Google, Inc.
5 */
6
7#include "bcache.h"
8#include "stats.h"
9#include "btree.h"
10#include "request.h"
11#include "sysfs.h"
12
13/*
14 * We keep absolute totals of various statistics, and addionally a set of three
15 * rolling averages.
16 *
17 * Every so often, a timer goes off and rescales the rolling averages.
18 * accounting_rescale[] is how many times the timer has to go off before we
19 * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
20 * and one day.
21 *
22 * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
23 * and accounting_weight is what we use to rescale:
24 *
25 * pow(31 / 32, 22) ~= 1/2
26 *
27 * So that we don't have to increment each set of numbers every time we (say)
28 * get a cache hit, we increment a single atomic_t in acc->collector, and when
29 * the rescale function runs it resets the atomic counter to 0 and adds its
30 * old value to each of the exported numbers.
31 *
32 * To reduce rounding error, the numbers in struct cache_stats are all
33 * stored left shifted by 16, and scaled back in the sysfs show() function.
34 */
35
36static const unsigned DAY_RESCALE = 288;
37static const unsigned HOUR_RESCALE = 12;
38static const unsigned FIVE_MINUTE_RESCALE = 1;
39static const unsigned accounting_delay = (HZ * 300) / 22;
40static const unsigned accounting_weight = 32;
41
42/* sysfs reading/writing */
43
44read_attribute(cache_hits);
45read_attribute(cache_misses);
46read_attribute(cache_bypass_hits);
47read_attribute(cache_bypass_misses);
48read_attribute(cache_hit_ratio);
49read_attribute(cache_readaheads);
50read_attribute(cache_miss_collisions);
51read_attribute(bypassed);
52
53SHOW(bch_stats)
54{
55 struct cache_stats *s =
56 container_of(kobj, struct cache_stats, kobj);
57#define var(stat) (s->stat >> 16)
58 var_print(cache_hits);
59 var_print(cache_misses);
60 var_print(cache_bypass_hits);
61 var_print(cache_bypass_misses);
62
63 sysfs_print(cache_hit_ratio,
64 DIV_SAFE(var(cache_hits) * 100,
65 var(cache_hits) + var(cache_misses)));
66
67 var_print(cache_readaheads);
68 var_print(cache_miss_collisions);
69 sysfs_hprint(bypassed, var(sectors_bypassed) << 9);
70#undef var
71 return 0;
72}
73
74STORE(bch_stats)
75{
76 return size;
77}
78
79static void bch_stats_release(struct kobject *k)
80{
81}
82
83static struct attribute *bch_stats_files[] = {
84 &sysfs_cache_hits,
85 &sysfs_cache_misses,
86 &sysfs_cache_bypass_hits,
87 &sysfs_cache_bypass_misses,
88 &sysfs_cache_hit_ratio,
89 &sysfs_cache_readaheads,
90 &sysfs_cache_miss_collisions,
91 &sysfs_bypassed,
92 NULL
93};
94static KTYPE(bch_stats);
95
96static void scale_accounting(unsigned long data);
97
98void bch_cache_accounting_init(struct cache_accounting *acc, struct closure *parent)
99{
100 kobject_init(&acc->total.kobj, &bch_stats_ktype);
101 kobject_init(&acc->five_minute.kobj, &bch_stats_ktype);
102 kobject_init(&acc->hour.kobj, &bch_stats_ktype);
103 kobject_init(&acc->day.kobj, &bch_stats_ktype);
104
105 closure_init(&acc->cl, parent);
106 init_timer(&acc->timer);
107 acc->timer.expires = jiffies + accounting_delay;
108 acc->timer.data = (unsigned long) acc;
109 acc->timer.function = scale_accounting;
110 add_timer(&acc->timer);
111}
112
113int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
114 struct kobject *parent)
115{
116 int ret = kobject_add(&acc->total.kobj, parent,
117 "stats_total");
118 ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
119 "stats_five_minute");
120 ret = ret ?: kobject_add(&acc->hour.kobj, parent,
121 "stats_hour");
122 ret = ret ?: kobject_add(&acc->day.kobj, parent,
123 "stats_day");
124 return ret;
125}
126
127void bch_cache_accounting_clear(struct cache_accounting *acc)
128{
129 memset(&acc->total.cache_hits,
130 0,
131 sizeof(unsigned long) * 7);
132}
133
134void bch_cache_accounting_destroy(struct cache_accounting *acc)
135{
136 kobject_put(&acc->total.kobj);
137 kobject_put(&acc->five_minute.kobj);
138 kobject_put(&acc->hour.kobj);
139 kobject_put(&acc->day.kobj);
140
141 atomic_set(&acc->closing, 1);
142 if (del_timer_sync(&acc->timer))
143 closure_return(&acc->cl);
144}
145
146/* EWMA scaling */
147
148static void scale_stat(unsigned long *stat)
149{
150 *stat = ewma_add(*stat, 0, accounting_weight, 0);
151}
152
153static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
154{
155 if (++stats->rescale == rescale_at) {
156 stats->rescale = 0;
157 scale_stat(&stats->cache_hits);
158 scale_stat(&stats->cache_misses);
159 scale_stat(&stats->cache_bypass_hits);
160 scale_stat(&stats->cache_bypass_misses);
161 scale_stat(&stats->cache_readaheads);
162 scale_stat(&stats->cache_miss_collisions);
163 scale_stat(&stats->sectors_bypassed);
164 }
165}
166
167static void scale_accounting(unsigned long data)
168{
169 struct cache_accounting *acc = (struct cache_accounting *) data;
170
171#define move_stat(name) do { \
172 unsigned t = atomic_xchg(&acc->collector.name, 0); \
173 t <<= 16; \
174 acc->five_minute.name += t; \
175 acc->hour.name += t; \
176 acc->day.name += t; \
177 acc->total.name += t; \
178} while (0)
179
180 move_stat(cache_hits);
181 move_stat(cache_misses);
182 move_stat(cache_bypass_hits);
183 move_stat(cache_bypass_misses);
184 move_stat(cache_readaheads);
185 move_stat(cache_miss_collisions);
186 move_stat(sectors_bypassed);
187
188 scale_stats(&acc->total, 0);
189 scale_stats(&acc->day, DAY_RESCALE);
190 scale_stats(&acc->hour, HOUR_RESCALE);
191 scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
192
193 acc->timer.expires += accounting_delay;
194
195 if (!atomic_read(&acc->closing))
196 add_timer(&acc->timer);
197 else
198 closure_return(&acc->cl);
199}
200
201static void mark_cache_stats(struct cache_stat_collector *stats,
202 bool hit, bool bypass)
203{
204 if (!bypass)
205 if (hit)
206 atomic_inc(&stats->cache_hits);
207 else
208 atomic_inc(&stats->cache_misses);
209 else
210 if (hit)
211 atomic_inc(&stats->cache_bypass_hits);
212 else
213 atomic_inc(&stats->cache_bypass_misses);
214}
215
216void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass)
217{
218 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
219 mark_cache_stats(&dc->accounting.collector, hit, bypass);
220 mark_cache_stats(&s->op.c->accounting.collector, hit, bypass);
221#ifdef CONFIG_CGROUP_BCACHE
222 mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
223#endif
224}
225
226void bch_mark_cache_readahead(struct search *s)
227{
228 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
229 atomic_inc(&dc->accounting.collector.cache_readaheads);
230 atomic_inc(&s->op.c->accounting.collector.cache_readaheads);
231}
232
233void bch_mark_cache_miss_collision(struct search *s)
234{
235 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
236 atomic_inc(&dc->accounting.collector.cache_miss_collisions);
237 atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions);
238}
239
240void bch_mark_sectors_bypassed(struct search *s, int sectors)
241{
242 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
243 atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
244 atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed);
245}
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
new file mode 100644
index 000000000000..c7c7a8fd29fe
--- /dev/null
+++ b/drivers/md/bcache/stats.h
@@ -0,0 +1,58 @@
1#ifndef _BCACHE_STATS_H_
2#define _BCACHE_STATS_H_
3
4struct cache_stat_collector {
5 atomic_t cache_hits;
6 atomic_t cache_misses;
7 atomic_t cache_bypass_hits;
8 atomic_t cache_bypass_misses;
9 atomic_t cache_readaheads;
10 atomic_t cache_miss_collisions;
11 atomic_t sectors_bypassed;
12};
13
14struct cache_stats {
15 struct kobject kobj;
16
17 unsigned long cache_hits;
18 unsigned long cache_misses;
19 unsigned long cache_bypass_hits;
20 unsigned long cache_bypass_misses;
21 unsigned long cache_readaheads;
22 unsigned long cache_miss_collisions;
23 unsigned long sectors_bypassed;
24
25 unsigned rescale;
26};
27
28struct cache_accounting {
29 struct closure cl;
30 struct timer_list timer;
31 atomic_t closing;
32
33 struct cache_stat_collector collector;
34
35 struct cache_stats total;
36 struct cache_stats five_minute;
37 struct cache_stats hour;
38 struct cache_stats day;
39};
40
41struct search;
42
43void bch_cache_accounting_init(struct cache_accounting *acc,
44 struct closure *parent);
45
46int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
47 struct kobject *parent);
48
49void bch_cache_accounting_clear(struct cache_accounting *acc);
50
51void bch_cache_accounting_destroy(struct cache_accounting *acc);
52
53void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass);
54void bch_mark_cache_readahead(struct search *s);
55void bch_mark_cache_miss_collision(struct search *s);
56void bch_mark_sectors_bypassed(struct search *s, int sectors);
57
58#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
new file mode 100644
index 000000000000..31ef47f1f3b6
--- /dev/null
+++ b/drivers/md/bcache/super.c
@@ -0,0 +1,1941 @@
1/*
2 * bcache setup/teardown code, and some metadata io - read a superblock and
3 * figure out what to do with it.
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "request.h"
13
14#include <linux/buffer_head.h>
15#include <linux/debugfs.h>
16#include <linux/genhd.h>
17#include <linux/module.h>
18#include <linux/random.h>
19#include <linux/reboot.h>
20#include <linux/sysfs.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
24
25static const char bcache_magic[] = {
26 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
27 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
28};
29
30static const char invalid_uuid[] = {
31 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
32 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
33};
34
35/* Default is -1; we skip past it for struct cached_dev's cache mode */
36const char * const bch_cache_modes[] = {
37 "default",
38 "writethrough",
39 "writeback",
40 "writearound",
41 "none",
42 NULL
43};
44
45struct uuid_entry_v0 {
46 uint8_t uuid[16];
47 uint8_t label[32];
48 uint32_t first_reg;
49 uint32_t last_reg;
50 uint32_t invalidated;
51 uint32_t pad;
52};
53
54static struct kobject *bcache_kobj;
55struct mutex bch_register_lock;
56LIST_HEAD(bch_cache_sets);
57static LIST_HEAD(uncached_devices);
58
59static int bcache_major, bcache_minor;
60static wait_queue_head_t unregister_wait;
61struct workqueue_struct *bcache_wq;
62
63#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
64
65static void bio_split_pool_free(struct bio_split_pool *p)
66{
67 if (p->bio_split)
68 bioset_free(p->bio_split);
69
70}
71
72static int bio_split_pool_init(struct bio_split_pool *p)
73{
74 p->bio_split = bioset_create(4, 0);
75 if (!p->bio_split)
76 return -ENOMEM;
77
78 p->bio_split_hook = mempool_create_kmalloc_pool(4,
79 sizeof(struct bio_split_hook));
80 if (!p->bio_split_hook)
81 return -ENOMEM;
82
83 return 0;
84}
85
86/* Superblock */
87
88static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
89 struct page **res)
90{
91 const char *err;
92 struct cache_sb *s;
93 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
94 unsigned i;
95
96 if (!bh)
97 return "IO error";
98
99 s = (struct cache_sb *) bh->b_data;
100
101 sb->offset = le64_to_cpu(s->offset);
102 sb->version = le64_to_cpu(s->version);
103
104 memcpy(sb->magic, s->magic, 16);
105 memcpy(sb->uuid, s->uuid, 16);
106 memcpy(sb->set_uuid, s->set_uuid, 16);
107 memcpy(sb->label, s->label, SB_LABEL_SIZE);
108
109 sb->flags = le64_to_cpu(s->flags);
110 sb->seq = le64_to_cpu(s->seq);
111
112 sb->nbuckets = le64_to_cpu(s->nbuckets);
113 sb->block_size = le16_to_cpu(s->block_size);
114 sb->bucket_size = le16_to_cpu(s->bucket_size);
115
116 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
117 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
118 sb->last_mount = le32_to_cpu(s->last_mount);
119
120 sb->first_bucket = le16_to_cpu(s->first_bucket);
121 sb->keys = le16_to_cpu(s->keys);
122
123 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
124 sb->d[i] = le64_to_cpu(s->d[i]);
125
126 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
127 sb->version, sb->flags, sb->seq, sb->keys);
128
129 err = "Not a bcache superblock";
130 if (sb->offset != SB_SECTOR)
131 goto err;
132
133 if (memcmp(sb->magic, bcache_magic, 16))
134 goto err;
135
136 err = "Too many journal buckets";
137 if (sb->keys > SB_JOURNAL_BUCKETS)
138 goto err;
139
140 err = "Bad checksum";
141 if (s->csum != csum_set(s))
142 goto err;
143
144 err = "Bad UUID";
145 if (is_zero(sb->uuid, 16))
146 goto err;
147
148 err = "Unsupported superblock version";
149 if (sb->version > BCACHE_SB_VERSION)
150 goto err;
151
152 err = "Bad block/bucket size";
153 if (!is_power_of_2(sb->block_size) || sb->block_size > PAGE_SECTORS ||
154 !is_power_of_2(sb->bucket_size) || sb->bucket_size < PAGE_SECTORS)
155 goto err;
156
157 err = "Too many buckets";
158 if (sb->nbuckets > LONG_MAX)
159 goto err;
160
161 err = "Not enough buckets";
162 if (sb->nbuckets < 1 << 7)
163 goto err;
164
165 err = "Invalid superblock: device too small";
166 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
167 goto err;
168
169 if (sb->version == CACHE_BACKING_DEV)
170 goto out;
171
172 err = "Bad UUID";
173 if (is_zero(sb->set_uuid, 16))
174 goto err;
175
176 err = "Bad cache device number in set";
177 if (!sb->nr_in_set ||
178 sb->nr_in_set <= sb->nr_this_dev ||
179 sb->nr_in_set > MAX_CACHES_PER_SET)
180 goto err;
181
182 err = "Journal buckets not sequential";
183 for (i = 0; i < sb->keys; i++)
184 if (sb->d[i] != sb->first_bucket + i)
185 goto err;
186
187 err = "Too many journal buckets";
188 if (sb->first_bucket + sb->keys > sb->nbuckets)
189 goto err;
190
191 err = "Invalid superblock: first bucket comes before end of super";
192 if (sb->first_bucket * sb->bucket_size < 16)
193 goto err;
194out:
195 sb->last_mount = get_seconds();
196 err = NULL;
197
198 get_page(bh->b_page);
199 *res = bh->b_page;
200err:
201 put_bh(bh);
202 return err;
203}
204
205static void write_bdev_super_endio(struct bio *bio, int error)
206{
207 struct cached_dev *dc = bio->bi_private;
208 /* XXX: error checking */
209
210 closure_put(&dc->sb_write.cl);
211}
212
213static void __write_super(struct cache_sb *sb, struct bio *bio)
214{
215 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
216 unsigned i;
217
218 bio->bi_sector = SB_SECTOR;
219 bio->bi_rw = REQ_SYNC|REQ_META;
220 bio->bi_size = SB_SIZE;
221 bio_map(bio, NULL);
222
223 out->offset = cpu_to_le64(sb->offset);
224 out->version = cpu_to_le64(sb->version);
225
226 memcpy(out->uuid, sb->uuid, 16);
227 memcpy(out->set_uuid, sb->set_uuid, 16);
228 memcpy(out->label, sb->label, SB_LABEL_SIZE);
229
230 out->flags = cpu_to_le64(sb->flags);
231 out->seq = cpu_to_le64(sb->seq);
232
233 out->last_mount = cpu_to_le32(sb->last_mount);
234 out->first_bucket = cpu_to_le16(sb->first_bucket);
235 out->keys = cpu_to_le16(sb->keys);
236
237 for (i = 0; i < sb->keys; i++)
238 out->d[i] = cpu_to_le64(sb->d[i]);
239
240 out->csum = csum_set(out);
241
242 pr_debug("ver %llu, flags %llu, seq %llu",
243 sb->version, sb->flags, sb->seq);
244
245 submit_bio(REQ_WRITE, bio);
246}
247
248void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
249{
250 struct closure *cl = &dc->sb_write.cl;
251 struct bio *bio = &dc->sb_bio;
252
253 closure_lock(&dc->sb_write, parent);
254
255 bio_reset(bio);
256 bio->bi_bdev = dc->bdev;
257 bio->bi_end_io = write_bdev_super_endio;
258 bio->bi_private = dc;
259
260 closure_get(cl);
261 __write_super(&dc->sb, bio);
262
263 closure_return(cl);
264}
265
266static void write_super_endio(struct bio *bio, int error)
267{
268 struct cache *ca = bio->bi_private;
269
270 bch_count_io_errors(ca, error, "writing superblock");
271 closure_put(&ca->set->sb_write.cl);
272}
273
274void bcache_write_super(struct cache_set *c)
275{
276 struct closure *cl = &c->sb_write.cl;
277 struct cache *ca;
278 unsigned i;
279
280 closure_lock(&c->sb_write, &c->cl);
281
282 c->sb.seq++;
283
284 for_each_cache(ca, c, i) {
285 struct bio *bio = &ca->sb_bio;
286
287 ca->sb.version = BCACHE_SB_VERSION;
288 ca->sb.seq = c->sb.seq;
289 ca->sb.last_mount = c->sb.last_mount;
290
291 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
292
293 bio_reset(bio);
294 bio->bi_bdev = ca->bdev;
295 bio->bi_end_io = write_super_endio;
296 bio->bi_private = ca;
297
298 closure_get(cl);
299 __write_super(&ca->sb, bio);
300 }
301
302 closure_return(cl);
303}
304
305/* UUID io */
306
307static void uuid_endio(struct bio *bio, int error)
308{
309 struct closure *cl = bio->bi_private;
310 struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
311
312 cache_set_err_on(error, c, "accessing uuids");
313 bch_bbio_free(bio, c);
314 closure_put(cl);
315}
316
317static void uuid_io(struct cache_set *c, unsigned long rw,
318 struct bkey *k, struct closure *parent)
319{
320 struct closure *cl = &c->uuid_write.cl;
321 struct uuid_entry *u;
322 unsigned i;
323
324 BUG_ON(!parent);
325 closure_lock(&c->uuid_write, parent);
326
327 for (i = 0; i < KEY_PTRS(k); i++) {
328 struct bio *bio = bch_bbio_alloc(c);
329
330 bio->bi_rw = REQ_SYNC|REQ_META|rw;
331 bio->bi_size = KEY_SIZE(k) << 9;
332
333 bio->bi_end_io = uuid_endio;
334 bio->bi_private = cl;
335 bio_map(bio, c->uuids);
336
337 bch_submit_bbio(bio, c, k, i);
338
339 if (!(rw & WRITE))
340 break;
341 }
342
343 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read",
344 pkey(&c->uuid_bucket));
345
346 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
347 if (!is_zero(u->uuid, 16))
348 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
349 u - c->uuids, u->uuid, u->label,
350 u->first_reg, u->last_reg, u->invalidated);
351
352 closure_return(cl);
353}
354
355static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
356{
357 struct bkey *k = &j->uuid_bucket;
358
359 if (__bch_ptr_invalid(c, 1, k))
360 return "bad uuid pointer";
361
362 bkey_copy(&c->uuid_bucket, k);
363 uuid_io(c, READ_SYNC, k, cl);
364
365 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
366 struct uuid_entry_v0 *u0 = (void *) c->uuids;
367 struct uuid_entry *u1 = (void *) c->uuids;
368 int i;
369
370 closure_sync(cl);
371
372 /*
373 * Since the new uuid entry is bigger than the old, we have to
374 * convert starting at the highest memory address and work down
375 * in order to do it in place
376 */
377
378 for (i = c->nr_uuids - 1;
379 i >= 0;
380 --i) {
381 memcpy(u1[i].uuid, u0[i].uuid, 16);
382 memcpy(u1[i].label, u0[i].label, 32);
383
384 u1[i].first_reg = u0[i].first_reg;
385 u1[i].last_reg = u0[i].last_reg;
386 u1[i].invalidated = u0[i].invalidated;
387
388 u1[i].flags = 0;
389 u1[i].sectors = 0;
390 }
391 }
392
393 return NULL;
394}
395
396static int __uuid_write(struct cache_set *c)
397{
398 BKEY_PADDED(key) k;
399 struct closure cl;
400 closure_init_stack(&cl);
401
402 lockdep_assert_held(&bch_register_lock);
403
404 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl))
405 return 1;
406
407 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
408 uuid_io(c, REQ_WRITE, &k.key, &cl);
409 closure_sync(&cl);
410
411 bkey_copy(&c->uuid_bucket, &k.key);
412 __bkey_put(c, &k.key);
413 return 0;
414}
415
416int bch_uuid_write(struct cache_set *c)
417{
418 int ret = __uuid_write(c);
419
420 if (!ret)
421 bch_journal_meta(c, NULL);
422
423 return ret;
424}
425
426static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
427{
428 struct uuid_entry *u;
429
430 for (u = c->uuids;
431 u < c->uuids + c->nr_uuids; u++)
432 if (!memcmp(u->uuid, uuid, 16))
433 return u;
434
435 return NULL;
436}
437
438static struct uuid_entry *uuid_find_empty(struct cache_set *c)
439{
440 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
441 return uuid_find(c, zero_uuid);
442}
443
444/*
445 * Bucket priorities/gens:
446 *
447 * For each bucket, we store on disk its
448 * 8 bit gen
449 * 16 bit priority
450 *
451 * See alloc.c for an explanation of the gen. The priority is used to implement
452 * lru (and in the future other) cache replacement policies; for most purposes
453 * it's just an opaque integer.
454 *
455 * The gens and the priorities don't have a whole lot to do with each other, and
456 * it's actually the gens that must be written out at specific times - it's no
457 * big deal if the priorities don't get written, if we lose them we just reuse
458 * buckets in suboptimal order.
459 *
460 * On disk they're stored in a packed array, and in as many buckets are required
461 * to fit them all. The buckets we use to store them form a list; the journal
462 * header points to the first bucket, the first bucket points to the second
463 * bucket, et cetera.
464 *
465 * This code is used by the allocation code; periodically (whenever it runs out
466 * of buckets to allocate from) the allocation code will invalidate some
467 * buckets, but it can't use those buckets until their new gens are safely on
468 * disk.
469 */
470
471static void prio_endio(struct bio *bio, int error)
472{
473 struct cache *ca = bio->bi_private;
474
475 cache_set_err_on(error, ca->set, "accessing priorities");
476 bch_bbio_free(bio, ca->set);
477 closure_put(&ca->prio);
478}
479
480static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
481{
482 struct closure *cl = &ca->prio;
483 struct bio *bio = bch_bbio_alloc(ca->set);
484
485 closure_init_stack(cl);
486
487 bio->bi_sector = bucket * ca->sb.bucket_size;
488 bio->bi_bdev = ca->bdev;
489 bio->bi_rw = REQ_SYNC|REQ_META|rw;
490 bio->bi_size = bucket_bytes(ca);
491
492 bio->bi_end_io = prio_endio;
493 bio->bi_private = ca;
494 bio_map(bio, ca->disk_buckets);
495
496 closure_bio_submit(bio, &ca->prio, ca);
497 closure_sync(cl);
498}
499
500#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \
501 fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
502
503void bch_prio_write(struct cache *ca)
504{
505 int i;
506 struct bucket *b;
507 struct closure cl;
508
509 closure_init_stack(&cl);
510
511 lockdep_assert_held(&ca->set->bucket_lock);
512
513 for (b = ca->buckets;
514 b < ca->buckets + ca->sb.nbuckets; b++)
515 b->disk_gen = b->gen;
516
517 ca->disk_buckets->seq++;
518
519 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
520 &ca->meta_sectors_written);
521
522 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
523 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
524 blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
525
526 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
527 long bucket;
528 struct prio_set *p = ca->disk_buckets;
529 struct bucket_disk *d = p->data, *end = d + prios_per_bucket(ca);
530
531 for (b = ca->buckets + i * prios_per_bucket(ca);
532 b < ca->buckets + ca->sb.nbuckets && d < end;
533 b++, d++) {
534 d->prio = cpu_to_le16(b->prio);
535 d->gen = b->gen;
536 }
537
538 p->next_bucket = ca->prio_buckets[i + 1];
539 p->magic = pset_magic(ca);
540 p->csum = crc64(&p->magic, bucket_bytes(ca) - 8);
541
542 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl);
543 BUG_ON(bucket == -1);
544
545 mutex_unlock(&ca->set->bucket_lock);
546 prio_io(ca, bucket, REQ_WRITE);
547 mutex_lock(&ca->set->bucket_lock);
548
549 ca->prio_buckets[i] = bucket;
550 atomic_dec_bug(&ca->buckets[bucket].pin);
551 }
552
553 mutex_unlock(&ca->set->bucket_lock);
554
555 bch_journal_meta(ca->set, &cl);
556 closure_sync(&cl);
557
558 mutex_lock(&ca->set->bucket_lock);
559
560 ca->need_save_prio = 0;
561
562 /*
563 * Don't want the old priorities to get garbage collected until after we
564 * finish writing the new ones, and they're journalled
565 */
566 for (i = 0; i < prio_buckets(ca); i++)
567 ca->prio_last_buckets[i] = ca->prio_buckets[i];
568}
569
570static void prio_read(struct cache *ca, uint64_t bucket)
571{
572 struct prio_set *p = ca->disk_buckets;
573 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
574 struct bucket *b;
575 unsigned bucket_nr = 0;
576
577 for (b = ca->buckets;
578 b < ca->buckets + ca->sb.nbuckets;
579 b++, d++) {
580 if (d == end) {
581 ca->prio_buckets[bucket_nr] = bucket;
582 ca->prio_last_buckets[bucket_nr] = bucket;
583 bucket_nr++;
584
585 prio_io(ca, bucket, READ_SYNC);
586
587 if (p->csum != crc64(&p->magic, bucket_bytes(ca) - 8))
588 pr_warn("bad csum reading priorities");
589
590 if (p->magic != pset_magic(ca))
591 pr_warn("bad magic reading priorities");
592
593 bucket = p->next_bucket;
594 d = p->data;
595 }
596
597 b->prio = le16_to_cpu(d->prio);
598 b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
599 }
600}
601
602/* Bcache device */
603
604static int open_dev(struct block_device *b, fmode_t mode)
605{
606 struct bcache_device *d = b->bd_disk->private_data;
607 if (atomic_read(&d->closing))
608 return -ENXIO;
609
610 closure_get(&d->cl);
611 return 0;
612}
613
614static int release_dev(struct gendisk *b, fmode_t mode)
615{
616 struct bcache_device *d = b->private_data;
617 closure_put(&d->cl);
618 return 0;
619}
620
621static int ioctl_dev(struct block_device *b, fmode_t mode,
622 unsigned int cmd, unsigned long arg)
623{
624 struct bcache_device *d = b->bd_disk->private_data;
625 return d->ioctl(d, mode, cmd, arg);
626}
627
628static const struct block_device_operations bcache_ops = {
629 .open = open_dev,
630 .release = release_dev,
631 .ioctl = ioctl_dev,
632 .owner = THIS_MODULE,
633};
634
635void bcache_device_stop(struct bcache_device *d)
636{
637 if (!atomic_xchg(&d->closing, 1))
638 closure_queue(&d->cl);
639}
640
641static void bcache_device_detach(struct bcache_device *d)
642{
643 lockdep_assert_held(&bch_register_lock);
644
645 if (atomic_read(&d->detaching)) {
646 struct uuid_entry *u = d->c->uuids + d->id;
647
648 SET_UUID_FLASH_ONLY(u, 0);
649 memcpy(u->uuid, invalid_uuid, 16);
650 u->invalidated = cpu_to_le32(get_seconds());
651 bch_uuid_write(d->c);
652
653 atomic_set(&d->detaching, 0);
654 }
655
656 d->c->devices[d->id] = NULL;
657 closure_put(&d->c->caching);
658 d->c = NULL;
659}
660
661static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
662 unsigned id)
663{
664 BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
665
666 d->id = id;
667 d->c = c;
668 c->devices[id] = d;
669
670 closure_get(&c->caching);
671}
672
673static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
674 const char *name)
675{
676 snprintf(d->name, BCACHEDEVNAME_SIZE,
677 "%s%u", name, d->id);
678
679 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
680 sysfs_create_link(&c->kobj, &d->kobj, d->name),
681 "Couldn't create device <-> cache set symlinks");
682}
683
684static void bcache_device_free(struct bcache_device *d)
685{
686 lockdep_assert_held(&bch_register_lock);
687
688 pr_info("%s stopped", d->disk->disk_name);
689
690 if (d->c)
691 bcache_device_detach(d);
692
693 if (d->disk)
694 del_gendisk(d->disk);
695 if (d->disk && d->disk->queue)
696 blk_cleanup_queue(d->disk->queue);
697 if (d->disk)
698 put_disk(d->disk);
699
700 bio_split_pool_free(&d->bio_split_hook);
701 if (d->unaligned_bvec)
702 mempool_destroy(d->unaligned_bvec);
703 if (d->bio_split)
704 bioset_free(d->bio_split);
705
706 closure_debug_destroy(&d->cl);
707}
708
709static int bcache_device_init(struct bcache_device *d, unsigned block_size)
710{
711 struct request_queue *q;
712
713 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
714 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
715 sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
716 bio_split_pool_init(&d->bio_split_hook))
717
718 return -ENOMEM;
719
720 d->disk = alloc_disk(1);
721 if (!d->disk)
722 return -ENOMEM;
723
724 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
725
726 d->disk->major = bcache_major;
727 d->disk->first_minor = bcache_minor++;
728 d->disk->fops = &bcache_ops;
729 d->disk->private_data = d;
730
731 q = blk_alloc_queue(GFP_KERNEL);
732 if (!q)
733 return -ENOMEM;
734
735 blk_queue_make_request(q, NULL);
736 d->disk->queue = q;
737 q->queuedata = d;
738 q->backing_dev_info.congested_data = d;
739 q->limits.max_hw_sectors = UINT_MAX;
740 q->limits.max_sectors = UINT_MAX;
741 q->limits.max_segment_size = UINT_MAX;
742 q->limits.max_segments = BIO_MAX_PAGES;
743 q->limits.max_discard_sectors = UINT_MAX;
744 q->limits.io_min = block_size;
745 q->limits.logical_block_size = block_size;
746 q->limits.physical_block_size = block_size;
747 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
748 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
749
750 return 0;
751}
752
753/* Cached device */
754
755static void calc_cached_dev_sectors(struct cache_set *c)
756{
757 uint64_t sectors = 0;
758 struct cached_dev *dc;
759
760 list_for_each_entry(dc, &c->cached_devs, list)
761 sectors += bdev_sectors(dc->bdev);
762
763 c->cached_dev_sectors = sectors;
764}
765
766void bch_cached_dev_run(struct cached_dev *dc)
767{
768 struct bcache_device *d = &dc->disk;
769
770 if (atomic_xchg(&dc->running, 1))
771 return;
772
773 if (!d->c &&
774 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
775 struct closure cl;
776 closure_init_stack(&cl);
777
778 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
779 bch_write_bdev_super(dc, &cl);
780 closure_sync(&cl);
781 }
782
783 add_disk(d->disk);
784#if 0
785 char *env[] = { "SYMLINK=label" , NULL };
786 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
787#endif
788 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
789 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
790 pr_debug("error creating sysfs link");
791}
792
793static void cached_dev_detach_finish(struct work_struct *w)
794{
795 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
796 char buf[BDEVNAME_SIZE];
797 struct closure cl;
798 closure_init_stack(&cl);
799
800 BUG_ON(!atomic_read(&dc->disk.detaching));
801 BUG_ON(atomic_read(&dc->count));
802
803 sysfs_remove_link(&dc->disk.c->kobj, dc->disk.name);
804 sysfs_remove_link(&dc->disk.kobj, "cache");
805
806 mutex_lock(&bch_register_lock);
807
808 memset(&dc->sb.set_uuid, 0, 16);
809 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
810
811 bch_write_bdev_super(dc, &cl);
812 closure_sync(&cl);
813
814 bcache_device_detach(&dc->disk);
815 list_move(&dc->list, &uncached_devices);
816
817 mutex_unlock(&bch_register_lock);
818
819 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
820
821 /* Drop ref we took in cached_dev_detach() */
822 closure_put(&dc->disk.cl);
823}
824
825void bch_cached_dev_detach(struct cached_dev *dc)
826{
827 lockdep_assert_held(&bch_register_lock);
828
829 if (atomic_read(&dc->disk.closing))
830 return;
831
832 if (atomic_xchg(&dc->disk.detaching, 1))
833 return;
834
835 /*
836 * Block the device from being closed and freed until we're finished
837 * detaching
838 */
839 closure_get(&dc->disk.cl);
840
841 bch_writeback_queue(dc);
842 cached_dev_put(dc);
843}
844
845int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
846{
847 uint32_t rtime = cpu_to_le32(get_seconds());
848 struct uuid_entry *u;
849 char buf[BDEVNAME_SIZE];
850
851 bdevname(dc->bdev, buf);
852
853 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
854 return -ENOENT;
855
856 if (dc->disk.c) {
857 pr_err("Can't attach %s: already attached", buf);
858 return -EINVAL;
859 }
860
861 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
862 pr_err("Can't attach %s: shutting down", buf);
863 return -EINVAL;
864 }
865
866 if (dc->sb.block_size < c->sb.block_size) {
867 /* Will die */
868 pr_err("Couldn't attach %s: block size "
869 "less than set's block size", buf);
870 return -EINVAL;
871 }
872
873 u = uuid_find(c, dc->sb.uuid);
874
875 if (u &&
876 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
877 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
878 memcpy(u->uuid, invalid_uuid, 16);
879 u->invalidated = cpu_to_le32(get_seconds());
880 u = NULL;
881 }
882
883 if (!u) {
884 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
885 pr_err("Couldn't find uuid for %s in set", buf);
886 return -ENOENT;
887 }
888
889 u = uuid_find_empty(c);
890 if (!u) {
891 pr_err("Not caching %s, no room for UUID", buf);
892 return -EINVAL;
893 }
894 }
895
896 /* Deadlocks since we're called via sysfs...
897 sysfs_remove_file(&dc->kobj, &sysfs_attach);
898 */
899
900 if (is_zero(u->uuid, 16)) {
901 struct closure cl;
902 closure_init_stack(&cl);
903
904 memcpy(u->uuid, dc->sb.uuid, 16);
905 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
906 u->first_reg = u->last_reg = rtime;
907 bch_uuid_write(c);
908
909 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
910 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
911
912 bch_write_bdev_super(dc, &cl);
913 closure_sync(&cl);
914 } else {
915 u->last_reg = rtime;
916 bch_uuid_write(c);
917 }
918
919 bcache_device_attach(&dc->disk, c, u - c->uuids);
920 bcache_device_link(&dc->disk, c, "bdev");
921 list_move(&dc->list, &c->cached_devs);
922 calc_cached_dev_sectors(c);
923
924 smp_wmb();
925 /*
926 * dc->c must be set before dc->count != 0 - paired with the mb in
927 * cached_dev_get()
928 */
929 atomic_set(&dc->count, 1);
930
931 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
932 atomic_set(&dc->has_dirty, 1);
933 atomic_inc(&dc->count);
934 bch_writeback_queue(dc);
935 }
936
937 bch_cached_dev_run(dc);
938
939 pr_info("Caching %s as %s on set %pU",
940 bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
941 dc->disk.c->sb.set_uuid);
942 return 0;
943}
944
945void bch_cached_dev_release(struct kobject *kobj)
946{
947 struct cached_dev *dc = container_of(kobj, struct cached_dev,
948 disk.kobj);
949 kfree(dc);
950 module_put(THIS_MODULE);
951}
952
953static void cached_dev_free(struct closure *cl)
954{
955 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
956
957 cancel_delayed_work_sync(&dc->writeback_rate_update);
958
959 mutex_lock(&bch_register_lock);
960
961 bcache_device_free(&dc->disk);
962 list_del(&dc->list);
963
964 mutex_unlock(&bch_register_lock);
965
966 if (!IS_ERR_OR_NULL(dc->bdev)) {
967 blk_sync_queue(bdev_get_queue(dc->bdev));
968 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
969 }
970
971 wake_up(&unregister_wait);
972
973 kobject_put(&dc->disk.kobj);
974}
975
976static void cached_dev_flush(struct closure *cl)
977{
978 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
979 struct bcache_device *d = &dc->disk;
980
981 bch_cache_accounting_destroy(&dc->accounting);
982 kobject_del(&d->kobj);
983
984 continue_at(cl, cached_dev_free, system_wq);
985}
986
987static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
988{
989 int err;
990 struct io *io;
991
992 closure_init(&dc->disk.cl, NULL);
993 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
994
995 __module_get(THIS_MODULE);
996 INIT_LIST_HEAD(&dc->list);
997 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
998
999 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1000
1001 err = bcache_device_init(&dc->disk, block_size);
1002 if (err)
1003 goto err;
1004
1005 spin_lock_init(&dc->io_lock);
1006 closure_init_unlocked(&dc->sb_write);
1007 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1008
1009 dc->sequential_merge = true;
1010 dc->sequential_cutoff = 4 << 20;
1011
1012 INIT_LIST_HEAD(&dc->io_lru);
1013 dc->sb_bio.bi_max_vecs = 1;
1014 dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs;
1015
1016 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1017 list_add(&io->lru, &dc->io_lru);
1018 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1019 }
1020
1021 bch_writeback_init_cached_dev(dc);
1022 return 0;
1023err:
1024 bcache_device_stop(&dc->disk);
1025 return err;
1026}
1027
1028/* Cached device - bcache superblock */
1029
1030static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
1031 struct block_device *bdev,
1032 struct cached_dev *dc)
1033{
1034 char name[BDEVNAME_SIZE];
1035 const char *err = "cannot allocate memory";
1036 struct gendisk *g;
1037 struct cache_set *c;
1038
1039 if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0)
1040 return err;
1041
1042 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1043 dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
1044 dc->bdev = bdev;
1045 dc->bdev->bd_holder = dc;
1046
1047 g = dc->disk.disk;
1048
1049 set_capacity(g, dc->bdev->bd_part->nr_sects - 16);
1050
1051 bch_cached_dev_request_init(dc);
1052
1053 err = "error creating kobject";
1054 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1055 "bcache"))
1056 goto err;
1057 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1058 goto err;
1059
1060 list_add(&dc->list, &uncached_devices);
1061 list_for_each_entry(c, &bch_cache_sets, list)
1062 bch_cached_dev_attach(dc, c);
1063
1064 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1065 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1066 bch_cached_dev_run(dc);
1067
1068 return NULL;
1069err:
1070 kobject_put(&dc->disk.kobj);
1071 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1072 /*
1073 * Return NULL instead of an error because kobject_put() cleans
1074 * everything up
1075 */
1076 return NULL;
1077}
1078
1079/* Flash only volumes */
1080
1081void bch_flash_dev_release(struct kobject *kobj)
1082{
1083 struct bcache_device *d = container_of(kobj, struct bcache_device,
1084 kobj);
1085 kfree(d);
1086}
1087
1088static void flash_dev_free(struct closure *cl)
1089{
1090 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1091 bcache_device_free(d);
1092 kobject_put(&d->kobj);
1093}
1094
1095static void flash_dev_flush(struct closure *cl)
1096{
1097 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1098
1099 sysfs_remove_link(&d->c->kobj, d->name);
1100 sysfs_remove_link(&d->kobj, "cache");
1101 kobject_del(&d->kobj);
1102 continue_at(cl, flash_dev_free, system_wq);
1103}
1104
1105static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1106{
1107 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1108 GFP_KERNEL);
1109 if (!d)
1110 return -ENOMEM;
1111
1112 closure_init(&d->cl, NULL);
1113 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1114
1115 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1116
1117 if (bcache_device_init(d, block_bytes(c)))
1118 goto err;
1119
1120 bcache_device_attach(d, c, u - c->uuids);
1121 set_capacity(d->disk, u->sectors);
1122 bch_flash_dev_request_init(d);
1123 add_disk(d->disk);
1124
1125 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1126 goto err;
1127
1128 bcache_device_link(d, c, "volume");
1129
1130 return 0;
1131err:
1132 kobject_put(&d->kobj);
1133 return -ENOMEM;
1134}
1135
1136static int flash_devs_run(struct cache_set *c)
1137{
1138 int ret = 0;
1139 struct uuid_entry *u;
1140
1141 for (u = c->uuids;
1142 u < c->uuids + c->nr_uuids && !ret;
1143 u++)
1144 if (UUID_FLASH_ONLY(u))
1145 ret = flash_dev_run(c, u);
1146
1147 return ret;
1148}
1149
1150int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1151{
1152 struct uuid_entry *u;
1153
1154 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1155 return -EINTR;
1156
1157 u = uuid_find_empty(c);
1158 if (!u) {
1159 pr_err("Can't create volume, no room for UUID");
1160 return -EINVAL;
1161 }
1162
1163 get_random_bytes(u->uuid, 16);
1164 memset(u->label, 0, 32);
1165 u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1166
1167 SET_UUID_FLASH_ONLY(u, 1);
1168 u->sectors = size >> 9;
1169
1170 bch_uuid_write(c);
1171
1172 return flash_dev_run(c, u);
1173}
1174
1175/* Cache set */
1176
1177__printf(2, 3)
1178bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1179{
1180 va_list args;
1181
1182 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1183 return false;
1184
1185 /* XXX: we can be called from atomic context
1186 acquire_console_sem();
1187 */
1188
1189 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1190
1191 va_start(args, fmt);
1192 vprintk(fmt, args);
1193 va_end(args);
1194
1195 printk(", disabling caching\n");
1196
1197 bch_cache_set_unregister(c);
1198 return true;
1199}
1200
1201void bch_cache_set_release(struct kobject *kobj)
1202{
1203 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1204 kfree(c);
1205 module_put(THIS_MODULE);
1206}
1207
1208static void cache_set_free(struct closure *cl)
1209{
1210 struct cache_set *c = container_of(cl, struct cache_set, cl);
1211 struct cache *ca;
1212 unsigned i;
1213
1214 if (!IS_ERR_OR_NULL(c->debug))
1215 debugfs_remove(c->debug);
1216
1217 bch_open_buckets_free(c);
1218 bch_btree_cache_free(c);
1219 bch_journal_free(c);
1220
1221 for_each_cache(ca, c, i)
1222 if (ca)
1223 kobject_put(&ca->kobj);
1224
1225 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1226 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1227
1228 kfree(c->fill_iter);
1229 if (c->bio_split)
1230 bioset_free(c->bio_split);
1231 if (c->bio_meta)
1232 mempool_destroy(c->bio_meta);
1233 if (c->search)
1234 mempool_destroy(c->search);
1235 kfree(c->devices);
1236
1237 mutex_lock(&bch_register_lock);
1238 list_del(&c->list);
1239 mutex_unlock(&bch_register_lock);
1240
1241 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1242 wake_up(&unregister_wait);
1243
1244 closure_debug_destroy(&c->cl);
1245 kobject_put(&c->kobj);
1246}
1247
1248static void cache_set_flush(struct closure *cl)
1249{
1250 struct cache_set *c = container_of(cl, struct cache_set, caching);
1251 struct btree *b;
1252
1253 /* Shut down allocator threads */
1254 set_bit(CACHE_SET_STOPPING_2, &c->flags);
1255 wake_up(&c->alloc_wait);
1256
1257 bch_cache_accounting_destroy(&c->accounting);
1258
1259 kobject_put(&c->internal);
1260 kobject_del(&c->kobj);
1261
1262 if (!IS_ERR_OR_NULL(c->root))
1263 list_add(&c->root->list, &c->btree_cache);
1264
1265 /* Should skip this if we're unregistering because of an error */
1266 list_for_each_entry(b, &c->btree_cache, list)
1267 if (btree_node_dirty(b))
1268 bch_btree_write(b, true, NULL);
1269
1270 closure_return(cl);
1271}
1272
1273static void __cache_set_unregister(struct closure *cl)
1274{
1275 struct cache_set *c = container_of(cl, struct cache_set, caching);
1276 struct cached_dev *dc, *t;
1277 size_t i;
1278
1279 mutex_lock(&bch_register_lock);
1280
1281 if (test_bit(CACHE_SET_UNREGISTERING, &c->flags))
1282 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1283 bch_cached_dev_detach(dc);
1284
1285 for (i = 0; i < c->nr_uuids; i++)
1286 if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i]))
1287 bcache_device_stop(c->devices[i]);
1288
1289 mutex_unlock(&bch_register_lock);
1290
1291 continue_at(cl, cache_set_flush, system_wq);
1292}
1293
1294void bch_cache_set_stop(struct cache_set *c)
1295{
1296 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1297 closure_queue(&c->caching);
1298}
1299
1300void bch_cache_set_unregister(struct cache_set *c)
1301{
1302 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1303 bch_cache_set_stop(c);
1304}
1305
1306#define alloc_bucket_pages(gfp, c) \
1307 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1308
1309struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1310{
1311 int iter_size;
1312 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1313 if (!c)
1314 return NULL;
1315
1316 __module_get(THIS_MODULE);
1317 closure_init(&c->cl, NULL);
1318 set_closure_fn(&c->cl, cache_set_free, system_wq);
1319
1320 closure_init(&c->caching, &c->cl);
1321 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1322
1323 /* Maybe create continue_at_noreturn() and use it here? */
1324 closure_set_stopped(&c->cl);
1325 closure_put(&c->cl);
1326
1327 kobject_init(&c->kobj, &bch_cache_set_ktype);
1328 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1329
1330 bch_cache_accounting_init(&c->accounting, &c->cl);
1331
1332 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1333 c->sb.block_size = sb->block_size;
1334 c->sb.bucket_size = sb->bucket_size;
1335 c->sb.nr_in_set = sb->nr_in_set;
1336 c->sb.last_mount = sb->last_mount;
1337 c->bucket_bits = ilog2(sb->bucket_size);
1338 c->block_bits = ilog2(sb->block_size);
1339 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1340
1341 c->btree_pages = c->sb.bucket_size / PAGE_SECTORS;
1342 if (c->btree_pages > BTREE_MAX_PAGES)
1343 c->btree_pages = max_t(int, c->btree_pages / 4,
1344 BTREE_MAX_PAGES);
1345
1346 init_waitqueue_head(&c->alloc_wait);
1347 mutex_init(&c->bucket_lock);
1348 mutex_init(&c->fill_lock);
1349 mutex_init(&c->sort_lock);
1350 spin_lock_init(&c->sort_time_lock);
1351 closure_init_unlocked(&c->sb_write);
1352 closure_init_unlocked(&c->uuid_write);
1353 spin_lock_init(&c->btree_read_time_lock);
1354 bch_moving_init_cache_set(c);
1355
1356 INIT_LIST_HEAD(&c->list);
1357 INIT_LIST_HEAD(&c->cached_devs);
1358 INIT_LIST_HEAD(&c->btree_cache);
1359 INIT_LIST_HEAD(&c->btree_cache_freeable);
1360 INIT_LIST_HEAD(&c->btree_cache_freed);
1361 INIT_LIST_HEAD(&c->data_buckets);
1362
1363 c->search = mempool_create_slab_pool(32, bch_search_cache);
1364 if (!c->search)
1365 goto err;
1366
1367 iter_size = (sb->bucket_size / sb->block_size + 1) *
1368 sizeof(struct btree_iter_set);
1369
1370 if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
1371 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1372 sizeof(struct bbio) + sizeof(struct bio_vec) *
1373 bucket_pages(c))) ||
1374 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1375 !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
1376 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1377 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1378 bch_journal_alloc(c) ||
1379 bch_btree_cache_alloc(c) ||
1380 bch_open_buckets_alloc(c))
1381 goto err;
1382
1383 c->fill_iter->size = sb->bucket_size / sb->block_size;
1384
1385 c->congested_read_threshold_us = 2000;
1386 c->congested_write_threshold_us = 20000;
1387 c->error_limit = 8 << IO_ERROR_SHIFT;
1388
1389 return c;
1390err:
1391 bch_cache_set_unregister(c);
1392 return NULL;
1393}
1394
1395static void run_cache_set(struct cache_set *c)
1396{
1397 const char *err = "cannot allocate memory";
1398 struct cached_dev *dc, *t;
1399 struct cache *ca;
1400 unsigned i;
1401
1402 struct btree_op op;
1403 bch_btree_op_init_stack(&op);
1404 op.lock = SHRT_MAX;
1405
1406 for_each_cache(ca, c, i)
1407 c->nbuckets += ca->sb.nbuckets;
1408
1409 if (CACHE_SYNC(&c->sb)) {
1410 LIST_HEAD(journal);
1411 struct bkey *k;
1412 struct jset *j;
1413
1414 err = "cannot allocate memory for journal";
1415 if (bch_journal_read(c, &journal, &op))
1416 goto err;
1417
1418 pr_debug("btree_journal_read() done");
1419
1420 err = "no journal entries found";
1421 if (list_empty(&journal))
1422 goto err;
1423
1424 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1425
1426 err = "IO error reading priorities";
1427 for_each_cache(ca, c, i)
1428 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1429
1430 /*
1431 * If prio_read() fails it'll call cache_set_error and we'll
1432 * tear everything down right away, but if we perhaps checked
1433 * sooner we could avoid journal replay.
1434 */
1435
1436 k = &j->btree_root;
1437
1438 err = "bad btree root";
1439 if (__bch_ptr_invalid(c, j->btree_level + 1, k))
1440 goto err;
1441
1442 err = "error reading btree root";
1443 c->root = bch_btree_node_get(c, k, j->btree_level, &op);
1444 if (IS_ERR_OR_NULL(c->root))
1445 goto err;
1446
1447 list_del_init(&c->root->list);
1448 rw_unlock(true, c->root);
1449
1450 err = uuid_read(c, j, &op.cl);
1451 if (err)
1452 goto err;
1453
1454 err = "error in recovery";
1455 if (bch_btree_check(c, &op))
1456 goto err;
1457
1458 bch_journal_mark(c, &journal);
1459 bch_btree_gc_finish(c);
1460 pr_debug("btree_check() done");
1461
1462 /*
1463 * bcache_journal_next() can't happen sooner, or
1464 * btree_gc_finish() will give spurious errors about last_gc >
1465 * gc_gen - this is a hack but oh well.
1466 */
1467 bch_journal_next(&c->journal);
1468
1469 for_each_cache(ca, c, i)
1470 closure_call(&ca->alloc, bch_allocator_thread,
1471 system_wq, &c->cl);
1472
1473 /*
1474 * First place it's safe to allocate: btree_check() and
1475 * btree_gc_finish() have to run before we have buckets to
1476 * allocate, and bch_bucket_alloc_set() might cause a journal
1477 * entry to be written so bcache_journal_next() has to be called
1478 * first.
1479 *
1480 * If the uuids were in the old format we have to rewrite them
1481 * before the next journal entry is written:
1482 */
1483 if (j->version < BCACHE_JSET_VERSION_UUID)
1484 __uuid_write(c);
1485
1486 bch_journal_replay(c, &journal, &op);
1487 } else {
1488 pr_notice("invalidating existing data");
1489 /* Don't want invalidate_buckets() to queue a gc yet */
1490 closure_lock(&c->gc, NULL);
1491
1492 for_each_cache(ca, c, i) {
1493 unsigned j;
1494
1495 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1496 2, SB_JOURNAL_BUCKETS);
1497
1498 for (j = 0; j < ca->sb.keys; j++)
1499 ca->sb.d[j] = ca->sb.first_bucket + j;
1500 }
1501
1502 bch_btree_gc_finish(c);
1503
1504 for_each_cache(ca, c, i)
1505 closure_call(&ca->alloc, bch_allocator_thread,
1506 ca->alloc_workqueue, &c->cl);
1507
1508 mutex_lock(&c->bucket_lock);
1509 for_each_cache(ca, c, i)
1510 bch_prio_write(ca);
1511 mutex_unlock(&c->bucket_lock);
1512
1513 wake_up(&c->alloc_wait);
1514
1515 err = "cannot allocate new UUID bucket";
1516 if (__uuid_write(c))
1517 goto err_unlock_gc;
1518
1519 err = "cannot allocate new btree root";
1520 c->root = bch_btree_node_alloc(c, 0, &op.cl);
1521 if (IS_ERR_OR_NULL(c->root))
1522 goto err_unlock_gc;
1523
1524 bkey_copy_key(&c->root->key, &MAX_KEY);
1525 bch_btree_write(c->root, true, &op);
1526
1527 bch_btree_set_root(c->root);
1528 rw_unlock(true, c->root);
1529
1530 /*
1531 * We don't want to write the first journal entry until
1532 * everything is set up - fortunately journal entries won't be
1533 * written until the SET_CACHE_SYNC() here:
1534 */
1535 SET_CACHE_SYNC(&c->sb, true);
1536
1537 bch_journal_next(&c->journal);
1538 bch_journal_meta(c, &op.cl);
1539
1540 /* Unlock */
1541 closure_set_stopped(&c->gc.cl);
1542 closure_put(&c->gc.cl);
1543 }
1544
1545 closure_sync(&op.cl);
1546 c->sb.last_mount = get_seconds();
1547 bcache_write_super(c);
1548
1549 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1550 bch_cached_dev_attach(dc, c);
1551
1552 flash_devs_run(c);
1553
1554 return;
1555err_unlock_gc:
1556 closure_set_stopped(&c->gc.cl);
1557 closure_put(&c->gc.cl);
1558err:
1559 closure_sync(&op.cl);
1560 /* XXX: test this, it's broken */
1561 bch_cache_set_error(c, err);
1562}
1563
1564static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1565{
1566 return ca->sb.block_size == c->sb.block_size &&
1567 ca->sb.bucket_size == c->sb.block_size &&
1568 ca->sb.nr_in_set == c->sb.nr_in_set;
1569}
1570
1571static const char *register_cache_set(struct cache *ca)
1572{
1573 char buf[12];
1574 const char *err = "cannot allocate memory";
1575 struct cache_set *c;
1576
1577 list_for_each_entry(c, &bch_cache_sets, list)
1578 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1579 if (c->cache[ca->sb.nr_this_dev])
1580 return "duplicate cache set member";
1581
1582 if (!can_attach_cache(ca, c))
1583 return "cache sb does not match set";
1584
1585 if (!CACHE_SYNC(&ca->sb))
1586 SET_CACHE_SYNC(&c->sb, false);
1587
1588 goto found;
1589 }
1590
1591 c = bch_cache_set_alloc(&ca->sb);
1592 if (!c)
1593 return err;
1594
1595 err = "error creating kobject";
1596 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1597 kobject_add(&c->internal, &c->kobj, "internal"))
1598 goto err;
1599
1600 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1601 goto err;
1602
1603 bch_debug_init_cache_set(c);
1604
1605 list_add(&c->list, &bch_cache_sets);
1606found:
1607 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1608 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1609 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1610 goto err;
1611
1612 if (ca->sb.seq > c->sb.seq) {
1613 c->sb.version = ca->sb.version;
1614 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1615 c->sb.flags = ca->sb.flags;
1616 c->sb.seq = ca->sb.seq;
1617 pr_debug("set version = %llu", c->sb.version);
1618 }
1619
1620 ca->set = c;
1621 ca->set->cache[ca->sb.nr_this_dev] = ca;
1622 c->cache_by_alloc[c->caches_loaded++] = ca;
1623
1624 if (c->caches_loaded == c->sb.nr_in_set)
1625 run_cache_set(c);
1626
1627 return NULL;
1628err:
1629 bch_cache_set_unregister(c);
1630 return err;
1631}
1632
1633/* Cache device */
1634
1635void bch_cache_release(struct kobject *kobj)
1636{
1637 struct cache *ca = container_of(kobj, struct cache, kobj);
1638
1639 if (ca->set)
1640 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1641
1642 bch_cache_allocator_exit(ca);
1643
1644 bio_split_pool_free(&ca->bio_split_hook);
1645
1646 if (ca->alloc_workqueue)
1647 destroy_workqueue(ca->alloc_workqueue);
1648
1649 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1650 kfree(ca->prio_buckets);
1651 vfree(ca->buckets);
1652
1653 free_heap(&ca->heap);
1654 free_fifo(&ca->unused);
1655 free_fifo(&ca->free_inc);
1656 free_fifo(&ca->free);
1657
1658 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1659 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1660
1661 if (!IS_ERR_OR_NULL(ca->bdev)) {
1662 blk_sync_queue(bdev_get_queue(ca->bdev));
1663 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1664 }
1665
1666 kfree(ca);
1667 module_put(THIS_MODULE);
1668}
1669
1670static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1671{
1672 size_t free;
1673 struct bucket *b;
1674
1675 if (!ca)
1676 return -ENOMEM;
1677
1678 __module_get(THIS_MODULE);
1679 kobject_init(&ca->kobj, &bch_cache_ktype);
1680
1681 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
1682
1683 INIT_LIST_HEAD(&ca->discards);
1684
1685 bio_init(&ca->sb_bio);
1686 ca->sb_bio.bi_max_vecs = 1;
1687 ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs;
1688
1689 bio_init(&ca->journal.bio);
1690 ca->journal.bio.bi_max_vecs = 8;
1691 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1692
1693 free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
1694 free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
1695
1696 if (!init_fifo(&ca->free, free, GFP_KERNEL) ||
1697 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1698 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
1699 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
1700 !(ca->buckets = vmalloc(sizeof(struct bucket) *
1701 ca->sb.nbuckets)) ||
1702 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1703 2, GFP_KERNEL)) ||
1704 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1705 !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
1706 bio_split_pool_init(&ca->bio_split_hook))
1707 goto err;
1708
1709 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1710
1711 memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket));
1712 for_each_bucket(b, ca)
1713 atomic_set(&b->pin, 0);
1714
1715 if (bch_cache_allocator_init(ca))
1716 goto err;
1717
1718 return 0;
1719err:
1720 kobject_put(&ca->kobj);
1721 return -ENOMEM;
1722}
1723
1724static const char *register_cache(struct cache_sb *sb, struct page *sb_page,
1725 struct block_device *bdev, struct cache *ca)
1726{
1727 char name[BDEVNAME_SIZE];
1728 const char *err = "cannot allocate memory";
1729
1730 if (cache_alloc(sb, ca) != 0)
1731 return err;
1732
1733 ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
1734 ca->bdev = bdev;
1735 ca->bdev->bd_holder = ca;
1736
1737 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1738 ca->discard = CACHE_DISCARD(&ca->sb);
1739
1740 err = "error creating kobject";
1741 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
1742 goto err;
1743
1744 err = register_cache_set(ca);
1745 if (err)
1746 goto err;
1747
1748 pr_info("registered cache device %s", bdevname(bdev, name));
1749
1750 return NULL;
1751err:
1752 kobject_put(&ca->kobj);
1753 pr_info("error opening %s: %s", bdevname(bdev, name), err);
1754 /* Return NULL instead of an error because kobject_put() cleans
1755 * everything up
1756 */
1757 return NULL;
1758}
1759
1760/* Global interfaces/init */
1761
1762static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1763 const char *, size_t);
1764
1765kobj_attribute_write(register, register_bcache);
1766kobj_attribute_write(register_quiet, register_bcache);
1767
1768static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1769 const char *buffer, size_t size)
1770{
1771 ssize_t ret = size;
1772 const char *err = "cannot allocate memory";
1773 char *path = NULL;
1774 struct cache_sb *sb = NULL;
1775 struct block_device *bdev = NULL;
1776 struct page *sb_page = NULL;
1777
1778 if (!try_module_get(THIS_MODULE))
1779 return -EBUSY;
1780
1781 mutex_lock(&bch_register_lock);
1782
1783 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1784 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1785 goto err;
1786
1787 err = "failed to open device";
1788 bdev = blkdev_get_by_path(strim(path),
1789 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1790 sb);
1791 if (bdev == ERR_PTR(-EBUSY))
1792 err = "device busy";
1793
1794 if (IS_ERR(bdev) ||
1795 set_blocksize(bdev, 4096))
1796 goto err;
1797
1798 err = read_super(sb, bdev, &sb_page);
1799 if (err)
1800 goto err_close;
1801
1802 if (sb->version == CACHE_BACKING_DEV) {
1803 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1804
1805 err = register_bdev(sb, sb_page, bdev, dc);
1806 } else {
1807 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1808
1809 err = register_cache(sb, sb_page, bdev, ca);
1810 }
1811
1812 if (err) {
1813 /* register_(bdev|cache) will only return an error if they
1814 * didn't get far enough to create the kobject - if they did,
1815 * the kobject destructor will do this cleanup.
1816 */
1817 put_page(sb_page);
1818err_close:
1819 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1820err:
1821 if (attr != &ksysfs_register_quiet)
1822 pr_info("error opening %s: %s", path, err);
1823 ret = -EINVAL;
1824 }
1825
1826 kfree(sb);
1827 kfree(path);
1828 mutex_unlock(&bch_register_lock);
1829 module_put(THIS_MODULE);
1830 return ret;
1831}
1832
1833static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1834{
1835 if (code == SYS_DOWN ||
1836 code == SYS_HALT ||
1837 code == SYS_POWER_OFF) {
1838 DEFINE_WAIT(wait);
1839 unsigned long start = jiffies;
1840 bool stopped = false;
1841
1842 struct cache_set *c, *tc;
1843 struct cached_dev *dc, *tdc;
1844
1845 mutex_lock(&bch_register_lock);
1846
1847 if (list_empty(&bch_cache_sets) &&
1848 list_empty(&uncached_devices))
1849 goto out;
1850
1851 pr_info("Stopping all devices:");
1852
1853 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1854 bch_cache_set_stop(c);
1855
1856 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
1857 bcache_device_stop(&dc->disk);
1858
1859 /* What's a condition variable? */
1860 while (1) {
1861 long timeout = start + 2 * HZ - jiffies;
1862
1863 stopped = list_empty(&bch_cache_sets) &&
1864 list_empty(&uncached_devices);
1865
1866 if (timeout < 0 || stopped)
1867 break;
1868
1869 prepare_to_wait(&unregister_wait, &wait,
1870 TASK_UNINTERRUPTIBLE);
1871
1872 mutex_unlock(&bch_register_lock);
1873 schedule_timeout(timeout);
1874 mutex_lock(&bch_register_lock);
1875 }
1876
1877 finish_wait(&unregister_wait, &wait);
1878
1879 if (stopped)
1880 pr_info("All devices stopped");
1881 else
1882 pr_notice("Timeout waiting for devices to be closed");
1883out:
1884 mutex_unlock(&bch_register_lock);
1885 }
1886
1887 return NOTIFY_DONE;
1888}
1889
1890static struct notifier_block reboot = {
1891 .notifier_call = bcache_reboot,
1892 .priority = INT_MAX, /* before any real devices */
1893};
1894
1895static void bcache_exit(void)
1896{
1897 bch_debug_exit();
1898 bch_writeback_exit();
1899 bch_request_exit();
1900 bch_btree_exit();
1901 if (bcache_kobj)
1902 kobject_put(bcache_kobj);
1903 if (bcache_wq)
1904 destroy_workqueue(bcache_wq);
1905 unregister_blkdev(bcache_major, "bcache");
1906 unregister_reboot_notifier(&reboot);
1907}
1908
1909static int __init bcache_init(void)
1910{
1911 static const struct attribute *files[] = {
1912 &ksysfs_register.attr,
1913 &ksysfs_register_quiet.attr,
1914 NULL
1915 };
1916
1917 mutex_init(&bch_register_lock);
1918 init_waitqueue_head(&unregister_wait);
1919 register_reboot_notifier(&reboot);
1920
1921 bcache_major = register_blkdev(0, "bcache");
1922 if (bcache_major < 0)
1923 return bcache_major;
1924
1925 if (!(bcache_wq = create_workqueue("bcache")) ||
1926 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
1927 sysfs_create_files(bcache_kobj, files) ||
1928 bch_btree_init() ||
1929 bch_request_init() ||
1930 bch_writeback_init() ||
1931 bch_debug_init(bcache_kobj))
1932 goto err;
1933
1934 return 0;
1935err:
1936 bcache_exit();
1937 return -ENOMEM;
1938}
1939
1940module_exit(bcache_exit);
1941module_init(bcache_init);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
new file mode 100644
index 000000000000..5c7e77073b1f
--- /dev/null
+++ b/drivers/md/bcache/sysfs.c
@@ -0,0 +1,817 @@
1/*
2 * bcache sysfs interfaces
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include "bcache.h"
9#include "sysfs.h"
10#include "btree.h"
11#include "request.h"
12
13#include <linux/sort.h>
14
15static const char * const cache_replacement_policies[] = {
16 "lru",
17 "fifo",
18 "random",
19 NULL
20};
21
22write_attribute(attach);
23write_attribute(detach);
24write_attribute(unregister);
25write_attribute(stop);
26write_attribute(clear_stats);
27write_attribute(trigger_gc);
28write_attribute(prune_cache);
29write_attribute(flash_vol_create);
30
31read_attribute(bucket_size);
32read_attribute(block_size);
33read_attribute(nbuckets);
34read_attribute(tree_depth);
35read_attribute(root_usage_percent);
36read_attribute(priority_stats);
37read_attribute(btree_cache_size);
38read_attribute(btree_cache_max_chain);
39read_attribute(cache_available_percent);
40read_attribute(written);
41read_attribute(btree_written);
42read_attribute(metadata_written);
43read_attribute(active_journal_entries);
44
45sysfs_time_stats_attribute(btree_gc, sec, ms);
46sysfs_time_stats_attribute(btree_split, sec, us);
47sysfs_time_stats_attribute(btree_sort, ms, us);
48sysfs_time_stats_attribute(btree_read, ms, us);
49sysfs_time_stats_attribute(try_harder, ms, us);
50
51read_attribute(btree_nodes);
52read_attribute(btree_used_percent);
53read_attribute(average_key_size);
54read_attribute(dirty_data);
55read_attribute(bset_tree_stats);
56
57read_attribute(state);
58read_attribute(cache_read_races);
59read_attribute(writeback_keys_done);
60read_attribute(writeback_keys_failed);
61read_attribute(io_errors);
62read_attribute(congested);
63rw_attribute(congested_read_threshold_us);
64rw_attribute(congested_write_threshold_us);
65
66rw_attribute(sequential_cutoff);
67rw_attribute(sequential_merge);
68rw_attribute(data_csum);
69rw_attribute(cache_mode);
70rw_attribute(writeback_metadata);
71rw_attribute(writeback_running);
72rw_attribute(writeback_percent);
73rw_attribute(writeback_delay);
74rw_attribute(writeback_rate);
75
76rw_attribute(writeback_rate_update_seconds);
77rw_attribute(writeback_rate_d_term);
78rw_attribute(writeback_rate_p_term_inverse);
79rw_attribute(writeback_rate_d_smooth);
80read_attribute(writeback_rate_debug);
81
82rw_attribute(synchronous);
83rw_attribute(journal_delay_ms);
84rw_attribute(discard);
85rw_attribute(running);
86rw_attribute(label);
87rw_attribute(readahead);
88rw_attribute(io_error_limit);
89rw_attribute(io_error_halflife);
90rw_attribute(verify);
91rw_attribute(key_merging_disabled);
92rw_attribute(gc_always_rewrite);
93rw_attribute(freelist_percent);
94rw_attribute(cache_replacement_policy);
95rw_attribute(btree_shrinker_disabled);
96rw_attribute(copy_gc_enabled);
97rw_attribute(size);
98
99SHOW(__bch_cached_dev)
100{
101 struct cached_dev *dc = container_of(kobj, struct cached_dev,
102 disk.kobj);
103 const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
104
105#define var(stat) (dc->stat)
106
107 if (attr == &sysfs_cache_mode)
108 return snprint_string_list(buf, PAGE_SIZE,
109 bch_cache_modes + 1,
110 BDEV_CACHE_MODE(&dc->sb));
111
112 sysfs_printf(data_csum, "%i", dc->disk.data_csum);
113 var_printf(verify, "%i");
114 var_printf(writeback_metadata, "%i");
115 var_printf(writeback_running, "%i");
116 var_print(writeback_delay);
117 var_print(writeback_percent);
118 sysfs_print(writeback_rate, dc->writeback_rate.rate);
119
120 var_print(writeback_rate_update_seconds);
121 var_print(writeback_rate_d_term);
122 var_print(writeback_rate_p_term_inverse);
123 var_print(writeback_rate_d_smooth);
124
125 if (attr == &sysfs_writeback_rate_debug) {
126 char dirty[20];
127 char derivative[20];
128 char target[20];
129 hprint(dirty,
130 atomic_long_read(&dc->disk.sectors_dirty) << 9);
131 hprint(derivative, dc->writeback_rate_derivative << 9);
132 hprint(target, dc->writeback_rate_target << 9);
133
134 return sprintf(buf,
135 "rate:\t\t%u\n"
136 "change:\t\t%i\n"
137 "dirty:\t\t%s\n"
138 "derivative:\t%s\n"
139 "target:\t\t%s\n",
140 dc->writeback_rate.rate,
141 dc->writeback_rate_change,
142 dirty, derivative, target);
143 }
144
145 sysfs_hprint(dirty_data,
146 atomic_long_read(&dc->disk.sectors_dirty) << 9);
147
148 var_printf(sequential_merge, "%i");
149 var_hprint(sequential_cutoff);
150 var_hprint(readahead);
151
152 sysfs_print(running, atomic_read(&dc->running));
153 sysfs_print(state, states[BDEV_STATE(&dc->sb)]);
154
155 if (attr == &sysfs_label) {
156 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
157 buf[SB_LABEL_SIZE + 1] = '\0';
158 strcat(buf, "\n");
159 return strlen(buf);
160 }
161
162#undef var
163 return 0;
164}
165SHOW_LOCKED(bch_cached_dev)
166
167STORE(__cached_dev)
168{
169 struct cached_dev *dc = container_of(kobj, struct cached_dev,
170 disk.kobj);
171 unsigned v = size;
172 struct cache_set *c;
173
174#define d_strtoul(var) sysfs_strtoul(var, dc->var)
175#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
176
177 sysfs_strtoul(data_csum, dc->disk.data_csum);
178 d_strtoul(verify);
179 d_strtoul(writeback_metadata);
180 d_strtoul(writeback_running);
181 d_strtoul(writeback_delay);
182 sysfs_strtoul_clamp(writeback_rate,
183 dc->writeback_rate.rate, 1, 1000000);
184 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
185
186 d_strtoul(writeback_rate_update_seconds);
187 d_strtoul(writeback_rate_d_term);
188 d_strtoul(writeback_rate_p_term_inverse);
189 sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
190 dc->writeback_rate_p_term_inverse, 1, INT_MAX);
191 d_strtoul(writeback_rate_d_smooth);
192
193 d_strtoul(sequential_merge);
194 d_strtoi_h(sequential_cutoff);
195 d_strtoi_h(readahead);
196
197 if (attr == &sysfs_clear_stats)
198 bch_cache_accounting_clear(&dc->accounting);
199
200 if (attr == &sysfs_running &&
201 strtoul_or_return(buf))
202 bch_cached_dev_run(dc);
203
204 if (attr == &sysfs_cache_mode) {
205 ssize_t v = read_string_list(buf, bch_cache_modes + 1);
206
207 if (v < 0)
208 return v;
209
210 if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) {
211 SET_BDEV_CACHE_MODE(&dc->sb, v);
212 bch_write_bdev_super(dc, NULL);
213 }
214 }
215
216 if (attr == &sysfs_label) {
217 memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
218 bch_write_bdev_super(dc, NULL);
219 if (dc->disk.c) {
220 memcpy(dc->disk.c->uuids[dc->disk.id].label,
221 buf, SB_LABEL_SIZE);
222 bch_uuid_write(dc->disk.c);
223 }
224 }
225
226 if (attr == &sysfs_attach) {
227 if (parse_uuid(buf, dc->sb.set_uuid) < 16)
228 return -EINVAL;
229
230 list_for_each_entry(c, &bch_cache_sets, list) {
231 v = bch_cached_dev_attach(dc, c);
232 if (!v)
233 return size;
234 }
235
236 pr_err("Can't attach %s: cache set not found", buf);
237 size = v;
238 }
239
240 if (attr == &sysfs_detach && dc->disk.c)
241 bch_cached_dev_detach(dc);
242
243 if (attr == &sysfs_stop)
244 bcache_device_stop(&dc->disk);
245
246 return size;
247}
248
249STORE(bch_cached_dev)
250{
251 struct cached_dev *dc = container_of(kobj, struct cached_dev,
252 disk.kobj);
253
254 mutex_lock(&bch_register_lock);
255 size = __cached_dev_store(kobj, attr, buf, size);
256
257 if (attr == &sysfs_writeback_running)
258 bch_writeback_queue(dc);
259
260 if (attr == &sysfs_writeback_percent)
261 schedule_delayed_work(&dc->writeback_rate_update,
262 dc->writeback_rate_update_seconds * HZ);
263
264 mutex_unlock(&bch_register_lock);
265 return size;
266}
267
268static struct attribute *bch_cached_dev_files[] = {
269 &sysfs_attach,
270 &sysfs_detach,
271 &sysfs_stop,
272#if 0
273 &sysfs_data_csum,
274#endif
275 &sysfs_cache_mode,
276 &sysfs_writeback_metadata,
277 &sysfs_writeback_running,
278 &sysfs_writeback_delay,
279 &sysfs_writeback_percent,
280 &sysfs_writeback_rate,
281 &sysfs_writeback_rate_update_seconds,
282 &sysfs_writeback_rate_d_term,
283 &sysfs_writeback_rate_p_term_inverse,
284 &sysfs_writeback_rate_d_smooth,
285 &sysfs_writeback_rate_debug,
286 &sysfs_dirty_data,
287 &sysfs_sequential_cutoff,
288 &sysfs_sequential_merge,
289 &sysfs_clear_stats,
290 &sysfs_running,
291 &sysfs_state,
292 &sysfs_label,
293 &sysfs_readahead,
294#ifdef CONFIG_BCACHE_DEBUG
295 &sysfs_verify,
296#endif
297 NULL
298};
299KTYPE(bch_cached_dev);
300
301SHOW(bch_flash_dev)
302{
303 struct bcache_device *d = container_of(kobj, struct bcache_device,
304 kobj);
305 struct uuid_entry *u = &d->c->uuids[d->id];
306
307 sysfs_printf(data_csum, "%i", d->data_csum);
308 sysfs_hprint(size, u->sectors << 9);
309
310 if (attr == &sysfs_label) {
311 memcpy(buf, u->label, SB_LABEL_SIZE);
312 buf[SB_LABEL_SIZE + 1] = '\0';
313 strcat(buf, "\n");
314 return strlen(buf);
315 }
316
317 return 0;
318}
319
320STORE(__bch_flash_dev)
321{
322 struct bcache_device *d = container_of(kobj, struct bcache_device,
323 kobj);
324 struct uuid_entry *u = &d->c->uuids[d->id];
325
326 sysfs_strtoul(data_csum, d->data_csum);
327
328 if (attr == &sysfs_size) {
329 uint64_t v;
330 strtoi_h_or_return(buf, v);
331
332 u->sectors = v >> 9;
333 bch_uuid_write(d->c);
334 set_capacity(d->disk, u->sectors);
335 }
336
337 if (attr == &sysfs_label) {
338 memcpy(u->label, buf, SB_LABEL_SIZE);
339 bch_uuid_write(d->c);
340 }
341
342 if (attr == &sysfs_unregister) {
343 atomic_set(&d->detaching, 1);
344 bcache_device_stop(d);
345 }
346
347 return size;
348}
349STORE_LOCKED(bch_flash_dev)
350
351static struct attribute *bch_flash_dev_files[] = {
352 &sysfs_unregister,
353#if 0
354 &sysfs_data_csum,
355#endif
356 &sysfs_label,
357 &sysfs_size,
358 NULL
359};
360KTYPE(bch_flash_dev);
361
362SHOW(__bch_cache_set)
363{
364 unsigned root_usage(struct cache_set *c)
365 {
366 unsigned bytes = 0;
367 struct bkey *k;
368 struct btree *b;
369 struct btree_iter iter;
370
371 goto lock_root;
372
373 do {
374 rw_unlock(false, b);
375lock_root:
376 b = c->root;
377 rw_lock(false, b, b->level);
378 } while (b != c->root);
379
380 for_each_key_filter(b, k, &iter, bch_ptr_bad)
381 bytes += bkey_bytes(k);
382
383 rw_unlock(false, b);
384
385 return (bytes * 100) / btree_bytes(c);
386 }
387
388 size_t cache_size(struct cache_set *c)
389 {
390 size_t ret = 0;
391 struct btree *b;
392
393 mutex_lock(&c->bucket_lock);
394 list_for_each_entry(b, &c->btree_cache, list)
395 ret += 1 << (b->page_order + PAGE_SHIFT);
396
397 mutex_unlock(&c->bucket_lock);
398 return ret;
399 }
400
401 unsigned cache_max_chain(struct cache_set *c)
402 {
403 unsigned ret = 0;
404 struct hlist_head *h;
405
406 mutex_lock(&c->bucket_lock);
407
408 for (h = c->bucket_hash;
409 h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
410 h++) {
411 unsigned i = 0;
412 struct hlist_node *p;
413
414 hlist_for_each(p, h)
415 i++;
416
417 ret = max(ret, i);
418 }
419
420 mutex_unlock(&c->bucket_lock);
421 return ret;
422 }
423
424 unsigned btree_used(struct cache_set *c)
425 {
426 return div64_u64(c->gc_stats.key_bytes * 100,
427 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
428 }
429
430 unsigned average_key_size(struct cache_set *c)
431 {
432 return c->gc_stats.nkeys
433 ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
434 : 0;
435 }
436
437 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
438
439 sysfs_print(synchronous, CACHE_SYNC(&c->sb));
440 sysfs_print(journal_delay_ms, c->journal_delay_ms);
441 sysfs_hprint(bucket_size, bucket_bytes(c));
442 sysfs_hprint(block_size, block_bytes(c));
443 sysfs_print(tree_depth, c->root->level);
444 sysfs_print(root_usage_percent, root_usage(c));
445
446 sysfs_hprint(btree_cache_size, cache_size(c));
447 sysfs_print(btree_cache_max_chain, cache_max_chain(c));
448 sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
449
450 sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
451 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
452 sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us);
453 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
454 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
455
456 sysfs_print(btree_used_percent, btree_used(c));
457 sysfs_print(btree_nodes, c->gc_stats.nodes);
458 sysfs_hprint(dirty_data, c->gc_stats.dirty);
459 sysfs_hprint(average_key_size, average_key_size(c));
460
461 sysfs_print(cache_read_races,
462 atomic_long_read(&c->cache_read_races));
463
464 sysfs_print(writeback_keys_done,
465 atomic_long_read(&c->writeback_keys_done));
466 sysfs_print(writeback_keys_failed,
467 atomic_long_read(&c->writeback_keys_failed));
468
469 /* See count_io_errors for why 88 */
470 sysfs_print(io_error_halflife, c->error_decay * 88);
471 sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT);
472
473 sysfs_hprint(congested,
474 ((uint64_t) bch_get_congested(c)) << 9);
475 sysfs_print(congested_read_threshold_us,
476 c->congested_read_threshold_us);
477 sysfs_print(congested_write_threshold_us,
478 c->congested_write_threshold_us);
479
480 sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
481 sysfs_printf(verify, "%i", c->verify);
482 sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
483 sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
484 sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
485 sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
486
487 if (attr == &sysfs_bset_tree_stats)
488 return bch_bset_print_stats(c, buf);
489
490 return 0;
491}
492SHOW_LOCKED(bch_cache_set)
493
494STORE(__bch_cache_set)
495{
496 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
497
498 if (attr == &sysfs_unregister)
499 bch_cache_set_unregister(c);
500
501 if (attr == &sysfs_stop)
502 bch_cache_set_stop(c);
503
504 if (attr == &sysfs_synchronous) {
505 bool sync = strtoul_or_return(buf);
506
507 if (sync != CACHE_SYNC(&c->sb)) {
508 SET_CACHE_SYNC(&c->sb, sync);
509 bcache_write_super(c);
510 }
511 }
512
513 if (attr == &sysfs_flash_vol_create) {
514 int r;
515 uint64_t v;
516 strtoi_h_or_return(buf, v);
517
518 r = bch_flash_dev_create(c, v);
519 if (r)
520 return r;
521 }
522
523 if (attr == &sysfs_clear_stats) {
524 atomic_long_set(&c->writeback_keys_done, 0);
525 atomic_long_set(&c->writeback_keys_failed, 0);
526
527 memset(&c->gc_stats, 0, sizeof(struct gc_stat));
528 bch_cache_accounting_clear(&c->accounting);
529 }
530
531 if (attr == &sysfs_trigger_gc)
532 bch_queue_gc(c);
533
534 if (attr == &sysfs_prune_cache) {
535 struct shrink_control sc;
536 sc.gfp_mask = GFP_KERNEL;
537 sc.nr_to_scan = strtoul_or_return(buf);
538 c->shrink.shrink(&c->shrink, &sc);
539 }
540
541 sysfs_strtoul(congested_read_threshold_us,
542 c->congested_read_threshold_us);
543 sysfs_strtoul(congested_write_threshold_us,
544 c->congested_write_threshold_us);
545
546 if (attr == &sysfs_io_error_limit)
547 c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
548
549 /* See count_io_errors() for why 88 */
550 if (attr == &sysfs_io_error_halflife)
551 c->error_decay = strtoul_or_return(buf) / 88;
552
553 sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
554 sysfs_strtoul(verify, c->verify);
555 sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
556 sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
557 sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
558 sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
559
560 return size;
561}
562STORE_LOCKED(bch_cache_set)
563
564SHOW(bch_cache_set_internal)
565{
566 struct cache_set *c = container_of(kobj, struct cache_set, internal);
567 return bch_cache_set_show(&c->kobj, attr, buf);
568}
569
570STORE(bch_cache_set_internal)
571{
572 struct cache_set *c = container_of(kobj, struct cache_set, internal);
573 return bch_cache_set_store(&c->kobj, attr, buf, size);
574}
575
576static void bch_cache_set_internal_release(struct kobject *k)
577{
578}
579
580static struct attribute *bch_cache_set_files[] = {
581 &sysfs_unregister,
582 &sysfs_stop,
583 &sysfs_synchronous,
584 &sysfs_journal_delay_ms,
585 &sysfs_flash_vol_create,
586
587 &sysfs_bucket_size,
588 &sysfs_block_size,
589 &sysfs_tree_depth,
590 &sysfs_root_usage_percent,
591 &sysfs_btree_cache_size,
592 &sysfs_cache_available_percent,
593
594 &sysfs_average_key_size,
595 &sysfs_dirty_data,
596
597 &sysfs_io_error_limit,
598 &sysfs_io_error_halflife,
599 &sysfs_congested,
600 &sysfs_congested_read_threshold_us,
601 &sysfs_congested_write_threshold_us,
602 &sysfs_clear_stats,
603 NULL
604};
605KTYPE(bch_cache_set);
606
607static struct attribute *bch_cache_set_internal_files[] = {
608 &sysfs_active_journal_entries,
609
610 sysfs_time_stats_attribute_list(btree_gc, sec, ms)
611 sysfs_time_stats_attribute_list(btree_split, sec, us)
612 sysfs_time_stats_attribute_list(btree_sort, ms, us)
613 sysfs_time_stats_attribute_list(btree_read, ms, us)
614 sysfs_time_stats_attribute_list(try_harder, ms, us)
615
616 &sysfs_btree_nodes,
617 &sysfs_btree_used_percent,
618 &sysfs_btree_cache_max_chain,
619
620 &sysfs_bset_tree_stats,
621 &sysfs_cache_read_races,
622 &sysfs_writeback_keys_done,
623 &sysfs_writeback_keys_failed,
624
625 &sysfs_trigger_gc,
626 &sysfs_prune_cache,
627#ifdef CONFIG_BCACHE_DEBUG
628 &sysfs_verify,
629 &sysfs_key_merging_disabled,
630#endif
631 &sysfs_gc_always_rewrite,
632 &sysfs_btree_shrinker_disabled,
633 &sysfs_copy_gc_enabled,
634 NULL
635};
636KTYPE(bch_cache_set_internal);
637
638SHOW(__bch_cache)
639{
640 struct cache *ca = container_of(kobj, struct cache, kobj);
641
642 sysfs_hprint(bucket_size, bucket_bytes(ca));
643 sysfs_hprint(block_size, block_bytes(ca));
644 sysfs_print(nbuckets, ca->sb.nbuckets);
645 sysfs_print(discard, ca->discard);
646 sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
647 sysfs_hprint(btree_written,
648 atomic_long_read(&ca->btree_sectors_written) << 9);
649 sysfs_hprint(metadata_written,
650 (atomic_long_read(&ca->meta_sectors_written) +
651 atomic_long_read(&ca->btree_sectors_written)) << 9);
652
653 sysfs_print(io_errors,
654 atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
655
656 sysfs_print(freelist_percent, ca->free.size * 100 /
657 ((size_t) ca->sb.nbuckets));
658
659 if (attr == &sysfs_cache_replacement_policy)
660 return snprint_string_list(buf, PAGE_SIZE,
661 cache_replacement_policies,
662 CACHE_REPLACEMENT(&ca->sb));
663
664 if (attr == &sysfs_priority_stats) {
665 int cmp(const void *l, const void *r)
666 { return *((uint16_t *) r) - *((uint16_t *) l); }
667
668 /* Number of quantiles we compute */
669 const unsigned nq = 31;
670
671 size_t n = ca->sb.nbuckets, i, unused, btree;
672 uint64_t sum = 0;
673 uint16_t q[nq], *p, *cached;
674 ssize_t ret;
675
676 cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
677 if (!p)
678 return -ENOMEM;
679
680 mutex_lock(&ca->set->bucket_lock);
681 for (i = ca->sb.first_bucket; i < n; i++)
682 p[i] = ca->buckets[i].prio;
683 mutex_unlock(&ca->set->bucket_lock);
684
685 sort(p, n, sizeof(uint16_t), cmp, NULL);
686
687 while (n &&
688 !cached[n - 1])
689 --n;
690
691 unused = ca->sb.nbuckets - n;
692
693 while (cached < p + n &&
694 *cached == BTREE_PRIO)
695 cached++;
696
697 btree = cached - p;
698 n -= btree;
699
700 for (i = 0; i < n; i++)
701 sum += INITIAL_PRIO - cached[i];
702
703 if (n)
704 do_div(sum, n);
705
706 for (i = 0; i < nq; i++)
707 q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)];
708
709 vfree(p);
710
711 ret = snprintf(buf, PAGE_SIZE,
712 "Unused: %zu%%\n"
713 "Metadata: %zu%%\n"
714 "Average: %llu\n"
715 "Sectors per Q: %zu\n"
716 "Quantiles: [",
717 unused * 100 / (size_t) ca->sb.nbuckets,
718 btree * 100 / (size_t) ca->sb.nbuckets, sum,
719 n * ca->sb.bucket_size / (nq + 1));
720
721 for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
722 ret += snprintf(buf + ret, PAGE_SIZE - ret,
723 i < nq - 1 ? "%u " : "%u]\n", q[i]);
724
725 buf[PAGE_SIZE - 1] = '\0';
726 return ret;
727 }
728
729 return 0;
730}
731SHOW_LOCKED(bch_cache)
732
733STORE(__bch_cache)
734{
735 struct cache *ca = container_of(kobj, struct cache, kobj);
736
737 if (attr == &sysfs_discard) {
738 bool v = strtoul_or_return(buf);
739
740 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
741 ca->discard = v;
742
743 if (v != CACHE_DISCARD(&ca->sb)) {
744 SET_CACHE_DISCARD(&ca->sb, v);
745 bcache_write_super(ca->set);
746 }
747 }
748
749 if (attr == &sysfs_cache_replacement_policy) {
750 ssize_t v = read_string_list(buf, cache_replacement_policies);
751
752 if (v < 0)
753 return v;
754
755 if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) {
756 mutex_lock(&ca->set->bucket_lock);
757 SET_CACHE_REPLACEMENT(&ca->sb, v);
758 mutex_unlock(&ca->set->bucket_lock);
759
760 bcache_write_super(ca->set);
761 }
762 }
763
764 if (attr == &sysfs_freelist_percent) {
765 DECLARE_FIFO(long, free);
766 long i;
767 size_t p = strtoul_or_return(buf);
768
769 p = clamp_t(size_t,
770 ((size_t) ca->sb.nbuckets * p) / 100,
771 roundup_pow_of_two(ca->sb.nbuckets) >> 9,
772 ca->sb.nbuckets / 2);
773
774 if (!init_fifo_exact(&free, p, GFP_KERNEL))
775 return -ENOMEM;
776
777 mutex_lock(&ca->set->bucket_lock);
778
779 fifo_move(&free, &ca->free);
780 fifo_swap(&free, &ca->free);
781
782 mutex_unlock(&ca->set->bucket_lock);
783
784 while (fifo_pop(&free, i))
785 atomic_dec(&ca->buckets[i].pin);
786
787 free_fifo(&free);
788 }
789
790 if (attr == &sysfs_clear_stats) {
791 atomic_long_set(&ca->sectors_written, 0);
792 atomic_long_set(&ca->btree_sectors_written, 0);
793 atomic_long_set(&ca->meta_sectors_written, 0);
794 atomic_set(&ca->io_count, 0);
795 atomic_set(&ca->io_errors, 0);
796 }
797
798 return size;
799}
800STORE_LOCKED(bch_cache)
801
802static struct attribute *bch_cache_files[] = {
803 &sysfs_bucket_size,
804 &sysfs_block_size,
805 &sysfs_nbuckets,
806 &sysfs_priority_stats,
807 &sysfs_discard,
808 &sysfs_written,
809 &sysfs_btree_written,
810 &sysfs_metadata_written,
811 &sysfs_io_errors,
812 &sysfs_clear_stats,
813 &sysfs_freelist_percent,
814 &sysfs_cache_replacement_policy,
815 NULL
816};
817KTYPE(bch_cache);
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
new file mode 100644
index 000000000000..34e4ba1184fe
--- /dev/null
+++ b/drivers/md/bcache/sysfs.h
@@ -0,0 +1,110 @@
1#ifndef _BCACHE_SYSFS_H_
2#define _BCACHE_SYSFS_H_
3
4#define KTYPE(type) \
5struct kobj_type type ## _ktype = { \
6 .release = type ## _release, \
7 .sysfs_ops = &((const struct sysfs_ops) { \
8 .show = type ## _show, \
9 .store = type ## _store \
10 }), \
11 .default_attrs = type ## _files \
12}
13
14#define SHOW(fn) \
15static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
16 char *buf) \
17
18#define STORE(fn) \
19static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
20 const char *buf, size_t size) \
21
22#define SHOW_LOCKED(fn) \
23SHOW(fn) \
24{ \
25 ssize_t ret; \
26 mutex_lock(&bch_register_lock); \
27 ret = __ ## fn ## _show(kobj, attr, buf); \
28 mutex_unlock(&bch_register_lock); \
29 return ret; \
30}
31
32#define STORE_LOCKED(fn) \
33STORE(fn) \
34{ \
35 ssize_t ret; \
36 mutex_lock(&bch_register_lock); \
37 ret = __ ## fn ## _store(kobj, attr, buf, size); \
38 mutex_unlock(&bch_register_lock); \
39 return ret; \
40}
41
42#define __sysfs_attribute(_name, _mode) \
43 static struct attribute sysfs_##_name = \
44 { .name = #_name, .mode = _mode }
45
46#define write_attribute(n) __sysfs_attribute(n, S_IWUSR)
47#define read_attribute(n) __sysfs_attribute(n, S_IRUGO)
48#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR)
49
50#define sysfs_printf(file, fmt, ...) \
51do { \
52 if (attr == &sysfs_ ## file) \
53 return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \
54} while (0)
55
56#define sysfs_print(file, var) \
57do { \
58 if (attr == &sysfs_ ## file) \
59 return snprint(buf, PAGE_SIZE, var); \
60} while (0)
61
62#define sysfs_hprint(file, val) \
63do { \
64 if (attr == &sysfs_ ## file) { \
65 ssize_t ret = hprint(buf, val); \
66 strcat(buf, "\n"); \
67 return ret + 1; \
68 } \
69} while (0)
70
71#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
72#define var_print(_var) sysfs_print(_var, var(_var))
73#define var_hprint(_var) sysfs_hprint(_var, var(_var))
74
75#define sysfs_strtoul(file, var) \
76do { \
77 if (attr == &sysfs_ ## file) \
78 return strtoul_safe(buf, var) ?: (ssize_t) size; \
79} while (0)
80
81#define sysfs_strtoul_clamp(file, var, min, max) \
82do { \
83 if (attr == &sysfs_ ## file) \
84 return strtoul_safe_clamp(buf, var, min, max) \
85 ?: (ssize_t) size; \
86} while (0)
87
88#define strtoul_or_return(cp) \
89({ \
90 unsigned long _v; \
91 int _r = kstrtoul(cp, 10, &_v); \
92 if (_r) \
93 return _r; \
94 _v; \
95})
96
97#define strtoi_h_or_return(cp, v) \
98do { \
99 int _r = strtoi_h(cp, &v); \
100 if (_r) \
101 return _r; \
102} while (0)
103
104#define sysfs_hatoi(file, var) \
105do { \
106 if (attr == &sysfs_ ## file) \
107 return strtoi_h(buf, &var) ?: (ssize_t) size; \
108} while (0)
109
110#endif /* _BCACHE_SYSFS_H_ */
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
new file mode 100644
index 000000000000..983f9bb411bc
--- /dev/null
+++ b/drivers/md/bcache/trace.c
@@ -0,0 +1,26 @@
1#include "bcache.h"
2#include "btree.h"
3#include "request.h"
4
5#include <linux/module.h>
6
7#define CREATE_TRACE_POINTS
8#include <trace/events/bcache.h>
9
10EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
11EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
12EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
13EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
14EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
15EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
16EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
17EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
18EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
19EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
20EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
21EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
22EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
23EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
24EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
25EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
26EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
new file mode 100644
index 000000000000..dcec2e4f84ad
--- /dev/null
+++ b/drivers/md/bcache/util.c
@@ -0,0 +1,389 @@
1/*
2 * random utiility code, for bcache but in theory not specific to bcache
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include <linux/bio.h>
9#include <linux/blkdev.h>
10#include <linux/ctype.h>
11#include <linux/debugfs.h>
12#include <linux/module.h>
13#include <linux/seq_file.h>
14#include <linux/types.h>
15
16#include "util.h"
17
18#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
19#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
20
21#define STRTO_H(name, type) \
22int name ## _h(const char *cp, type *res) \
23{ \
24 int u = 0; \
25 char *e; \
26 type i = simple_ ## name(cp, &e, 10); \
27 \
28 switch (tolower(*e)) { \
29 default: \
30 return -EINVAL; \
31 case 'y': \
32 case 'z': \
33 u++; \
34 case 'e': \
35 u++; \
36 case 'p': \
37 u++; \
38 case 't': \
39 u++; \
40 case 'g': \
41 u++; \
42 case 'm': \
43 u++; \
44 case 'k': \
45 u++; \
46 if (e++ == cp) \
47 return -EINVAL; \
48 case '\n': \
49 case '\0': \
50 if (*e == '\n') \
51 e++; \
52 } \
53 \
54 if (*e) \
55 return -EINVAL; \
56 \
57 while (u--) { \
58 if ((type) ~0 > 0 && \
59 (type) ~0 / 1024 <= i) \
60 return -EINVAL; \
61 if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \
62 (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \
63 return -EINVAL; \
64 i *= 1024; \
65 } \
66 \
67 *res = i; \
68 return 0; \
69} \
70EXPORT_SYMBOL_GPL(name ## _h);
71
72STRTO_H(strtoint, int)
73STRTO_H(strtouint, unsigned int)
74STRTO_H(strtoll, long long)
75STRTO_H(strtoull, unsigned long long)
76
77ssize_t hprint(char *buf, int64_t v)
78{
79 static const char units[] = "?kMGTPEZY";
80 char dec[3] = "";
81 int u, t = 0;
82
83 for (u = 0; v >= 1024 || v <= -1024; u++) {
84 t = v & ~(~0 << 10);
85 v >>= 10;
86 }
87
88 if (!u)
89 return sprintf(buf, "%llu", v);
90
91 if (v < 100 && v > -100)
92 sprintf(dec, ".%i", t / 100);
93
94 return sprintf(buf, "%lli%s%c", v, dec, units[u]);
95}
96EXPORT_SYMBOL_GPL(hprint);
97
98ssize_t snprint_string_list(char *buf, size_t size, const char * const list[],
99 size_t selected)
100{
101 char *out = buf;
102 size_t i;
103
104 for (i = 0; list[i]; i++)
105 out += snprintf(out, buf + size - out,
106 i == selected ? "[%s] " : "%s ", list[i]);
107
108 out[-1] = '\n';
109 return out - buf;
110}
111EXPORT_SYMBOL_GPL(snprint_string_list);
112
113ssize_t read_string_list(const char *buf, const char * const list[])
114{
115 size_t i;
116 char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
117 if (!d)
118 return -ENOMEM;
119
120 s = strim(d);
121
122 for (i = 0; list[i]; i++)
123 if (!strcmp(list[i], s))
124 break;
125
126 kfree(d);
127
128 if (!list[i])
129 return -EINVAL;
130
131 return i;
132}
133EXPORT_SYMBOL_GPL(read_string_list);
134
135bool is_zero(const char *p, size_t n)
136{
137 size_t i;
138
139 for (i = 0; i < n; i++)
140 if (p[i])
141 return false;
142 return true;
143}
144EXPORT_SYMBOL_GPL(is_zero);
145
146int parse_uuid(const char *s, char *uuid)
147{
148 size_t i, j, x;
149 memset(uuid, 0, 16);
150
151 for (i = 0, j = 0;
152 i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
153 i++) {
154 x = s[i] | 32;
155
156 switch (x) {
157 case '0'...'9':
158 x -= '0';
159 break;
160 case 'a'...'f':
161 x -= 'a' - 10;
162 break;
163 default:
164 continue;
165 }
166
167 if (!(j & 1))
168 x <<= 4;
169 uuid[j++ >> 1] |= x;
170 }
171 return i;
172}
173EXPORT_SYMBOL_GPL(parse_uuid);
174
175void time_stats_update(struct time_stats *stats, uint64_t start_time)
176{
177 uint64_t now = local_clock();
178 uint64_t duration = time_after64(now, start_time)
179 ? now - start_time : 0;
180 uint64_t last = time_after64(now, stats->last)
181 ? now - stats->last : 0;
182
183 stats->max_duration = max(stats->max_duration, duration);
184
185 if (stats->last) {
186 ewma_add(stats->average_duration, duration, 8, 8);
187
188 if (stats->average_frequency)
189 ewma_add(stats->average_frequency, last, 8, 8);
190 else
191 stats->average_frequency = last << 8;
192 } else {
193 stats->average_duration = duration << 8;
194 }
195
196 stats->last = now ?: 1;
197}
198EXPORT_SYMBOL_GPL(time_stats_update);
199
200unsigned next_delay(struct ratelimit *d, uint64_t done)
201{
202 uint64_t now = local_clock();
203
204 d->next += div_u64(done, d->rate);
205
206 return time_after64(d->next, now)
207 ? div_u64(d->next - now, NSEC_PER_SEC / HZ)
208 : 0;
209}
210EXPORT_SYMBOL_GPL(next_delay);
211
212void bio_map(struct bio *bio, void *base)
213{
214 size_t size = bio->bi_size;
215 struct bio_vec *bv = bio->bi_io_vec;
216
217 BUG_ON(!bio->bi_size);
218 BUG_ON(bio->bi_vcnt);
219
220 bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
221 goto start;
222
223 for (; size; bio->bi_vcnt++, bv++) {
224 bv->bv_offset = 0;
225start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
226 size);
227 if (base) {
228 bv->bv_page = is_vmalloc_addr(base)
229 ? vmalloc_to_page(base)
230 : virt_to_page(base);
231
232 base += bv->bv_len;
233 }
234
235 size -= bv->bv_len;
236 }
237}
238EXPORT_SYMBOL_GPL(bio_map);
239
240int bio_alloc_pages(struct bio *bio, gfp_t gfp)
241{
242 int i;
243 struct bio_vec *bv;
244
245 bio_for_each_segment(bv, bio, i) {
246 bv->bv_page = alloc_page(gfp);
247 if (!bv->bv_page) {
248 while (bv-- != bio->bi_io_vec + bio->bi_idx)
249 __free_page(bv->bv_page);
250 return -ENOMEM;
251 }
252 }
253
254 return 0;
255}
256EXPORT_SYMBOL_GPL(bio_alloc_pages);
257
258/*
259 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
260 * use permitted, subject to terms of PostgreSQL license; see.)
261
262 * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
263 * usual sort of implementation. (See Ross Williams' excellent introduction
264 * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
265 * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
266 * If we have no working 64-bit type, then fake it with two 32-bit registers.
267 *
268 * The present implementation is a normal (not "reflected", in Williams'
269 * terms) 64-bit CRC, using initial all-ones register contents and a final
270 * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
271 * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
272 *
273 * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
274 * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
275 * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
276 * x^7 + x^4 + x + 1
277*/
278
279static const uint64_t crc_table[256] = {
280 0x0000000000000000, 0x42F0E1EBA9EA3693, 0x85E1C3D753D46D26,
281 0xC711223CFA3E5BB5, 0x493366450E42ECDF, 0x0BC387AEA7A8DA4C,
282 0xCCD2A5925D9681F9, 0x8E224479F47CB76A, 0x9266CC8A1C85D9BE,
283 0xD0962D61B56FEF2D, 0x17870F5D4F51B498, 0x5577EEB6E6BB820B,
284 0xDB55AACF12C73561, 0x99A54B24BB2D03F2, 0x5EB4691841135847,
285 0x1C4488F3E8F96ED4, 0x663D78FF90E185EF, 0x24CD9914390BB37C,
286 0xE3DCBB28C335E8C9, 0xA12C5AC36ADFDE5A, 0x2F0E1EBA9EA36930,
287 0x6DFEFF5137495FA3, 0xAAEFDD6DCD770416, 0xE81F3C86649D3285,
288 0xF45BB4758C645C51, 0xB6AB559E258E6AC2, 0x71BA77A2DFB03177,
289 0x334A9649765A07E4, 0xBD68D2308226B08E, 0xFF9833DB2BCC861D,
290 0x388911E7D1F2DDA8, 0x7A79F00C7818EB3B, 0xCC7AF1FF21C30BDE,
291 0x8E8A101488293D4D, 0x499B3228721766F8, 0x0B6BD3C3DBFD506B,
292 0x854997BA2F81E701, 0xC7B97651866BD192, 0x00A8546D7C558A27,
293 0x4258B586D5BFBCB4, 0x5E1C3D753D46D260, 0x1CECDC9E94ACE4F3,
294 0xDBFDFEA26E92BF46, 0x990D1F49C77889D5, 0x172F5B3033043EBF,
295 0x55DFBADB9AEE082C, 0x92CE98E760D05399, 0xD03E790CC93A650A,
296 0xAA478900B1228E31, 0xE8B768EB18C8B8A2, 0x2FA64AD7E2F6E317,
297 0x6D56AB3C4B1CD584, 0xE374EF45BF6062EE, 0xA1840EAE168A547D,
298 0x66952C92ECB40FC8, 0x2465CD79455E395B, 0x3821458AADA7578F,
299 0x7AD1A461044D611C, 0xBDC0865DFE733AA9, 0xFF3067B657990C3A,
300 0x711223CFA3E5BB50, 0x33E2C2240A0F8DC3, 0xF4F3E018F031D676,
301 0xB60301F359DBE0E5, 0xDA050215EA6C212F, 0x98F5E3FE438617BC,
302 0x5FE4C1C2B9B84C09, 0x1D14202910527A9A, 0x93366450E42ECDF0,
303 0xD1C685BB4DC4FB63, 0x16D7A787B7FAA0D6, 0x5427466C1E109645,
304 0x4863CE9FF6E9F891, 0x0A932F745F03CE02, 0xCD820D48A53D95B7,
305 0x8F72ECA30CD7A324, 0x0150A8DAF8AB144E, 0x43A04931514122DD,
306 0x84B16B0DAB7F7968, 0xC6418AE602954FFB, 0xBC387AEA7A8DA4C0,
307 0xFEC89B01D3679253, 0x39D9B93D2959C9E6, 0x7B2958D680B3FF75,
308 0xF50B1CAF74CF481F, 0xB7FBFD44DD257E8C, 0x70EADF78271B2539,
309 0x321A3E938EF113AA, 0x2E5EB66066087D7E, 0x6CAE578BCFE24BED,
310 0xABBF75B735DC1058, 0xE94F945C9C3626CB, 0x676DD025684A91A1,
311 0x259D31CEC1A0A732, 0xE28C13F23B9EFC87, 0xA07CF2199274CA14,
312 0x167FF3EACBAF2AF1, 0x548F120162451C62, 0x939E303D987B47D7,
313 0xD16ED1D631917144, 0x5F4C95AFC5EDC62E, 0x1DBC74446C07F0BD,
314 0xDAAD56789639AB08, 0x985DB7933FD39D9B, 0x84193F60D72AF34F,
315 0xC6E9DE8B7EC0C5DC, 0x01F8FCB784FE9E69, 0x43081D5C2D14A8FA,
316 0xCD2A5925D9681F90, 0x8FDAB8CE70822903, 0x48CB9AF28ABC72B6,
317 0x0A3B7B1923564425, 0x70428B155B4EAF1E, 0x32B26AFEF2A4998D,
318 0xF5A348C2089AC238, 0xB753A929A170F4AB, 0x3971ED50550C43C1,
319 0x7B810CBBFCE67552, 0xBC902E8706D82EE7, 0xFE60CF6CAF321874,
320 0xE224479F47CB76A0, 0xA0D4A674EE214033, 0x67C58448141F1B86,
321 0x253565A3BDF52D15, 0xAB1721DA49899A7F, 0xE9E7C031E063ACEC,
322 0x2EF6E20D1A5DF759, 0x6C0603E6B3B7C1CA, 0xF6FAE5C07D3274CD,
323 0xB40A042BD4D8425E, 0x731B26172EE619EB, 0x31EBC7FC870C2F78,
324 0xBFC9838573709812, 0xFD39626EDA9AAE81, 0x3A28405220A4F534,
325 0x78D8A1B9894EC3A7, 0x649C294A61B7AD73, 0x266CC8A1C85D9BE0,
326 0xE17DEA9D3263C055, 0xA38D0B769B89F6C6, 0x2DAF4F0F6FF541AC,
327 0x6F5FAEE4C61F773F, 0xA84E8CD83C212C8A, 0xEABE6D3395CB1A19,
328 0x90C79D3FEDD3F122, 0xD2377CD44439C7B1, 0x15265EE8BE079C04,
329 0x57D6BF0317EDAA97, 0xD9F4FB7AE3911DFD, 0x9B041A914A7B2B6E,
330 0x5C1538ADB04570DB, 0x1EE5D94619AF4648, 0x02A151B5F156289C,
331 0x4051B05E58BC1E0F, 0x87409262A28245BA, 0xC5B073890B687329,
332 0x4B9237F0FF14C443, 0x0962D61B56FEF2D0, 0xCE73F427ACC0A965,
333 0x8C8315CC052A9FF6, 0x3A80143F5CF17F13, 0x7870F5D4F51B4980,
334 0xBF61D7E80F251235, 0xFD913603A6CF24A6, 0x73B3727A52B393CC,
335 0x31439391FB59A55F, 0xF652B1AD0167FEEA, 0xB4A25046A88DC879,
336 0xA8E6D8B54074A6AD, 0xEA16395EE99E903E, 0x2D071B6213A0CB8B,
337 0x6FF7FA89BA4AFD18, 0xE1D5BEF04E364A72, 0xA3255F1BE7DC7CE1,
338 0x64347D271DE22754, 0x26C49CCCB40811C7, 0x5CBD6CC0CC10FAFC,
339 0x1E4D8D2B65FACC6F, 0xD95CAF179FC497DA, 0x9BAC4EFC362EA149,
340 0x158E0A85C2521623, 0x577EEB6E6BB820B0, 0x906FC95291867B05,
341 0xD29F28B9386C4D96, 0xCEDBA04AD0952342, 0x8C2B41A1797F15D1,
342 0x4B3A639D83414E64, 0x09CA82762AAB78F7, 0x87E8C60FDED7CF9D,
343 0xC51827E4773DF90E, 0x020905D88D03A2BB, 0x40F9E43324E99428,
344 0x2CFFE7D5975E55E2, 0x6E0F063E3EB46371, 0xA91E2402C48A38C4,
345 0xEBEEC5E96D600E57, 0x65CC8190991CB93D, 0x273C607B30F68FAE,
346 0xE02D4247CAC8D41B, 0xA2DDA3AC6322E288, 0xBE992B5F8BDB8C5C,
347 0xFC69CAB42231BACF, 0x3B78E888D80FE17A, 0x7988096371E5D7E9,
348 0xF7AA4D1A85996083, 0xB55AACF12C735610, 0x724B8ECDD64D0DA5,
349 0x30BB6F267FA73B36, 0x4AC29F2A07BFD00D, 0x08327EC1AE55E69E,
350 0xCF235CFD546BBD2B, 0x8DD3BD16FD818BB8, 0x03F1F96F09FD3CD2,
351 0x41011884A0170A41, 0x86103AB85A2951F4, 0xC4E0DB53F3C36767,
352 0xD8A453A01B3A09B3, 0x9A54B24BB2D03F20, 0x5D45907748EE6495,
353 0x1FB5719CE1045206, 0x919735E51578E56C, 0xD367D40EBC92D3FF,
354 0x1476F63246AC884A, 0x568617D9EF46BED9, 0xE085162AB69D5E3C,
355 0xA275F7C11F7768AF, 0x6564D5FDE549331A, 0x279434164CA30589,
356 0xA9B6706FB8DFB2E3, 0xEB46918411358470, 0x2C57B3B8EB0BDFC5,
357 0x6EA7525342E1E956, 0x72E3DAA0AA188782, 0x30133B4B03F2B111,
358 0xF7021977F9CCEAA4, 0xB5F2F89C5026DC37, 0x3BD0BCE5A45A6B5D,
359 0x79205D0E0DB05DCE, 0xBE317F32F78E067B, 0xFCC19ED95E6430E8,
360 0x86B86ED5267CDBD3, 0xC4488F3E8F96ED40, 0x0359AD0275A8B6F5,
361 0x41A94CE9DC428066, 0xCF8B0890283E370C, 0x8D7BE97B81D4019F,
362 0x4A6ACB477BEA5A2A, 0x089A2AACD2006CB9, 0x14DEA25F3AF9026D,
363 0x562E43B4931334FE, 0x913F6188692D6F4B, 0xD3CF8063C0C759D8,
364 0x5DEDC41A34BBEEB2, 0x1F1D25F19D51D821, 0xD80C07CD676F8394,
365 0x9AFCE626CE85B507
366};
367
368uint64_t crc64_update(uint64_t crc, const void *_data, size_t len)
369{
370 const unsigned char *data = _data;
371
372 while (len--) {
373 int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
374 crc = crc_table[i] ^ (crc << 8);
375 }
376
377 return crc;
378}
379EXPORT_SYMBOL(crc64_update);
380
381uint64_t crc64(const void *data, size_t len)
382{
383 uint64_t crc = 0xffffffffffffffff;
384
385 crc = crc64_update(crc, data, len);
386
387 return crc ^ 0xffffffffffffffff;
388}
389EXPORT_SYMBOL(crc64);
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
new file mode 100644
index 000000000000..56705fdcc149
--- /dev/null
+++ b/drivers/md/bcache/util.h
@@ -0,0 +1,589 @@
1
2#ifndef _BCACHE_UTIL_H
3#define _BCACHE_UTIL_H
4
5#include <linux/errno.h>
6#include <linux/kernel.h>
7#include <linux/llist.h>
8#include <linux/ratelimit.h>
9#include <linux/vmalloc.h>
10#include <linux/workqueue.h>
11
12#include "closure.h"
13
14#define PAGE_SECTORS (PAGE_SIZE / 512)
15
16struct closure;
17
18#include <trace/events/bcache.h>
19
20#ifdef CONFIG_BCACHE_EDEBUG
21
22#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
23#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
24
25#else /* EDEBUG */
26
27#define atomic_dec_bug(v) atomic_dec(v)
28#define atomic_inc_bug(v, i) atomic_inc(v)
29
30#endif
31
32#define BITMASK(name, type, field, offset, size) \
33static inline uint64_t name(const type *k) \
34{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \
35 \
36static inline void SET_##name(type *k, uint64_t v) \
37{ \
38 k->field &= ~(~((uint64_t) ~0 << size) << offset); \
39 k->field |= v << offset; \
40}
41
42#define DECLARE_HEAP(type, name) \
43 struct { \
44 size_t size, used; \
45 type *data; \
46 } name
47
48#define init_heap(heap, _size, gfp) \
49({ \
50 size_t _bytes; \
51 (heap)->used = 0; \
52 (heap)->size = (_size); \
53 _bytes = (heap)->size * sizeof(*(heap)->data); \
54 (heap)->data = NULL; \
55 if (_bytes < KMALLOC_MAX_SIZE) \
56 (heap)->data = kmalloc(_bytes, (gfp)); \
57 if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \
58 (heap)->data = vmalloc(_bytes); \
59 (heap)->data; \
60})
61
62#define free_heap(heap) \
63do { \
64 if (is_vmalloc_addr((heap)->data)) \
65 vfree((heap)->data); \
66 else \
67 kfree((heap)->data); \
68 (heap)->data = NULL; \
69} while (0)
70
71#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
72
73#define heap_sift(h, i, cmp) \
74do { \
75 size_t _r, _j = i; \
76 \
77 for (; _j * 2 + 1 < (h)->used; _j = _r) { \
78 _r = _j * 2 + 1; \
79 if (_r + 1 < (h)->used && \
80 cmp((h)->data[_r], (h)->data[_r + 1])) \
81 _r++; \
82 \
83 if (cmp((h)->data[_r], (h)->data[_j])) \
84 break; \
85 heap_swap(h, _r, _j); \
86 } \
87} while (0)
88
89#define heap_sift_down(h, i, cmp) \
90do { \
91 while (i) { \
92 size_t p = (i - 1) / 2; \
93 if (cmp((h)->data[i], (h)->data[p])) \
94 break; \
95 heap_swap(h, i, p); \
96 i = p; \
97 } \
98} while (0)
99
100#define heap_add(h, d, cmp) \
101({ \
102 bool _r = !heap_full(h); \
103 if (_r) { \
104 size_t _i = (h)->used++; \
105 (h)->data[_i] = d; \
106 \
107 heap_sift_down(h, _i, cmp); \
108 heap_sift(h, _i, cmp); \
109 } \
110 _r; \
111})
112
113#define heap_pop(h, d, cmp) \
114({ \
115 bool _r = (h)->used; \
116 if (_r) { \
117 (d) = (h)->data[0]; \
118 (h)->used--; \
119 heap_swap(h, 0, (h)->used); \
120 heap_sift(h, 0, cmp); \
121 } \
122 _r; \
123})
124
125#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL)
126
127#define heap_full(h) ((h)->used == (h)->size)
128
129#define DECLARE_FIFO(type, name) \
130 struct { \
131 size_t front, back, size, mask; \
132 type *data; \
133 } name
134
135#define fifo_for_each(c, fifo, iter) \
136 for (iter = (fifo)->front; \
137 c = (fifo)->data[iter], iter != (fifo)->back; \
138 iter = (iter + 1) & (fifo)->mask)
139
140#define __init_fifo(fifo, gfp) \
141({ \
142 size_t _allocated_size, _bytes; \
143 BUG_ON(!(fifo)->size); \
144 \
145 _allocated_size = roundup_pow_of_two((fifo)->size + 1); \
146 _bytes = _allocated_size * sizeof(*(fifo)->data); \
147 \
148 (fifo)->mask = _allocated_size - 1; \
149 (fifo)->front = (fifo)->back = 0; \
150 (fifo)->data = NULL; \
151 \
152 if (_bytes < KMALLOC_MAX_SIZE) \
153 (fifo)->data = kmalloc(_bytes, (gfp)); \
154 if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \
155 (fifo)->data = vmalloc(_bytes); \
156 (fifo)->data; \
157})
158
159#define init_fifo_exact(fifo, _size, gfp) \
160({ \
161 (fifo)->size = (_size); \
162 __init_fifo(fifo, gfp); \
163})
164
165#define init_fifo(fifo, _size, gfp) \
166({ \
167 (fifo)->size = (_size); \
168 if ((fifo)->size > 4) \
169 (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \
170 __init_fifo(fifo, gfp); \
171})
172
173#define free_fifo(fifo) \
174do { \
175 if (is_vmalloc_addr((fifo)->data)) \
176 vfree((fifo)->data); \
177 else \
178 kfree((fifo)->data); \
179 (fifo)->data = NULL; \
180} while (0)
181
182#define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask)
183#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
184
185#define fifo_empty(fifo) (!fifo_used(fifo))
186#define fifo_full(fifo) (!fifo_free(fifo))
187
188#define fifo_front(fifo) ((fifo)->data[(fifo)->front])
189#define fifo_back(fifo) \
190 ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
191
192#define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask)
193
194#define fifo_push_back(fifo, i) \
195({ \
196 bool _r = !fifo_full((fifo)); \
197 if (_r) { \
198 (fifo)->data[(fifo)->back++] = (i); \
199 (fifo)->back &= (fifo)->mask; \
200 } \
201 _r; \
202})
203
204#define fifo_pop_front(fifo, i) \
205({ \
206 bool _r = !fifo_empty((fifo)); \
207 if (_r) { \
208 (i) = (fifo)->data[(fifo)->front++]; \
209 (fifo)->front &= (fifo)->mask; \
210 } \
211 _r; \
212})
213
214#define fifo_push_front(fifo, i) \
215({ \
216 bool _r = !fifo_full((fifo)); \
217 if (_r) { \
218 --(fifo)->front; \
219 (fifo)->front &= (fifo)->mask; \
220 (fifo)->data[(fifo)->front] = (i); \
221 } \
222 _r; \
223})
224
225#define fifo_pop_back(fifo, i) \
226({ \
227 bool _r = !fifo_empty((fifo)); \
228 if (_r) { \
229 --(fifo)->back; \
230 (fifo)->back &= (fifo)->mask; \
231 (i) = (fifo)->data[(fifo)->back] \
232 } \
233 _r; \
234})
235
236#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
237#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
238
239#define fifo_swap(l, r) \
240do { \
241 swap((l)->front, (r)->front); \
242 swap((l)->back, (r)->back); \
243 swap((l)->size, (r)->size); \
244 swap((l)->mask, (r)->mask); \
245 swap((l)->data, (r)->data); \
246} while (0)
247
248#define fifo_move(dest, src) \
249do { \
250 typeof(*((dest)->data)) _t; \
251 while (!fifo_full(dest) && \
252 fifo_pop(src, _t)) \
253 fifo_push(dest, _t); \
254} while (0)
255
256/*
257 * Simple array based allocator - preallocates a number of elements and you can
258 * never allocate more than that, also has no locking.
259 *
260 * Handy because if you know you only need a fixed number of elements you don't
261 * have to worry about memory allocation failure, and sometimes a mempool isn't
262 * what you want.
263 *
264 * We treat the free elements as entries in a singly linked list, and the
265 * freelist as a stack - allocating and freeing push and pop off the freelist.
266 */
267
268#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \
269 struct { \
270 type *freelist; \
271 type data[size]; \
272 } name
273
274#define array_alloc(array) \
275({ \
276 typeof((array)->freelist) _ret = (array)->freelist; \
277 \
278 if (_ret) \
279 (array)->freelist = *((typeof((array)->freelist) *) _ret);\
280 \
281 _ret; \
282})
283
284#define array_free(array, ptr) \
285do { \
286 typeof((array)->freelist) _ptr = ptr; \
287 \
288 *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \
289 (array)->freelist = _ptr; \
290} while (0)
291
292#define array_allocator_init(array) \
293do { \
294 typeof((array)->freelist) _i; \
295 \
296 BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \
297 (array)->freelist = NULL; \
298 \
299 for (_i = (array)->data; \
300 _i < (array)->data + ARRAY_SIZE((array)->data); \
301 _i++) \
302 array_free(array, _i); \
303} while (0)
304
305#define array_freelist_empty(array) ((array)->freelist == NULL)
306
307#define ANYSINT_MAX(t) \
308 ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
309
310int strtoint_h(const char *, int *);
311int strtouint_h(const char *, unsigned int *);
312int strtoll_h(const char *, long long *);
313int strtoull_h(const char *, unsigned long long *);
314
315static inline int strtol_h(const char *cp, long *res)
316{
317#if BITS_PER_LONG == 32
318 return strtoint_h(cp, (int *) res);
319#else
320 return strtoll_h(cp, (long long *) res);
321#endif
322}
323
324static inline int strtoul_h(const char *cp, long *res)
325{
326#if BITS_PER_LONG == 32
327 return strtouint_h(cp, (unsigned int *) res);
328#else
329 return strtoull_h(cp, (unsigned long long *) res);
330#endif
331}
332
333#define strtoi_h(cp, res) \
334 (__builtin_types_compatible_p(typeof(*res), int) \
335 ? strtoint_h(cp, (void *) res) \
336 : __builtin_types_compatible_p(typeof(*res), long) \
337 ? strtol_h(cp, (void *) res) \
338 : __builtin_types_compatible_p(typeof(*res), long long) \
339 ? strtoll_h(cp, (void *) res) \
340 : __builtin_types_compatible_p(typeof(*res), unsigned int) \
341 ? strtouint_h(cp, (void *) res) \
342 : __builtin_types_compatible_p(typeof(*res), unsigned long) \
343 ? strtoul_h(cp, (void *) res) \
344 : __builtin_types_compatible_p(typeof(*res), unsigned long long)\
345 ? strtoull_h(cp, (void *) res) : -EINVAL)
346
347#define strtoul_safe(cp, var) \
348({ \
349 unsigned long _v; \
350 int _r = kstrtoul(cp, 10, &_v); \
351 if (!_r) \
352 var = _v; \
353 _r; \
354})
355
356#define strtoul_safe_clamp(cp, var, min, max) \
357({ \
358 unsigned long _v; \
359 int _r = kstrtoul(cp, 10, &_v); \
360 if (!_r) \
361 var = clamp_t(typeof(var), _v, min, max); \
362 _r; \
363})
364
365#define snprint(buf, size, var) \
366 snprintf(buf, size, \
367 __builtin_types_compatible_p(typeof(var), int) \
368 ? "%i\n" : \
369 __builtin_types_compatible_p(typeof(var), unsigned) \
370 ? "%u\n" : \
371 __builtin_types_compatible_p(typeof(var), long) \
372 ? "%li\n" : \
373 __builtin_types_compatible_p(typeof(var), unsigned long)\
374 ? "%lu\n" : \
375 __builtin_types_compatible_p(typeof(var), int64_t) \
376 ? "%lli\n" : \
377 __builtin_types_compatible_p(typeof(var), uint64_t) \
378 ? "%llu\n" : \
379 __builtin_types_compatible_p(typeof(var), const char *) \
380 ? "%s\n" : "%i\n", var)
381
382ssize_t hprint(char *buf, int64_t v);
383
384bool is_zero(const char *p, size_t n);
385int parse_uuid(const char *s, char *uuid);
386
387ssize_t snprint_string_list(char *buf, size_t size, const char * const list[],
388 size_t selected);
389
390ssize_t read_string_list(const char *buf, const char * const list[]);
391
392struct time_stats {
393 /*
394 * all fields are in nanoseconds, averages are ewmas stored left shifted
395 * by 8
396 */
397 uint64_t max_duration;
398 uint64_t average_duration;
399 uint64_t average_frequency;
400 uint64_t last;
401};
402
403void time_stats_update(struct time_stats *stats, uint64_t time);
404
405#define NSEC_PER_ns 1L
406#define NSEC_PER_us NSEC_PER_USEC
407#define NSEC_PER_ms NSEC_PER_MSEC
408#define NSEC_PER_sec NSEC_PER_SEC
409
410#define __print_time_stat(stats, name, stat, units) \
411 sysfs_print(name ## _ ## stat ## _ ## units, \
412 div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
413
414#define sysfs_print_time_stats(stats, name, \
415 frequency_units, \
416 duration_units) \
417do { \
418 __print_time_stat(stats, name, \
419 average_frequency, frequency_units); \
420 __print_time_stat(stats, name, \
421 average_duration, duration_units); \
422 __print_time_stat(stats, name, \
423 max_duration, duration_units); \
424 \
425 sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
426 ? div_s64(local_clock() - (stats)->last, \
427 NSEC_PER_ ## frequency_units) \
428 : -1LL); \
429} while (0)
430
431#define sysfs_time_stats_attribute(name, \
432 frequency_units, \
433 duration_units) \
434read_attribute(name ## _average_frequency_ ## frequency_units); \
435read_attribute(name ## _average_duration_ ## duration_units); \
436read_attribute(name ## _max_duration_ ## duration_units); \
437read_attribute(name ## _last_ ## frequency_units)
438
439#define sysfs_time_stats_attribute_list(name, \
440 frequency_units, \
441 duration_units) \
442&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
443&sysfs_ ## name ## _average_duration_ ## duration_units, \
444&sysfs_ ## name ## _max_duration_ ## duration_units, \
445&sysfs_ ## name ## _last_ ## frequency_units,
446
447#define ewma_add(ewma, val, weight, factor) \
448({ \
449 (ewma) *= (weight) - 1; \
450 (ewma) += (val) << factor; \
451 (ewma) /= (weight); \
452 (ewma) >> factor; \
453})
454
455struct ratelimit {
456 uint64_t next;
457 unsigned rate;
458};
459
460static inline void ratelimit_reset(struct ratelimit *d)
461{
462 d->next = local_clock();
463}
464
465unsigned next_delay(struct ratelimit *d, uint64_t done);
466
467#define __DIV_SAFE(n, d, zero) \
468({ \
469 typeof(n) _n = (n); \
470 typeof(d) _d = (d); \
471 _d ? _n / _d : zero; \
472})
473
474#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0)
475
476#define container_of_or_null(ptr, type, member) \
477({ \
478 typeof(ptr) _ptr = ptr; \
479 _ptr ? container_of(_ptr, type, member) : NULL; \
480})
481
482#define RB_INSERT(root, new, member, cmp) \
483({ \
484 __label__ dup; \
485 struct rb_node **n = &(root)->rb_node, *parent = NULL; \
486 typeof(new) this; \
487 int res, ret = -1; \
488 \
489 while (*n) { \
490 parent = *n; \
491 this = container_of(*n, typeof(*(new)), member); \
492 res = cmp(new, this); \
493 if (!res) \
494 goto dup; \
495 n = res < 0 \
496 ? &(*n)->rb_left \
497 : &(*n)->rb_right; \
498 } \
499 \
500 rb_link_node(&(new)->member, parent, n); \
501 rb_insert_color(&(new)->member, root); \
502 ret = 0; \
503dup: \
504 ret; \
505})
506
507#define RB_SEARCH(root, search, member, cmp) \
508({ \
509 struct rb_node *n = (root)->rb_node; \
510 typeof(&(search)) this, ret = NULL; \
511 int res; \
512 \
513 while (n) { \
514 this = container_of(n, typeof(search), member); \
515 res = cmp(&(search), this); \
516 if (!res) { \
517 ret = this; \
518 break; \
519 } \
520 n = res < 0 \
521 ? n->rb_left \
522 : n->rb_right; \
523 } \
524 ret; \
525})
526
527#define RB_GREATER(root, search, member, cmp) \
528({ \
529 struct rb_node *n = (root)->rb_node; \
530 typeof(&(search)) this, ret = NULL; \
531 int res; \
532 \
533 while (n) { \
534 this = container_of(n, typeof(search), member); \
535 res = cmp(&(search), this); \
536 if (res < 0) { \
537 ret = this; \
538 n = n->rb_left; \
539 } else \
540 n = n->rb_right; \
541 } \
542 ret; \
543})
544
545#define RB_FIRST(root, type, member) \
546 container_of_or_null(rb_first(root), type, member)
547
548#define RB_LAST(root, type, member) \
549 container_of_or_null(rb_last(root), type, member)
550
551#define RB_NEXT(ptr, member) \
552 container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
553
554#define RB_PREV(ptr, member) \
555 container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
556
557/* Does linear interpolation between powers of two */
558static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
559{
560 unsigned fract = x & ~(~0 << fract_bits);
561
562 x >>= fract_bits;
563 x = 1 << x;
564 x += (x * fract) >> fract_bits;
565
566 return x;
567}
568
569#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
570
571void bio_map(struct bio *bio, void *base);
572
573int bio_alloc_pages(struct bio *bio, gfp_t gfp);
574
575static inline sector_t bdev_sectors(struct block_device *bdev)
576{
577 return bdev->bd_inode->i_size >> 9;
578}
579
580#define closure_bio_submit(bio, cl, dev) \
581do { \
582 closure_get(cl); \
583 bch_generic_make_request(bio, &(dev)->bio_split_hook); \
584} while (0)
585
586uint64_t crc64_update(uint64_t, const void *, size_t);
587uint64_t crc64(const void *, size_t);
588
589#endif /* _BCACHE_UTIL_H */
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
new file mode 100644
index 000000000000..a80ee5373fd8
--- /dev/null
+++ b/drivers/md/bcache/writeback.c
@@ -0,0 +1,414 @@
1/*
2 * background writeback - scan btree for dirty data and write it to the backing
3 * device
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12
13static struct workqueue_struct *dirty_wq;
14
15static void read_dirty(struct closure *);
16
17struct dirty_io {
18 struct closure cl;
19 struct cached_dev *dc;
20 struct bio bio;
21};
22
23/* Rate limiting */
24
25static void __update_writeback_rate(struct cached_dev *dc)
26{
27 struct cache_set *c = dc->disk.c;
28 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
29 uint64_t cache_dirty_target =
30 div_u64(cache_sectors * dc->writeback_percent, 100);
31
32 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
33 c->cached_dev_sectors);
34
35 /* PD controller */
36
37 int change = 0;
38 int64_t error;
39 int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
40 int64_t derivative = dirty - dc->disk.sectors_dirty_last;
41
42 dc->disk.sectors_dirty_last = dirty;
43
44 derivative *= dc->writeback_rate_d_term;
45 derivative = clamp(derivative, -dirty, dirty);
46
47 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
48 dc->writeback_rate_d_smooth, 0);
49
50 /* Avoid divide by zero */
51 if (!target)
52 goto out;
53
54 error = div64_s64((dirty + derivative - target) << 8, target);
55
56 change = div_s64((dc->writeback_rate.rate * error) >> 8,
57 dc->writeback_rate_p_term_inverse);
58
59 /* Don't increase writeback rate if the device isn't keeping up */
60 if (change > 0 &&
61 time_after64(local_clock(),
62 dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
63 change = 0;
64
65 dc->writeback_rate.rate =
66 clamp_t(int64_t, dc->writeback_rate.rate + change,
67 1, NSEC_PER_MSEC);
68out:
69 dc->writeback_rate_derivative = derivative;
70 dc->writeback_rate_change = change;
71 dc->writeback_rate_target = target;
72
73 schedule_delayed_work(&dc->writeback_rate_update,
74 dc->writeback_rate_update_seconds * HZ);
75}
76
77static void update_writeback_rate(struct work_struct *work)
78{
79 struct cached_dev *dc = container_of(to_delayed_work(work),
80 struct cached_dev,
81 writeback_rate_update);
82
83 down_read(&dc->writeback_lock);
84
85 if (atomic_read(&dc->has_dirty) &&
86 dc->writeback_percent)
87 __update_writeback_rate(dc);
88
89 up_read(&dc->writeback_lock);
90}
91
92static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
93{
94 if (atomic_read(&dc->disk.detaching) ||
95 !dc->writeback_percent)
96 return 0;
97
98 return next_delay(&dc->writeback_rate, sectors * 10000000ULL);
99}
100
101/* Background writeback */
102
103static bool dirty_pred(struct keybuf *buf, struct bkey *k)
104{
105 return KEY_DIRTY(k);
106}
107
108static void dirty_init(struct keybuf_key *w)
109{
110 struct dirty_io *io = w->private;
111 struct bio *bio = &io->bio;
112
113 bio_init(bio);
114 if (!io->dc->writeback_percent)
115 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
116
117 bio->bi_size = KEY_SIZE(&w->key) << 9;
118 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
119 bio->bi_private = w;
120 bio->bi_io_vec = bio->bi_inline_vecs;
121 bio_map(bio, NULL);
122}
123
124static void refill_dirty(struct closure *cl)
125{
126 struct cached_dev *dc = container_of(cl, struct cached_dev,
127 writeback.cl);
128 struct keybuf *buf = &dc->writeback_keys;
129 bool searched_from_start = false;
130 struct bkey end = MAX_KEY;
131 SET_KEY_INODE(&end, dc->disk.id);
132
133 if (!atomic_read(&dc->disk.detaching) &&
134 !dc->writeback_running)
135 closure_return(cl);
136
137 down_write(&dc->writeback_lock);
138
139 if (!atomic_read(&dc->has_dirty)) {
140 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
141 bch_write_bdev_super(dc, NULL);
142
143 up_write(&dc->writeback_lock);
144 closure_return(cl);
145 }
146
147 if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
148 buf->last_scanned = KEY(dc->disk.id, 0, 0);
149 searched_from_start = true;
150 }
151
152 bch_refill_keybuf(dc->disk.c, buf, &end);
153
154 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
155 /* Searched the entire btree - delay awhile */
156
157 if (RB_EMPTY_ROOT(&buf->keys)) {
158 atomic_set(&dc->has_dirty, 0);
159 cached_dev_put(dc);
160 }
161
162 if (!atomic_read(&dc->disk.detaching))
163 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
164 }
165
166 up_write(&dc->writeback_lock);
167
168 ratelimit_reset(&dc->writeback_rate);
169
170 /* Punt to workqueue only so we don't recurse and blow the stack */
171 continue_at(cl, read_dirty, dirty_wq);
172}
173
174void bch_writeback_queue(struct cached_dev *dc)
175{
176 if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
177 if (!atomic_read(&dc->disk.detaching))
178 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
179
180 continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
181 }
182}
183
184void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
185{
186 atomic_long_add(sectors, &dc->disk.sectors_dirty);
187
188 if (!atomic_read(&dc->has_dirty) &&
189 !atomic_xchg(&dc->has_dirty, 1)) {
190 atomic_inc(&dc->count);
191
192 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
193 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
194 /* XXX: should do this synchronously */
195 bch_write_bdev_super(dc, NULL);
196 }
197
198 bch_writeback_queue(dc);
199
200 if (dc->writeback_percent)
201 schedule_delayed_work(&dc->writeback_rate_update,
202 dc->writeback_rate_update_seconds * HZ);
203 }
204}
205
206/* Background writeback - IO loop */
207
208static void dirty_io_destructor(struct closure *cl)
209{
210 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
211 kfree(io);
212}
213
214static void write_dirty_finish(struct closure *cl)
215{
216 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
217 struct keybuf_key *w = io->bio.bi_private;
218 struct cached_dev *dc = io->dc;
219 struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
220
221 while (bv-- != io->bio.bi_io_vec)
222 __free_page(bv->bv_page);
223
224 /* This is kind of a dumb way of signalling errors. */
225 if (KEY_DIRTY(&w->key)) {
226 unsigned i;
227 struct btree_op op;
228 bch_btree_op_init_stack(&op);
229
230 op.type = BTREE_REPLACE;
231 bkey_copy(&op.replace, &w->key);
232
233 SET_KEY_DIRTY(&w->key, false);
234 bch_keylist_add(&op.keys, &w->key);
235
236 for (i = 0; i < KEY_PTRS(&w->key); i++)
237 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
238
239 pr_debug("clearing %s", pkey(&w->key));
240 bch_btree_insert(&op, dc->disk.c);
241 closure_sync(&op.cl);
242
243 atomic_long_inc(op.insert_collision
244 ? &dc->disk.c->writeback_keys_failed
245 : &dc->disk.c->writeback_keys_done);
246 }
247
248 bch_keybuf_del(&dc->writeback_keys, w);
249 atomic_dec_bug(&dc->in_flight);
250
251 closure_wake_up(&dc->writeback_wait);
252
253 closure_return_with_destructor(cl, dirty_io_destructor);
254}
255
256static void dirty_endio(struct bio *bio, int error)
257{
258 struct keybuf_key *w = bio->bi_private;
259 struct dirty_io *io = w->private;
260
261 if (error)
262 SET_KEY_DIRTY(&w->key, false);
263
264 closure_put(&io->cl);
265}
266
267static void write_dirty(struct closure *cl)
268{
269 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
270 struct keybuf_key *w = io->bio.bi_private;
271
272 dirty_init(w);
273 io->bio.bi_rw = WRITE;
274 io->bio.bi_sector = KEY_START(&w->key);
275 io->bio.bi_bdev = io->dc->bdev;
276 io->bio.bi_end_io = dirty_endio;
277
278 trace_bcache_write_dirty(&io->bio);
279 closure_bio_submit(&io->bio, cl, &io->dc->disk);
280
281 continue_at(cl, write_dirty_finish, dirty_wq);
282}
283
284static void read_dirty_endio(struct bio *bio, int error)
285{
286 struct keybuf_key *w = bio->bi_private;
287 struct dirty_io *io = w->private;
288
289 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
290 error, "reading dirty data from cache");
291
292 dirty_endio(bio, error);
293}
294
295static void read_dirty_submit(struct closure *cl)
296{
297 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
298
299 trace_bcache_read_dirty(&io->bio);
300 closure_bio_submit(&io->bio, cl, &io->dc->disk);
301
302 continue_at(cl, write_dirty, dirty_wq);
303}
304
305static void read_dirty(struct closure *cl)
306{
307 struct cached_dev *dc = container_of(cl, struct cached_dev,
308 writeback.cl);
309 unsigned delay = writeback_delay(dc, 0);
310 struct keybuf_key *w;
311 struct dirty_io *io;
312
313 /*
314 * XXX: if we error, background writeback just spins. Should use some
315 * mempools.
316 */
317
318 while (1) {
319 w = bch_keybuf_next(&dc->writeback_keys);
320 if (!w)
321 break;
322
323 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
324
325 if (delay > 0 &&
326 (KEY_START(&w->key) != dc->last_read ||
327 jiffies_to_msecs(delay) > 50)) {
328 w->private = NULL;
329
330 closure_delay(&dc->writeback, delay);
331 continue_at(cl, read_dirty, dirty_wq);
332 }
333
334 dc->last_read = KEY_OFFSET(&w->key);
335
336 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
337 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
338 GFP_KERNEL);
339 if (!io)
340 goto err;
341
342 w->private = io;
343 io->dc = dc;
344
345 dirty_init(w);
346 io->bio.bi_sector = PTR_OFFSET(&w->key, 0);
347 io->bio.bi_bdev = PTR_CACHE(dc->disk.c,
348 &w->key, 0)->bdev;
349 io->bio.bi_rw = READ;
350 io->bio.bi_end_io = read_dirty_endio;
351
352 if (bio_alloc_pages(&io->bio, GFP_KERNEL))
353 goto err_free;
354
355 pr_debug("%s", pkey(&w->key));
356
357 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
358
359 delay = writeback_delay(dc, KEY_SIZE(&w->key));
360
361 atomic_inc(&dc->in_flight);
362
363 if (!closure_wait_event(&dc->writeback_wait, cl,
364 atomic_read(&dc->in_flight) < 64))
365 continue_at(cl, read_dirty, dirty_wq);
366 }
367
368 if (0) {
369err_free:
370 kfree(w->private);
371err:
372 bch_keybuf_del(&dc->writeback_keys, w);
373 }
374
375 refill_dirty(cl);
376}
377
378void bch_writeback_init_cached_dev(struct cached_dev *dc)
379{
380 closure_init_unlocked(&dc->writeback);
381 init_rwsem(&dc->writeback_lock);
382
383 bch_keybuf_init(&dc->writeback_keys, dirty_pred);
384
385 dc->writeback_metadata = true;
386 dc->writeback_running = true;
387 dc->writeback_percent = 10;
388 dc->writeback_delay = 30;
389 dc->writeback_rate.rate = 1024;
390
391 dc->writeback_rate_update_seconds = 30;
392 dc->writeback_rate_d_term = 16;
393 dc->writeback_rate_p_term_inverse = 64;
394 dc->writeback_rate_d_smooth = 8;
395
396 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
397 schedule_delayed_work(&dc->writeback_rate_update,
398 dc->writeback_rate_update_seconds * HZ);
399}
400
401void bch_writeback_exit(void)
402{
403 if (dirty_wq)
404 destroy_workqueue(dirty_wq);
405}
406
407int __init bch_writeback_init(void)
408{
409 dirty_wq = create_singlethread_workqueue("bcache_writeback");
410 if (!dirty_wq)
411 return -ENOMEM;
412
413 return 0;
414}