aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-04-01 22:43:53 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-01 22:43:53 -0400
commitb33ce442993865180292df2a314ee5251ba38b50 (patch)
tree004b703ea3cd19c932393086fe9cde96e8db8de7 /drivers/md
parent7a48837732f87a574ee3e1855927dc250117f565 (diff)
parente84987a1f941b8e2e3173bb38510ddf25cc8c7f0 (diff)
Merge branch 'for-3.15/drivers' of git://git.kernel.dk/linux-block
Pull block driver update from Jens Axboe: "On top of the core pull request, here's the pull request for the driver related changes for 3.15. It contains: - Improvements for msi-x registration for block drivers (mtip32xx, skd, cciss, nvme) from Alexander Gordeev. - A round of cleanups and improvements for drbd from Andreas Gruenbacher and Rashika Kheria. - A round of clanups and improvements for bcache from Kent. - Removal of sleep_on() and friends in DAC960, ataflop, swim3 from Arnd Bergmann. - Bug fix for a bug in the mtip32xx async completion code from Sam Bradshaw. - Bug fix for accidentally bouncing IO on 32-bit platforms with mtip32xx from Felipe Franciosi" * 'for-3.15/drivers' of git://git.kernel.dk/linux-block: (103 commits) bcache: remove nested function usage bcache: Kill bucket->gc_gen bcache: Kill unused freelist bcache: Rework btree cache reserve handling bcache: Kill btree_io_wq bcache: btree locking rework bcache: Fix a race when freeing btree nodes bcache: Add a real GC_MARK_RECLAIMABLE bcache: Add bch_keylist_init_single() bcache: Improve priority_stats bcache: Better alloc tracepoints bcache: Kill dead cgroup code bcache: stop moving_gc marking buckets that can't be moved. bcache: Fix moving_pred() bcache: Fix moving_gc deadlocking with a foreground write bcache: Fix discard granularity bcache: Fix another bug recovering from unclean shutdown bcache: Fix a bug recovering from unclean shutdown bcache: Fix a journalling reclaim after recovery bug bcache: Fix a null ptr deref in journal replay ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/Kconfig8
-rw-r--r--drivers/md/bcache/alloc.c173
-rw-r--r--drivers/md/bcache/bcache.h56
-rw-r--r--drivers/md/bcache/bset.c4
-rw-r--r--drivers/md/bcache/bset.h6
-rw-r--r--drivers/md/bcache/btree.c592
-rw-r--r--drivers/md/bcache/btree.h12
-rw-r--r--drivers/md/bcache/extents.c36
-rw-r--r--drivers/md/bcache/journal.c46
-rw-r--r--drivers/md/bcache/journal.h1
-rw-r--r--drivers/md/bcache/movinggc.c18
-rw-r--r--drivers/md/bcache/request.c201
-rw-r--r--drivers/md/bcache/request.h19
-rw-r--r--drivers/md/bcache/stats.c3
-rw-r--r--drivers/md/bcache/super.c64
-rw-r--r--drivers/md/bcache/sysfs.c155
-rw-r--r--drivers/md/bcache/trace.c2
17 files changed, 629 insertions, 767 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 2638417b19aa..4d200883c505 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG
24 Keeps all active closures in a linked list and provides a debugfs 24 Keeps all active closures in a linked list and provides a debugfs
25 interface to list them, which makes it possible to see asynchronous 25 interface to list them, which makes it possible to see asynchronous
26 operations that get stuck. 26 operations that get stuck.
27
28# cgroup code needs to be updated:
29#
30#config CGROUP_BCACHE
31# bool "Cgroup controls for bcache"
32# depends on BCACHE && BLK_CGROUP
33# ---help---
34# TODO
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index c0d37d082443..443d03fbac47 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
78 ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); 78 ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
79 WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); 79 WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
80 80
81 if (CACHE_SYNC(&ca->set->sb)) {
82 ca->need_save_prio = max(ca->need_save_prio,
83 bucket_disk_gen(b));
84 WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
85 }
86
87 return ret; 81 return ret;
88} 82}
89 83
@@ -120,51 +114,45 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
120 mutex_unlock(&c->bucket_lock); 114 mutex_unlock(&c->bucket_lock);
121} 115}
122 116
123/* Allocation */ 117/*
118 * Background allocation thread: scans for buckets to be invalidated,
119 * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
120 * then optionally issues discard commands to the newly free buckets, then puts
121 * them on the various freelists.
122 */
124 123
125static inline bool can_inc_bucket_gen(struct bucket *b) 124static inline bool can_inc_bucket_gen(struct bucket *b)
126{ 125{
127 return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && 126 return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
128 bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
129} 127}
130 128
131bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) 129bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
132{ 130{
133 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); 131 BUG_ON(!ca->set->gc_mark_valid);
134
135 if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
136 unsigned i;
137
138 for (i = 0; i < RESERVE_NONE; i++)
139 if (!fifo_full(&ca->free[i]))
140 goto add;
141 132
142 return false; 133 return (!GC_MARK(b) ||
143 } 134 GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
144add:
145 b->prio = 0;
146
147 if (can_inc_bucket_gen(b) &&
148 fifo_push(&ca->unused, b - ca->buckets)) {
149 atomic_inc(&b->pin);
150 return true;
151 }
152
153 return false;
154}
155
156static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
157{
158 return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
159 !atomic_read(&b->pin) && 135 !atomic_read(&b->pin) &&
160 can_inc_bucket_gen(b); 136 can_inc_bucket_gen(b);
161} 137}
162 138
163static void invalidate_one_bucket(struct cache *ca, struct bucket *b) 139void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
164{ 140{
141 lockdep_assert_held(&ca->set->bucket_lock);
142 BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
143
144 if (GC_SECTORS_USED(b))
145 trace_bcache_invalidate(ca, b - ca->buckets);
146
165 bch_inc_gen(ca, b); 147 bch_inc_gen(ca, b);
166 b->prio = INITIAL_PRIO; 148 b->prio = INITIAL_PRIO;
167 atomic_inc(&b->pin); 149 atomic_inc(&b->pin);
150}
151
152static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
153{
154 __bch_invalidate_one_bucket(ca, b);
155
168 fifo_push(&ca->free_inc, b - ca->buckets); 156 fifo_push(&ca->free_inc, b - ca->buckets);
169} 157}
170 158
@@ -195,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca)
195 ca->heap.used = 0; 183 ca->heap.used = 0;
196 184
197 for_each_bucket(b, ca) { 185 for_each_bucket(b, ca) {
198 /* 186 if (!bch_can_invalidate_bucket(ca, b))
199 * If we fill up the unused list, if we then return before
200 * adding anything to the free_inc list we'll skip writing
201 * prios/gens and just go back to allocating from the unused
202 * list:
203 */
204 if (fifo_full(&ca->unused))
205 return;
206
207 if (!can_invalidate_bucket(ca, b))
208 continue;
209
210 if (!GC_SECTORS_USED(b) &&
211 bch_bucket_add_unused(ca, b))
212 continue; 187 continue;
213 188
214 if (!heap_full(&ca->heap)) 189 if (!heap_full(&ca->heap))
@@ -233,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca)
233 return; 208 return;
234 } 209 }
235 210
236 invalidate_one_bucket(ca, b); 211 bch_invalidate_one_bucket(ca, b);
237 } 212 }
238} 213}
239 214
@@ -249,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca)
249 224
250 b = ca->buckets + ca->fifo_last_bucket++; 225 b = ca->buckets + ca->fifo_last_bucket++;
251 226
252 if (can_invalidate_bucket(ca, b)) 227 if (bch_can_invalidate_bucket(ca, b))
253 invalidate_one_bucket(ca, b); 228 bch_invalidate_one_bucket(ca, b);
254 229
255 if (++checked >= ca->sb.nbuckets) { 230 if (++checked >= ca->sb.nbuckets) {
256 ca->invalidate_needs_gc = 1; 231 ca->invalidate_needs_gc = 1;
@@ -274,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca)
274 249
275 b = ca->buckets + n; 250 b = ca->buckets + n;
276 251
277 if (can_invalidate_bucket(ca, b)) 252 if (bch_can_invalidate_bucket(ca, b))
278 invalidate_one_bucket(ca, b); 253 bch_invalidate_one_bucket(ca, b);
279 254
280 if (++checked >= ca->sb.nbuckets / 2) { 255 if (++checked >= ca->sb.nbuckets / 2) {
281 ca->invalidate_needs_gc = 1; 256 ca->invalidate_needs_gc = 1;
@@ -287,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca)
287 262
288static void invalidate_buckets(struct cache *ca) 263static void invalidate_buckets(struct cache *ca)
289{ 264{
290 if (ca->invalidate_needs_gc) 265 BUG_ON(ca->invalidate_needs_gc);
291 return;
292 266
293 switch (CACHE_REPLACEMENT(&ca->sb)) { 267 switch (CACHE_REPLACEMENT(&ca->sb)) {
294 case CACHE_REPLACEMENT_LRU: 268 case CACHE_REPLACEMENT_LRU:
@@ -301,8 +275,6 @@ static void invalidate_buckets(struct cache *ca)
301 invalidate_buckets_random(ca); 275 invalidate_buckets_random(ca);
302 break; 276 break;
303 } 277 }
304
305 trace_bcache_alloc_invalidate(ca);
306} 278}
307 279
308#define allocator_wait(ca, cond) \ 280#define allocator_wait(ca, cond) \
@@ -350,17 +322,10 @@ static int bch_allocator_thread(void *arg)
350 * possibly issue discards to them, then we add the bucket to 322 * possibly issue discards to them, then we add the bucket to
351 * the free list: 323 * the free list:
352 */ 324 */
353 while (1) { 325 while (!fifo_empty(&ca->free_inc)) {
354 long bucket; 326 long bucket;
355 327
356 if ((!atomic_read(&ca->set->prio_blocked) || 328 fifo_pop(&ca->free_inc, bucket);
357 !CACHE_SYNC(&ca->set->sb)) &&
358 !fifo_empty(&ca->unused))
359 fifo_pop(&ca->unused, bucket);
360 else if (!fifo_empty(&ca->free_inc))
361 fifo_pop(&ca->free_inc, bucket);
362 else
363 break;
364 329
365 if (ca->discard) { 330 if (ca->discard) {
366 mutex_unlock(&ca->set->bucket_lock); 331 mutex_unlock(&ca->set->bucket_lock);
@@ -371,6 +336,7 @@ static int bch_allocator_thread(void *arg)
371 } 336 }
372 337
373 allocator_wait(ca, bch_allocator_push(ca, bucket)); 338 allocator_wait(ca, bch_allocator_push(ca, bucket));
339 wake_up(&ca->set->btree_cache_wait);
374 wake_up(&ca->set->bucket_wait); 340 wake_up(&ca->set->bucket_wait);
375 } 341 }
376 342
@@ -380,9 +346,9 @@ static int bch_allocator_thread(void *arg)
380 * them to the free_inc list: 346 * them to the free_inc list:
381 */ 347 */
382 348
349retry_invalidate:
383 allocator_wait(ca, ca->set->gc_mark_valid && 350 allocator_wait(ca, ca->set->gc_mark_valid &&
384 (ca->need_save_prio > 64 || 351 !ca->invalidate_needs_gc);
385 !ca->invalidate_needs_gc));
386 invalidate_buckets(ca); 352 invalidate_buckets(ca);
387 353
388 /* 354 /*
@@ -390,13 +356,28 @@ static int bch_allocator_thread(void *arg)
390 * new stuff to them: 356 * new stuff to them:
391 */ 357 */
392 allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); 358 allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
393 if (CACHE_SYNC(&ca->set->sb) && 359 if (CACHE_SYNC(&ca->set->sb)) {
394 (!fifo_empty(&ca->free_inc) || 360 /*
395 ca->need_save_prio > 64)) 361 * This could deadlock if an allocation with a btree
362 * node locked ever blocked - having the btree node
363 * locked would block garbage collection, but here we're
364 * waiting on garbage collection before we invalidate
365 * and free anything.
366 *
367 * But this should be safe since the btree code always
368 * uses btree_check_reserve() before allocating now, and
369 * if it fails it blocks without btree nodes locked.
370 */
371 if (!fifo_full(&ca->free_inc))
372 goto retry_invalidate;
373
396 bch_prio_write(ca); 374 bch_prio_write(ca);
375 }
397 } 376 }
398} 377}
399 378
379/* Allocation */
380
400long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) 381long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
401{ 382{
402 DEFINE_WAIT(w); 383 DEFINE_WAIT(w);
@@ -408,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
408 fifo_pop(&ca->free[reserve], r)) 389 fifo_pop(&ca->free[reserve], r))
409 goto out; 390 goto out;
410 391
411 if (!wait) 392 if (!wait) {
393 trace_bcache_alloc_fail(ca, reserve);
412 return -1; 394 return -1;
395 }
413 396
414 do { 397 do {
415 prepare_to_wait(&ca->set->bucket_wait, &w, 398 prepare_to_wait(&ca->set->bucket_wait, &w,
@@ -425,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
425out: 408out:
426 wake_up_process(ca->alloc_thread); 409 wake_up_process(ca->alloc_thread);
427 410
411 trace_bcache_alloc(ca, reserve);
412
428 if (expensive_debug_checks(ca->set)) { 413 if (expensive_debug_checks(ca->set)) {
429 size_t iter; 414 size_t iter;
430 long i; 415 long i;
@@ -438,8 +423,6 @@ out:
438 BUG_ON(i == r); 423 BUG_ON(i == r);
439 fifo_for_each(i, &ca->free_inc, iter) 424 fifo_for_each(i, &ca->free_inc, iter)
440 BUG_ON(i == r); 425 BUG_ON(i == r);
441 fifo_for_each(i, &ca->unused, iter)
442 BUG_ON(i == r);
443 } 426 }
444 427
445 b = ca->buckets + r; 428 b = ca->buckets + r;
@@ -461,17 +444,19 @@ out:
461 return r; 444 return r;
462} 445}
463 446
447void __bch_bucket_free(struct cache *ca, struct bucket *b)
448{
449 SET_GC_MARK(b, 0);
450 SET_GC_SECTORS_USED(b, 0);
451}
452
464void bch_bucket_free(struct cache_set *c, struct bkey *k) 453void bch_bucket_free(struct cache_set *c, struct bkey *k)
465{ 454{
466 unsigned i; 455 unsigned i;
467 456
468 for (i = 0; i < KEY_PTRS(k); i++) { 457 for (i = 0; i < KEY_PTRS(k); i++)
469 struct bucket *b = PTR_BUCKET(c, k, i); 458 __bch_bucket_free(PTR_CACHE(c, k, i),
470 459 PTR_BUCKET(c, k, i));
471 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
472 SET_GC_SECTORS_USED(b, 0);
473 bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
474 }
475} 460}
476 461
477int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, 462int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
@@ -709,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca)
709 ca->alloc_thread = k; 694 ca->alloc_thread = k;
710 return 0; 695 return 0;
711} 696}
712
713int bch_cache_allocator_init(struct cache *ca)
714{
715 /*
716 * Reserve:
717 * Prio/gen writes first
718 * Then 8 for btree allocations
719 * Then half for the moving garbage collector
720 */
721#if 0
722 ca->watermark[WATERMARK_PRIO] = 0;
723
724 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
725
726 ca->watermark[WATERMARK_MOVINGGC] = 8 +
727 ca->watermark[WATERMARK_METADATA];
728
729 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
730 ca->watermark[WATERMARK_MOVINGGC];
731#endif
732 return 0;
733}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index a4c7306ff43d..82c9c5d35251 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -195,9 +195,7 @@ struct bucket {
195 atomic_t pin; 195 atomic_t pin;
196 uint16_t prio; 196 uint16_t prio;
197 uint8_t gen; 197 uint8_t gen;
198 uint8_t disk_gen;
199 uint8_t last_gc; /* Most out of date gen in the btree */ 198 uint8_t last_gc; /* Most out of date gen in the btree */
200 uint8_t gc_gen;
201 uint16_t gc_mark; /* Bitfield used by GC. See below for field */ 199 uint16_t gc_mark; /* Bitfield used by GC. See below for field */
202}; 200};
203 201
@@ -207,9 +205,9 @@ struct bucket {
207 */ 205 */
208 206
209BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); 207BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
210#define GC_MARK_RECLAIMABLE 0 208#define GC_MARK_RECLAIMABLE 1
211#define GC_MARK_DIRTY 1 209#define GC_MARK_DIRTY 2
212#define GC_MARK_METADATA 2 210#define GC_MARK_METADATA 3
213#define GC_SECTORS_USED_SIZE 13 211#define GC_SECTORS_USED_SIZE 13
214#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) 212#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE))
215BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); 213BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
@@ -426,14 +424,9 @@ struct cache {
426 * their new gen to disk. After prio_write() finishes writing the new 424 * their new gen to disk. After prio_write() finishes writing the new
427 * gens/prios, they'll be moved to the free list (and possibly discarded 425 * gens/prios, they'll be moved to the free list (and possibly discarded
428 * in the process) 426 * in the process)
429 *
430 * unused: GC found nothing pointing into these buckets (possibly
431 * because all the data they contained was overwritten), so we only
432 * need to discard them before they can be moved to the free list.
433 */ 427 */
434 DECLARE_FIFO(long, free)[RESERVE_NR]; 428 DECLARE_FIFO(long, free)[RESERVE_NR];
435 DECLARE_FIFO(long, free_inc); 429 DECLARE_FIFO(long, free_inc);
436 DECLARE_FIFO(long, unused);
437 430
438 size_t fifo_last_bucket; 431 size_t fifo_last_bucket;
439 432
@@ -443,12 +436,6 @@ struct cache {
443 DECLARE_HEAP(struct bucket *, heap); 436 DECLARE_HEAP(struct bucket *, heap);
444 437
445 /* 438 /*
446 * max(gen - disk_gen) for all buckets. When it gets too big we have to
447 * call prio_write() to keep gens from wrapping.
448 */
449 uint8_t need_save_prio;
450
451 /*
452 * If nonzero, we know we aren't going to find any buckets to invalidate 439 * If nonzero, we know we aren't going to find any buckets to invalidate
453 * until a gc finishes - otherwise we could pointlessly burn a ton of 440 * until a gc finishes - otherwise we could pointlessly burn a ton of
454 * cpu 441 * cpu
@@ -562,19 +549,16 @@ struct cache_set {
562 struct list_head btree_cache_freed; 549 struct list_head btree_cache_freed;
563 550
564 /* Number of elements in btree_cache + btree_cache_freeable lists */ 551 /* Number of elements in btree_cache + btree_cache_freeable lists */
565 unsigned bucket_cache_used; 552 unsigned btree_cache_used;
566 553
567 /* 554 /*
568 * If we need to allocate memory for a new btree node and that 555 * If we need to allocate memory for a new btree node and that
569 * allocation fails, we can cannibalize another node in the btree cache 556 * allocation fails, we can cannibalize another node in the btree cache
570 * to satisfy the allocation. However, only one thread can be doing this 557 * to satisfy the allocation - lock to guarantee only one thread does
571 * at a time, for obvious reasons - try_harder and try_wait are 558 * this at a time:
572 * basically a lock for this that we can wait on asynchronously. The
573 * btree_root() macro releases the lock when it returns.
574 */ 559 */
575 struct task_struct *try_harder; 560 wait_queue_head_t btree_cache_wait;
576 wait_queue_head_t try_wait; 561 struct task_struct *btree_cache_alloc_lock;
577 uint64_t try_harder_start;
578 562
579 /* 563 /*
580 * When we free a btree node, we increment the gen of the bucket the 564 * When we free a btree node, we increment the gen of the bucket the
@@ -603,7 +587,7 @@ struct cache_set {
603 uint16_t min_prio; 587 uint16_t min_prio;
604 588
605 /* 589 /*
606 * max(gen - gc_gen) for all buckets. When it gets too big we have to gc 590 * max(gen - last_gc) for all buckets. When it gets too big we have to gc
607 * to keep gens from wrapping around. 591 * to keep gens from wrapping around.
608 */ 592 */
609 uint8_t need_gc; 593 uint8_t need_gc;
@@ -628,6 +612,8 @@ struct cache_set {
628 /* Number of moving GC bios in flight */ 612 /* Number of moving GC bios in flight */
629 struct semaphore moving_in_flight; 613 struct semaphore moving_in_flight;
630 614
615 struct workqueue_struct *moving_gc_wq;
616
631 struct btree *root; 617 struct btree *root;
632 618
633#ifdef CONFIG_BCACHE_DEBUG 619#ifdef CONFIG_BCACHE_DEBUG
@@ -667,7 +653,6 @@ struct cache_set {
667 struct time_stats btree_gc_time; 653 struct time_stats btree_gc_time;
668 struct time_stats btree_split_time; 654 struct time_stats btree_split_time;
669 struct time_stats btree_read_time; 655 struct time_stats btree_read_time;
670 struct time_stats try_harder_time;
671 656
672 atomic_long_t cache_read_races; 657 atomic_long_t cache_read_races;
673 atomic_long_t writeback_keys_done; 658 atomic_long_t writeback_keys_done;
@@ -850,9 +835,6 @@ static inline bool cached_dev_get(struct cached_dev *dc)
850/* 835/*
851 * bucket_gc_gen() returns the difference between the bucket's current gen and 836 * bucket_gc_gen() returns the difference between the bucket's current gen and
852 * the oldest gen of any pointer into that bucket in the btree (last_gc). 837 * the oldest gen of any pointer into that bucket in the btree (last_gc).
853 *
854 * bucket_disk_gen() returns the difference between the current gen and the gen
855 * on disk; they're both used to make sure gens don't wrap around.
856 */ 838 */
857 839
858static inline uint8_t bucket_gc_gen(struct bucket *b) 840static inline uint8_t bucket_gc_gen(struct bucket *b)
@@ -860,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
860 return b->gen - b->last_gc; 842 return b->gen - b->last_gc;
861} 843}
862 844
863static inline uint8_t bucket_disk_gen(struct bucket *b)
864{
865 return b->gen - b->disk_gen;
866}
867
868#define BUCKET_GC_GEN_MAX 96U 845#define BUCKET_GC_GEN_MAX 96U
869#define BUCKET_DISK_GEN_MAX 64U
870 846
871#define kobj_attribute_write(n, fn) \ 847#define kobj_attribute_write(n, fn) \
872 static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) 848 static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
@@ -899,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
899 875
900uint8_t bch_inc_gen(struct cache *, struct bucket *); 876uint8_t bch_inc_gen(struct cache *, struct bucket *);
901void bch_rescale_priorities(struct cache_set *, int); 877void bch_rescale_priorities(struct cache_set *, int);
902bool bch_bucket_add_unused(struct cache *, struct bucket *);
903 878
904long bch_bucket_alloc(struct cache *, unsigned, bool); 879bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
880void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
881
882void __bch_bucket_free(struct cache *, struct bucket *);
905void bch_bucket_free(struct cache_set *, struct bkey *); 883void bch_bucket_free(struct cache_set *, struct bkey *);
906 884
885long bch_bucket_alloc(struct cache *, unsigned, bool);
907int __bch_bucket_alloc_set(struct cache_set *, unsigned, 886int __bch_bucket_alloc_set(struct cache_set *, unsigned,
908 struct bkey *, int, bool); 887 struct bkey *, int, bool);
909int bch_bucket_alloc_set(struct cache_set *, unsigned, 888int bch_bucket_alloc_set(struct cache_set *, unsigned,
@@ -954,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *);
954void bch_open_buckets_free(struct cache_set *); 933void bch_open_buckets_free(struct cache_set *);
955 934
956int bch_cache_allocator_start(struct cache *ca); 935int bch_cache_allocator_start(struct cache *ca);
957int bch_cache_allocator_init(struct cache *ca);
958 936
959void bch_debug_exit(void); 937void bch_debug_exit(void);
960int bch_debug_init(struct kobject *); 938int bch_debug_init(struct kobject *);
961void bch_request_exit(void); 939void bch_request_exit(void);
962int bch_request_init(void); 940int bch_request_init(void);
963void bch_btree_exit(void);
964int bch_btree_init(void);
965 941
966#endif /* _BCACHE_H */ 942#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 3f74b4b0747b..545416415305 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
23 for (k = i->start; k < bset_bkey_last(i); k = next) { 23 for (k = i->start; k < bset_bkey_last(i); k = next) {
24 next = bkey_next(k); 24 next = bkey_next(k);
25 25
26 printk(KERN_ERR "block %u key %li/%u: ", set, 26 printk(KERN_ERR "block %u key %u/%u: ", set,
27 (uint64_t *) k - i->d, i->keys); 27 (unsigned) ((u64 *) k - i->d), i->keys);
28 28
29 if (b->ops->key_dump) 29 if (b->ops->key_dump)
30 b->ops->key_dump(b, k); 30 b->ops->key_dump(b, k);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 003260f4ddf6..5f6728d5d4dd 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l)
478 l->top_p = l->keys_p = l->inline_keys; 478 l->top_p = l->keys_p = l->inline_keys;
479} 479}
480 480
481static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
482{
483 l->keys = k;
484 l->top = bkey_next(k);
485}
486
481static inline void bch_keylist_push(struct keylist *l) 487static inline void bch_keylist_push(struct keylist *l)
482{ 488{
483 l->top = bkey_next(l->top); 489 l->top = bkey_next(l->top);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5f9c2a665ca5..7347b6100961 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -68,15 +68,11 @@
68 * alloc_bucket() cannot fail. This should be true but is not completely 68 * alloc_bucket() cannot fail. This should be true but is not completely
69 * obvious. 69 * obvious.
70 * 70 *
71 * Make sure all allocations get charged to the root cgroup
72 *
73 * Plugging? 71 * Plugging?
74 * 72 *
75 * If data write is less than hard sector size of ssd, round up offset in open 73 * If data write is less than hard sector size of ssd, round up offset in open
76 * bucket to the next whole sector 74 * bucket to the next whole sector
77 * 75 *
78 * Also lookup by cgroup in get_open_bucket()
79 *
80 * Superblock needs to be fleshed out for multiple cache devices 76 * Superblock needs to be fleshed out for multiple cache devices
81 * 77 *
82 * Add a sysfs tunable for the number of writeback IOs in flight 78 * Add a sysfs tunable for the number of writeback IOs in flight
@@ -97,8 +93,6 @@
97#define PTR_HASH(c, k) \ 93#define PTR_HASH(c, k) \
98 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 94 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
99 95
100static struct workqueue_struct *btree_io_wq;
101
102#define insert_lock(s, b) ((b)->level <= (s)->lock) 96#define insert_lock(s, b) ((b)->level <= (s)->lock)
103 97
104/* 98/*
@@ -123,7 +117,7 @@ static struct workqueue_struct *btree_io_wq;
123({ \ 117({ \
124 int _r, l = (b)->level - 1; \ 118 int _r, l = (b)->level - 1; \
125 bool _w = l <= (op)->lock; \ 119 bool _w = l <= (op)->lock; \
126 struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ 120 struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
127 if (!IS_ERR(_child)) { \ 121 if (!IS_ERR(_child)) { \
128 _child->parent = (b); \ 122 _child->parent = (b); \
129 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ 123 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
@@ -152,17 +146,12 @@ static struct workqueue_struct *btree_io_wq;
152 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 146 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
153 } \ 147 } \
154 rw_unlock(_w, _b); \ 148 rw_unlock(_w, _b); \
149 bch_cannibalize_unlock(c); \
155 if (_r == -EINTR) \ 150 if (_r == -EINTR) \
156 schedule(); \ 151 schedule(); \
157 bch_cannibalize_unlock(c); \
158 if (_r == -ENOSPC) { \
159 wait_event((c)->try_wait, \
160 !(c)->try_harder); \
161 _r = -EINTR; \
162 } \
163 } while (_r == -EINTR); \ 152 } while (_r == -EINTR); \
164 \ 153 \
165 finish_wait(&(c)->bucket_wait, &(op)->wait); \ 154 finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
166 _r; \ 155 _r; \
167}) 156})
168 157
@@ -171,6 +160,20 @@ static inline struct bset *write_block(struct btree *b)
171 return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); 160 return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
172} 161}
173 162
163static void bch_btree_init_next(struct btree *b)
164{
165 /* If not a leaf node, always sort */
166 if (b->level && b->keys.nsets)
167 bch_btree_sort(&b->keys, &b->c->sort);
168 else
169 bch_btree_sort_lazy(&b->keys, &b->c->sort);
170
171 if (b->written < btree_blocks(b))
172 bch_bset_init_next(&b->keys, write_block(b),
173 bset_magic(&b->c->sb));
174
175}
176
174/* Btree key manipulation */ 177/* Btree key manipulation */
175 178
176void bkey_put(struct cache_set *c, struct bkey *k) 179void bkey_put(struct cache_set *c, struct bkey *k)
@@ -352,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl)
352 btree_complete_write(b, w); 355 btree_complete_write(b, w);
353 356
354 if (btree_node_dirty(b)) 357 if (btree_node_dirty(b))
355 queue_delayed_work(btree_io_wq, &b->work, 358 schedule_delayed_work(&b->work, 30 * HZ);
356 msecs_to_jiffies(30000));
357 359
358 closure_return_with_destructor(cl, btree_node_write_unlock); 360 closure_return_with_destructor(cl, btree_node_write_unlock);
359} 361}
@@ -442,10 +444,12 @@ static void do_btree_node_write(struct btree *b)
442 } 444 }
443} 445}
444 446
445void bch_btree_node_write(struct btree *b, struct closure *parent) 447void __bch_btree_node_write(struct btree *b, struct closure *parent)
446{ 448{
447 struct bset *i = btree_bset_last(b); 449 struct bset *i = btree_bset_last(b);
448 450
451 lockdep_assert_held(&b->write_lock);
452
449 trace_bcache_btree_write(b); 453 trace_bcache_btree_write(b);
450 454
451 BUG_ON(current->bio_list); 455 BUG_ON(current->bio_list);
@@ -469,23 +473,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
469 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 473 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
470 474
471 b->written += set_blocks(i, block_bytes(b->c)); 475 b->written += set_blocks(i, block_bytes(b->c));
476}
472 477
473 /* If not a leaf node, always sort */ 478void bch_btree_node_write(struct btree *b, struct closure *parent)
474 if (b->level && b->keys.nsets) 479{
475 bch_btree_sort(&b->keys, &b->c->sort); 480 unsigned nsets = b->keys.nsets;
476 else 481
477 bch_btree_sort_lazy(&b->keys, &b->c->sort); 482 lockdep_assert_held(&b->lock);
483
484 __bch_btree_node_write(b, parent);
478 485
479 /* 486 /*
480 * do verify if there was more than one set initially (i.e. we did a 487 * do verify if there was more than one set initially (i.e. we did a
481 * sort) and we sorted down to a single set: 488 * sort) and we sorted down to a single set:
482 */ 489 */
483 if (i != b->keys.set->data && !b->keys.nsets) 490 if (nsets && !b->keys.nsets)
484 bch_btree_verify(b); 491 bch_btree_verify(b);
485 492
486 if (b->written < btree_blocks(b)) 493 bch_btree_init_next(b);
487 bch_bset_init_next(&b->keys, write_block(b),
488 bset_magic(&b->c->sb));
489} 494}
490 495
491static void bch_btree_node_write_sync(struct btree *b) 496static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b)
493 struct closure cl; 498 struct closure cl;
494 499
495 closure_init_stack(&cl); 500 closure_init_stack(&cl);
501
502 mutex_lock(&b->write_lock);
496 bch_btree_node_write(b, &cl); 503 bch_btree_node_write(b, &cl);
504 mutex_unlock(&b->write_lock);
505
497 closure_sync(&cl); 506 closure_sync(&cl);
498} 507}
499 508
@@ -501,11 +510,10 @@ static void btree_node_write_work(struct work_struct *w)
501{ 510{
502 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 511 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
503 512
504 rw_lock(true, b, b->level); 513 mutex_lock(&b->write_lock);
505
506 if (btree_node_dirty(b)) 514 if (btree_node_dirty(b))
507 bch_btree_node_write(b, NULL); 515 __bch_btree_node_write(b, NULL);
508 rw_unlock(true, b); 516 mutex_unlock(&b->write_lock);
509} 517}
510 518
511static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 519static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
@@ -513,11 +521,13 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
513 struct bset *i = btree_bset_last(b); 521 struct bset *i = btree_bset_last(b);
514 struct btree_write *w = btree_current_write(b); 522 struct btree_write *w = btree_current_write(b);
515 523
524 lockdep_assert_held(&b->write_lock);
525
516 BUG_ON(!b->written); 526 BUG_ON(!b->written);
517 BUG_ON(!i->keys); 527 BUG_ON(!i->keys);
518 528
519 if (!btree_node_dirty(b)) 529 if (!btree_node_dirty(b))
520 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); 530 schedule_delayed_work(&b->work, 30 * HZ);
521 531
522 set_btree_node_dirty(b); 532 set_btree_node_dirty(b);
523 533
@@ -548,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
548#define mca_reserve(c) (((c->root && c->root->level) \ 558#define mca_reserve(c) (((c->root && c->root->level) \
549 ? c->root->level : 1) * 8 + 16) 559 ? c->root->level : 1) * 8 + 16)
550#define mca_can_free(c) \ 560#define mca_can_free(c) \
551 max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) 561 max_t(int, 0, c->btree_cache_used - mca_reserve(c))
552 562
553static void mca_data_free(struct btree *b) 563static void mca_data_free(struct btree *b)
554{ 564{
@@ -556,7 +566,7 @@ static void mca_data_free(struct btree *b)
556 566
557 bch_btree_keys_free(&b->keys); 567 bch_btree_keys_free(&b->keys);
558 568
559 b->c->bucket_cache_used--; 569 b->c->btree_cache_used--;
560 list_move(&b->list, &b->c->btree_cache_freed); 570 list_move(&b->list, &b->c->btree_cache_freed);
561} 571}
562 572
@@ -581,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
581 ilog2(b->c->btree_pages), 591 ilog2(b->c->btree_pages),
582 btree_order(k)), 592 btree_order(k)),
583 gfp)) { 593 gfp)) {
584 b->c->bucket_cache_used++; 594 b->c->btree_cache_used++;
585 list_move(&b->list, &b->c->btree_cache); 595 list_move(&b->list, &b->c->btree_cache);
586 } else { 596 } else {
587 list_move(&b->list, &b->c->btree_cache_freed); 597 list_move(&b->list, &b->c->btree_cache_freed);
@@ -597,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
597 607
598 init_rwsem(&b->lock); 608 init_rwsem(&b->lock);
599 lockdep_set_novalidate_class(&b->lock); 609 lockdep_set_novalidate_class(&b->lock);
610 mutex_init(&b->write_lock);
611 lockdep_set_novalidate_class(&b->write_lock);
600 INIT_LIST_HEAD(&b->list); 612 INIT_LIST_HEAD(&b->list);
601 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 613 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
602 b->c = c; 614 b->c = c;
@@ -630,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
630 up(&b->io_mutex); 642 up(&b->io_mutex);
631 } 643 }
632 644
645 mutex_lock(&b->write_lock);
633 if (btree_node_dirty(b)) 646 if (btree_node_dirty(b))
634 bch_btree_node_write_sync(b); 647 __bch_btree_node_write(b, &cl);
648 mutex_unlock(&b->write_lock);
649
650 closure_sync(&cl);
635 651
636 /* wait for any in flight btree write */ 652 /* wait for any in flight btree write */
637 down(&b->io_mutex); 653 down(&b->io_mutex);
@@ -654,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
654 if (c->shrinker_disabled) 670 if (c->shrinker_disabled)
655 return SHRINK_STOP; 671 return SHRINK_STOP;
656 672
657 if (c->try_harder) 673 if (c->btree_cache_alloc_lock)
658 return SHRINK_STOP; 674 return SHRINK_STOP;
659 675
660 /* Return -1 if we can't do anything right now */ 676 /* Return -1 if we can't do anything right now */
@@ -686,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
686 } 702 }
687 } 703 }
688 704
689 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { 705 for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
690 if (list_empty(&c->btree_cache)) 706 if (list_empty(&c->btree_cache))
691 goto out; 707 goto out;
692 708
@@ -715,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
715 if (c->shrinker_disabled) 731 if (c->shrinker_disabled)
716 return 0; 732 return 0;
717 733
718 if (c->try_harder) 734 if (c->btree_cache_alloc_lock)
719 return 0; 735 return 0;
720 736
721 return mca_can_free(c) * c->btree_pages; 737 return mca_can_free(c) * c->btree_pages;
@@ -819,17 +835,30 @@ out:
819 return b; 835 return b;
820} 836}
821 837
822static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) 838static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
839{
840 struct task_struct *old;
841
842 old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
843 if (old && old != current) {
844 if (op)
845 prepare_to_wait(&c->btree_cache_wait, &op->wait,
846 TASK_UNINTERRUPTIBLE);
847 return -EINTR;
848 }
849
850 return 0;
851}
852
853static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
854 struct bkey *k)
823{ 855{
824 struct btree *b; 856 struct btree *b;
825 857
826 trace_bcache_btree_cache_cannibalize(c); 858 trace_bcache_btree_cache_cannibalize(c);
827 859
828 if (!c->try_harder) { 860 if (mca_cannibalize_lock(c, op))
829 c->try_harder = current; 861 return ERR_PTR(-EINTR);
830 c->try_harder_start = local_clock();
831 } else if (c->try_harder != current)
832 return ERR_PTR(-ENOSPC);
833 862
834 list_for_each_entry_reverse(b, &c->btree_cache, list) 863 list_for_each_entry_reverse(b, &c->btree_cache, list)
835 if (!mca_reap(b, btree_order(k), false)) 864 if (!mca_reap(b, btree_order(k), false))
@@ -839,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
839 if (!mca_reap(b, btree_order(k), true)) 868 if (!mca_reap(b, btree_order(k), true))
840 return b; 869 return b;
841 870
871 WARN(1, "btree cache cannibalize failed\n");
842 return ERR_PTR(-ENOMEM); 872 return ERR_PTR(-ENOMEM);
843} 873}
844 874
@@ -850,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
850 */ 880 */
851static void bch_cannibalize_unlock(struct cache_set *c) 881static void bch_cannibalize_unlock(struct cache_set *c)
852{ 882{
853 if (c->try_harder == current) { 883 if (c->btree_cache_alloc_lock == current) {
854 bch_time_stats_update(&c->try_harder_time, c->try_harder_start); 884 c->btree_cache_alloc_lock = NULL;
855 c->try_harder = NULL; 885 wake_up(&c->btree_cache_wait);
856 wake_up(&c->try_wait);
857 } 886 }
858} 887}
859 888
860static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) 889static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
890 struct bkey *k, int level)
861{ 891{
862 struct btree *b; 892 struct btree *b;
863 893
@@ -920,7 +950,7 @@ err:
920 if (b) 950 if (b)
921 rw_unlock(true, b); 951 rw_unlock(true, b);
922 952
923 b = mca_cannibalize(c, k); 953 b = mca_cannibalize(c, op, k);
924 if (!IS_ERR(b)) 954 if (!IS_ERR(b))
925 goto out; 955 goto out;
926 956
@@ -936,8 +966,8 @@ err:
936 * The btree node will have either a read or a write lock held, depending on 966 * The btree node will have either a read or a write lock held, depending on
937 * level and op->lock. 967 * level and op->lock.
938 */ 968 */
939struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, 969struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
940 int level, bool write) 970 struct bkey *k, int level, bool write)
941{ 971{
942 int i = 0; 972 int i = 0;
943 struct btree *b; 973 struct btree *b;
@@ -951,7 +981,7 @@ retry:
951 return ERR_PTR(-EAGAIN); 981 return ERR_PTR(-EAGAIN);
952 982
953 mutex_lock(&c->bucket_lock); 983 mutex_lock(&c->bucket_lock);
954 b = mca_alloc(c, k, level); 984 b = mca_alloc(c, op, k, level);
955 mutex_unlock(&c->bucket_lock); 985 mutex_unlock(&c->bucket_lock);
956 986
957 if (!b) 987 if (!b)
@@ -997,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
997 struct btree *b; 1027 struct btree *b;
998 1028
999 mutex_lock(&c->bucket_lock); 1029 mutex_lock(&c->bucket_lock);
1000 b = mca_alloc(c, k, level); 1030 b = mca_alloc(c, NULL, k, level);
1001 mutex_unlock(&c->bucket_lock); 1031 mutex_unlock(&c->bucket_lock);
1002 1032
1003 if (!IS_ERR_OR_NULL(b)) { 1033 if (!IS_ERR_OR_NULL(b)) {
@@ -1010,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
1010 1040
1011static void btree_node_free(struct btree *b) 1041static void btree_node_free(struct btree *b)
1012{ 1042{
1013 unsigned i;
1014
1015 trace_bcache_btree_node_free(b); 1043 trace_bcache_btree_node_free(b);
1016 1044
1017 BUG_ON(b == b->c->root); 1045 BUG_ON(b == b->c->root);
1018 1046
1047 mutex_lock(&b->write_lock);
1048
1019 if (btree_node_dirty(b)) 1049 if (btree_node_dirty(b))
1020 btree_complete_write(b, btree_current_write(b)); 1050 btree_complete_write(b, btree_current_write(b));
1021 clear_bit(BTREE_NODE_dirty, &b->flags); 1051 clear_bit(BTREE_NODE_dirty, &b->flags);
1022 1052
1053 mutex_unlock(&b->write_lock);
1054
1023 cancel_delayed_work(&b->work); 1055 cancel_delayed_work(&b->work);
1024 1056
1025 mutex_lock(&b->c->bucket_lock); 1057 mutex_lock(&b->c->bucket_lock);
1026
1027 for (i = 0; i < KEY_PTRS(&b->key); i++) {
1028 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
1029
1030 bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1031 PTR_BUCKET(b->c, &b->key, i));
1032 }
1033
1034 bch_bucket_free(b->c, &b->key); 1058 bch_bucket_free(b->c, &b->key);
1035 mca_bucket_free(b); 1059 mca_bucket_free(b);
1036 mutex_unlock(&b->c->bucket_lock); 1060 mutex_unlock(&b->c->bucket_lock);
1037} 1061}
1038 1062
1039struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) 1063struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1064 int level)
1040{ 1065{
1041 BKEY_PADDED(key) k; 1066 BKEY_PADDED(key) k;
1042 struct btree *b = ERR_PTR(-EAGAIN); 1067 struct btree *b = ERR_PTR(-EAGAIN);
1043 1068
1044 mutex_lock(&c->bucket_lock); 1069 mutex_lock(&c->bucket_lock);
1045retry: 1070retry:
1046 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) 1071 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
1047 goto err; 1072 goto err;
1048 1073
1049 bkey_put(c, &k.key); 1074 bkey_put(c, &k.key);
1050 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1075 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1051 1076
1052 b = mca_alloc(c, &k.key, level); 1077 b = mca_alloc(c, op, &k.key, level);
1053 if (IS_ERR(b)) 1078 if (IS_ERR(b))
1054 goto err_free; 1079 goto err_free;
1055 1080
@@ -1075,12 +1100,15 @@ err:
1075 return b; 1100 return b;
1076} 1101}
1077 1102
1078static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) 1103static struct btree *btree_node_alloc_replacement(struct btree *b,
1104 struct btree_op *op)
1079{ 1105{
1080 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); 1106 struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
1081 if (!IS_ERR_OR_NULL(n)) { 1107 if (!IS_ERR_OR_NULL(n)) {
1108 mutex_lock(&n->write_lock);
1082 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); 1109 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1083 bkey_copy_key(&n->key, &b->key); 1110 bkey_copy_key(&n->key, &b->key);
1111 mutex_unlock(&n->write_lock);
1084 } 1112 }
1085 1113
1086 return n; 1114 return n;
@@ -1090,43 +1118,47 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1090{ 1118{
1091 unsigned i; 1119 unsigned i;
1092 1120
1121 mutex_lock(&b->c->bucket_lock);
1122
1123 atomic_inc(&b->c->prio_blocked);
1124
1093 bkey_copy(k, &b->key); 1125 bkey_copy(k, &b->key);
1094 bkey_copy_key(k, &ZERO_KEY); 1126 bkey_copy_key(k, &ZERO_KEY);
1095 1127
1096 for (i = 0; i < KEY_PTRS(k); i++) { 1128 for (i = 0; i < KEY_PTRS(k); i++)
1097 uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; 1129 SET_PTR_GEN(k, i,
1098 1130 bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1099 SET_PTR_GEN(k, i, g); 1131 PTR_BUCKET(b->c, &b->key, i)));
1100 }
1101 1132
1102 atomic_inc(&b->c->prio_blocked); 1133 mutex_unlock(&b->c->bucket_lock);
1103} 1134}
1104 1135
1105static int btree_check_reserve(struct btree *b, struct btree_op *op) 1136static int btree_check_reserve(struct btree *b, struct btree_op *op)
1106{ 1137{
1107 struct cache_set *c = b->c; 1138 struct cache_set *c = b->c;
1108 struct cache *ca; 1139 struct cache *ca;
1109 unsigned i, reserve = c->root->level * 2 + 1; 1140 unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
1110 int ret = 0;
1111 1141
1112 mutex_lock(&c->bucket_lock); 1142 mutex_lock(&c->bucket_lock);
1113 1143
1114 for_each_cache(ca, c, i) 1144 for_each_cache(ca, c, i)
1115 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { 1145 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1116 if (op) 1146 if (op)
1117 prepare_to_wait(&c->bucket_wait, &op->wait, 1147 prepare_to_wait(&c->btree_cache_wait, &op->wait,
1118 TASK_UNINTERRUPTIBLE); 1148 TASK_UNINTERRUPTIBLE);
1119 ret = -EINTR; 1149 mutex_unlock(&c->bucket_lock);
1120 break; 1150 return -EINTR;
1121 } 1151 }
1122 1152
1123 mutex_unlock(&c->bucket_lock); 1153 mutex_unlock(&c->bucket_lock);
1124 return ret; 1154
1155 return mca_cannibalize_lock(b->c, op);
1125} 1156}
1126 1157
1127/* Garbage collection */ 1158/* Garbage collection */
1128 1159
1129uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) 1160static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
1161 struct bkey *k)
1130{ 1162{
1131 uint8_t stale = 0; 1163 uint8_t stale = 0;
1132 unsigned i; 1164 unsigned i;
@@ -1146,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1146 1178
1147 g = PTR_BUCKET(c, k, i); 1179 g = PTR_BUCKET(c, k, i);
1148 1180
1149 if (gen_after(g->gc_gen, PTR_GEN(k, i))) 1181 if (gen_after(g->last_gc, PTR_GEN(k, i)))
1150 g->gc_gen = PTR_GEN(k, i); 1182 g->last_gc = PTR_GEN(k, i);
1151 1183
1152 if (ptr_stale(c, k, i)) { 1184 if (ptr_stale(c, k, i)) {
1153 stale = max(stale, ptr_stale(c, k, i)); 1185 stale = max(stale, ptr_stale(c, k, i));
@@ -1163,6 +1195,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1163 SET_GC_MARK(g, GC_MARK_METADATA); 1195 SET_GC_MARK(g, GC_MARK_METADATA);
1164 else if (KEY_DIRTY(k)) 1196 else if (KEY_DIRTY(k))
1165 SET_GC_MARK(g, GC_MARK_DIRTY); 1197 SET_GC_MARK(g, GC_MARK_DIRTY);
1198 else if (!GC_MARK(g))
1199 SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
1166 1200
1167 /* guard against overflow */ 1201 /* guard against overflow */
1168 SET_GC_SECTORS_USED(g, min_t(unsigned, 1202 SET_GC_SECTORS_USED(g, min_t(unsigned,
@@ -1177,6 +1211,26 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1177 1211
1178#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1212#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1179 1213
1214void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
1215{
1216 unsigned i;
1217
1218 for (i = 0; i < KEY_PTRS(k); i++)
1219 if (ptr_available(c, k, i) &&
1220 !ptr_stale(c, k, i)) {
1221 struct bucket *b = PTR_BUCKET(c, k, i);
1222
1223 b->gen = PTR_GEN(k, i);
1224
1225 if (level && bkey_cmp(k, &ZERO_KEY))
1226 b->prio = BTREE_PRIO;
1227 else if (!level && b->prio == BTREE_PRIO)
1228 b->prio = INITIAL_PRIO;
1229 }
1230
1231 __bch_btree_mark_key(c, level, k);
1232}
1233
1180static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) 1234static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1181{ 1235{
1182 uint8_t stale = 0; 1236 uint8_t stale = 0;
@@ -1230,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *,
1230 struct keylist *, atomic_t *, struct bkey *); 1284 struct keylist *, atomic_t *, struct bkey *);
1231 1285
1232static int btree_gc_coalesce(struct btree *b, struct btree_op *op, 1286static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1233 struct keylist *keylist, struct gc_stat *gc, 1287 struct gc_stat *gc, struct gc_merge_info *r)
1234 struct gc_merge_info *r)
1235{ 1288{
1236 unsigned i, nodes = 0, keys = 0, blocks; 1289 unsigned i, nodes = 0, keys = 0, blocks;
1237 struct btree *new_nodes[GC_MERGE_NODES]; 1290 struct btree *new_nodes[GC_MERGE_NODES];
1291 struct keylist keylist;
1238 struct closure cl; 1292 struct closure cl;
1239 struct bkey *k; 1293 struct bkey *k;
1240 1294
1295 bch_keylist_init(&keylist);
1296
1297 if (btree_check_reserve(b, NULL))
1298 return 0;
1299
1241 memset(new_nodes, 0, sizeof(new_nodes)); 1300 memset(new_nodes, 0, sizeof(new_nodes));
1242 closure_init_stack(&cl); 1301 closure_init_stack(&cl);
1243 1302
@@ -1252,11 +1311,23 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1252 return 0; 1311 return 0;
1253 1312
1254 for (i = 0; i < nodes; i++) { 1313 for (i = 0; i < nodes; i++) {
1255 new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); 1314 new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
1256 if (IS_ERR_OR_NULL(new_nodes[i])) 1315 if (IS_ERR_OR_NULL(new_nodes[i]))
1257 goto out_nocoalesce; 1316 goto out_nocoalesce;
1258 } 1317 }
1259 1318
1319 /*
1320 * We have to check the reserve here, after we've allocated our new
1321 * nodes, to make sure the insert below will succeed - we also check
1322 * before as an optimization to potentially avoid a bunch of expensive
1323 * allocs/sorts
1324 */
1325 if (btree_check_reserve(b, NULL))
1326 goto out_nocoalesce;
1327
1328 for (i = 0; i < nodes; i++)
1329 mutex_lock(&new_nodes[i]->write_lock);
1330
1260 for (i = nodes - 1; i > 0; --i) { 1331 for (i = nodes - 1; i > 0; --i) {
1261 struct bset *n1 = btree_bset_first(new_nodes[i]); 1332 struct bset *n1 = btree_bset_first(new_nodes[i]);
1262 struct bset *n2 = btree_bset_first(new_nodes[i - 1]); 1333 struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
@@ -1315,28 +1386,34 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1315 1386
1316 n2->keys -= keys; 1387 n2->keys -= keys;
1317 1388
1318 if (__bch_keylist_realloc(keylist, 1389 if (__bch_keylist_realloc(&keylist,
1319 bkey_u64s(&new_nodes[i]->key))) 1390 bkey_u64s(&new_nodes[i]->key)))
1320 goto out_nocoalesce; 1391 goto out_nocoalesce;
1321 1392
1322 bch_btree_node_write(new_nodes[i], &cl); 1393 bch_btree_node_write(new_nodes[i], &cl);
1323 bch_keylist_add(keylist, &new_nodes[i]->key); 1394 bch_keylist_add(&keylist, &new_nodes[i]->key);
1324 } 1395 }
1325 1396
1326 for (i = 0; i < nodes; i++) { 1397 for (i = 0; i < nodes; i++)
1327 if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) 1398 mutex_unlock(&new_nodes[i]->write_lock);
1328 goto out_nocoalesce;
1329 1399
1330 make_btree_freeing_key(r[i].b, keylist->top); 1400 closure_sync(&cl);
1331 bch_keylist_push(keylist);
1332 }
1333 1401
1334 /* We emptied out this node */ 1402 /* We emptied out this node */
1335 BUG_ON(btree_bset_first(new_nodes[0])->keys); 1403 BUG_ON(btree_bset_first(new_nodes[0])->keys);
1336 btree_node_free(new_nodes[0]); 1404 btree_node_free(new_nodes[0]);
1337 rw_unlock(true, new_nodes[0]); 1405 rw_unlock(true, new_nodes[0]);
1338 1406
1339 closure_sync(&cl); 1407 for (i = 0; i < nodes; i++) {
1408 if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
1409 goto out_nocoalesce;
1410
1411 make_btree_freeing_key(r[i].b, keylist.top);
1412 bch_keylist_push(&keylist);
1413 }
1414
1415 bch_btree_insert_node(b, op, &keylist, NULL, NULL);
1416 BUG_ON(!bch_keylist_empty(&keylist));
1340 1417
1341 for (i = 0; i < nodes; i++) { 1418 for (i = 0; i < nodes; i++) {
1342 btree_node_free(r[i].b); 1419 btree_node_free(r[i].b);
@@ -1345,22 +1422,22 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1345 r[i].b = new_nodes[i]; 1422 r[i].b = new_nodes[i];
1346 } 1423 }
1347 1424
1348 bch_btree_insert_node(b, op, keylist, NULL, NULL);
1349 BUG_ON(!bch_keylist_empty(keylist));
1350
1351 memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); 1425 memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1352 r[nodes - 1].b = ERR_PTR(-EINTR); 1426 r[nodes - 1].b = ERR_PTR(-EINTR);
1353 1427
1354 trace_bcache_btree_gc_coalesce(nodes); 1428 trace_bcache_btree_gc_coalesce(nodes);
1355 gc->nodes--; 1429 gc->nodes--;
1356 1430
1431 bch_keylist_free(&keylist);
1432
1357 /* Invalidated our iterator */ 1433 /* Invalidated our iterator */
1358 return -EINTR; 1434 return -EINTR;
1359 1435
1360out_nocoalesce: 1436out_nocoalesce:
1361 closure_sync(&cl); 1437 closure_sync(&cl);
1438 bch_keylist_free(&keylist);
1362 1439
1363 while ((k = bch_keylist_pop(keylist))) 1440 while ((k = bch_keylist_pop(&keylist)))
1364 if (!bkey_cmp(k, &ZERO_KEY)) 1441 if (!bkey_cmp(k, &ZERO_KEY))
1365 atomic_dec(&b->c->prio_blocked); 1442 atomic_dec(&b->c->prio_blocked);
1366 1443
@@ -1372,6 +1449,42 @@ out_nocoalesce:
1372 return 0; 1449 return 0;
1373} 1450}
1374 1451
1452static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
1453 struct btree *replace)
1454{
1455 struct keylist keys;
1456 struct btree *n;
1457
1458 if (btree_check_reserve(b, NULL))
1459 return 0;
1460
1461 n = btree_node_alloc_replacement(replace, NULL);
1462
1463 /* recheck reserve after allocating replacement node */
1464 if (btree_check_reserve(b, NULL)) {
1465 btree_node_free(n);
1466 rw_unlock(true, n);
1467 return 0;
1468 }
1469
1470 bch_btree_node_write_sync(n);
1471
1472 bch_keylist_init(&keys);
1473 bch_keylist_add(&keys, &n->key);
1474
1475 make_btree_freeing_key(replace, keys.top);
1476 bch_keylist_push(&keys);
1477
1478 bch_btree_insert_node(b, op, &keys, NULL, NULL);
1479 BUG_ON(!bch_keylist_empty(&keys));
1480
1481 btree_node_free(replace);
1482 rw_unlock(true, n);
1483
1484 /* Invalidated our iterator */
1485 return -EINTR;
1486}
1487
1375static unsigned btree_gc_count_keys(struct btree *b) 1488static unsigned btree_gc_count_keys(struct btree *b)
1376{ 1489{
1377 struct bkey *k; 1490 struct bkey *k;
@@ -1387,26 +1500,23 @@ static unsigned btree_gc_count_keys(struct btree *b)
1387static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1500static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1388 struct closure *writes, struct gc_stat *gc) 1501 struct closure *writes, struct gc_stat *gc)
1389{ 1502{
1390 unsigned i;
1391 int ret = 0; 1503 int ret = 0;
1392 bool should_rewrite; 1504 bool should_rewrite;
1393 struct btree *n;
1394 struct bkey *k; 1505 struct bkey *k;
1395 struct keylist keys;
1396 struct btree_iter iter; 1506 struct btree_iter iter;
1397 struct gc_merge_info r[GC_MERGE_NODES]; 1507 struct gc_merge_info r[GC_MERGE_NODES];
1398 struct gc_merge_info *last = r + GC_MERGE_NODES - 1; 1508 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
1399 1509
1400 bch_keylist_init(&keys);
1401 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 1510 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1402 1511
1403 for (i = 0; i < GC_MERGE_NODES; i++) 1512 for (i = r; i < r + ARRAY_SIZE(r); i++)
1404 r[i].b = ERR_PTR(-EINTR); 1513 i->b = ERR_PTR(-EINTR);
1405 1514
1406 while (1) { 1515 while (1) {
1407 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1516 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1408 if (k) { 1517 if (k) {
1409 r->b = bch_btree_node_get(b->c, k, b->level - 1, true); 1518 r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
1519 true);
1410 if (IS_ERR(r->b)) { 1520 if (IS_ERR(r->b)) {
1411 ret = PTR_ERR(r->b); 1521 ret = PTR_ERR(r->b);
1412 break; 1522 break;
@@ -1414,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1414 1524
1415 r->keys = btree_gc_count_keys(r->b); 1525 r->keys = btree_gc_count_keys(r->b);
1416 1526
1417 ret = btree_gc_coalesce(b, op, &keys, gc, r); 1527 ret = btree_gc_coalesce(b, op, gc, r);
1418 if (ret) 1528 if (ret)
1419 break; 1529 break;
1420 } 1530 }
@@ -1424,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1424 1534
1425 if (!IS_ERR(last->b)) { 1535 if (!IS_ERR(last->b)) {
1426 should_rewrite = btree_gc_mark_node(last->b, gc); 1536 should_rewrite = btree_gc_mark_node(last->b, gc);
1427 if (should_rewrite && 1537 if (should_rewrite) {
1428 !btree_check_reserve(b, NULL)) { 1538 ret = btree_gc_rewrite_node(b, op, last->b);
1429 n = btree_node_alloc_replacement(last->b, 1539 if (ret)
1430 false);
1431
1432 if (!IS_ERR_OR_NULL(n)) {
1433 bch_btree_node_write_sync(n);
1434 bch_keylist_add(&keys, &n->key);
1435
1436 make_btree_freeing_key(last->b,
1437 keys.top);
1438 bch_keylist_push(&keys);
1439
1440 btree_node_free(last->b);
1441
1442 bch_btree_insert_node(b, op, &keys,
1443 NULL, NULL);
1444 BUG_ON(!bch_keylist_empty(&keys));
1445
1446 rw_unlock(true, last->b);
1447 last->b = n;
1448
1449 /* Invalidated our iterator */
1450 ret = -EINTR;
1451 break; 1540 break;
1452 }
1453 } 1541 }
1454 1542
1455 if (last->b->level) { 1543 if (last->b->level) {
@@ -1464,8 +1552,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1464 * Must flush leaf nodes before gc ends, since replace 1552 * Must flush leaf nodes before gc ends, since replace
1465 * operations aren't journalled 1553 * operations aren't journalled
1466 */ 1554 */
1555 mutex_lock(&last->b->write_lock);
1467 if (btree_node_dirty(last->b)) 1556 if (btree_node_dirty(last->b))
1468 bch_btree_node_write(last->b, writes); 1557 bch_btree_node_write(last->b, writes);
1558 mutex_unlock(&last->b->write_lock);
1469 rw_unlock(true, last->b); 1559 rw_unlock(true, last->b);
1470 } 1560 }
1471 1561
@@ -1478,15 +1568,15 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1478 } 1568 }
1479 } 1569 }
1480 1570
1481 for (i = 0; i < GC_MERGE_NODES; i++) 1571 for (i = r; i < r + ARRAY_SIZE(r); i++)
1482 if (!IS_ERR_OR_NULL(r[i].b)) { 1572 if (!IS_ERR_OR_NULL(i->b)) {
1483 if (btree_node_dirty(r[i].b)) 1573 mutex_lock(&i->b->write_lock);
1484 bch_btree_node_write(r[i].b, writes); 1574 if (btree_node_dirty(i->b))
1485 rw_unlock(true, r[i].b); 1575 bch_btree_node_write(i->b, writes);
1576 mutex_unlock(&i->b->write_lock);
1577 rw_unlock(true, i->b);
1486 } 1578 }
1487 1579
1488 bch_keylist_free(&keys);
1489
1490 return ret; 1580 return ret;
1491} 1581}
1492 1582
@@ -1499,10 +1589,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1499 1589
1500 should_rewrite = btree_gc_mark_node(b, gc); 1590 should_rewrite = btree_gc_mark_node(b, gc);
1501 if (should_rewrite) { 1591 if (should_rewrite) {
1502 n = btree_node_alloc_replacement(b, false); 1592 n = btree_node_alloc_replacement(b, NULL);
1503 1593
1504 if (!IS_ERR_OR_NULL(n)) { 1594 if (!IS_ERR_OR_NULL(n)) {
1505 bch_btree_node_write_sync(n); 1595 bch_btree_node_write_sync(n);
1596
1506 bch_btree_set_root(n); 1597 bch_btree_set_root(n);
1507 btree_node_free(b); 1598 btree_node_free(b);
1508 rw_unlock(true, n); 1599 rw_unlock(true, n);
@@ -1511,6 +1602,8 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1511 } 1602 }
1512 } 1603 }
1513 1604
1605 __bch_btree_mark_key(b->c, b->level + 1, &b->key);
1606
1514 if (b->level) { 1607 if (b->level) {
1515 ret = btree_gc_recurse(b, op, writes, gc); 1608 ret = btree_gc_recurse(b, op, writes, gc);
1516 if (ret) 1609 if (ret)
@@ -1538,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c)
1538 1631
1539 for_each_cache(ca, c, i) 1632 for_each_cache(ca, c, i)
1540 for_each_bucket(b, ca) { 1633 for_each_bucket(b, ca) {
1541 b->gc_gen = b->gen; 1634 b->last_gc = b->gen;
1542 if (!atomic_read(&b->pin)) { 1635 if (!atomic_read(&b->pin)) {
1543 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 1636 SET_GC_MARK(b, 0);
1544 SET_GC_SECTORS_USED(b, 0); 1637 SET_GC_SECTORS_USED(b, 0);
1545 } 1638 }
1546 } 1639 }
@@ -1548,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c)
1548 mutex_unlock(&c->bucket_lock); 1641 mutex_unlock(&c->bucket_lock);
1549} 1642}
1550 1643
1551size_t bch_btree_gc_finish(struct cache_set *c) 1644static size_t bch_btree_gc_finish(struct cache_set *c)
1552{ 1645{
1553 size_t available = 0; 1646 size_t available = 0;
1554 struct bucket *b; 1647 struct bucket *b;
@@ -1561,11 +1654,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1561 c->gc_mark_valid = 1; 1654 c->gc_mark_valid = 1;
1562 c->need_gc = 0; 1655 c->need_gc = 0;
1563 1656
1564 if (c->root)
1565 for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1566 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1567 GC_MARK_METADATA);
1568
1569 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) 1657 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1570 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), 1658 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1571 GC_MARK_METADATA); 1659 GC_MARK_METADATA);
@@ -1605,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1605 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1693 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1606 1694
1607 for_each_bucket(b, ca) { 1695 for_each_bucket(b, ca) {
1608 b->last_gc = b->gc_gen;
1609 c->need_gc = max(c->need_gc, bucket_gc_gen(b)); 1696 c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1610 1697
1611 if (!atomic_read(&b->pin) && 1698 if (atomic_read(&b->pin))
1612 GC_MARK(b) == GC_MARK_RECLAIMABLE) { 1699 continue;
1700
1701 BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1702
1703 if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1613 available++; 1704 available++;
1614 if (!GC_SECTORS_USED(b))
1615 bch_bucket_add_unused(ca, b);
1616 }
1617 } 1705 }
1618 } 1706 }
1619 1707
@@ -1705,36 +1793,16 @@ int bch_gc_thread_start(struct cache_set *c)
1705 1793
1706/* Initial partial gc */ 1794/* Initial partial gc */
1707 1795
1708static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, 1796static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
1709 unsigned long **seen)
1710{ 1797{
1711 int ret = 0; 1798 int ret = 0;
1712 unsigned i;
1713 struct bkey *k, *p = NULL; 1799 struct bkey *k, *p = NULL;
1714 struct bucket *g;
1715 struct btree_iter iter; 1800 struct btree_iter iter;
1716 1801
1717 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { 1802 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
1718 for (i = 0; i < KEY_PTRS(k); i++) { 1803 bch_initial_mark_key(b->c, b->level, k);
1719 if (!ptr_available(b->c, k, i))
1720 continue;
1721
1722 g = PTR_BUCKET(b->c, k, i);
1723 1804
1724 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), 1805 bch_initial_mark_key(b->c, b->level + 1, &b->key);
1725 seen[PTR_DEV(k, i)]) ||
1726 !ptr_stale(b->c, k, i)) {
1727 g->gen = PTR_GEN(k, i);
1728
1729 if (b->level)
1730 g->prio = BTREE_PRIO;
1731 else if (g->prio == BTREE_PRIO)
1732 g->prio = INITIAL_PRIO;
1733 }
1734 }
1735
1736 btree_mark_key(b, k);
1737 }
1738 1806
1739 if (b->level) { 1807 if (b->level) {
1740 bch_btree_iter_init(&b->keys, &iter, NULL); 1808 bch_btree_iter_init(&b->keys, &iter, NULL);
@@ -1746,40 +1814,58 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1746 btree_node_prefetch(b->c, k, b->level - 1); 1814 btree_node_prefetch(b->c, k, b->level - 1);
1747 1815
1748 if (p) 1816 if (p)
1749 ret = btree(check_recurse, p, b, op, seen); 1817 ret = btree(check_recurse, p, b, op);
1750 1818
1751 p = k; 1819 p = k;
1752 } while (p && !ret); 1820 } while (p && !ret);
1753 } 1821 }
1754 1822
1755 return 0; 1823 return ret;
1756} 1824}
1757 1825
1758int bch_btree_check(struct cache_set *c) 1826int bch_btree_check(struct cache_set *c)
1759{ 1827{
1760 int ret = -ENOMEM;
1761 unsigned i;
1762 unsigned long *seen[MAX_CACHES_PER_SET];
1763 struct btree_op op; 1828 struct btree_op op;
1764 1829
1765 memset(seen, 0, sizeof(seen));
1766 bch_btree_op_init(&op, SHRT_MAX); 1830 bch_btree_op_init(&op, SHRT_MAX);
1767 1831
1768 for (i = 0; c->cache[i]; i++) { 1832 return btree_root(check_recurse, c, &op);
1769 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); 1833}
1770 seen[i] = kmalloc(n, GFP_KERNEL); 1834
1771 if (!seen[i]) 1835void bch_initial_gc_finish(struct cache_set *c)
1772 goto err; 1836{
1837 struct cache *ca;
1838 struct bucket *b;
1839 unsigned i;
1840
1841 bch_btree_gc_finish(c);
1773 1842
1774 /* Disables the seen array until prio_read() uses it too */ 1843 mutex_lock(&c->bucket_lock);
1775 memset(seen[i], 0xFF, n); 1844
1845 /*
1846 * We need to put some unused buckets directly on the prio freelist in
1847 * order to get the allocator thread started - it needs freed buckets in
1848 * order to rewrite the prios and gens, and it needs to rewrite prios
1849 * and gens in order to free buckets.
1850 *
1851 * This is only safe for buckets that have no live data in them, which
1852 * there should always be some of.
1853 */
1854 for_each_cache(ca, c, i) {
1855 for_each_bucket(b, ca) {
1856 if (fifo_full(&ca->free[RESERVE_PRIO]))
1857 break;
1858
1859 if (bch_can_invalidate_bucket(ca, b) &&
1860 !GC_MARK(b)) {
1861 __bch_invalidate_one_bucket(ca, b);
1862 fifo_push(&ca->free[RESERVE_PRIO],
1863 b - ca->buckets);
1864 }
1865 }
1776 } 1866 }
1777 1867
1778 ret = btree_root(check_recurse, c, &op, seen); 1868 mutex_unlock(&c->bucket_lock);
1779err:
1780 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1781 kfree(seen[i]);
1782 return ret;
1783} 1869}
1784 1870
1785/* Btree insertion */ 1871/* Btree insertion */
@@ -1871,11 +1957,14 @@ static int btree_split(struct btree *b, struct btree_op *op,
1871 closure_init_stack(&cl); 1957 closure_init_stack(&cl);
1872 bch_keylist_init(&parent_keys); 1958 bch_keylist_init(&parent_keys);
1873 1959
1874 if (!b->level && 1960 if (btree_check_reserve(b, op)) {
1875 btree_check_reserve(b, op)) 1961 if (!b->level)
1876 return -EINTR; 1962 return -EINTR;
1963 else
1964 WARN(1, "insufficient reserve for split\n");
1965 }
1877 1966
1878 n1 = btree_node_alloc_replacement(b, true); 1967 n1 = btree_node_alloc_replacement(b, op);
1879 if (IS_ERR(n1)) 1968 if (IS_ERR(n1))
1880 goto err; 1969 goto err;
1881 1970
@@ -1887,16 +1976,19 @@ static int btree_split(struct btree *b, struct btree_op *op,
1887 1976
1888 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); 1977 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
1889 1978
1890 n2 = bch_btree_node_alloc(b->c, b->level, true); 1979 n2 = bch_btree_node_alloc(b->c, op, b->level);
1891 if (IS_ERR(n2)) 1980 if (IS_ERR(n2))
1892 goto err_free1; 1981 goto err_free1;
1893 1982
1894 if (!b->parent) { 1983 if (!b->parent) {
1895 n3 = bch_btree_node_alloc(b->c, b->level + 1, true); 1984 n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
1896 if (IS_ERR(n3)) 1985 if (IS_ERR(n3))
1897 goto err_free2; 1986 goto err_free2;
1898 } 1987 }
1899 1988
1989 mutex_lock(&n1->write_lock);
1990 mutex_lock(&n2->write_lock);
1991
1900 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 1992 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
1901 1993
1902 /* 1994 /*
@@ -1923,45 +2015,45 @@ static int btree_split(struct btree *b, struct btree_op *op,
1923 2015
1924 bch_keylist_add(&parent_keys, &n2->key); 2016 bch_keylist_add(&parent_keys, &n2->key);
1925 bch_btree_node_write(n2, &cl); 2017 bch_btree_node_write(n2, &cl);
2018 mutex_unlock(&n2->write_lock);
1926 rw_unlock(true, n2); 2019 rw_unlock(true, n2);
1927 } else { 2020 } else {
1928 trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); 2021 trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
1929 2022
2023 mutex_lock(&n1->write_lock);
1930 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 2024 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
1931 } 2025 }
1932 2026
1933 bch_keylist_add(&parent_keys, &n1->key); 2027 bch_keylist_add(&parent_keys, &n1->key);
1934 bch_btree_node_write(n1, &cl); 2028 bch_btree_node_write(n1, &cl);
2029 mutex_unlock(&n1->write_lock);
1935 2030
1936 if (n3) { 2031 if (n3) {
1937 /* Depth increases, make a new root */ 2032 /* Depth increases, make a new root */
2033 mutex_lock(&n3->write_lock);
1938 bkey_copy_key(&n3->key, &MAX_KEY); 2034 bkey_copy_key(&n3->key, &MAX_KEY);
1939 bch_btree_insert_keys(n3, op, &parent_keys, NULL); 2035 bch_btree_insert_keys(n3, op, &parent_keys, NULL);
1940 bch_btree_node_write(n3, &cl); 2036 bch_btree_node_write(n3, &cl);
2037 mutex_unlock(&n3->write_lock);
1941 2038
1942 closure_sync(&cl); 2039 closure_sync(&cl);
1943 bch_btree_set_root(n3); 2040 bch_btree_set_root(n3);
1944 rw_unlock(true, n3); 2041 rw_unlock(true, n3);
1945
1946 btree_node_free(b);
1947 } else if (!b->parent) { 2042 } else if (!b->parent) {
1948 /* Root filled up but didn't need to be split */ 2043 /* Root filled up but didn't need to be split */
1949 closure_sync(&cl); 2044 closure_sync(&cl);
1950 bch_btree_set_root(n1); 2045 bch_btree_set_root(n1);
1951
1952 btree_node_free(b);
1953 } else { 2046 } else {
1954 /* Split a non root node */ 2047 /* Split a non root node */
1955 closure_sync(&cl); 2048 closure_sync(&cl);
1956 make_btree_freeing_key(b, parent_keys.top); 2049 make_btree_freeing_key(b, parent_keys.top);
1957 bch_keylist_push(&parent_keys); 2050 bch_keylist_push(&parent_keys);
1958 2051
1959 btree_node_free(b);
1960
1961 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); 2052 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
1962 BUG_ON(!bch_keylist_empty(&parent_keys)); 2053 BUG_ON(!bch_keylist_empty(&parent_keys));
1963 } 2054 }
1964 2055
2056 btree_node_free(b);
1965 rw_unlock(true, n1); 2057 rw_unlock(true, n1);
1966 2058
1967 bch_time_stats_update(&b->c->btree_split_time, start_time); 2059 bch_time_stats_update(&b->c->btree_split_time, start_time);
@@ -1976,7 +2068,7 @@ err_free1:
1976 btree_node_free(n1); 2068 btree_node_free(n1);
1977 rw_unlock(true, n1); 2069 rw_unlock(true, n1);
1978err: 2070err:
1979 WARN(1, "bcache: btree split failed"); 2071 WARN(1, "bcache: btree split failed (level %u)", b->level);
1980 2072
1981 if (n3 == ERR_PTR(-EAGAIN) || 2073 if (n3 == ERR_PTR(-EAGAIN) ||
1982 n2 == ERR_PTR(-EAGAIN) || 2074 n2 == ERR_PTR(-EAGAIN) ||
@@ -1991,33 +2083,54 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
1991 atomic_t *journal_ref, 2083 atomic_t *journal_ref,
1992 struct bkey *replace_key) 2084 struct bkey *replace_key)
1993{ 2085{
2086 struct closure cl;
2087
1994 BUG_ON(b->level && replace_key); 2088 BUG_ON(b->level && replace_key);
1995 2089
2090 closure_init_stack(&cl);
2091
2092 mutex_lock(&b->write_lock);
2093
2094 if (write_block(b) != btree_bset_last(b) &&
2095 b->keys.last_set_unwritten)
2096 bch_btree_init_next(b); /* just wrote a set */
2097
1996 if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { 2098 if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
1997 if (current->bio_list) { 2099 mutex_unlock(&b->write_lock);
1998 op->lock = b->c->root->level + 1; 2100 goto split;
1999 return -EAGAIN; 2101 }
2000 } else if (op->lock <= b->c->root->level) {
2001 op->lock = b->c->root->level + 1;
2002 return -EINTR;
2003 } else {
2004 /* Invalidated all iterators */
2005 int ret = btree_split(b, op, insert_keys, replace_key);
2006 2102
2007 return bch_keylist_empty(insert_keys) ? 2103 BUG_ON(write_block(b) != btree_bset_last(b));
2008 0 : ret ?: -EINTR;
2009 }
2010 } else {
2011 BUG_ON(write_block(b) != btree_bset_last(b));
2012 2104
2013 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 2105 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2014 if (!b->level) 2106 if (!b->level)
2015 bch_btree_leaf_dirty(b, journal_ref); 2107 bch_btree_leaf_dirty(b, journal_ref);
2016 else 2108 else
2017 bch_btree_node_write_sync(b); 2109 bch_btree_node_write(b, &cl);
2018 } 2110 }
2019 2111
2020 return 0; 2112 mutex_unlock(&b->write_lock);
2113
2114 /* wait for btree node write if necessary, after unlock */
2115 closure_sync(&cl);
2116
2117 return 0;
2118split:
2119 if (current->bio_list) {
2120 op->lock = b->c->root->level + 1;
2121 return -EAGAIN;
2122 } else if (op->lock <= b->c->root->level) {
2123 op->lock = b->c->root->level + 1;
2124 return -EINTR;
2125 } else {
2126 /* Invalidated all iterators */
2127 int ret = btree_split(b, op, insert_keys, replace_key);
2128
2129 if (bch_keylist_empty(insert_keys))
2130 return 0;
2131 else if (!ret)
2132 return -EINTR;
2133 return ret;
2021 } 2134 }
2022} 2135}
2023 2136
@@ -2403,18 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf)
2403 spin_lock_init(&buf->lock); 2516 spin_lock_init(&buf->lock);
2404 array_allocator_init(&buf->freelist); 2517 array_allocator_init(&buf->freelist);
2405} 2518}
2406
2407void bch_btree_exit(void)
2408{
2409 if (btree_io_wq)
2410 destroy_workqueue(btree_io_wq);
2411}
2412
2413int __init bch_btree_init(void)
2414{
2415 btree_io_wq = create_singlethread_workqueue("bch_btree_io");
2416 if (!btree_io_wq)
2417 return -ENOMEM;
2418
2419 return 0;
2420}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index af065e97e55c..91dfa5e69685 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -127,6 +127,8 @@ struct btree {
127 struct cache_set *c; 127 struct cache_set *c;
128 struct btree *parent; 128 struct btree *parent;
129 129
130 struct mutex write_lock;
131
130 unsigned long flags; 132 unsigned long flags;
131 uint16_t written; /* would be nice to kill */ 133 uint16_t written; /* would be nice to kill */
132 uint8_t level; 134 uint8_t level;
@@ -236,11 +238,13 @@ static inline void rw_unlock(bool w, struct btree *b)
236} 238}
237 239
238void bch_btree_node_read_done(struct btree *); 240void bch_btree_node_read_done(struct btree *);
241void __bch_btree_node_write(struct btree *, struct closure *);
239void bch_btree_node_write(struct btree *, struct closure *); 242void bch_btree_node_write(struct btree *, struct closure *);
240 243
241void bch_btree_set_root(struct btree *); 244void bch_btree_set_root(struct btree *);
242struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); 245struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
243struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); 246struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
247 struct bkey *, int, bool);
244 248
245int bch_btree_insert_check_key(struct btree *, struct btree_op *, 249int bch_btree_insert_check_key(struct btree *, struct btree_op *,
246 struct bkey *); 250 struct bkey *);
@@ -248,10 +252,10 @@ int bch_btree_insert(struct cache_set *, struct keylist *,
248 atomic_t *, struct bkey *); 252 atomic_t *, struct bkey *);
249 253
250int bch_gc_thread_start(struct cache_set *); 254int bch_gc_thread_start(struct cache_set *);
251size_t bch_btree_gc_finish(struct cache_set *); 255void bch_initial_gc_finish(struct cache_set *);
252void bch_moving_gc(struct cache_set *); 256void bch_moving_gc(struct cache_set *);
253int bch_btree_check(struct cache_set *); 257int bch_btree_check(struct cache_set *);
254uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); 258void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
255 259
256static inline void wake_up_gc(struct cache_set *c) 260static inline void wake_up_gc(struct cache_set *c)
257{ 261{
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 416d1a3e028e..3a0de4cf9771 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -194,9 +194,9 @@ err:
194 mutex_unlock(&b->c->bucket_lock); 194 mutex_unlock(&b->c->bucket_lock);
195 bch_extent_to_text(buf, sizeof(buf), k); 195 bch_extent_to_text(buf, sizeof(buf), k);
196 btree_bug(b, 196 btree_bug(b,
197"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 197"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
198 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), 198 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
199 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 199 g->prio, g->gen, g->last_gc, GC_MARK(g));
200 return true; 200 return true;
201} 201}
202 202
@@ -308,6 +308,16 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
308 return NULL; 308 return NULL;
309} 309}
310 310
311static void bch_subtract_dirty(struct bkey *k,
312 struct cache_set *c,
313 uint64_t offset,
314 int sectors)
315{
316 if (KEY_DIRTY(k))
317 bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
318 offset, -sectors);
319}
320
311static bool bch_extent_insert_fixup(struct btree_keys *b, 321static bool bch_extent_insert_fixup(struct btree_keys *b,
312 struct bkey *insert, 322 struct bkey *insert,
313 struct btree_iter *iter, 323 struct btree_iter *iter,
@@ -315,13 +325,6 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
315{ 325{
316 struct cache_set *c = container_of(b, struct btree, keys)->c; 326 struct cache_set *c = container_of(b, struct btree, keys)->c;
317 327
318 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
319 {
320 if (KEY_DIRTY(k))
321 bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
322 offset, -sectors);
323 }
324
325 uint64_t old_offset; 328 uint64_t old_offset;
326 unsigned old_size, sectors_found = 0; 329 unsigned old_size, sectors_found = 0;
327 330
@@ -398,7 +401,8 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
398 401
399 struct bkey *top; 402 struct bkey *top;
400 403
401 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); 404 bch_subtract_dirty(k, c, KEY_START(insert),
405 KEY_SIZE(insert));
402 406
403 if (bkey_written(b, k)) { 407 if (bkey_written(b, k)) {
404 /* 408 /*
@@ -448,7 +452,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
448 } 452 }
449 } 453 }
450 454
451 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); 455 bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
452 } 456 }
453 457
454check_failed: 458check_failed:
@@ -499,9 +503,9 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
499 503
500 if (mutex_trylock(&b->c->bucket_lock)) { 504 if (mutex_trylock(&b->c->bucket_lock)) {
501 if (b->c->gc_mark_valid && 505 if (b->c->gc_mark_valid &&
502 ((GC_MARK(g) != GC_MARK_DIRTY && 506 (!GC_MARK(g) ||
503 KEY_DIRTY(k)) || 507 GC_MARK(g) == GC_MARK_METADATA ||
504 GC_MARK(g) == GC_MARK_METADATA)) 508 (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
505 goto err; 509 goto err;
506 510
507 if (g->prio == BTREE_PRIO) 511 if (g->prio == BTREE_PRIO)
@@ -515,9 +519,9 @@ err:
515 mutex_unlock(&b->c->bucket_lock); 519 mutex_unlock(&b->c->bucket_lock);
516 bch_extent_to_text(buf, sizeof(buf), k); 520 bch_extent_to_text(buf, sizeof(buf), k);
517 btree_bug(b, 521 btree_bug(b,
518"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 522"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
519 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), 523 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
520 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 524 g->prio, g->gen, g->last_gc, GC_MARK(g));
521 return true; 525 return true;
522} 526}
523 527
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 18039affc306..59e82021b5bb 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -237,8 +237,14 @@ bsearch:
237 for (i = 0; i < ca->sb.njournal_buckets; i++) 237 for (i = 0; i < ca->sb.njournal_buckets; i++)
238 if (ja->seq[i] > seq) { 238 if (ja->seq[i] > seq) {
239 seq = ja->seq[i]; 239 seq = ja->seq[i];
240 ja->cur_idx = ja->discard_idx = 240 /*
241 ja->last_idx = i; 241 * When journal_reclaim() goes to allocate for
242 * the first time, it'll use the bucket after
243 * ja->cur_idx
244 */
245 ja->cur_idx = i;
246 ja->last_idx = ja->discard_idx = (i + 1) %
247 ca->sb.njournal_buckets;
242 248
243 } 249 }
244 } 250 }
@@ -288,16 +294,11 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
288 k = bkey_next(k)) { 294 k = bkey_next(k)) {
289 unsigned j; 295 unsigned j;
290 296
291 for (j = 0; j < KEY_PTRS(k); j++) { 297 for (j = 0; j < KEY_PTRS(k); j++)
292 struct bucket *g = PTR_BUCKET(c, k, j); 298 if (ptr_available(c, k, j))
293 atomic_inc(&g->pin); 299 atomic_inc(&PTR_BUCKET(c, k, j)->pin);
294 300
295 if (g->prio == BTREE_PRIO && 301 bch_initial_mark_key(c, 0, k);
296 !ptr_stale(c, k, j))
297 g->prio = INITIAL_PRIO;
298 }
299
300 __bch_btree_mark_key(c, 0, k);
301 } 302 }
302 } 303 }
303} 304}
@@ -312,8 +313,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
312 uint64_t start = i->j.last_seq, end = i->j.seq, n = start; 313 uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
313 struct keylist keylist; 314 struct keylist keylist;
314 315
315 bch_keylist_init(&keylist);
316
317 list_for_each_entry(i, list, list) { 316 list_for_each_entry(i, list, list) {
318 BUG_ON(i->pin && atomic_read(i->pin) != 1); 317 BUG_ON(i->pin && atomic_read(i->pin) != 1);
319 318
@@ -326,8 +325,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
326 k = bkey_next(k)) { 325 k = bkey_next(k)) {
327 trace_bcache_journal_replay_key(k); 326 trace_bcache_journal_replay_key(k);
328 327
329 bkey_copy(keylist.top, k); 328 bch_keylist_init_single(&keylist, k);
330 bch_keylist_push(&keylist);
331 329
332 ret = bch_btree_insert(s, &keylist, i->pin, NULL); 330 ret = bch_btree_insert(s, &keylist, i->pin, NULL);
333 if (ret) 331 if (ret)
@@ -383,16 +381,15 @@ retry:
383 381
384 b = best; 382 b = best;
385 if (b) { 383 if (b) {
386 rw_lock(true, b, b->level); 384 mutex_lock(&b->write_lock);
387
388 if (!btree_current_write(b)->journal) { 385 if (!btree_current_write(b)->journal) {
389 rw_unlock(true, b); 386 mutex_unlock(&b->write_lock);
390 /* We raced */ 387 /* We raced */
391 goto retry; 388 goto retry;
392 } 389 }
393 390
394 bch_btree_node_write(b, NULL); 391 __bch_btree_node_write(b, NULL);
395 rw_unlock(true, b); 392 mutex_unlock(&b->write_lock);
396 } 393 }
397} 394}
398 395
@@ -536,6 +533,7 @@ void bch_journal_next(struct journal *j)
536 atomic_set(&fifo_back(&j->pin), 1); 533 atomic_set(&fifo_back(&j->pin), 1);
537 534
538 j->cur->data->seq = ++j->seq; 535 j->cur->data->seq = ++j->seq;
536 j->cur->dirty = false;
539 j->cur->need_write = false; 537 j->cur->need_write = false;
540 j->cur->data->keys = 0; 538 j->cur->data->keys = 0;
541 539
@@ -731,7 +729,10 @@ static void journal_write_work(struct work_struct *work)
731 struct cache_set, 729 struct cache_set,
732 journal.work); 730 journal.work);
733 spin_lock(&c->journal.lock); 731 spin_lock(&c->journal.lock);
734 journal_try_write(c); 732 if (c->journal.cur->dirty)
733 journal_try_write(c);
734 else
735 spin_unlock(&c->journal.lock);
735} 736}
736 737
737/* 738/*
@@ -761,7 +762,8 @@ atomic_t *bch_journal(struct cache_set *c,
761 if (parent) { 762 if (parent) {
762 closure_wait(&w->wait, parent); 763 closure_wait(&w->wait, parent);
763 journal_try_write(c); 764 journal_try_write(c);
764 } else if (!w->need_write) { 765 } else if (!w->dirty) {
766 w->dirty = true;
765 schedule_delayed_work(&c->journal.work, 767 schedule_delayed_work(&c->journal.work,
766 msecs_to_jiffies(c->journal_delay_ms)); 768 msecs_to_jiffies(c->journal_delay_ms));
767 spin_unlock(&c->journal.lock); 769 spin_unlock(&c->journal.lock);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 9180c4465075..e3c39457afbb 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -95,6 +95,7 @@ struct journal_write {
95 95
96 struct cache_set *c; 96 struct cache_set *c;
97 struct closure_waitlist wait; 97 struct closure_waitlist wait;
98 bool dirty;
98 bool need_write; 99 bool need_write;
99}; 100};
100 101
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 9eb60d102de8..cd7490311e51 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -24,12 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
24 moving_gc_keys); 24 moving_gc_keys);
25 unsigned i; 25 unsigned i;
26 26
27 for (i = 0; i < KEY_PTRS(k); i++) { 27 for (i = 0; i < KEY_PTRS(k); i++)
28 struct bucket *g = PTR_BUCKET(c, k, i); 28 if (ptr_available(c, k, i) &&
29 29 GC_MOVE(PTR_BUCKET(c, k, i)))
30 if (GC_MOVE(g))
31 return true; 30 return true;
32 }
33 31
34 return false; 32 return false;
35} 33}
@@ -115,7 +113,7 @@ static void write_moving(struct closure *cl)
115 closure_call(&op->cl, bch_data_insert, NULL, cl); 113 closure_call(&op->cl, bch_data_insert, NULL, cl);
116 } 114 }
117 115
118 continue_at(cl, write_moving_finish, system_wq); 116 continue_at(cl, write_moving_finish, op->wq);
119} 117}
120 118
121static void read_moving_submit(struct closure *cl) 119static void read_moving_submit(struct closure *cl)
@@ -125,7 +123,7 @@ static void read_moving_submit(struct closure *cl)
125 123
126 bch_submit_bbio(bio, io->op.c, &io->w->key, 0); 124 bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
127 125
128 continue_at(cl, write_moving, system_wq); 126 continue_at(cl, write_moving, io->op.wq);
129} 127}
130 128
131static void read_moving(struct cache_set *c) 129static void read_moving(struct cache_set *c)
@@ -160,6 +158,7 @@ static void read_moving(struct cache_set *c)
160 io->w = w; 158 io->w = w;
161 io->op.inode = KEY_INODE(&w->key); 159 io->op.inode = KEY_INODE(&w->key);
162 io->op.c = c; 160 io->op.c = c;
161 io->op.wq = c->moving_gc_wq;
163 162
164 moving_init(io); 163 moving_init(io);
165 bio = &io->bio.bio; 164 bio = &io->bio.bio;
@@ -216,7 +215,10 @@ void bch_moving_gc(struct cache_set *c)
216 ca->heap.used = 0; 215 ca->heap.used = 0;
217 216
218 for_each_bucket(b, ca) { 217 for_each_bucket(b, ca) {
219 if (!GC_SECTORS_USED(b)) 218 if (GC_MARK(b) == GC_MARK_METADATA ||
219 !GC_SECTORS_USED(b) ||
220 GC_SECTORS_USED(b) == ca->sb.bucket_size ||
221 atomic_read(&b->pin))
220 continue; 222 continue;
221 223
222 if (!heap_full(&ca->heap)) { 224 if (!heap_full(&ca->heap)) {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 5d5d031cf381..15fff4f68a7c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -12,11 +12,9 @@
12#include "request.h" 12#include "request.h"
13#include "writeback.h" 13#include "writeback.h"
14 14
15#include <linux/cgroup.h>
16#include <linux/module.h> 15#include <linux/module.h>
17#include <linux/hash.h> 16#include <linux/hash.h>
18#include <linux/random.h> 17#include <linux/random.h>
19#include "blk-cgroup.h"
20 18
21#include <trace/events/bcache.h> 19#include <trace/events/bcache.h>
22 20
@@ -27,171 +25,13 @@ struct kmem_cache *bch_search_cache;
27 25
28static void bch_data_insert_start(struct closure *); 26static void bch_data_insert_start(struct closure *);
29 27
30/* Cgroup interface */
31
32#ifdef CONFIG_CGROUP_BCACHE
33static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
34
35static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
36{
37 struct cgroup_subsys_state *css;
38 return cgroup &&
39 (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
40 ? container_of(css, struct bch_cgroup, css)
41 : &bcache_default_cgroup;
42}
43
44struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
45{
46 struct cgroup_subsys_state *css = bio->bi_css
47 ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
48 : task_subsys_state(current, bcache_subsys_id);
49
50 return css
51 ? container_of(css, struct bch_cgroup, css)
52 : &bcache_default_cgroup;
53}
54
55static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
56 struct file *file,
57 char __user *buf, size_t nbytes, loff_t *ppos)
58{
59 char tmp[1024];
60 int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
61 cgroup_to_bcache(cgrp)->cache_mode + 1);
62
63 if (len < 0)
64 return len;
65
66 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
67}
68
69static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
70 const char *buf)
71{
72 int v = bch_read_string_list(buf, bch_cache_modes);
73 if (v < 0)
74 return v;
75
76 cgroup_to_bcache(cgrp)->cache_mode = v - 1;
77 return 0;
78}
79
80static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
81{
82 return cgroup_to_bcache(cgrp)->verify;
83}
84
85static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
86{
87 cgroup_to_bcache(cgrp)->verify = val;
88 return 0;
89}
90
91static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
92{
93 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
94 return atomic_read(&bcachecg->stats.cache_hits);
95}
96
97static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
98{
99 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
100 return atomic_read(&bcachecg->stats.cache_misses);
101}
102
103static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
104 struct cftype *cft)
105{
106 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
107 return atomic_read(&bcachecg->stats.cache_bypass_hits);
108}
109
110static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
111 struct cftype *cft)
112{
113 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
114 return atomic_read(&bcachecg->stats.cache_bypass_misses);
115}
116
117static struct cftype bch_files[] = {
118 {
119 .name = "cache_mode",
120 .read = cache_mode_read,
121 .write_string = cache_mode_write,
122 },
123 {
124 .name = "verify",
125 .read_u64 = bch_verify_read,
126 .write_u64 = bch_verify_write,
127 },
128 {
129 .name = "cache_hits",
130 .read_u64 = bch_cache_hits_read,
131 },
132 {
133 .name = "cache_misses",
134 .read_u64 = bch_cache_misses_read,
135 },
136 {
137 .name = "cache_bypass_hits",
138 .read_u64 = bch_cache_bypass_hits_read,
139 },
140 {
141 .name = "cache_bypass_misses",
142 .read_u64 = bch_cache_bypass_misses_read,
143 },
144 { } /* terminate */
145};
146
147static void init_bch_cgroup(struct bch_cgroup *cg)
148{
149 cg->cache_mode = -1;
150}
151
152static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
153{
154 struct bch_cgroup *cg;
155
156 cg = kzalloc(sizeof(*cg), GFP_KERNEL);
157 if (!cg)
158 return ERR_PTR(-ENOMEM);
159 init_bch_cgroup(cg);
160 return &cg->css;
161}
162
163static void bcachecg_destroy(struct cgroup *cgroup)
164{
165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
166 kfree(cg);
167}
168
169struct cgroup_subsys bcache_subsys = {
170 .create = bcachecg_create,
171 .destroy = bcachecg_destroy,
172 .subsys_id = bcache_subsys_id,
173 .name = "bcache",
174 .module = THIS_MODULE,
175};
176EXPORT_SYMBOL_GPL(bcache_subsys);
177#endif
178
179static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) 28static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
180{ 29{
181#ifdef CONFIG_CGROUP_BCACHE
182 int r = bch_bio_to_cgroup(bio)->cache_mode;
183 if (r >= 0)
184 return r;
185#endif
186 return BDEV_CACHE_MODE(&dc->sb); 30 return BDEV_CACHE_MODE(&dc->sb);
187} 31}
188 32
189static bool verify(struct cached_dev *dc, struct bio *bio) 33static bool verify(struct cached_dev *dc, struct bio *bio)
190{ 34{
191#ifdef CONFIG_CGROUP_BCACHE
192 if (bch_bio_to_cgroup(bio)->verify)
193 return true;
194#endif
195 return dc->verify; 35 return dc->verify;
196} 36}
197 37
@@ -248,7 +88,7 @@ static void bch_data_insert_keys(struct closure *cl)
248 atomic_dec_bug(journal_ref); 88 atomic_dec_bug(journal_ref);
249 89
250 if (!op->insert_data_done) 90 if (!op->insert_data_done)
251 continue_at(cl, bch_data_insert_start, bcache_wq); 91 continue_at(cl, bch_data_insert_start, op->wq);
252 92
253 bch_keylist_free(&op->insert_keys); 93 bch_keylist_free(&op->insert_keys);
254 closure_return(cl); 94 closure_return(cl);
@@ -297,7 +137,7 @@ static void bch_data_invalidate(struct closure *cl)
297 op->insert_data_done = true; 137 op->insert_data_done = true;
298 bio_put(bio); 138 bio_put(bio);
299out: 139out:
300 continue_at(cl, bch_data_insert_keys, bcache_wq); 140 continue_at(cl, bch_data_insert_keys, op->wq);
301} 141}
302 142
303static void bch_data_insert_error(struct closure *cl) 143static void bch_data_insert_error(struct closure *cl)
@@ -340,7 +180,7 @@ static void bch_data_insert_endio(struct bio *bio, int error)
340 if (op->writeback) 180 if (op->writeback)
341 op->error = error; 181 op->error = error;
342 else if (!op->replace) 182 else if (!op->replace)
343 set_closure_fn(cl, bch_data_insert_error, bcache_wq); 183 set_closure_fn(cl, bch_data_insert_error, op->wq);
344 else 184 else
345 set_closure_fn(cl, NULL, NULL); 185 set_closure_fn(cl, NULL, NULL);
346 } 186 }
@@ -376,7 +216,7 @@ static void bch_data_insert_start(struct closure *cl)
376 if (bch_keylist_realloc(&op->insert_keys, 216 if (bch_keylist_realloc(&op->insert_keys,
377 3 + (op->csum ? 1 : 0), 217 3 + (op->csum ? 1 : 0),
378 op->c)) 218 op->c))
379 continue_at(cl, bch_data_insert_keys, bcache_wq); 219 continue_at(cl, bch_data_insert_keys, op->wq);
380 220
381 k = op->insert_keys.top; 221 k = op->insert_keys.top;
382 bkey_init(k); 222 bkey_init(k);
@@ -413,7 +253,7 @@ static void bch_data_insert_start(struct closure *cl)
413 } while (n != bio); 253 } while (n != bio);
414 254
415 op->insert_data_done = true; 255 op->insert_data_done = true;
416 continue_at(cl, bch_data_insert_keys, bcache_wq); 256 continue_at(cl, bch_data_insert_keys, op->wq);
417err: 257err:
418 /* bch_alloc_sectors() blocks if s->writeback = true */ 258 /* bch_alloc_sectors() blocks if s->writeback = true */
419 BUG_ON(op->writeback); 259 BUG_ON(op->writeback);
@@ -442,7 +282,7 @@ err:
442 bio_put(bio); 282 bio_put(bio);
443 283
444 if (!bch_keylist_empty(&op->insert_keys)) 284 if (!bch_keylist_empty(&op->insert_keys))
445 continue_at(cl, bch_data_insert_keys, bcache_wq); 285 continue_at(cl, bch_data_insert_keys, op->wq);
446 else 286 else
447 closure_return(cl); 287 closure_return(cl);
448 } 288 }
@@ -824,6 +664,7 @@ static inline struct search *search_alloc(struct bio *bio,
824 s->iop.error = 0; 664 s->iop.error = 0;
825 s->iop.flags = 0; 665 s->iop.flags = 0;
826 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 666 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
667 s->iop.wq = bcache_wq;
827 668
828 return s; 669 return s;
829} 670}
@@ -1203,22 +1044,13 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
1203static int flash_dev_cache_miss(struct btree *b, struct search *s, 1044static int flash_dev_cache_miss(struct btree *b, struct search *s,
1204 struct bio *bio, unsigned sectors) 1045 struct bio *bio, unsigned sectors)
1205{ 1046{
1206 struct bio_vec bv; 1047 unsigned bytes = min(sectors, bio_sectors(bio)) << 9;
1207 struct bvec_iter iter;
1208
1209 /* Zero fill bio */
1210 1048
1211 bio_for_each_segment(bv, bio, iter) { 1049 swap(bio->bi_iter.bi_size, bytes);
1212 unsigned j = min(bv.bv_len >> 9, sectors); 1050 zero_fill_bio(bio);
1213 1051 swap(bio->bi_iter.bi_size, bytes);
1214 void *p = kmap(bv.bv_page);
1215 memset(p + bv.bv_offset, 0, j << 9);
1216 kunmap(bv.bv_page);
1217
1218 sectors -= j;
1219 }
1220 1052
1221 bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size)); 1053 bio_advance(bio, bytes);
1222 1054
1223 if (!bio->bi_iter.bi_size) 1055 if (!bio->bi_iter.bi_size)
1224 return MAP_DONE; 1056 return MAP_DONE;
@@ -1313,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d)
1313 1145
1314void bch_request_exit(void) 1146void bch_request_exit(void)
1315{ 1147{
1316#ifdef CONFIG_CGROUP_BCACHE
1317 cgroup_unload_subsys(&bcache_subsys);
1318#endif
1319 if (bch_search_cache) 1148 if (bch_search_cache)
1320 kmem_cache_destroy(bch_search_cache); 1149 kmem_cache_destroy(bch_search_cache);
1321} 1150}
@@ -1326,11 +1155,5 @@ int __init bch_request_init(void)
1326 if (!bch_search_cache) 1155 if (!bch_search_cache)
1327 return -ENOMEM; 1156 return -ENOMEM;
1328 1157
1329#ifdef CONFIG_CGROUP_BCACHE
1330 cgroup_load_subsys(&bcache_subsys);
1331 init_bch_cgroup(&bcache_default_cgroup);
1332
1333 cgroup_add_cftypes(&bcache_subsys, bch_files);
1334#endif
1335 return 0; 1158 return 0;
1336} 1159}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 39f21dbedc38..1ff36875c2b3 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -1,12 +1,11 @@
1#ifndef _BCACHE_REQUEST_H_ 1#ifndef _BCACHE_REQUEST_H_
2#define _BCACHE_REQUEST_H_ 2#define _BCACHE_REQUEST_H_
3 3
4#include <linux/cgroup.h>
5
6struct data_insert_op { 4struct data_insert_op {
7 struct closure cl; 5 struct closure cl;
8 struct cache_set *c; 6 struct cache_set *c;
9 struct bio *bio; 7 struct bio *bio;
8 struct workqueue_struct *wq;
10 9
11 unsigned inode; 10 unsigned inode;
12 uint16_t write_point; 11 uint16_t write_point;
@@ -41,20 +40,4 @@ void bch_flash_dev_request_init(struct bcache_device *d);
41 40
42extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; 41extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
43 42
44struct bch_cgroup {
45#ifdef CONFIG_CGROUP_BCACHE
46 struct cgroup_subsys_state css;
47#endif
48 /*
49 * We subtract one from the index into bch_cache_modes[], so that
50 * default == -1; this makes it so the rest match up with d->cache_mode,
51 * and we use d->cache_mode if cgrp->cache_mode < 0
52 */
53 short cache_mode;
54 bool verify;
55 struct cache_stat_collector stats;
56};
57
58struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
59
60#endif /* _BCACHE_REQUEST_H_ */ 43#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 84d0782f702e..0ca072c20d0d 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -201,9 +201,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
201 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 201 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
202 mark_cache_stats(&dc->accounting.collector, hit, bypass); 202 mark_cache_stats(&dc->accounting.collector, hit, bypass);
203 mark_cache_stats(&c->accounting.collector, hit, bypass); 203 mark_cache_stats(&c->accounting.collector, hit, bypass);
204#ifdef CONFIG_CGROUP_BCACHE
205 mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
206#endif
207} 204}
208 205
209void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) 206void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 24a3a1546caa..926ded8ccbf5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -541,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
541 closure_sync(cl); 541 closure_sync(cl);
542} 542}
543 543
544#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \
545 fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
546
547void bch_prio_write(struct cache *ca) 544void bch_prio_write(struct cache *ca)
548{ 545{
549 int i; 546 int i;
@@ -554,10 +551,6 @@ void bch_prio_write(struct cache *ca)
554 551
555 lockdep_assert_held(&ca->set->bucket_lock); 552 lockdep_assert_held(&ca->set->bucket_lock);
556 553
557 for (b = ca->buckets;
558 b < ca->buckets + ca->sb.nbuckets; b++)
559 b->disk_gen = b->gen;
560
561 ca->disk_buckets->seq++; 554 ca->disk_buckets->seq++;
562 555
563 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), 556 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
@@ -601,14 +594,17 @@ void bch_prio_write(struct cache *ca)
601 594
602 mutex_lock(&ca->set->bucket_lock); 595 mutex_lock(&ca->set->bucket_lock);
603 596
604 ca->need_save_prio = 0;
605
606 /* 597 /*
607 * Don't want the old priorities to get garbage collected until after we 598 * Don't want the old priorities to get garbage collected until after we
608 * finish writing the new ones, and they're journalled 599 * finish writing the new ones, and they're journalled
609 */ 600 */
610 for (i = 0; i < prio_buckets(ca); i++) 601 for (i = 0; i < prio_buckets(ca); i++) {
602 if (ca->prio_last_buckets[i])
603 __bch_bucket_free(ca,
604 &ca->buckets[ca->prio_last_buckets[i]]);
605
611 ca->prio_last_buckets[i] = ca->prio_buckets[i]; 606 ca->prio_last_buckets[i] = ca->prio_buckets[i];
607 }
612} 608}
613 609
614static void prio_read(struct cache *ca, uint64_t bucket) 610static void prio_read(struct cache *ca, uint64_t bucket)
@@ -639,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
639 } 635 }
640 636
641 b->prio = le16_to_cpu(d->prio); 637 b->prio = le16_to_cpu(d->prio);
642 b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; 638 b->gen = b->last_gc = d->gen;
643 } 639 }
644} 640}
645 641
@@ -843,6 +839,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
843 q->limits.max_segment_size = UINT_MAX; 839 q->limits.max_segment_size = UINT_MAX;
844 q->limits.max_segments = BIO_MAX_PAGES; 840 q->limits.max_segments = BIO_MAX_PAGES;
845 q->limits.max_discard_sectors = UINT_MAX; 841 q->limits.max_discard_sectors = UINT_MAX;
842 q->limits.discard_granularity = 512;
846 q->limits.io_min = block_size; 843 q->limits.io_min = block_size;
847 q->limits.logical_block_size = block_size; 844 q->limits.logical_block_size = block_size;
848 q->limits.physical_block_size = block_size; 845 q->limits.physical_block_size = block_size;
@@ -1355,6 +1352,8 @@ static void cache_set_free(struct closure *cl)
1355 bch_bset_sort_state_free(&c->sort); 1352 bch_bset_sort_state_free(&c->sort);
1356 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1353 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1357 1354
1355 if (c->moving_gc_wq)
1356 destroy_workqueue(c->moving_gc_wq);
1358 if (c->bio_split) 1357 if (c->bio_split)
1359 bioset_free(c->bio_split); 1358 bioset_free(c->bio_split);
1360 if (c->fill_iter) 1359 if (c->fill_iter)
@@ -1395,14 +1394,21 @@ static void cache_set_flush(struct closure *cl)
1395 list_add(&c->root->list, &c->btree_cache); 1394 list_add(&c->root->list, &c->btree_cache);
1396 1395
1397 /* Should skip this if we're unregistering because of an error */ 1396 /* Should skip this if we're unregistering because of an error */
1398 list_for_each_entry(b, &c->btree_cache, list) 1397 list_for_each_entry(b, &c->btree_cache, list) {
1398 mutex_lock(&b->write_lock);
1399 if (btree_node_dirty(b)) 1399 if (btree_node_dirty(b))
1400 bch_btree_node_write(b, NULL); 1400 __bch_btree_node_write(b, NULL);
1401 mutex_unlock(&b->write_lock);
1402 }
1401 1403
1402 for_each_cache(ca, c, i) 1404 for_each_cache(ca, c, i)
1403 if (ca->alloc_thread) 1405 if (ca->alloc_thread)
1404 kthread_stop(ca->alloc_thread); 1406 kthread_stop(ca->alloc_thread);
1405 1407
1408 cancel_delayed_work_sync(&c->journal.work);
1409 /* flush last journal entry if needed */
1410 c->journal.work.work.func(&c->journal.work.work);
1411
1406 closure_return(cl); 1412 closure_return(cl);
1407} 1413}
1408 1414
@@ -1485,14 +1491,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1485 1491
1486 sema_init(&c->sb_write_mutex, 1); 1492 sema_init(&c->sb_write_mutex, 1);
1487 mutex_init(&c->bucket_lock); 1493 mutex_init(&c->bucket_lock);
1488 init_waitqueue_head(&c->try_wait); 1494 init_waitqueue_head(&c->btree_cache_wait);
1489 init_waitqueue_head(&c->bucket_wait); 1495 init_waitqueue_head(&c->bucket_wait);
1490 sema_init(&c->uuid_write_mutex, 1); 1496 sema_init(&c->uuid_write_mutex, 1);
1491 1497
1492 spin_lock_init(&c->btree_gc_time.lock); 1498 spin_lock_init(&c->btree_gc_time.lock);
1493 spin_lock_init(&c->btree_split_time.lock); 1499 spin_lock_init(&c->btree_split_time.lock);
1494 spin_lock_init(&c->btree_read_time.lock); 1500 spin_lock_init(&c->btree_read_time.lock);
1495 spin_lock_init(&c->try_harder_time.lock);
1496 1501
1497 bch_moving_init_cache_set(c); 1502 bch_moving_init_cache_set(c);
1498 1503
@@ -1517,6 +1522,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1517 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || 1522 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1518 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1523 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1519 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1524 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1525 !(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
1520 bch_journal_alloc(c) || 1526 bch_journal_alloc(c) ||
1521 bch_btree_cache_alloc(c) || 1527 bch_btree_cache_alloc(c) ||
1522 bch_open_buckets_alloc(c) || 1528 bch_open_buckets_alloc(c) ||
@@ -1580,7 +1586,7 @@ static void run_cache_set(struct cache_set *c)
1580 goto err; 1586 goto err;
1581 1587
1582 err = "error reading btree root"; 1588 err = "error reading btree root";
1583 c->root = bch_btree_node_get(c, k, j->btree_level, true); 1589 c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
1584 if (IS_ERR_OR_NULL(c->root)) 1590 if (IS_ERR_OR_NULL(c->root))
1585 goto err; 1591 goto err;
1586 1592
@@ -1596,7 +1602,7 @@ static void run_cache_set(struct cache_set *c)
1596 goto err; 1602 goto err;
1597 1603
1598 bch_journal_mark(c, &journal); 1604 bch_journal_mark(c, &journal);
1599 bch_btree_gc_finish(c); 1605 bch_initial_gc_finish(c);
1600 pr_debug("btree_check() done"); 1606 pr_debug("btree_check() done");
1601 1607
1602 /* 1608 /*
@@ -1638,7 +1644,7 @@ static void run_cache_set(struct cache_set *c)
1638 ca->sb.d[j] = ca->sb.first_bucket + j; 1644 ca->sb.d[j] = ca->sb.first_bucket + j;
1639 } 1645 }
1640 1646
1641 bch_btree_gc_finish(c); 1647 bch_initial_gc_finish(c);
1642 1648
1643 err = "error starting allocator thread"; 1649 err = "error starting allocator thread";
1644 for_each_cache(ca, c, i) 1650 for_each_cache(ca, c, i)
@@ -1655,12 +1661,14 @@ static void run_cache_set(struct cache_set *c)
1655 goto err; 1661 goto err;
1656 1662
1657 err = "cannot allocate new btree root"; 1663 err = "cannot allocate new btree root";
1658 c->root = bch_btree_node_alloc(c, 0, true); 1664 c->root = bch_btree_node_alloc(c, NULL, 0);
1659 if (IS_ERR_OR_NULL(c->root)) 1665 if (IS_ERR_OR_NULL(c->root))
1660 goto err; 1666 goto err;
1661 1667
1668 mutex_lock(&c->root->write_lock);
1662 bkey_copy_key(&c->root->key, &MAX_KEY); 1669 bkey_copy_key(&c->root->key, &MAX_KEY);
1663 bch_btree_node_write(c->root, &cl); 1670 bch_btree_node_write(c->root, &cl);
1671 mutex_unlock(&c->root->write_lock);
1664 1672
1665 bch_btree_set_root(c->root); 1673 bch_btree_set_root(c->root);
1666 rw_unlock(true, c->root); 1674 rw_unlock(true, c->root);
@@ -1782,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj)
1782 vfree(ca->buckets); 1790 vfree(ca->buckets);
1783 1791
1784 free_heap(&ca->heap); 1792 free_heap(&ca->heap);
1785 free_fifo(&ca->unused);
1786 free_fifo(&ca->free_inc); 1793 free_fifo(&ca->free_inc);
1787 1794
1788 for (i = 0; i < RESERVE_NR; i++) 1795 for (i = 0; i < RESERVE_NR; i++)
@@ -1819,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1819 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || 1826 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
1820 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || 1827 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
1821 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || 1828 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1822 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
1823 !init_heap(&ca->heap, free << 3, GFP_KERNEL) || 1829 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
1824 !(ca->buckets = vzalloc(sizeof(struct bucket) * 1830 !(ca->buckets = vzalloc(sizeof(struct bucket) *
1825 ca->sb.nbuckets)) || 1831 ca->sb.nbuckets)) ||
@@ -1834,13 +1840,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1834 for_each_bucket(b, ca) 1840 for_each_bucket(b, ca)
1835 atomic_set(&b->pin, 0); 1841 atomic_set(&b->pin, 0);
1836 1842
1837 if (bch_cache_allocator_init(ca))
1838 goto err;
1839
1840 return 0; 1843 return 0;
1841err:
1842 kobject_put(&ca->kobj);
1843 return -ENOMEM;
1844} 1844}
1845 1845
1846static void register_cache(struct cache_sb *sb, struct page *sb_page, 1846static void register_cache(struct cache_sb *sb, struct page *sb_page,
@@ -1869,7 +1869,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
1869 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) 1869 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
1870 goto err; 1870 goto err;
1871 1871
1872 mutex_lock(&bch_register_lock);
1872 err = register_cache_set(ca); 1873 err = register_cache_set(ca);
1874 mutex_unlock(&bch_register_lock);
1875
1873 if (err) 1876 if (err)
1874 goto err; 1877 goto err;
1875 1878
@@ -1931,8 +1934,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1931 if (!try_module_get(THIS_MODULE)) 1934 if (!try_module_get(THIS_MODULE))
1932 return -EBUSY; 1935 return -EBUSY;
1933 1936
1934 mutex_lock(&bch_register_lock);
1935
1936 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || 1937 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1937 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) 1938 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1938 goto err; 1939 goto err;
@@ -1965,7 +1966,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1965 if (!dc) 1966 if (!dc)
1966 goto err_close; 1967 goto err_close;
1967 1968
1969 mutex_lock(&bch_register_lock);
1968 register_bdev(sb, sb_page, bdev, dc); 1970 register_bdev(sb, sb_page, bdev, dc);
1971 mutex_unlock(&bch_register_lock);
1969 } else { 1972 } else {
1970 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 1973 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1971 if (!ca) 1974 if (!ca)
@@ -1978,7 +1981,6 @@ out:
1978 put_page(sb_page); 1981 put_page(sb_page);
1979 kfree(sb); 1982 kfree(sb);
1980 kfree(path); 1983 kfree(path);
1981 mutex_unlock(&bch_register_lock);
1982 module_put(THIS_MODULE); 1984 module_put(THIS_MODULE);
1983 return ret; 1985 return ret;
1984 1986
@@ -2057,7 +2059,6 @@ static void bcache_exit(void)
2057{ 2059{
2058 bch_debug_exit(); 2060 bch_debug_exit();
2059 bch_request_exit(); 2061 bch_request_exit();
2060 bch_btree_exit();
2061 if (bcache_kobj) 2062 if (bcache_kobj)
2062 kobject_put(bcache_kobj); 2063 kobject_put(bcache_kobj);
2063 if (bcache_wq) 2064 if (bcache_wq)
@@ -2087,7 +2088,6 @@ static int __init bcache_init(void)
2087 if (!(bcache_wq = create_workqueue("bcache")) || 2088 if (!(bcache_wq = create_workqueue("bcache")) ||
2088 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || 2089 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2089 sysfs_create_files(bcache_kobj, files) || 2090 sysfs_create_files(bcache_kobj, files) ||
2090 bch_btree_init() ||
2091 bch_request_init() || 2091 bch_request_init() ||
2092 bch_debug_init(bcache_kobj)) 2092 bch_debug_init(bcache_kobj))
2093 goto err; 2093 goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index d8458d477a12..b3ff57d61dde 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms);
54sysfs_time_stats_attribute(btree_split, sec, us); 54sysfs_time_stats_attribute(btree_split, sec, us);
55sysfs_time_stats_attribute(btree_sort, ms, us); 55sysfs_time_stats_attribute(btree_sort, ms, us);
56sysfs_time_stats_attribute(btree_read, ms, us); 56sysfs_time_stats_attribute(btree_read, ms, us);
57sysfs_time_stats_attribute(try_harder, ms, us);
58 57
59read_attribute(btree_nodes); 58read_attribute(btree_nodes);
60read_attribute(btree_used_percent); 59read_attribute(btree_used_percent);
@@ -406,7 +405,7 @@ struct bset_stats_op {
406 struct bset_stats stats; 405 struct bset_stats stats;
407}; 406};
408 407
409static int btree_bset_stats(struct btree_op *b_op, struct btree *b) 408static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b)
410{ 409{
411 struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); 410 struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
412 411
@@ -424,7 +423,7 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf)
424 memset(&op, 0, sizeof(op)); 423 memset(&op, 0, sizeof(op));
425 bch_btree_op_init(&op.op, -1); 424 bch_btree_op_init(&op.op, -1);
426 425
427 ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); 426 ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats);
428 if (ret < 0) 427 if (ret < 0)
429 return ret; 428 return ret;
430 429
@@ -442,81 +441,81 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf)
442 op.stats.floats, op.stats.failed); 441 op.stats.floats, op.stats.failed);
443} 442}
444 443
445SHOW(__bch_cache_set) 444static unsigned bch_root_usage(struct cache_set *c)
446{ 445{
447 unsigned root_usage(struct cache_set *c) 446 unsigned bytes = 0;
448 { 447 struct bkey *k;
449 unsigned bytes = 0; 448 struct btree *b;
450 struct bkey *k; 449 struct btree_iter iter;
451 struct btree *b;
452 struct btree_iter iter;
453 450
454 goto lock_root; 451 goto lock_root;
455 452
456 do { 453 do {
457 rw_unlock(false, b); 454 rw_unlock(false, b);
458lock_root: 455lock_root:
459 b = c->root; 456 b = c->root;
460 rw_lock(false, b, b->level); 457 rw_lock(false, b, b->level);
461 } while (b != c->root); 458 } while (b != c->root);
462 459
463 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) 460 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
464 bytes += bkey_bytes(k); 461 bytes += bkey_bytes(k);
465 462
466 rw_unlock(false, b); 463 rw_unlock(false, b);
467 464
468 return (bytes * 100) / btree_bytes(c); 465 return (bytes * 100) / btree_bytes(c);
469 } 466}
470 467
471 size_t cache_size(struct cache_set *c) 468static size_t bch_cache_size(struct cache_set *c)
472 { 469{
473 size_t ret = 0; 470 size_t ret = 0;
474 struct btree *b; 471 struct btree *b;
475 472
476 mutex_lock(&c->bucket_lock); 473 mutex_lock(&c->bucket_lock);
477 list_for_each_entry(b, &c->btree_cache, list) 474 list_for_each_entry(b, &c->btree_cache, list)
478 ret += 1 << (b->keys.page_order + PAGE_SHIFT); 475 ret += 1 << (b->keys.page_order + PAGE_SHIFT);
479 476
480 mutex_unlock(&c->bucket_lock); 477 mutex_unlock(&c->bucket_lock);
481 return ret; 478 return ret;
482 } 479}
483
484 unsigned cache_max_chain(struct cache_set *c)
485 {
486 unsigned ret = 0;
487 struct hlist_head *h;
488 480
489 mutex_lock(&c->bucket_lock); 481static unsigned bch_cache_max_chain(struct cache_set *c)
482{
483 unsigned ret = 0;
484 struct hlist_head *h;
490 485
491 for (h = c->bucket_hash; 486 mutex_lock(&c->bucket_lock);
492 h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
493 h++) {
494 unsigned i = 0;
495 struct hlist_node *p;
496 487
497 hlist_for_each(p, h) 488 for (h = c->bucket_hash;
498 i++; 489 h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
490 h++) {
491 unsigned i = 0;
492 struct hlist_node *p;
499 493
500 ret = max(ret, i); 494 hlist_for_each(p, h)
501 } 495 i++;
502 496
503 mutex_unlock(&c->bucket_lock); 497 ret = max(ret, i);
504 return ret;
505 } 498 }
506 499
507 unsigned btree_used(struct cache_set *c) 500 mutex_unlock(&c->bucket_lock);
508 { 501 return ret;
509 return div64_u64(c->gc_stats.key_bytes * 100, 502}
510 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
511 }
512 503
513 unsigned average_key_size(struct cache_set *c) 504static unsigned bch_btree_used(struct cache_set *c)
514 { 505{
515 return c->gc_stats.nkeys 506 return div64_u64(c->gc_stats.key_bytes * 100,
516 ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) 507 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
517 : 0; 508}
518 }
519 509
510static unsigned bch_average_key_size(struct cache_set *c)
511{
512 return c->gc_stats.nkeys
513 ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
514 : 0;
515}
516
517SHOW(__bch_cache_set)
518{
520 struct cache_set *c = container_of(kobj, struct cache_set, kobj); 519 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
521 520
522 sysfs_print(synchronous, CACHE_SYNC(&c->sb)); 521 sysfs_print(synchronous, CACHE_SYNC(&c->sb));
@@ -524,21 +523,20 @@ lock_root:
524 sysfs_hprint(bucket_size, bucket_bytes(c)); 523 sysfs_hprint(bucket_size, bucket_bytes(c));
525 sysfs_hprint(block_size, block_bytes(c)); 524 sysfs_hprint(block_size, block_bytes(c));
526 sysfs_print(tree_depth, c->root->level); 525 sysfs_print(tree_depth, c->root->level);
527 sysfs_print(root_usage_percent, root_usage(c)); 526 sysfs_print(root_usage_percent, bch_root_usage(c));
528 527
529 sysfs_hprint(btree_cache_size, cache_size(c)); 528 sysfs_hprint(btree_cache_size, bch_cache_size(c));
530 sysfs_print(btree_cache_max_chain, cache_max_chain(c)); 529 sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c));
531 sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); 530 sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
532 531
533 sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); 532 sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
534 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); 533 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
535 sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); 534 sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
536 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); 535 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
537 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
538 536
539 sysfs_print(btree_used_percent, btree_used(c)); 537 sysfs_print(btree_used_percent, bch_btree_used(c));
540 sysfs_print(btree_nodes, c->gc_stats.nodes); 538 sysfs_print(btree_nodes, c->gc_stats.nodes);
541 sysfs_hprint(average_key_size, average_key_size(c)); 539 sysfs_hprint(average_key_size, bch_average_key_size(c));
542 540
543 sysfs_print(cache_read_races, 541 sysfs_print(cache_read_races,
544 atomic_long_read(&c->cache_read_races)); 542 atomic_long_read(&c->cache_read_races));
@@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = {
709 sysfs_time_stats_attribute_list(btree_split, sec, us) 707 sysfs_time_stats_attribute_list(btree_split, sec, us)
710 sysfs_time_stats_attribute_list(btree_sort, ms, us) 708 sysfs_time_stats_attribute_list(btree_sort, ms, us)
711 sysfs_time_stats_attribute_list(btree_read, ms, us) 709 sysfs_time_stats_attribute_list(btree_read, ms, us)
712 sysfs_time_stats_attribute_list(try_harder, ms, us)
713 710
714 &sysfs_btree_nodes, 711 &sysfs_btree_nodes,
715 &sysfs_btree_used_percent, 712 &sysfs_btree_used_percent,
@@ -761,7 +758,9 @@ SHOW(__bch_cache)
761 int cmp(const void *l, const void *r) 758 int cmp(const void *l, const void *r)
762 { return *((uint16_t *) r) - *((uint16_t *) l); } 759 { return *((uint16_t *) r) - *((uint16_t *) l); }
763 760
764 size_t n = ca->sb.nbuckets, i, unused, btree; 761 struct bucket *b;
762 size_t n = ca->sb.nbuckets, i;
763 size_t unused = 0, available = 0, dirty = 0, meta = 0;
765 uint64_t sum = 0; 764 uint64_t sum = 0;
766 /* Compute 31 quantiles */ 765 /* Compute 31 quantiles */
767 uint16_t q[31], *p, *cached; 766 uint16_t q[31], *p, *cached;
@@ -772,6 +771,17 @@ SHOW(__bch_cache)
772 return -ENOMEM; 771 return -ENOMEM;
773 772
774 mutex_lock(&ca->set->bucket_lock); 773 mutex_lock(&ca->set->bucket_lock);
774 for_each_bucket(b, ca) {
775 if (!GC_SECTORS_USED(b))
776 unused++;
777 if (GC_MARK(b) == GC_MARK_RECLAIMABLE)
778 available++;
779 if (GC_MARK(b) == GC_MARK_DIRTY)
780 dirty++;
781 if (GC_MARK(b) == GC_MARK_METADATA)
782 meta++;
783 }
784
775 for (i = ca->sb.first_bucket; i < n; i++) 785 for (i = ca->sb.first_bucket; i < n; i++)
776 p[i] = ca->buckets[i].prio; 786 p[i] = ca->buckets[i].prio;
777 mutex_unlock(&ca->set->bucket_lock); 787 mutex_unlock(&ca->set->bucket_lock);
@@ -786,10 +796,7 @@ SHOW(__bch_cache)
786 796
787 while (cached < p + n && 797 while (cached < p + n &&
788 *cached == BTREE_PRIO) 798 *cached == BTREE_PRIO)
789 cached++; 799 cached++, n--;
790
791 btree = cached - p;
792 n -= btree;
793 800
794 for (i = 0; i < n; i++) 801 for (i = 0; i < n; i++)
795 sum += INITIAL_PRIO - cached[i]; 802 sum += INITIAL_PRIO - cached[i];
@@ -805,12 +812,16 @@ SHOW(__bch_cache)
805 812
806 ret = scnprintf(buf, PAGE_SIZE, 813 ret = scnprintf(buf, PAGE_SIZE,
807 "Unused: %zu%%\n" 814 "Unused: %zu%%\n"
815 "Clean: %zu%%\n"
816 "Dirty: %zu%%\n"
808 "Metadata: %zu%%\n" 817 "Metadata: %zu%%\n"
809 "Average: %llu\n" 818 "Average: %llu\n"
810 "Sectors per Q: %zu\n" 819 "Sectors per Q: %zu\n"
811 "Quantiles: [", 820 "Quantiles: [",
812 unused * 100 / (size_t) ca->sb.nbuckets, 821 unused * 100 / (size_t) ca->sb.nbuckets,
813 btree * 100 / (size_t) ca->sb.nbuckets, sum, 822 available * 100 / (size_t) ca->sb.nbuckets,
823 dirty * 100 / (size_t) ca->sb.nbuckets,
824 meta * 100 / (size_t) ca->sb.nbuckets, sum,
814 n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); 825 n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
815 826
816 for (i = 0; i < ARRAY_SIZE(q); i++) 827 for (i = 0; i < ARRAY_SIZE(q); i++)
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index adbc3df17a80..b7820b0d2621 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -45,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
45EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); 45EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
46EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); 46EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
47 47
48EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); 48EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate);
49EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); 49EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
50 50
51EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); 51EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);